Update linux-x86 Go prebuilts from ab/9878432

https://ci.android.com/builds/branches/aosp-build-tools-release/grid?head=9878432&tail=9878432

Update script: toolchain/go/update-prebuilts.sh

Test: Treehugger presubmit
Change-Id: I07818c960e04b2ef4373ab22161590b088582d39
diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md
index 61b5a51..ce0b42a 100644
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md
@@ -235,7 +235,7 @@
   objects of the same type.
 
 In general, types that are allocated using any of these should be
-marked `//go:notinheap` (see below).
+marked as not in heap by embedding `runtime/internal/sys.NotInHeap`.
 
 Objects that are allocated in unmanaged memory **must not** contain
 heap pointers unless the following rules are also obeyed:
@@ -330,37 +330,3 @@
 The conversion from pointer to uintptr must appear in the argument list of any
 call to this function. This directive is used for some low-level system call
 implementations.
-
-go:notinheap
-------------
-
-`go:notinheap` applies to type declarations. It indicates that a type
-must never be allocated from the GC'd heap or on the stack.
-Specifically, pointers to this type must always fail the
-`runtime.inheap` check. The type may be used for global variables, or
-for objects in unmanaged memory (e.g., allocated with `sysAlloc`,
-`persistentalloc`, `fixalloc`, or from a manually-managed span).
-Specifically:
-
-1. `new(T)`, `make([]T)`, `append([]T, ...)` and implicit heap
-   allocation of T are disallowed. (Though implicit allocations are
-   disallowed in the runtime anyway.)
-
-2. A pointer to a regular type (other than `unsafe.Pointer`) cannot be
-   converted to a pointer to a `go:notinheap` type, even if they have
-   the same underlying type.
-
-3. Any type that contains a `go:notinheap` type is itself
-   `go:notinheap`. Structs and arrays are `go:notinheap` if their
-   elements are. Maps and channels of `go:notinheap` types are
-   disallowed. To keep things explicit, any type declaration where the
-   type is implicitly `go:notinheap` must be explicitly marked
-   `go:notinheap` as well.
-
-4. Write barriers on pointers to `go:notinheap` types can be omitted.
-
-The last point is the real benefit of `go:notinheap`. The runtime uses
-it for low-level internal structures to avoid memory barriers in the
-scheduler and the memory allocator where they are illegal or simply
-inefficient. This mechanism is reasonably safe and does not compromise
-the readability of the runtime.
diff --git a/src/runtime/align_runtime_test.go b/src/runtime/align_runtime_test.go
index ec7956d..d78b0b2 100644
--- a/src/runtime/align_runtime_test.go
+++ b/src/runtime/align_runtime_test.go
@@ -14,24 +14,7 @@
 // operations (all the *64 operations in runtime/internal/atomic).
 var AtomicFields = []uintptr{
 	unsafe.Offsetof(m{}.procid),
-	unsafe.Offsetof(p{}.timer0When),
-	unsafe.Offsetof(p{}.timerModifiedEarliest),
 	unsafe.Offsetof(p{}.gcFractionalMarkTime),
-	unsafe.Offsetof(schedt{}.goidgen),
-	unsafe.Offsetof(schedt{}.lastpoll),
-	unsafe.Offsetof(schedt{}.pollUntil),
-	unsafe.Offsetof(schedt{}.timeToRun),
-	unsafe.Offsetof(gcControllerState{}.bgScanCredit),
-	unsafe.Offsetof(gcControllerState{}.maxStackScan),
-	unsafe.Offsetof(gcControllerState{}.heapLive),
-	unsafe.Offsetof(gcControllerState{}.heapScan),
-	unsafe.Offsetof(gcControllerState{}.dedicatedMarkTime),
-	unsafe.Offsetof(gcControllerState{}.dedicatedMarkWorkersNeeded),
-	unsafe.Offsetof(gcControllerState{}.fractionalMarkTime),
-	unsafe.Offsetof(gcControllerState{}.idleMarkTime),
-	unsafe.Offsetof(gcControllerState{}.globalsScan),
-	unsafe.Offsetof(gcControllerState{}.lastStackScan),
-	unsafe.Offsetof(timeHistogram{}.underflow),
 	unsafe.Offsetof(profBuf{}.overflow),
 	unsafe.Offsetof(profBuf{}.overflowTime),
 	unsafe.Offsetof(heapStatsDelta{}.tinyAllocCount),
@@ -50,10 +33,7 @@
 	unsafe.Offsetof(lfnode{}.next),
 	unsafe.Offsetof(mstats{}.last_gc_nanotime),
 	unsafe.Offsetof(mstats{}.last_gc_unix),
-	unsafe.Offsetof(mstats{}.gcPauseDist),
-	unsafe.Offsetof(ticksType{}.val),
 	unsafe.Offsetof(workType{}.bytesMarked),
-	unsafe.Offsetof(timeHistogram{}.counts),
 }
 
 // AtomicVariables is the set of global variables on which we perform
diff --git a/src/runtime/align_test.go b/src/runtime/align_test.go
index 55cf783..5f225d6 100644
--- a/src/runtime/align_test.go
+++ b/src/runtime/align_test.go
@@ -5,7 +5,6 @@
 package runtime_test
 
 import (
-	"bytes"
 	"go/ast"
 	"go/build"
 	"go/importer"
@@ -13,6 +12,7 @@
 	"go/printer"
 	"go/token"
 	"go/types"
+	"internal/testenv"
 	"os"
 	"regexp"
 	"runtime"
@@ -23,6 +23,8 @@
 // Check that 64-bit fields on which we apply atomic operations
 // are aligned to 8 bytes. This can be a problem on 32-bit systems.
 func TestAtomicAlignment(t *testing.T) {
+	testenv.MustHaveGoBuild(t) // go command needed to resolve std .a files for importer.Default().
+
 	// Read the code making the tables above, to see which fields and
 	// variables we are currently checking.
 	checked := map[string]bool{}
@@ -180,7 +182,7 @@
 }
 
 func (v *Visitor) print(n ast.Node) string {
-	var b bytes.Buffer
+	var b strings.Builder
 	printer.Fprint(&b, v.fset, n)
 	return b.String()
 }
diff --git a/src/runtime/arena.go b/src/runtime/arena.go
new file mode 100644
index 0000000..c338d30
--- /dev/null
+++ b/src/runtime/arena.go
@@ -0,0 +1,1003 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implementation of (safe) user arenas.
+//
+// This file contains the implementation of user arenas wherein Go values can
+// be manually allocated and freed in bulk. The act of manually freeing memory,
+// potentially before a GC cycle, means that a garbage collection cycle can be
+// delayed, improving efficiency by reducing GC cycle frequency. There are other
+// potential efficiency benefits, such as improved locality and access to a more
+// efficient allocation strategy.
+//
+// What makes the arenas here safe is that once they are freed, accessing the
+// arena's memory will cause an explicit program fault, and the arena's address
+// space will not be reused until no more pointers into it are found. There's one
+// exception to this: if an arena allocated memory that isn't exhausted, it's placed
+// back into a pool for reuse. This means that a crash is not always guaranteed.
+//
+// While this may seem unsafe, it still prevents memory corruption, and is in fact
+// necessary in order to make new(T) a valid implementation of arenas. Such a property
+// is desirable to allow for a trivial implementation. (It also avoids complexities
+// that arise from synchronization with the GC when trying to set the arena chunks to
+// fault while the GC is active.)
+//
+// The implementation works in layers. At the bottom, arenas are managed in chunks.
+// Each chunk must be a multiple of the heap arena size, or the heap arena size must
+// be divisible by the arena chunks. The address space for each chunk, and each
+// corresponding heapArena for that addres space, are eternelly reserved for use as
+// arena chunks. That is, they can never be used for the general heap. Each chunk
+// is also represented by a single mspan, and is modeled as a single large heap
+// allocation. It must be, because each chunk contains ordinary Go values that may
+// point into the heap, so it must be scanned just like any other object. Any
+// pointer into a chunk will therefore always cause the whole chunk to be scanned
+// while its corresponding arena is still live.
+//
+// Chunks may be allocated either from new memory mapped by the OS on our behalf,
+// or by reusing old freed chunks. When chunks are freed, their underlying memory
+// is returned to the OS, set to fault on access, and may not be reused until the
+// program doesn't point into the chunk anymore (the code refers to this state as
+// "quarantined"), a property checked by the GC.
+//
+// The sweeper handles moving chunks out of this quarantine state to be ready for
+// reuse. When the chunk is placed into the quarantine state, its corresponding
+// span is marked as noscan so that the GC doesn't try to scan memory that would
+// cause a fault.
+//
+// At the next layer are the user arenas themselves. They consist of a single
+// active chunk which new Go values are bump-allocated into and a list of chunks
+// that were exhausted when allocating into the arena. Once the arena is freed,
+// it frees all full chunks it references, and places the active one onto a reuse
+// list for a future arena to use. Each arena keeps its list of referenced chunks
+// explicitly live until it is freed. Each user arena also maps to an object which
+// has a finalizer attached that ensures the arena's chunks are all freed even if
+// the arena itself is never explicitly freed.
+//
+// Pointer-ful memory is bump-allocated from low addresses to high addresses in each
+// chunk, while pointer-free memory is bump-allocated from high address to low
+// addresses. The reason for this is to take advantage of a GC optimization wherein
+// the GC will stop scanning an object when there are no more pointers in it, which
+// also allows us to elide clearing the heap bitmap for pointer-free Go values
+// allocated into arenas.
+//
+// Note that arenas are not safe to use concurrently.
+//
+// In summary, there are 2 resources: arenas, and arena chunks. They exist in the
+// following lifecycle:
+//
+// (1) A new arena is created via newArena.
+// (2) Chunks are allocated to hold memory allocated into the arena with new or slice.
+//    (a) Chunks are first allocated from the reuse list of partially-used chunks.
+//    (b) If there are no such chunks, then chunks on the ready list are taken.
+//    (c) Failing all the above, memory for a new chunk is mapped.
+// (3) The arena is freed, or all references to it are dropped, triggering its finalizer.
+//    (a) If the GC is not active, exhausted chunks are set to fault and placed on a
+//        quarantine list.
+//    (b) If the GC is active, exhausted chunks are placed on a fault list and will
+//        go through step (a) at a later point in time.
+//    (c) Any remaining partially-used chunk is placed on a reuse list.
+// (4) Once no more pointers are found into quarantined arena chunks, the sweeper
+//     takes these chunks out of quarantine and places them on the ready list.
+
+package runtime
+
+import (
+	"internal/goarch"
+	"runtime/internal/atomic"
+	"runtime/internal/math"
+	"unsafe"
+)
+
+// Functions starting with arena_ are meant to be exported to downstream users
+// of arenas. They should wrap these functions in a higher-lever API.
+//
+// The underlying arena and its resources are managed through an opaque unsafe.Pointer.
+
+// arena_newArena is a wrapper around newUserArena.
+//
+//go:linkname arena_newArena arena.runtime_arena_newArena
+func arena_newArena() unsafe.Pointer {
+	return unsafe.Pointer(newUserArena())
+}
+
+// arena_arena_New is a wrapper around (*userArena).new, except that typ
+// is an any (must be a *_type, still) and typ must be a type descriptor
+// for a pointer to the type to actually be allocated, i.e. pass a *T
+// to allocate a T. This is necessary because this function returns a *T.
+//
+//go:linkname arena_arena_New arena.runtime_arena_arena_New
+func arena_arena_New(arena unsafe.Pointer, typ any) any {
+	t := (*_type)(efaceOf(&typ).data)
+	if t.kind&kindMask != kindPtr {
+		throw("arena_New: non-pointer type")
+	}
+	te := (*ptrtype)(unsafe.Pointer(t)).elem
+	x := ((*userArena)(arena)).new(te)
+	var result any
+	e := efaceOf(&result)
+	e._type = t
+	e.data = x
+	return result
+}
+
+// arena_arena_Slice is a wrapper around (*userArena).slice.
+//
+//go:linkname arena_arena_Slice arena.runtime_arena_arena_Slice
+func arena_arena_Slice(arena unsafe.Pointer, slice any, cap int) {
+	((*userArena)(arena)).slice(slice, cap)
+}
+
+// arena_arena_Free is a wrapper around (*userArena).free.
+//
+//go:linkname arena_arena_Free arena.runtime_arena_arena_Free
+func arena_arena_Free(arena unsafe.Pointer) {
+	((*userArena)(arena)).free()
+}
+
+// arena_heapify takes a value that lives in an arena and makes a copy
+// of it on the heap. Values that don't live in an arena are returned unmodified.
+//
+//go:linkname arena_heapify arena.runtime_arena_heapify
+func arena_heapify(s any) any {
+	var v unsafe.Pointer
+	e := efaceOf(&s)
+	t := e._type
+	switch t.kind & kindMask {
+	case kindString:
+		v = stringStructOf((*string)(e.data)).str
+	case kindSlice:
+		v = (*slice)(e.data).array
+	case kindPtr:
+		v = e.data
+	default:
+		panic("arena: Clone only supports pointers, slices, and strings")
+	}
+	span := spanOf(uintptr(v))
+	if span == nil || !span.isUserArenaChunk {
+		// Not stored in a user arena chunk.
+		return s
+	}
+	// Heap-allocate storage for a copy.
+	var x any
+	switch t.kind & kindMask {
+	case kindString:
+		s1 := s.(string)
+		s2, b := rawstring(len(s1))
+		copy(b, s1)
+		x = s2
+	case kindSlice:
+		len := (*slice)(e.data).len
+		et := (*slicetype)(unsafe.Pointer(t)).elem
+		sl := new(slice)
+		*sl = slice{makeslicecopy(et, len, len, (*slice)(e.data).array), len, len}
+		xe := efaceOf(&x)
+		xe._type = t
+		xe.data = unsafe.Pointer(sl)
+	case kindPtr:
+		et := (*ptrtype)(unsafe.Pointer(t)).elem
+		e2 := newobject(et)
+		typedmemmove(et, e2, e.data)
+		xe := efaceOf(&x)
+		xe._type = t
+		xe.data = e2
+	}
+	return x
+}
+
+const (
+	// userArenaChunkBytes is the size of a user arena chunk.
+	userArenaChunkBytesMax = 8 << 20
+	userArenaChunkBytes    = uintptr(int64(userArenaChunkBytesMax-heapArenaBytes)&(int64(userArenaChunkBytesMax-heapArenaBytes)>>63) + heapArenaBytes) // min(userArenaChunkBytesMax, heapArenaBytes)
+
+	// userArenaChunkPages is the number of pages a user arena chunk uses.
+	userArenaChunkPages = userArenaChunkBytes / pageSize
+
+	// userArenaChunkMaxAllocBytes is the maximum size of an object that can
+	// be allocated from an arena. This number is chosen to cap worst-case
+	// fragmentation of user arenas to 25%. Larger allocations are redirected
+	// to the heap.
+	userArenaChunkMaxAllocBytes = userArenaChunkBytes / 4
+)
+
+func init() {
+	if userArenaChunkPages*pageSize != userArenaChunkBytes {
+		throw("user arena chunk size is not a mutliple of the page size")
+	}
+	if userArenaChunkBytes%physPageSize != 0 {
+		throw("user arena chunk size is not a mutliple of the physical page size")
+	}
+	if userArenaChunkBytes < heapArenaBytes {
+		if heapArenaBytes%userArenaChunkBytes != 0 {
+			throw("user arena chunk size is smaller than a heap arena, but doesn't divide it")
+		}
+	} else {
+		if userArenaChunkBytes%heapArenaBytes != 0 {
+			throw("user arena chunks size is larger than a heap arena, but not a multiple")
+		}
+	}
+	lockInit(&userArenaState.lock, lockRankUserArenaState)
+}
+
+type userArena struct {
+	// full is a list of full chunks that have not enough free memory left, and
+	// that we'll free once this user arena is freed.
+	//
+	// Can't use mSpanList here because it's not-in-heap.
+	fullList *mspan
+
+	// active is the user arena chunk we're currently allocating into.
+	active *mspan
+
+	// refs is a set of references to the arena chunks so that they're kept alive.
+	//
+	// The last reference in the list always refers to active, while the rest of
+	// them correspond to fullList. Specifically, the head of fullList is the
+	// second-to-last one, fullList.next is the third-to-last, and so on.
+	//
+	// In other words, every time a new chunk becomes active, its appended to this
+	// list.
+	refs []unsafe.Pointer
+
+	// defunct is true if free has been called on this arena.
+	//
+	// This is just a best-effort way to discover a concurrent allocation
+	// and free. Also used to detect a double-free.
+	defunct atomic.Bool
+}
+
+// newUserArena creates a new userArena ready to be used.
+func newUserArena() *userArena {
+	a := new(userArena)
+	SetFinalizer(a, func(a *userArena) {
+		// If arena handle is dropped without being freed, then call
+		// free on the arena, so the arena chunks are never reclaimed
+		// by the garbage collector.
+		a.free()
+	})
+	a.refill()
+	return a
+}
+
+// new allocates a new object of the provided type into the arena, and returns
+// its pointer.
+//
+// This operation is not safe to call concurrently with other operations on the
+// same arena.
+func (a *userArena) new(typ *_type) unsafe.Pointer {
+	return a.alloc(typ, -1)
+}
+
+// slice allocates a new slice backing store. slice must be a pointer to a slice
+// (i.e. *[]T), because userArenaSlice will update the slice directly.
+//
+// cap determines the capacity of the slice backing store and must be non-negative.
+//
+// This operation is not safe to call concurrently with other operations on the
+// same arena.
+func (a *userArena) slice(sl any, cap int) {
+	if cap < 0 {
+		panic("userArena.slice: negative cap")
+	}
+	i := efaceOf(&sl)
+	typ := i._type
+	if typ.kind&kindMask != kindPtr {
+		panic("slice result of non-ptr type")
+	}
+	typ = (*ptrtype)(unsafe.Pointer(typ)).elem
+	if typ.kind&kindMask != kindSlice {
+		panic("slice of non-ptr-to-slice type")
+	}
+	typ = (*slicetype)(unsafe.Pointer(typ)).elem
+	// t is now the element type of the slice we want to allocate.
+
+	*((*slice)(i.data)) = slice{a.alloc(typ, cap), cap, cap}
+}
+
+// free returns the userArena's chunks back to mheap and marks it as defunct.
+//
+// Must be called at most once for any given arena.
+//
+// This operation is not safe to call concurrently with other operations on the
+// same arena.
+func (a *userArena) free() {
+	// Check for a double-free.
+	if a.defunct.Load() {
+		panic("arena double free")
+	}
+
+	// Mark ourselves as defunct.
+	a.defunct.Store(true)
+	SetFinalizer(a, nil)
+
+	// Free all the full arenas.
+	//
+	// The refs on this list are in reverse order from the second-to-last.
+	s := a.fullList
+	i := len(a.refs) - 2
+	for s != nil {
+		a.fullList = s.next
+		s.next = nil
+		freeUserArenaChunk(s, a.refs[i])
+		s = a.fullList
+		i--
+	}
+	if a.fullList != nil || i >= 0 {
+		// There's still something left on the full list, or we
+		// failed to actually iterate over the entire refs list.
+		throw("full list doesn't match refs list in length")
+	}
+
+	// Put the active chunk onto the reuse list.
+	//
+	// Note that active's reference is always the last reference in refs.
+	s = a.active
+	if s != nil {
+		if raceenabled || msanenabled || asanenabled {
+			// Don't reuse arenas with sanitizers enabled. We want to catch
+			// any use-after-free errors aggressively.
+			freeUserArenaChunk(s, a.refs[len(a.refs)-1])
+		} else {
+			lock(&userArenaState.lock)
+			userArenaState.reuse = append(userArenaState.reuse, liveUserArenaChunk{s, a.refs[len(a.refs)-1]})
+			unlock(&userArenaState.lock)
+		}
+	}
+	// nil out a.active so that a race with freeing will more likely cause a crash.
+	a.active = nil
+	a.refs = nil
+}
+
+// alloc reserves space in the current chunk or calls refill and reserves space
+// in a new chunk. If cap is negative, the type will be taken literally, otherwise
+// it will be considered as an element type for a slice backing store with capacity
+// cap.
+func (a *userArena) alloc(typ *_type, cap int) unsafe.Pointer {
+	s := a.active
+	var x unsafe.Pointer
+	for {
+		x = s.userArenaNextFree(typ, cap)
+		if x != nil {
+			break
+		}
+		s = a.refill()
+	}
+	return x
+}
+
+// refill inserts the current arena chunk onto the full list and obtains a new
+// one, either from the partial list or allocating a new one, both from mheap.
+func (a *userArena) refill() *mspan {
+	// If there's an active chunk, assume it's full.
+	s := a.active
+	if s != nil {
+		if s.userArenaChunkFree.size() > userArenaChunkMaxAllocBytes {
+			// It's difficult to tell when we're actually out of memory
+			// in a chunk because the allocation that failed may still leave
+			// some free space available. However, that amount of free space
+			// should never exceed the maximum allocation size.
+			throw("wasted too much memory in an arena chunk")
+		}
+		s.next = a.fullList
+		a.fullList = s
+		a.active = nil
+		s = nil
+	}
+	var x unsafe.Pointer
+
+	// Check the partially-used list.
+	lock(&userArenaState.lock)
+	if len(userArenaState.reuse) > 0 {
+		// Pick off the last arena chunk from the list.
+		n := len(userArenaState.reuse) - 1
+		x = userArenaState.reuse[n].x
+		s = userArenaState.reuse[n].mspan
+		userArenaState.reuse[n].x = nil
+		userArenaState.reuse[n].mspan = nil
+		userArenaState.reuse = userArenaState.reuse[:n]
+	}
+	unlock(&userArenaState.lock)
+	if s == nil {
+		// Allocate a new one.
+		x, s = newUserArenaChunk()
+		if s == nil {
+			throw("out of memory")
+		}
+	}
+	a.refs = append(a.refs, x)
+	a.active = s
+	return s
+}
+
+type liveUserArenaChunk struct {
+	*mspan // Must represent a user arena chunk.
+
+	// Reference to mspan.base() to keep the chunk alive.
+	x unsafe.Pointer
+}
+
+var userArenaState struct {
+	lock mutex
+
+	// reuse contains a list of partially-used and already-live
+	// user arena chunks that can be quickly reused for another
+	// arena.
+	//
+	// Protected by lock.
+	reuse []liveUserArenaChunk
+
+	// fault contains full user arena chunks that need to be faulted.
+	//
+	// Protected by lock.
+	fault []liveUserArenaChunk
+}
+
+// userArenaNextFree reserves space in the user arena for an item of the specified
+// type. If cap is not -1, this is for an array of cap elements of type t.
+func (s *mspan) userArenaNextFree(typ *_type, cap int) unsafe.Pointer {
+	size := typ.size
+	if cap > 0 {
+		if size > ^uintptr(0)/uintptr(cap) {
+			// Overflow.
+			throw("out of memory")
+		}
+		size *= uintptr(cap)
+	}
+	if size == 0 || cap == 0 {
+		return unsafe.Pointer(&zerobase)
+	}
+	if size > userArenaChunkMaxAllocBytes {
+		// Redirect allocations that don't fit into a chunk well directly
+		// from the heap.
+		if cap >= 0 {
+			return newarray(typ, cap)
+		}
+		return newobject(typ)
+	}
+
+	// Prevent preemption as we set up the space for a new object.
+	//
+	// Act like we're allocating.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
+	mp.mallocing = 1
+
+	var ptr unsafe.Pointer
+	if typ.ptrdata == 0 {
+		// Allocate pointer-less objects from the tail end of the chunk.
+		v, ok := s.userArenaChunkFree.takeFromBack(size, typ.align)
+		if ok {
+			ptr = unsafe.Pointer(v)
+		}
+	} else {
+		v, ok := s.userArenaChunkFree.takeFromFront(size, typ.align)
+		if ok {
+			ptr = unsafe.Pointer(v)
+		}
+	}
+	if ptr == nil {
+		// Failed to allocate.
+		mp.mallocing = 0
+		releasem(mp)
+		return nil
+	}
+	if s.needzero != 0 {
+		throw("arena chunk needs zeroing, but should already be zeroed")
+	}
+	// Set up heap bitmap and do extra accounting.
+	if typ.ptrdata != 0 {
+		if cap >= 0 {
+			userArenaHeapBitsSetSliceType(typ, cap, ptr, s.base())
+		} else {
+			userArenaHeapBitsSetType(typ, ptr, s.base())
+		}
+		c := getMCache(mp)
+		if c == nil {
+			throw("mallocgc called without a P or outside bootstrapping")
+		}
+		if cap > 0 {
+			c.scanAlloc += size - (typ.size - typ.ptrdata)
+		} else {
+			c.scanAlloc += typ.ptrdata
+		}
+	}
+
+	// Ensure that the stores above that initialize x to
+	// type-safe memory and set the heap bits occur before
+	// the caller can make ptr observable to the garbage
+	// collector. Otherwise, on weakly ordered machines,
+	// the garbage collector could follow a pointer to x,
+	// but see uninitialized memory or stale heap bits.
+	publicationBarrier()
+
+	mp.mallocing = 0
+	releasem(mp)
+
+	return ptr
+}
+
+// userArenaHeapBitsSetType is the equivalent of heapBitsSetType but for
+// non-slice-backing-store Go values allocated in a user arena chunk. It
+// sets up the heap bitmap for the value with type typ allocated at address ptr.
+// base is the base address of the arena chunk.
+func userArenaHeapBitsSetType(typ *_type, ptr unsafe.Pointer, base uintptr) {
+	h := writeHeapBitsForAddr(uintptr(ptr))
+
+	// Our last allocation might have ended right at a noMorePtrs mark,
+	// which we would not have erased. We need to erase that mark here,
+	// because we're going to start adding new heap bitmap bits.
+	// We only need to clear one mark, because below we make sure to
+	// pad out the bits with zeroes and only write one noMorePtrs bit
+	// for each new object.
+	// (This is only necessary at noMorePtrs boundaries, as noMorePtrs
+	// marks within an object allocated with newAt will be erased by
+	// the normal writeHeapBitsForAddr mechanism.)
+	//
+	// Note that we skip this if this is the first allocation in the
+	// arena because there's definitely no previous noMorePtrs mark
+	// (in fact, we *must* do this, because we're going to try to back
+	// up a pointer to fix this up).
+	if uintptr(ptr)%(8*goarch.PtrSize*goarch.PtrSize) == 0 && uintptr(ptr) != base {
+		// Back up one pointer and rewrite that pointer. That will
+		// cause the writeHeapBits implementation to clear the
+		// noMorePtrs bit we need to clear.
+		r := heapBitsForAddr(uintptr(ptr)-goarch.PtrSize, goarch.PtrSize)
+		_, p := r.next()
+		b := uintptr(0)
+		if p == uintptr(ptr)-goarch.PtrSize {
+			b = 1
+		}
+		h = writeHeapBitsForAddr(uintptr(ptr) - goarch.PtrSize)
+		h = h.write(b, 1)
+	}
+
+	p := typ.gcdata // start of 1-bit pointer mask (or GC program)
+	var gcProgBits uintptr
+	if typ.kind&kindGCProg != 0 {
+		// Expand gc program, using the object itself for storage.
+		gcProgBits = runGCProg(addb(p, 4), (*byte)(ptr))
+		p = (*byte)(ptr)
+	}
+	nb := typ.ptrdata / goarch.PtrSize
+
+	for i := uintptr(0); i < nb; i += ptrBits {
+		k := nb - i
+		if k > ptrBits {
+			k = ptrBits
+		}
+		h = h.write(readUintptr(addb(p, i/8)), k)
+	}
+	// Note: we call pad here to ensure we emit explicit 0 bits
+	// for the pointerless tail of the object. This ensures that
+	// there's only a single noMorePtrs mark for the next object
+	// to clear. We don't need to do this to clear stale noMorePtrs
+	// markers from previous uses because arena chunk pointer bitmaps
+	// are always fully cleared when reused.
+	h = h.pad(typ.size - typ.ptrdata)
+	h.flush(uintptr(ptr), typ.size)
+
+	if typ.kind&kindGCProg != 0 {
+		// Zero out temporary ptrmask buffer inside object.
+		memclrNoHeapPointers(ptr, (gcProgBits+7)/8)
+	}
+
+	// Double-check that the bitmap was written out correctly.
+	//
+	// Derived from heapBitsSetType.
+	const doubleCheck = false
+	if doubleCheck {
+		size := typ.size
+		x := uintptr(ptr)
+		h := heapBitsForAddr(x, size)
+		for i := uintptr(0); i < size; i += goarch.PtrSize {
+			// Compute the pointer bit we want at offset i.
+			want := false
+			off := i % typ.size
+			if off < typ.ptrdata {
+				j := off / goarch.PtrSize
+				want = *addb(typ.gcdata, j/8)>>(j%8)&1 != 0
+			}
+			if want {
+				var addr uintptr
+				h, addr = h.next()
+				if addr != x+i {
+					throw("userArenaHeapBitsSetType: pointer entry not correct")
+				}
+			}
+		}
+		if _, addr := h.next(); addr != 0 {
+			throw("userArenaHeapBitsSetType: extra pointer")
+		}
+	}
+}
+
+// userArenaHeapBitsSetSliceType is the equivalent of heapBitsSetType but for
+// Go slice backing store values allocated in a user arena chunk. It sets up the
+// heap bitmap for n consecutive values with type typ allocated at address ptr.
+func userArenaHeapBitsSetSliceType(typ *_type, n int, ptr unsafe.Pointer, base uintptr) {
+	mem, overflow := math.MulUintptr(typ.size, uintptr(n))
+	if overflow || n < 0 || mem > maxAlloc {
+		panic(plainError("runtime: allocation size out of range"))
+	}
+	for i := 0; i < n; i++ {
+		userArenaHeapBitsSetType(typ, add(ptr, uintptr(i)*typ.size), base)
+	}
+}
+
+// newUserArenaChunk allocates a user arena chunk, which maps to a single
+// heap arena and single span. Returns a pointer to the base of the chunk
+// (this is really important: we need to keep the chunk alive) and the span.
+func newUserArenaChunk() (unsafe.Pointer, *mspan) {
+	if gcphase == _GCmarktermination {
+		throw("newUserArenaChunk called with gcphase == _GCmarktermination")
+	}
+
+	// Deduct assist credit. Because user arena chunks are modeled as one
+	// giant heap object which counts toward heapLive, we're obligated to
+	// assist the GC proportionally (and it's worth noting that the arena
+	// does represent additional work for the GC, but we also have no idea
+	// what that looks like until we actually allocate things into the
+	// arena).
+	deductAssistCredit(userArenaChunkBytes)
+
+	// Set mp.mallocing to keep from being preempted by GC.
+	mp := acquirem()
+	if mp.mallocing != 0 {
+		throw("malloc deadlock")
+	}
+	if mp.gsignal == getg() {
+		throw("malloc during signal")
+	}
+	mp.mallocing = 1
+
+	// Allocate a new user arena.
+	var span *mspan
+	systemstack(func() {
+		span = mheap_.allocUserArenaChunk()
+	})
+	if span == nil {
+		throw("out of memory")
+	}
+	x := unsafe.Pointer(span.base())
+
+	// Allocate black during GC.
+	// All slots hold nil so no scanning is needed.
+	// This may be racing with GC so do it atomically if there can be
+	// a race marking the bit.
+	if gcphase != _GCoff {
+		gcmarknewobject(span, span.base(), span.elemsize)
+	}
+
+	if raceenabled {
+		// TODO(mknyszek): Track individual objects.
+		racemalloc(unsafe.Pointer(span.base()), span.elemsize)
+	}
+
+	if msanenabled {
+		// TODO(mknyszek): Track individual objects.
+		msanmalloc(unsafe.Pointer(span.base()), span.elemsize)
+	}
+
+	if asanenabled {
+		// TODO(mknyszek): Track individual objects.
+		rzSize := computeRZlog(span.elemsize)
+		span.elemsize -= rzSize
+		span.limit -= rzSize
+		span.userArenaChunkFree = makeAddrRange(span.base(), span.limit)
+		asanpoison(unsafe.Pointer(span.limit), span.npages*pageSize-span.elemsize)
+		asanunpoison(unsafe.Pointer(span.base()), span.elemsize)
+	}
+
+	if rate := MemProfileRate; rate > 0 {
+		c := getMCache(mp)
+		if c == nil {
+			throw("newUserArenaChunk called without a P or outside bootstrapping")
+		}
+		// Note cache c only valid while m acquired; see #47302
+		if rate != 1 && userArenaChunkBytes < c.nextSample {
+			c.nextSample -= userArenaChunkBytes
+		} else {
+			profilealloc(mp, unsafe.Pointer(span.base()), userArenaChunkBytes)
+		}
+	}
+	mp.mallocing = 0
+	releasem(mp)
+
+	// Again, because this chunk counts toward heapLive, potentially trigger a GC.
+	if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+		gcStart(t)
+	}
+
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(unsafe.Pointer(span.base()), userArenaChunkBytes, nil)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single goroutine.
+			inittrace.bytes += uint64(userArenaChunkBytes)
+		}
+	}
+
+	// Double-check it's aligned to the physical page size. Based on the current
+	// implementation this is trivially true, but it need not be in the future.
+	// However, if it's not aligned to the physical page size then we can't properly
+	// set it to fault later.
+	if uintptr(x)%physPageSize != 0 {
+		throw("user arena chunk is not aligned to the physical page size")
+	}
+
+	return x, span
+}
+
+// isUnusedUserArenaChunk indicates that the arena chunk has been set to fault
+// and doesn't contain any scannable memory anymore. However, it might still be
+// mSpanInUse as it sits on the quarantine list, since it needs to be swept.
+//
+// This is not safe to execute unless the caller has ownership of the mspan or
+// the world is stopped (preemption is prevented while the relevant state changes).
+//
+// This is really only meant to be used by accounting tests in the runtime to
+// distinguish when a span shouldn't be counted (since mSpanInUse might not be
+// enough).
+func (s *mspan) isUnusedUserArenaChunk() bool {
+	return s.isUserArenaChunk && s.spanclass == makeSpanClass(0, true)
+}
+
+// setUserArenaChunkToFault sets the address space for the user arena chunk to fault
+// and releases any underlying memory resources.
+//
+// Must be in a non-preemptible state to ensure the consistency of statistics
+// exported to MemStats.
+func (s *mspan) setUserArenaChunkToFault() {
+	if !s.isUserArenaChunk {
+		throw("invalid span in heapArena for user arena")
+	}
+	if s.npages*pageSize != userArenaChunkBytes {
+		throw("span on userArena.faultList has invalid size")
+	}
+
+	// Update the span class to be noscan. What we want to happen is that
+	// any pointer into the span keeps it from getting recycled, so we want
+	// the mark bit to get set, but we're about to set the address space to fault,
+	// so we have to prevent the GC from scanning this memory.
+	//
+	// It's OK to set it here because (1) a GC isn't in progress, so the scanning code
+	// won't make a bad decision, (2) we're currently non-preemptible and in the runtime,
+	// so a GC is blocked from starting. We might race with sweeping, which could
+	// put it on the "wrong" sweep list, but really don't care because the chunk is
+	// treated as a large object span and there's no meaningful difference between scan
+	// and noscan large objects in the sweeper. The STW at the start of the GC acts as a
+	// barrier for this update.
+	s.spanclass = makeSpanClass(0, true)
+
+	// Actually set the arena chunk to fault, so we'll get dangling pointer errors.
+	// sysFault currently uses a method on each OS that forces it to evacuate all
+	// memory backing the chunk.
+	sysFault(unsafe.Pointer(s.base()), s.npages*pageSize)
+
+	// Everything on the list is counted as in-use, however sysFault transitions to
+	// Reserved, not Prepared, so we skip updating heapFree or heapReleased and just
+	// remove the memory from the total altogether; it's just address space now.
+	gcController.heapInUse.add(-int64(s.npages * pageSize))
+
+	// Count this as a free of an object right now as opposed to when
+	// the span gets off the quarantine list. The main reason is so that the
+	// amount of bytes allocated doesn't exceed how much is counted as
+	// "mapped ready," which could cause a deadlock in the pacer.
+	gcController.totalFree.Add(int64(s.npages * pageSize))
+
+	// Update consistent stats to match.
+	//
+	// We're non-preemptible, so it's safe to update consistent stats (our P
+	// won't change out from under us).
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.committed, -int64(s.npages*pageSize))
+	atomic.Xaddint64(&stats.inHeap, -int64(s.npages*pageSize))
+	atomic.Xadd64(&stats.largeFreeCount, 1)
+	atomic.Xadd64(&stats.largeFree, int64(s.npages*pageSize))
+	memstats.heapStats.release()
+
+	// This counts as a free, so update heapLive.
+	gcController.update(-int64(s.npages*pageSize), 0)
+
+	// Mark it as free for the race detector.
+	if raceenabled {
+		racefree(unsafe.Pointer(s.base()), s.elemsize)
+	}
+
+	systemstack(func() {
+		// Add the user arena to the quarantine list.
+		lock(&mheap_.lock)
+		mheap_.userArena.quarantineList.insert(s)
+		unlock(&mheap_.lock)
+	})
+}
+
+// inUserArenaChunk returns true if p points to a user arena chunk.
+func inUserArenaChunk(p uintptr) bool {
+	s := spanOf(p)
+	if s == nil {
+		return false
+	}
+	return s.isUserArenaChunk
+}
+
+// freeUserArenaChunk releases the user arena represented by s back to the runtime.
+//
+// x must be a live pointer within s.
+//
+// The runtime will set the user arena to fault once it's safe (the GC is no longer running)
+// and then once the user arena is no longer referenced by the application, will allow it to
+// be reused.
+func freeUserArenaChunk(s *mspan, x unsafe.Pointer) {
+	if !s.isUserArenaChunk {
+		throw("span is not for a user arena")
+	}
+	if s.npages*pageSize != userArenaChunkBytes {
+		throw("invalid user arena span size")
+	}
+
+	// Mark the region as free to various santizers immediately instead
+	// of handling them at sweep time.
+	if raceenabled {
+		racefree(unsafe.Pointer(s.base()), s.elemsize)
+	}
+	if msanenabled {
+		msanfree(unsafe.Pointer(s.base()), s.elemsize)
+	}
+	if asanenabled {
+		asanpoison(unsafe.Pointer(s.base()), s.elemsize)
+	}
+
+	// Make ourselves non-preemptible as we manipulate state and statistics.
+	//
+	// Also required by setUserArenaChunksToFault.
+	mp := acquirem()
+
+	// We can only set user arenas to fault if we're in the _GCoff phase.
+	if gcphase == _GCoff {
+		lock(&userArenaState.lock)
+		faultList := userArenaState.fault
+		userArenaState.fault = nil
+		unlock(&userArenaState.lock)
+
+		s.setUserArenaChunkToFault()
+		for _, lc := range faultList {
+			lc.mspan.setUserArenaChunkToFault()
+		}
+
+		// Until the chunks are set to fault, keep them alive via the fault list.
+		KeepAlive(x)
+		KeepAlive(faultList)
+	} else {
+		// Put the user arena on the fault list.
+		lock(&userArenaState.lock)
+		userArenaState.fault = append(userArenaState.fault, liveUserArenaChunk{s, x})
+		unlock(&userArenaState.lock)
+	}
+	releasem(mp)
+}
+
+// allocUserArenaChunk attempts to reuse a free user arena chunk represented
+// as a span.
+//
+// Must be in a non-preemptible state to ensure the consistency of statistics
+// exported to MemStats.
+//
+// Acquires the heap lock. Must run on the system stack for that reason.
+//
+//go:systemstack
+func (h *mheap) allocUserArenaChunk() *mspan {
+	var s *mspan
+	var base uintptr
+
+	// First check the free list.
+	lock(&h.lock)
+	if !h.userArena.readyList.isEmpty() {
+		s = h.userArena.readyList.first
+		h.userArena.readyList.remove(s)
+		base = s.base()
+	} else {
+		// Free list was empty, so allocate a new arena.
+		hintList := &h.userArena.arenaHints
+		if raceenabled {
+			// In race mode just use the regular heap hints. We might fragment
+			// the address space, but the race detector requires that the heap
+			// is mapped contiguously.
+			hintList = &h.arenaHints
+		}
+		v, size := h.sysAlloc(userArenaChunkBytes, hintList, false)
+		if size%userArenaChunkBytes != 0 {
+			throw("sysAlloc size is not divisible by userArenaChunkBytes")
+		}
+		if size > userArenaChunkBytes {
+			// We got more than we asked for. This can happen if
+			// heapArenaSize > userArenaChunkSize, or if sysAlloc just returns
+			// some extra as a result of trying to find an aligned region.
+			//
+			// Divide it up and put it on the ready list.
+			for i := uintptr(userArenaChunkBytes); i < size; i += userArenaChunkBytes {
+				s := h.allocMSpanLocked()
+				s.init(uintptr(v)+i, userArenaChunkPages)
+				h.userArena.readyList.insertBack(s)
+			}
+			size = userArenaChunkBytes
+		}
+		base = uintptr(v)
+		if base == 0 {
+			// Out of memory.
+			unlock(&h.lock)
+			return nil
+		}
+		s = h.allocMSpanLocked()
+	}
+	unlock(&h.lock)
+
+	// sysAlloc returns Reserved address space, and any span we're
+	// reusing is set to fault (so, also Reserved), so transition
+	// it to Prepared and then Ready.
+	//
+	// Unlike (*mheap).grow, just map in everything that we
+	// asked for. We're likely going to use it all.
+	sysMap(unsafe.Pointer(base), userArenaChunkBytes, &gcController.heapReleased)
+	sysUsed(unsafe.Pointer(base), userArenaChunkBytes, userArenaChunkBytes)
+
+	// Model the user arena as a heap span for a large object.
+	spc := makeSpanClass(0, false)
+	h.initSpan(s, spanAllocHeap, spc, base, userArenaChunkPages)
+	s.isUserArenaChunk = true
+
+	// Account for this new arena chunk memory.
+	gcController.heapInUse.add(int64(userArenaChunkBytes))
+	gcController.heapReleased.add(-int64(userArenaChunkBytes))
+
+	stats := memstats.heapStats.acquire()
+	atomic.Xaddint64(&stats.inHeap, int64(userArenaChunkBytes))
+	atomic.Xaddint64(&stats.committed, int64(userArenaChunkBytes))
+
+	// Model the arena as a single large malloc.
+	atomic.Xadd64(&stats.largeAlloc, int64(userArenaChunkBytes))
+	atomic.Xadd64(&stats.largeAllocCount, 1)
+	memstats.heapStats.release()
+
+	// Count the alloc in inconsistent, internal stats.
+	gcController.totalAlloc.Add(int64(userArenaChunkBytes))
+
+	// Update heapLive.
+	gcController.update(int64(userArenaChunkBytes), 0)
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	h.central[spc].mcentral.fullSwept(h.sweepgen).push(s)
+	s.limit = s.base() + userArenaChunkBytes
+	s.freeindex = 1
+	s.allocCount = 1
+
+	// This must clear the entire heap bitmap so that it's safe
+	// to allocate noscan data without writing anything out.
+	s.initHeapBits(true)
+
+	// Clear the span preemptively. It's an arena chunk, so let's assume
+	// everything is going to be used.
+	//
+	// This also seems to make a massive difference as to whether or
+	// not Linux decides to back this memory with transparent huge
+	// pages. There's latency involved in this zeroing, but the hugepage
+	// gains are almost always worth it. Note: it's important that we
+	// clear even if it's freshly mapped and we know there's no point
+	// to zeroing as *that* is the critical signal to use huge pages.
+	memclrNoHeapPointers(unsafe.Pointer(s.base()), s.elemsize)
+	s.needzero = 0
+
+	s.freeIndexForScan = 1
+
+	// Set up the range for allocation.
+	s.userArenaChunkFree = makeAddrRange(base, s.limit)
+	return s
+}
diff --git a/src/runtime/arena_test.go b/src/runtime/arena_test.go
new file mode 100644
index 0000000..7e121ad
--- /dev/null
+++ b/src/runtime/arena_test.go
@@ -0,0 +1,529 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"internal/goarch"
+	"reflect"
+	. "runtime"
+	"runtime/debug"
+	"runtime/internal/atomic"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+type smallScalar struct {
+	X uintptr
+}
+type smallPointer struct {
+	X *smallPointer
+}
+type smallPointerMix struct {
+	A *smallPointer
+	B byte
+	C *smallPointer
+	D [11]byte
+}
+type mediumScalarEven [8192]byte
+type mediumScalarOdd [3321]byte
+type mediumPointerEven [1024]*smallPointer
+type mediumPointerOdd [1023]*smallPointer
+
+type largeScalar [UserArenaChunkBytes + 1]byte
+type largePointer [UserArenaChunkBytes/unsafe.Sizeof(&smallPointer{}) + 1]*smallPointer
+
+func TestUserArena(t *testing.T) {
+	// Set GOMAXPROCS to 2 so we don't run too many of these
+	// tests in parallel.
+	defer GOMAXPROCS(GOMAXPROCS(2))
+
+	// Start a subtest so that we can clean up after any parallel tests within.
+	t.Run("Alloc", func(t *testing.T) {
+		ss := &smallScalar{5}
+		runSubTestUserArenaNew(t, ss, true)
+
+		sp := &smallPointer{new(smallPointer)}
+		runSubTestUserArenaNew(t, sp, true)
+
+		spm := &smallPointerMix{sp, 5, nil, [11]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}}
+		runSubTestUserArenaNew(t, spm, true)
+
+		mse := new(mediumScalarEven)
+		for i := range mse {
+			mse[i] = 121
+		}
+		runSubTestUserArenaNew(t, mse, true)
+
+		mso := new(mediumScalarOdd)
+		for i := range mso {
+			mso[i] = 122
+		}
+		runSubTestUserArenaNew(t, mso, true)
+
+		mpe := new(mediumPointerEven)
+		for i := range mpe {
+			mpe[i] = sp
+		}
+		runSubTestUserArenaNew(t, mpe, true)
+
+		mpo := new(mediumPointerOdd)
+		for i := range mpo {
+			mpo[i] = sp
+		}
+		runSubTestUserArenaNew(t, mpo, true)
+
+		ls := new(largeScalar)
+		for i := range ls {
+			ls[i] = 123
+		}
+		// Not in parallel because we don't want to hold this large allocation live.
+		runSubTestUserArenaNew(t, ls, false)
+
+		lp := new(largePointer)
+		for i := range lp {
+			lp[i] = sp
+		}
+		// Not in parallel because we don't want to hold this large allocation live.
+		runSubTestUserArenaNew(t, lp, false)
+
+		sss := make([]smallScalar, 25)
+		for i := range sss {
+			sss[i] = smallScalar{12}
+		}
+		runSubTestUserArenaSlice(t, sss, true)
+
+		mpos := make([]mediumPointerOdd, 5)
+		for i := range mpos {
+			mpos[i] = *mpo
+		}
+		runSubTestUserArenaSlice(t, mpos, true)
+
+		sps := make([]smallPointer, UserArenaChunkBytes/unsafe.Sizeof(smallPointer{})+1)
+		for i := range sps {
+			sps[i] = *sp
+		}
+		// Not in parallel because we don't want to hold this large allocation live.
+		runSubTestUserArenaSlice(t, sps, false)
+
+		// Test zero-sized types.
+		t.Run("struct{}", func(t *testing.T) {
+			arena := NewUserArena()
+			var x any
+			x = (*struct{})(nil)
+			arena.New(&x)
+			if v := unsafe.Pointer(x.(*struct{})); v != ZeroBase {
+				t.Errorf("expected zero-sized type to be allocated as zerobase: got %x, want %x", v, ZeroBase)
+			}
+			arena.Free()
+		})
+		t.Run("[]struct{}", func(t *testing.T) {
+			arena := NewUserArena()
+			var sl []struct{}
+			arena.Slice(&sl, 10)
+			if v := unsafe.Pointer(&sl[0]); v != ZeroBase {
+				t.Errorf("expected zero-sized type to be allocated as zerobase: got %x, want %x", v, ZeroBase)
+			}
+			arena.Free()
+		})
+		t.Run("[]int (cap 0)", func(t *testing.T) {
+			arena := NewUserArena()
+			var sl []int
+			arena.Slice(&sl, 0)
+			if len(sl) != 0 {
+				t.Errorf("expected requested zero-sized slice to still have zero length: got %x, want 0", len(sl))
+			}
+			arena.Free()
+		})
+	})
+
+	// Run a GC cycle to get any arenas off the quarantine list.
+	GC()
+
+	if n := GlobalWaitingArenaChunks(); n != 0 {
+		t.Errorf("expected zero waiting arena chunks, found %d", n)
+	}
+}
+
+func runSubTestUserArenaNew[S comparable](t *testing.T, value *S, parallel bool) {
+	t.Run(reflect.TypeOf(value).Elem().Name(), func(t *testing.T) {
+		if parallel {
+			t.Parallel()
+		}
+
+		// Allocate and write data, enough to exhaust the arena.
+		//
+		// This is an underestimate, likely leaving some space in the arena. That's a good thing,
+		// because it gives us coverage of boundary cases.
+		n := int(UserArenaChunkBytes / unsafe.Sizeof(*value))
+		if n == 0 {
+			n = 1
+		}
+
+		// Create a new arena and do a bunch of operations on it.
+		arena := NewUserArena()
+
+		arenaValues := make([]*S, 0, n)
+		for j := 0; j < n; j++ {
+			var x any
+			x = (*S)(nil)
+			arena.New(&x)
+			s := x.(*S)
+			*s = *value
+			arenaValues = append(arenaValues, s)
+		}
+		// Check integrity of allocated data.
+		for _, s := range arenaValues {
+			if *s != *value {
+				t.Errorf("failed integrity check: got %#v, want %#v", *s, *value)
+			}
+		}
+
+		// Release the arena.
+		arena.Free()
+	})
+}
+
+func runSubTestUserArenaSlice[S comparable](t *testing.T, value []S, parallel bool) {
+	t.Run("[]"+reflect.TypeOf(value).Elem().Name(), func(t *testing.T) {
+		if parallel {
+			t.Parallel()
+		}
+
+		// Allocate and write data, enough to exhaust the arena.
+		//
+		// This is an underestimate, likely leaving some space in the arena. That's a good thing,
+		// because it gives us coverage of boundary cases.
+		n := int(UserArenaChunkBytes / (unsafe.Sizeof(*new(S)) * uintptr(cap(value))))
+		if n == 0 {
+			n = 1
+		}
+
+		// Create a new arena and do a bunch of operations on it.
+		arena := NewUserArena()
+
+		arenaValues := make([][]S, 0, n)
+		for j := 0; j < n; j++ {
+			var sl []S
+			arena.Slice(&sl, cap(value))
+			copy(sl, value)
+			arenaValues = append(arenaValues, sl)
+		}
+		// Check integrity of allocated data.
+		for _, sl := range arenaValues {
+			for i := range sl {
+				got := sl[i]
+				want := value[i]
+				if got != want {
+					t.Errorf("failed integrity check: got %#v, want %#v at index %d", got, want, i)
+				}
+			}
+		}
+
+		// Release the arena.
+		arena.Free()
+	})
+}
+
+func TestUserArenaLiveness(t *testing.T) {
+	t.Run("Free", func(t *testing.T) {
+		testUserArenaLiveness(t, false)
+	})
+	t.Run("Finalizer", func(t *testing.T) {
+		testUserArenaLiveness(t, true)
+	})
+}
+
+func testUserArenaLiveness(t *testing.T, useArenaFinalizer bool) {
+	// Disable the GC so that there's zero chance we try doing anything arena related *during*
+	// a mark phase, since otherwise a bunch of arenas could end up on the fault list.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+	// Defensively ensure that any full arena chunks leftover from previous tests have been cleared.
+	GC()
+	GC()
+
+	arena := NewUserArena()
+
+	// Allocate a few pointer-ful but un-initialized objects so that later we can
+	// place a reference to heap object at a more interesting location.
+	for i := 0; i < 3; i++ {
+		var x any
+		x = (*mediumPointerOdd)(nil)
+		arena.New(&x)
+	}
+
+	var x any
+	x = (*smallPointerMix)(nil)
+	arena.New(&x)
+	v := x.(*smallPointerMix)
+
+	var safeToFinalize atomic.Bool
+	var finalized atomic.Bool
+	v.C = new(smallPointer)
+	SetFinalizer(v.C, func(_ *smallPointer) {
+		if !safeToFinalize.Load() {
+			t.Error("finalized arena-referenced object unexpectedly")
+		}
+		finalized.Store(true)
+	})
+
+	// Make sure it stays alive.
+	GC()
+	GC()
+
+	// In order to ensure the object can be freed, we now need to make sure to use
+	// the entire arena. Exhaust the rest of the arena.
+
+	for i := 0; i < int(UserArenaChunkBytes/unsafe.Sizeof(mediumScalarEven{})); i++ {
+		var x any
+		x = (*mediumScalarEven)(nil)
+		arena.New(&x)
+	}
+
+	// Make sure it stays alive again.
+	GC()
+	GC()
+
+	v = nil
+
+	safeToFinalize.Store(true)
+	if useArenaFinalizer {
+		arena = nil
+
+		// Try to queue the arena finalizer.
+		GC()
+		GC()
+
+		// In order for the finalizer we actually want to run to execute,
+		// we need to make sure this one runs first.
+		if !BlockUntilEmptyFinalizerQueue(int64(2 * time.Second)) {
+			t.Fatal("finalizer queue was never emptied")
+		}
+	} else {
+		// Free the arena explicitly.
+		arena.Free()
+	}
+
+	// Try to queue the object's finalizer that we set earlier.
+	GC()
+	GC()
+
+	if !BlockUntilEmptyFinalizerQueue(int64(2 * time.Second)) {
+		t.Fatal("finalizer queue was never emptied")
+	}
+	if !finalized.Load() {
+		t.Error("expected arena-referenced object to be finalized")
+	}
+}
+
+func TestUserArenaClearsPointerBits(t *testing.T) {
+	// This is a regression test for a serious issue wherein if pointer bits
+	// aren't properly cleared, it's possible to allocate scalar data down
+	// into a previously pointer-ful area, causing misinterpretation by the GC.
+
+	// Create a large object, grab a pointer into it, and free it.
+	x := new([8 << 20]byte)
+	xp := uintptr(unsafe.Pointer(&x[124]))
+	var finalized atomic.Bool
+	SetFinalizer(x, func(_ *[8 << 20]byte) {
+		finalized.Store(true)
+	})
+
+	// Write three chunks worth of pointer data. Three gives us a
+	// high likelihood that when we write 2 later, we'll get the behavior
+	// we want.
+	a := NewUserArena()
+	for i := 0; i < int(UserArenaChunkBytes/goarch.PtrSize*3); i++ {
+		var x any
+		x = (*smallPointer)(nil)
+		a.New(&x)
+	}
+	a.Free()
+
+	// Recycle the arena chunks.
+	GC()
+	GC()
+
+	a = NewUserArena()
+	for i := 0; i < int(UserArenaChunkBytes/goarch.PtrSize*2); i++ {
+		var x any
+		x = (*smallScalar)(nil)
+		a.New(&x)
+		v := x.(*smallScalar)
+		// Write a pointer that should not keep x alive.
+		*v = smallScalar{xp}
+	}
+	KeepAlive(x)
+	x = nil
+
+	// Try to free x.
+	GC()
+	GC()
+
+	if !BlockUntilEmptyFinalizerQueue(int64(2 * time.Second)) {
+		t.Fatal("finalizer queue was never emptied")
+	}
+	if !finalized.Load() {
+		t.Fatal("heap allocation kept alive through non-pointer reference")
+	}
+
+	// Clean up the arena.
+	a.Free()
+	GC()
+	GC()
+}
+
+func TestUserArenaCloneString(t *testing.T) {
+	a := NewUserArena()
+
+	// A static string (not on heap or arena)
+	var s = "abcdefghij"
+
+	// Create a byte slice in the arena, initialize it with s
+	var b []byte
+	a.Slice(&b, len(s))
+	copy(b, s)
+
+	// Create a string as using the same memory as the byte slice, hence in
+	// the arena. This could be an arena API, but hasn't really been needed
+	// yet.
+	var as string
+	asHeader := (*reflect.StringHeader)(unsafe.Pointer(&as))
+	asHeader.Data = (*reflect.SliceHeader)(unsafe.Pointer(&b)).Data
+	asHeader.Len = len(b)
+
+	// Clone should make a copy of as, since it is in the arena.
+	asCopy := UserArenaClone(as)
+	if (*reflect.StringHeader)(unsafe.Pointer(&as)).Data == (*reflect.StringHeader)(unsafe.Pointer(&asCopy)).Data {
+		t.Error("Clone did not make a copy")
+	}
+
+	// Clone should make a copy of subAs, since subAs is just part of as and so is in the arena.
+	subAs := as[1:3]
+	subAsCopy := UserArenaClone(subAs)
+	if (*reflect.StringHeader)(unsafe.Pointer(&subAs)).Data == (*reflect.StringHeader)(unsafe.Pointer(&subAsCopy)).Data {
+		t.Error("Clone did not make a copy")
+	}
+	if len(subAs) != len(subAsCopy) {
+		t.Errorf("Clone made an incorrect copy (bad length): %d -> %d", len(subAs), len(subAsCopy))
+	} else {
+		for i := range subAs {
+			if subAs[i] != subAsCopy[i] {
+				t.Errorf("Clone made an incorrect copy (data at index %d): %d -> %d", i, subAs[i], subAs[i])
+			}
+		}
+	}
+
+	// Clone should not make a copy of doubleAs, since doubleAs will be on the heap.
+	doubleAs := as + as
+	doubleAsCopy := UserArenaClone(doubleAs)
+	if (*reflect.StringHeader)(unsafe.Pointer(&doubleAs)).Data != (*reflect.StringHeader)(unsafe.Pointer(&doubleAsCopy)).Data {
+		t.Error("Clone should not have made a copy")
+	}
+
+	// Clone should not make a copy of s, since s is a static string.
+	sCopy := UserArenaClone(s)
+	if (*reflect.StringHeader)(unsafe.Pointer(&s)).Data != (*reflect.StringHeader)(unsafe.Pointer(&sCopy)).Data {
+		t.Error("Clone should not have made a copy")
+	}
+
+	a.Free()
+}
+
+func TestUserArenaClonePointer(t *testing.T) {
+	a := NewUserArena()
+
+	// Clone should not make a copy of a heap-allocated smallScalar.
+	x := Escape(new(smallScalar))
+	xCopy := UserArenaClone(x)
+	if unsafe.Pointer(x) != unsafe.Pointer(xCopy) {
+		t.Errorf("Clone should not have made a copy: %#v -> %#v", x, xCopy)
+	}
+
+	// Clone should make a copy of an arena-allocated smallScalar.
+	var i any
+	i = (*smallScalar)(nil)
+	a.New(&i)
+	xArena := i.(*smallScalar)
+	xArenaCopy := UserArenaClone(xArena)
+	if unsafe.Pointer(xArena) == unsafe.Pointer(xArenaCopy) {
+		t.Errorf("Clone should have made a copy: %#v -> %#v", xArena, xArenaCopy)
+	}
+	if *xArena != *xArenaCopy {
+		t.Errorf("Clone made an incorrect copy copy: %#v -> %#v", *xArena, *xArenaCopy)
+	}
+
+	a.Free()
+}
+
+func TestUserArenaCloneSlice(t *testing.T) {
+	a := NewUserArena()
+
+	// A static string (not on heap or arena)
+	var s = "klmnopqrstuv"
+
+	// Create a byte slice in the arena, initialize it with s
+	var b []byte
+	a.Slice(&b, len(s))
+	copy(b, s)
+
+	// Clone should make a copy of b, since it is in the arena.
+	bCopy := UserArenaClone(b)
+	if unsafe.Pointer(&b[0]) == unsafe.Pointer(&bCopy[0]) {
+		t.Errorf("Clone did not make a copy: %#v -> %#v", b, bCopy)
+	}
+	if len(b) != len(bCopy) {
+		t.Errorf("Clone made an incorrect copy (bad length): %d -> %d", len(b), len(bCopy))
+	} else {
+		for i := range b {
+			if b[i] != bCopy[i] {
+				t.Errorf("Clone made an incorrect copy (data at index %d): %d -> %d", i, b[i], bCopy[i])
+			}
+		}
+	}
+
+	// Clone should make a copy of bSub, since bSub is just part of b and so is in the arena.
+	bSub := b[1:3]
+	bSubCopy := UserArenaClone(bSub)
+	if unsafe.Pointer(&bSub[0]) == unsafe.Pointer(&bSubCopy[0]) {
+		t.Errorf("Clone did not make a copy: %#v -> %#v", bSub, bSubCopy)
+	}
+	if len(bSub) != len(bSubCopy) {
+		t.Errorf("Clone made an incorrect copy (bad length): %d -> %d", len(bSub), len(bSubCopy))
+	} else {
+		for i := range bSub {
+			if bSub[i] != bSubCopy[i] {
+				t.Errorf("Clone made an incorrect copy (data at index %d): %d -> %d", i, bSub[i], bSubCopy[i])
+			}
+		}
+	}
+
+	// Clone should not make a copy of bNotArena, since it will not be in an arena.
+	bNotArena := make([]byte, len(s))
+	copy(bNotArena, s)
+	bNotArenaCopy := UserArenaClone(bNotArena)
+	if unsafe.Pointer(&bNotArena[0]) != unsafe.Pointer(&bNotArenaCopy[0]) {
+		t.Error("Clone should not have made a copy")
+	}
+
+	a.Free()
+}
+
+func TestUserArenaClonePanic(t *testing.T) {
+	var s string
+	func() {
+		x := smallScalar{2}
+		defer func() {
+			if v := recover(); v != nil {
+				s = v.(string)
+			}
+		}()
+		UserArenaClone(x)
+	}()
+	if s == "" {
+		t.Errorf("expected panic from Clone")
+	}
+}
diff --git a/src/runtime/asan/asan.go b/src/runtime/asan/asan.go
index 4359f41..25f15ae 100644
--- a/src/runtime/asan/asan.go
+++ b/src/runtime/asan/asan.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build asan && linux && (arm64 || amd64 || riscv64)
+//go:build asan && linux && (arm64 || amd64 || riscv64 || ppc64le)
 
 package asan
 
@@ -34,7 +34,7 @@
 	__asan_poison_memory_region(addr, sz);
 }
 
-// Keep in sync with the defination in compiler-rt
+// Keep in sync with the definition in compiler-rt
 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/asan/asan_interface_internal.h#L41
 // This structure is used to describe the source location of
 // a place where global was defined.
@@ -44,7 +44,7 @@
 	int column_no;
 };
 
-// Keep in sync with the defination in compiler-rt
+// Keep in sync with the definition in compiler-rt
 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/asan/asan_interface_internal.h#L48
 // So far, the current implementation is only compatible with the ASan library from version v7 to v9.
 // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/asan/asan_init_version.h
diff --git a/src/runtime/asan_ppc64le.s b/src/runtime/asan_ppc64le.s
new file mode 100644
index 0000000..d13301a
--- /dev/null
+++ b/src/runtime/asan_ppc64le.s
@@ -0,0 +1,87 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build asan
+
+#include "go_asm.h"
+#include "textflag.h"
+
+#define RARG0 R3
+#define RARG1 R4
+#define RARG2 R5
+#define RARG3 R6
+#define FARG R12
+
+// Called from instrumented code.
+// func runtime·doasanread(addr unsafe.Pointer, sz, sp, pc uintptr)
+TEXT	runtime·doasanread(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	addr+0(FP), RARG0
+	MOVD	sz+8(FP), RARG1
+	MOVD	sp+16(FP), RARG2
+	MOVD	pc+24(FP), RARG3
+	// void __asan_read_go(void *addr, uintptr_t sz, void *sp, void *pc);
+	MOVD	$__asan_read_go(SB), FARG
+	BR	asancall<>(SB)
+
+// func runtime·doasanwrite(addr unsafe.Pointer, sz, sp, pc uintptr)
+TEXT	runtime·doasanwrite(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	addr+0(FP), RARG0
+	MOVD	sz+8(FP), RARG1
+	MOVD	sp+16(FP), RARG2
+	MOVD	pc+24(FP), RARG3
+	// void __asan_write_go(void *addr, uintptr_t sz, void *sp, void *pc);
+	MOVD	$__asan_write_go(SB), FARG
+	BR	asancall<>(SB)
+
+// func runtime·asanunpoison(addr unsafe.Pointer, sz uintptr)
+TEXT	runtime·asanunpoison(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	addr+0(FP), RARG0
+	MOVD	sz+8(FP), RARG1
+	// void __asan_unpoison_go(void *addr, uintptr_t sz);
+	MOVD	$__asan_unpoison_go(SB), FARG
+	BR	asancall<>(SB)
+
+// func runtime·asanpoison(addr unsafe.Pointer, sz uintptr)
+TEXT	runtime·asanpoison(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	addr+0(FP), RARG0
+	MOVD	sz+8(FP), RARG1
+	// void __asan_poison_go(void *addr, uintptr_t sz);
+	MOVD	$__asan_poison_go(SB), FARG
+	BR	asancall<>(SB)
+
+// func runtime·asanregisterglobals(addr unsafe.Pointer, n uintptr)
+TEXT	runtime·asanregisterglobals(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	addr+0(FP), RARG0
+	MOVD	n+8(FP), RARG1
+	// void __asan_register_globals_go(void *addr, uintptr_t n);
+	MOVD	$__asan_register_globals_go(SB), FARG
+	BR	asancall<>(SB)
+
+// Switches SP to g0 stack and calls (FARG). Arguments already set.
+TEXT	asancall<>(SB), NOSPLIT, $0-0
+	// LR saved in generated prologue
+	// Get info from the current goroutine
+	MOVD	runtime·tls_g(SB), R10  // g offset in TLS
+	MOVD	0(R10), g
+	MOVD	g_m(g), R7		// m for g
+	MOVD	R1, R16			// callee-saved, preserved across C call
+	MOVD	m_g0(R7), R10		// g0 for m
+	CMP	R10, g			// same g0?
+	BEQ	call			// already on g0
+	MOVD	(g_sched+gobuf_sp)(R10), R1 // switch R1
+call:
+	// prepare frame for C ABI
+	SUB	$32, R1			// create frame for callee saving LR, CR, R2 etc.
+	RLDCR	$0, R1, $~15, R1	// align SP to 16 bytes
+	MOVD	FARG, CTR		// address of function to be called
+	MOVD	R0, 0(R1)		// clear back chain pointer
+	BL	(CTR)
+	MOVD	$0, R0			// C code can clobber R0 set it back to 0
+	MOVD	R16, R1			// restore R1;
+	MOVD	runtime·tls_g(SB), R10	// find correct g
+	MOVD	0(R10), g
+	RET
+
+// tls_g, g value for each thread in TLS
+GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
diff --git a/src/runtime/asm_amd64.h b/src/runtime/asm_amd64.h
index 49e0ee2..f7a8896 100644
--- a/src/runtime/asm_amd64.h
+++ b/src/runtime/asm_amd64.h
@@ -5,10 +5,21 @@
 // Define features that are guaranteed to be supported by setting the AMD64 variable.
 // If a feature is supported, there's no need to check it at runtime every time.
 
+#ifdef GOAMD64_v2
+#define hasPOPCNT
+#define hasSSE42
+#endif
+
 #ifdef GOAMD64_v3
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
 
 #ifdef GOAMD64_v4
+#define hasAVX
 #define hasAVX2
+#define hasPOPCNT
+#define hasSSE42
 #endif
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index d2f7984..13c8de4 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -201,16 +201,16 @@
 	JZ	needtls
 	// arg 1: g0, already in DI
 	MOVQ	$setg_gcc<>(SB), SI // arg 2: setg_gcc
+	MOVQ	$0, DX	// arg 3, 4: not used when using platform's TLS
+	MOVQ	$0, CX
 #ifdef GOOS_android
 	MOVQ	$runtime·tls_g(SB), DX 	// arg 3: &tls_g
 	// arg 4: TLS base, stored in slot 0 (Android's TLS_SLOT_SELF).
 	// Compensate for tls_g (+16).
 	MOVQ	-16(TLS), CX
-#else
-	MOVQ	$0, DX	// arg 3, 4: not used when using platform's TLS
-	MOVQ	$0, CX
 #endif
 #ifdef GOOS_windows
+	MOVQ	$runtime·tls_g(SB), DX 	// arg 3: &tls_g
 	// Adjust for the Win64 calling convention.
 	MOVQ	CX, R9 // arg 4
 	MOVQ	DX, R8 // arg 3
@@ -251,6 +251,10 @@
 	JMP ok
 #endif
 
+#ifdef GOOS_windows
+	CALL	runtime·wintls(SB)
+#endif
+
 	LEAQ	runtime·m0+m_tls(SB), DI
 	CALL	runtime·settls(SB)
 
@@ -2026,6 +2030,9 @@
 DATA runtime·tls_g+0(SB)/8, $16
 GLOBL runtime·tls_g+0(SB), NOPTR, $8
 #endif
+#ifdef GOOS_windows
+GLOBL runtime·tls_g+0(SB), NOPTR, $8
+#endif
 
 // The compiler and assembler's -spectre=ret mode rewrites
 // all indirect CALL AX / JMP AX instructions to be
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index b47184e..591ef2a 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -387,6 +387,13 @@
 	RET
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R3), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOVW	R13, R13
+
 	MOVW	$0, R7
 	B runtime·morestack(SB)
 
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 7836ba1..7eb5bcf 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -320,6 +320,13 @@
 	UNDEF
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R3), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOVD	RSP, RSP
+
 	MOVW	$0, R26
 	B runtime·morestack(SB)
 
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 3597ebe..1abadb9 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -258,6 +258,13 @@
 	UNDEF
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R3), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOVV	R29, R29
+
 	MOVV	R0, REGCTXT
 	JMP	runtime·morestack(SB)
 
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 4a086b8..877c1bb 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -257,6 +257,13 @@
 	UNDEF
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R3), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOVW	R29, R29
+
 	MOVW	R0, REGCTXT
 	JMP	runtime·morestack(SB)
 
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index c6bcf82..61ff17a 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -334,6 +334,16 @@
 	UNDEF
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R5), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	// Use OR R0, R1 instead of MOVD R1, R1 as the MOVD instruction
+	// has a special affect on Power8,9,10 by lowering the thread 
+	// priority and causing a slowdown in execution time
+
+	OR	R0, R1
 	MOVD	R0, R11
 	BR	runtime·morestack(SB)
 
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 00caa9f..31b81ae 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -158,8 +158,8 @@
  */
 
 // Called during function prolog when more stack is needed.
-// Caller has already loaded:
-// R1: framesize, R2: argsize, R3: LR
+// Called with return address (i.e. caller's PC) in X5 (aka T0),
+// and the LR register contains the caller's LR.
 //
 // The traceback routines see morestack on a g0 as being
 // the top of a stack (for example, morestack calling newstack
@@ -209,6 +209,13 @@
 
 // func morestack_noctxt()
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register, and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOV	X2, X2
+
 	MOV	ZERO, CTXT
 	JMP	runtime·morestack(SB)
 
@@ -261,11 +268,7 @@
 
 // func mcall(fn func(*g))
 TEXT runtime·mcall<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-8
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	X10, CTXT
-#else
-	MOV	fn+0(FP), CTXT
-#endif
 
 	// Save caller state in g->sched
 	MOV	X2, (g_sched+gobuf_sp)(g)
@@ -637,7 +640,6 @@
 	MOV	T0, ret+0(FP)
 	RET
 
-#ifdef GOEXPERIMENT_regabiargs
 // spillArgs stores return values from registers to a *internal/abi.RegArgs in X25.
 TEXT ·spillArgs(SB),NOSPLIT,$0-0
 	MOV	X10, (0*8)(X25)
@@ -709,13 +711,6 @@
 	MOVD	(30*8)(X25), F22
 	MOVD	(31*8)(X25), F23
 	RET
-#else
-TEXT ·spillArgs(SB),NOSPLIT,$0-0
-	RET
-
-TEXT ·unspillArgs(SB),NOSPLIT,$0-0
-	RET
-#endif
 
 // gcWriteBarrier performs a heap pointer write and informs the GC.
 //
@@ -825,157 +820,72 @@
 // corresponding runtime handler.
 // The tail call makes these stubs disappear in backtraces.
 TEXT runtime·panicIndex<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicIndex<ABIInternal>(SB)
 TEXT runtime·panicIndexU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicIndexU<ABIInternal>(SB)
 TEXT runtime·panicSliceAlen<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceAlen<ABIInternal>(SB)
 TEXT runtime·panicSliceAlenU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceAlenU<ABIInternal>(SB)
 TEXT runtime·panicSliceAcap<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceAcap<ABIInternal>(SB)
 TEXT runtime·panicSliceAcapU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceAcapU<ABIInternal>(SB)
 TEXT runtime·panicSliceB<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceB<ABIInternal>(SB)
 TEXT runtime·panicSliceBU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceBU<ABIInternal>(SB)
 TEXT runtime·panicSlice3Alen<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T2, X10
 	MOV	T3, X11
-#else
-	MOV	T2, x+0(FP)
-	MOV	T3, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3Alen<ABIInternal>(SB)
 TEXT runtime·panicSlice3AlenU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T2, X10
 	MOV	T3, X11
-#else
-	MOV	T2, x+0(FP)
-	MOV	T3, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3AlenU<ABIInternal>(SB)
 TEXT runtime·panicSlice3Acap<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T2, X10
 	MOV	T3, X11
-#else
-	MOV	T2, x+0(FP)
-	MOV	T3, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3Acap<ABIInternal>(SB)
 TEXT runtime·panicSlice3AcapU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T2, X10
 	MOV	T3, X11
-#else
-	MOV	T2, x+0(FP)
-	MOV	T3, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3AcapU<ABIInternal>(SB)
 TEXT runtime·panicSlice3B<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3B<ABIInternal>(SB)
 TEXT runtime·panicSlice3BU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T1, X10
 	MOV	T2, X11
-#else
-	MOV	T1, x+0(FP)
-	MOV	T2, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3BU<ABIInternal>(SB)
 TEXT runtime·panicSlice3C<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3C<ABIInternal>(SB)
 TEXT runtime·panicSlice3CU<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T0, X10
 	MOV	T1, X11
-#else
-	MOV	T0, x+0(FP)
-	MOV	T1, y+8(FP)
-#endif
 	JMP	runtime·goPanicSlice3CU<ABIInternal>(SB)
 TEXT runtime·panicSliceConvert<ABIInternal>(SB),NOSPLIT,$0-16
-#ifdef GOEXPERIMENT_regabiargs
 	MOV	T2, X10
 	MOV	T3, X11
-#else
-	MOV	T2, x+0(FP)
-	MOV	T3, y+8(FP)
-#endif
 	JMP	runtime·goPanicSliceConvert<ABIInternal>(SB)
 
 DATA	runtime·mainPC+0(SB)/8,$runtime·main<ABIInternal>(SB)
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index 9159a67..334e1aa 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -346,6 +346,13 @@
 	UNDEF
 
 TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	// Force SPWRITE. This function doesn't actually write SP,
+	// but it is called with a special calling convention where
+	// the caller doesn't save LR on stack but passes it as a
+	// register (R5), and the unwinder currently doesn't understand.
+	// Make it SPWRITE to stop unwinding. (See issue 54332)
+	MOVD	R15, R15
+
 	MOVD	$0, R12
 	BR	runtime·morestack(SB)
 
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index d885da6..e075c72 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -320,10 +320,8 @@
 		I64Load stackArgs+16(FP); \
 		I32WrapI64; \
 		I64Load stackArgsSize+24(FP); \
-		I64Const $3; \
-		I64ShrU; \
 		I32WrapI64; \
-		Call runtime·wasmMove(SB); \
+		MemoryCopy; \
 	End; \
 	\
 	MOVD f+8(FP), CTXT; \
diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index b8f0c22..25e0e65 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go
@@ -35,6 +35,27 @@
 	atomic.StorepNoWB(noescape(ptr), new)
 }
 
+// atomic_storePointer is the implementation of runtime/internal/UnsafePointer.Store
+// (like StoreNoWB but with the write barrier).
+//
+//go:nosplit
+//go:linkname atomic_storePointer runtime/internal/atomic.storePointer
+func atomic_storePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
+	atomicstorep(unsafe.Pointer(ptr), new)
+}
+
+// atomic_casPointer is the implementation of runtime/internal/UnsafePointer.CompareAndSwap
+// (like CompareAndSwapNoWB but with the write barrier).
+//
+//go:nosplit
+//go:linkname atomic_casPointer runtime/internal/atomic.casPointer
+func atomic_casPointer(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
+	if writeBarrier.enabled {
+		atomicwb(ptr, new)
+	}
+	return atomic.Casp1(ptr, old, new)
+}
+
 // Like above, but implement in terms of sync/atomic's uintptr operations.
 // We cannot just call the runtime routines, because the race detector expects
 // to be able to intercept the sync/atomic forms but not the runtime forms.
diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index 298aa63..b8473e5 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go
@@ -23,9 +23,18 @@
 #cgo solaris LDFLAGS: -lxnet
 #cgo solaris LDFLAGS: -lsocket
 
-#cgo CFLAGS: -Wall -Werror
+// We use -fno-stack-protector because internal linking won't find
+// the support functions. See issues #52919 and #54313.
+#cgo CFLAGS: -Wall -Werror -fno-stack-protector
 
 #cgo solaris CPPFLAGS: -D_POSIX_PTHREAD_SEMANTICS
 
 */
 import "C"
+
+import "runtime/internal/sys"
+
+// Incomplete is used specifically for the semantics of incomplete C types.
+type Incomplete struct {
+	_ sys.NotInHeap
+}
diff --git a/src/runtime/cgo/gcc_386.S b/src/runtime/cgo/gcc_386.S
index ff55b2c..5e6d715 100644
--- a/src/runtime/cgo/gcc_386.S
+++ b/src/runtime/cgo/gcc_386.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_386.S"
+
 /*
  * Apple still insists on underscore prefixes for C function names.
  */
diff --git a/src/runtime/cgo/gcc_aix_ppc64.S b/src/runtime/cgo/gcc_aix_ppc64.S
index a00fae2..a77363e 100644
--- a/src/runtime/cgo/gcc_aix_ppc64.S
+++ b/src/runtime/cgo/gcc_aix_ppc64.S
@@ -5,6 +5,8 @@
 // +build ppc64
 // +build aix
 
+.file "gcc_aix_ppc64.S"
+
 /*
  * void crosscall_ppc64(void (*fn)(void), void *g)
  *
diff --git a/src/runtime/cgo/gcc_amd64.S b/src/runtime/cgo/gcc_amd64.S
index 46699d1..5a1629e 100644
--- a/src/runtime/cgo/gcc_amd64.S
+++ b/src/runtime/cgo/gcc_amd64.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_amd64.S"
+
 /*
  * Apple still insists on underscore prefixes for C function names.
  */
diff --git a/src/runtime/cgo/gcc_arm.S b/src/runtime/cgo/gcc_arm.S
index fe1c48b..6e8c14a 100644
--- a/src/runtime/cgo/gcc_arm.S
+++ b/src/runtime/cgo/gcc_arm.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_arm.S"
+
 /*
  * Apple still insists on underscore prefixes for C function names.
  */
diff --git a/src/runtime/cgo/gcc_arm64.S b/src/runtime/cgo/gcc_arm64.S
index 9154d2a..865f67c 100644
--- a/src/runtime/cgo/gcc_arm64.S
+++ b/src/runtime/cgo/gcc_arm64.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_arm64.S"
+
 /*
  * Apple still insists on underscore prefixes for C function names.
  */
diff --git a/src/runtime/cgo/gcc_darwin_amd64.c b/src/runtime/cgo/gcc_darwin_amd64.c
index d5b7fd8..955b81d 100644
--- a/src/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/runtime/cgo/gcc_darwin_amd64.c
@@ -14,15 +14,12 @@
 void
 x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 {
-	pthread_attr_t attr;
 	size_t size;
 
 	setg_gcc = setg;
 
-	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
-	g->stacklo = (uintptr)&attr - size + 4096;
-	pthread_attr_destroy(&attr);
+	size = pthread_get_stacksize_np(pthread_self());
+	g->stacklo = (uintptr)&size - size + 4096;
 }
 
 
@@ -38,8 +35,9 @@
 	sigfillset(&ign);
 	pthread_sigmask(SIG_SETMASK, &ign, &oset);
 
+	size = pthread_get_stacksize_np(pthread_self());
 	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
+	pthread_attr_setstacksize(&attr, size);
 	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index 24be675..5b77a42 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -36,8 +36,9 @@
 	sigfillset(&ign);
 	pthread_sigmask(SIG_SETMASK, &ign, &oset);
 
+	size = pthread_get_stacksize_np(pthread_self());
 	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
+	pthread_attr_setstacksize(&attr, size);
 	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
@@ -126,15 +127,12 @@
 void
 x_cgo_init(G *g, void (*setg)(void*))
 {
-	pthread_attr_t attr;
 	size_t size;
 
 	//fprintf(stderr, "x_cgo_init = %p\n", &x_cgo_init); // aid debugging in presence of ASLR
 	setg_gcc = setg;
-	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
-	g->stacklo = (uintptr)&attr - size + 4096;
-	pthread_attr_destroy(&attr);
+	size = pthread_get_stacksize_np(pthread_self());
+	g->stacklo = (uintptr)&size - size + 4096;
 
 #if TARGET_OS_IPHONE
 	darwin_arm_init_mach_exception_handler();
diff --git a/src/runtime/cgo/gcc_freebsd_riscv64.c b/src/runtime/cgo/gcc_freebsd_riscv64.c
new file mode 100644
index 0000000..6ce5e65
--- /dev/null
+++ b/src/runtime/cgo/gcc_freebsd_riscv64.c
@@ -0,0 +1,67 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <sys/types.h>
+#include <errno.h>
+#include <sys/signalvar.h>
+#include <pthread.h>
+#include <signal.h>
+#include <string.h>
+#include "libcgo.h"
+#include "libcgo_unix.h"
+
+static void* threadentry(void*);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*))
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	SIGFILLSET(ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
+	ts->g->stackhi = size;
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
+		abort();
+	}
+}
+
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
diff --git a/src/runtime/cgo/gcc_linux_ppc64x.S b/src/runtime/cgo/gcc_linux_ppc64x.S
index 595eb38..957ef3a 100644
--- a/src/runtime/cgo/gcc_linux_ppc64x.S
+++ b/src/runtime/cgo/gcc_linux_ppc64x.S
@@ -5,6 +5,8 @@
 // +build ppc64 ppc64le
 // +build linux
 
+.file "gcc_linux_ppc64x.S"
+
 /*
  * Apple still insists on underscore prefixes for C function names.
  */
diff --git a/src/runtime/cgo/gcc_loong64.S b/src/runtime/cgo/gcc_loong64.S
index 100aa33..6b7668f 100644
--- a/src/runtime/cgo/gcc_loong64.S
+++ b/src/runtime/cgo/gcc_loong64.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_loong64.S"
+
 /*
  * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
diff --git a/src/runtime/cgo/gcc_mips64x.S b/src/runtime/cgo/gcc_mips64x.S
index 908dd21..ec24d71 100644
--- a/src/runtime/cgo/gcc_mips64x.S
+++ b/src/runtime/cgo/gcc_mips64x.S
@@ -4,6 +4,8 @@
 
 // +build mips64 mips64le
 
+.file "gcc_mips64x.S"
+
 /*
  * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
diff --git a/src/runtime/cgo/gcc_mipsx.S b/src/runtime/cgo/gcc_mipsx.S
index 54f4b82..2867f6a 100644
--- a/src/runtime/cgo/gcc_mipsx.S
+++ b/src/runtime/cgo/gcc_mipsx.S
@@ -4,6 +4,8 @@
 
 // +build mips mipsle
 
+.file "gcc_mipsx.S"
+
 /*
  * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
diff --git a/src/runtime/cgo/gcc_mmap.c b/src/runtime/cgo/gcc_mmap.c
index 698a7e3..83d857f 100644
--- a/src/runtime/cgo/gcc_mmap.c
+++ b/src/runtime/cgo/gcc_mmap.c
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux,amd64 linux,arm64 linux,ppc64le
+// +build linux,amd64 linux,arm64 linux,ppc64le freebsd,amd64
 
 #include <errno.h>
 #include <stdint.h>
diff --git a/src/runtime/cgo/gcc_riscv64.S b/src/runtime/cgo/gcc_riscv64.S
index f429dc6..8f07649 100644
--- a/src/runtime/cgo/gcc_riscv64.S
+++ b/src/runtime/cgo/gcc_riscv64.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_riscv64.S"
+
 /*
  * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
  *
diff --git a/src/runtime/cgo/gcc_s390x.S b/src/runtime/cgo/gcc_s390x.S
index 614de4b..8bd30fe 100644
--- a/src/runtime/cgo/gcc_s390x.S
+++ b/src/runtime/cgo/gcc_s390x.S
@@ -2,6 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+.file "gcc_s390x.S"
+
 /*
  * void crosscall_s390x(void (*fn)(void), void *g)
  *
diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
index 996947e..3ff3c64 100644
--- a/src/runtime/cgo/gcc_windows_amd64.c
+++ b/src/runtime/cgo/gcc_windows_amd64.c
@@ -13,11 +13,13 @@
 
 static void threadentry(void*);
 static void (*setg_gcc)(void*);
+static DWORD *tls_g;
 
 void
 x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 {
 	setg_gcc = setg;
+	tls_g = (DWORD *)tlsg;
 }
 
 
@@ -41,8 +43,8 @@
 	 * Set specific keys in thread local storage.
 	 */
 	asm volatile (
-	  "movq %0, %%gs:0x28\n"	// MOVL tls0, 0x28(GS)
-	  :: "r"(ts.tls)
+	  "movq %0, %%gs:0(%1)\n"	// MOVL tls0, 0(tls_g)(GS)
+	  :: "r"(ts.tls), "r"(*tls_g)
 	);
 
 	crosscall_amd64(ts.fn, setg_gcc, (void*)ts.g);
diff --git a/src/runtime/cgo/mmap.go b/src/runtime/cgo/mmap.go
index eae0a9e..2f7e83b 100644
--- a/src/runtime/cgo/mmap.go
+++ b/src/runtime/cgo/mmap.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (linux && amd64) || (linux && arm64)
+//go:build (linux && amd64) || (linux && arm64) || (freebsd && amd64)
 
 package cgo
 
diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
index 4cb3e65..30660f7 100644
--- a/src/runtime/cgo_mmap.go
+++ b/src/runtime/cgo_mmap.go
@@ -4,7 +4,7 @@
 
 // Support for memory sanitizer. See runtime/cgo/mmap.go.
 
-//go:build (linux && amd64) || (linux && arm64)
+//go:build (linux && amd64) || (linux && arm64) || (freebsd && amd64)
 
 package runtime
 
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 892654e..9c75280 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -86,7 +86,6 @@
 
 import (
 	"internal/goarch"
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -259,7 +258,7 @@
 	// We must still stay on the same m.
 	defer unlockOSThread()
 
-	if gp.m.needextram || atomic.Load(&extraMWaiters) > 0 {
+	if gp.m.needextram || extraMWaiters.Load() > 0 {
 		gp.m.needextram = false
 		systemstack(newextram)
 	}
@@ -347,12 +346,12 @@
 	}
 }
 
-// called from assembly
+// called from assembly.
 func badcgocallback() {
 	throw("misaligned stack in cgocallback")
 }
 
-// called from (incomplete) assembly
+// called from (incomplete) assembly.
 func cgounimpl() {
 	throw("cgo not implemented")
 }
@@ -568,17 +567,16 @@
 		if base == 0 {
 			return
 		}
-		hbits := heapBitsForAddr(base)
 		n := span.elemsize
-		for i = uintptr(0); i < n; i += goarch.PtrSize {
-			if !hbits.morePointers() {
-				// No more possible pointers.
+		hbits := heapBitsForAddr(base, n)
+		for {
+			var addr uintptr
+			if hbits, addr = hbits.next(); addr == 0 {
 				break
 			}
-			if hbits.isPointer() && cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+			if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(addr))) {
 				panic(errorString(msg))
 			}
-			hbits = hbits.next()
 		}
 
 		return
diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go
index 74a2ec0..84e7516 100644
--- a/src/runtime/cgocheck.go
+++ b/src/runtime/cgocheck.go
@@ -32,14 +32,14 @@
 
 	// If we are running on the system stack then dst might be an
 	// address on the stack, which is OK.
-	g := getg()
-	if g == g.m.g0 || g == g.m.gsignal {
+	gp := getg()
+	if gp == gp.m.g0 || gp == gp.m.gsignal {
 		return
 	}
 
 	// Allocating memory can write to various mfixalloc structs
 	// that look like they are non-Go memory.
-	if g.m.mallocing != 0 {
+	if gp.m.mallocing != 0 {
 		return
 	}
 
@@ -153,16 +153,16 @@
 
 	// src must be in the regular heap.
 
-	hbits := heapBitsForAddr(uintptr(src))
-	for i := uintptr(0); i < off+size; i += goarch.PtrSize {
-		bits := hbits.bits()
-		if i >= off && bits&bitPointer != 0 {
-			v := *(*unsafe.Pointer)(add(src, i))
-			if cgoIsGoPointer(v) {
-				throw(cgoWriteBarrierFail)
-			}
+	hbits := heapBitsForAddr(uintptr(src), size)
+	for {
+		var addr uintptr
+		if hbits, addr = hbits.next(); addr == 0 {
+			break
 		}
-		hbits = hbits.next()
+		v := *(*unsafe.Pointer)(unsafe.Pointer(addr))
+		if cgoIsGoPointer(v) {
+			throw(cgoWriteBarrierFail)
+		}
 	}
 }
 
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index ca516ad..6a0ad35 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -138,7 +138,7 @@
 	return c.qcount == c.dataqsiz
 }
 
-// entry point for c <- x from compiled code
+// entry point for c <- x from compiled code.
 //
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
@@ -255,7 +255,7 @@
 	// to park on a channel. The window between when this G's status
 	// changes and when we set gp.activeStackChans is not safe for
 	// stack shrinking.
-	atomic.Store8(&gp.parkingOnChan, 1)
+	gp.parkingOnChan.Store(true)
 	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
 	// Ensure the value being sent is kept alive until the
 	// receiver copies it out. The sudog has a pointer to the
@@ -435,7 +435,7 @@
 	return atomic.Loaduint(&c.qcount) == 0
 }
 
-// entry points for <- c from compiled code
+// entry points for <- c from compiled code.
 //
 //go:nosplit
 func chanrecv1(c *hchan, elem unsafe.Pointer) {
@@ -579,7 +579,7 @@
 	// to park on a channel. The window between when this G's status
 	// changes and when we set gp.activeStackChans is not safe for
 	// stack shrinking.
-	atomic.Store8(&gp.parkingOnChan, 1)
+	gp.parkingOnChan.Store(true)
 	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
 
 	// someone woke us up
@@ -664,7 +664,7 @@
 	// Mark that it's safe for stack shrinking to occur now,
 	// because any thread acquiring this G's stack for shrinking
 	// is guaranteed to observe activeStackChans after this store.
-	atomic.Store8(&gp.parkingOnChan, 0)
+	gp.parkingOnChan.Store(false)
 	// Make sure we unlock after setting activeStackChans and
 	// unsetting parkingOnChan. The moment we unlock chanLock
 	// we risk gp getting readied by a channel operation and
@@ -791,7 +791,7 @@
 		// We use a flag in the G struct to tell us when someone
 		// else has won the race to signal this goroutine but the goroutine
 		// hasn't removed itself from the queue yet.
-		if sgp.isSelect && !atomic.Cas(&sgp.g.selectDone, 0, 1) {
+		if sgp.isSelect && !sgp.g.selectDone.CompareAndSwap(0, 1) {
 			continue
 		}
 
diff --git a/src/runtime/checkptr_test.go b/src/runtime/checkptr_test.go
index 15011ec..811c0f0 100644
--- a/src/runtime/checkptr_test.go
+++ b/src/runtime/checkptr_test.go
@@ -39,6 +39,8 @@
 		{"CheckPtrSmall", "fatal error: checkptr: pointer arithmetic computed bad pointer value\n"},
 		{"CheckPtrSliceOK", ""},
 		{"CheckPtrSliceFail", "fatal error: checkptr: unsafe.Slice result straddles multiple allocations\n"},
+		{"CheckPtrStringOK", ""},
+		{"CheckPtrStringFail", "fatal error: checkptr: unsafe.String result straddles multiple allocations\n"},
 	}
 
 	for _, tc := range testCases {
diff --git a/src/runtime/coverage/apis.go b/src/runtime/coverage/apis.go
new file mode 100644
index 0000000..7d851f9
--- /dev/null
+++ b/src/runtime/coverage/apis.go
@@ -0,0 +1,178 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import (
+	"fmt"
+	"internal/coverage"
+	"io"
+	"reflect"
+	"sync/atomic"
+	"unsafe"
+)
+
+// WriteMetaDir writes a coverage meta-data file for the currently
+// running program to the directory specified in 'dir'. An error will
+// be returned if the operation can't be completed successfully (for
+// example, if the currently running program was not built with
+// "-cover", or if the directory does not exist).
+func WriteMetaDir(dir string) error {
+	if !finalHashComputed {
+		return fmt.Errorf("error: no meta-data available (binary not built with -cover?)")
+	}
+	return emitMetaDataToDirectory(dir, getCovMetaList())
+}
+
+// WriteMeta writes the meta-data content (the payload that would
+// normally be emitted to a meta-data file) for the currently running
+// program to the the writer 'w'. An error will be returned if the
+// operation can't be completed successfully (for example, if the
+// currently running program was not built with "-cover", or if a
+// write fails).
+func WriteMeta(w io.Writer) error {
+	if w == nil {
+		return fmt.Errorf("error: nil writer in WriteMeta")
+	}
+	if !finalHashComputed {
+		return fmt.Errorf("error: no meta-data available (binary not built with -cover?)")
+	}
+	ml := getCovMetaList()
+	return writeMetaData(w, ml, cmode, cgran, finalHash)
+}
+
+// WriteCountersDir writes a coverage counter-data file for the
+// currently running program to the directory specified in 'dir'. An
+// error will be returned if the operation can't be completed
+// successfully (for example, if the currently running program was not
+// built with "-cover", or if the directory does not exist). The
+// counter data written will be a snapshot taken at the point of the
+// call.
+func WriteCountersDir(dir string) error {
+	return emitCounterDataToDirectory(dir)
+}
+
+// WriteCounters writes coverage counter-data content for
+// the currently running program to the writer 'w'. An error will be
+// returned if the operation can't be completed successfully (for
+// example, if the currently running program was not built with
+// "-cover", or if a write fails). The counter data written will be a
+// snapshot taken at the point of the invocation.
+func WriteCounters(w io.Writer) error {
+	if w == nil {
+		return fmt.Errorf("error: nil writer in WriteCounters")
+	}
+	// Ask the runtime for the list of coverage counter symbols.
+	cl := getCovCounterList()
+	if len(cl) == 0 {
+		return fmt.Errorf("program not built with -cover")
+	}
+	if !finalHashComputed {
+		return fmt.Errorf("meta-data not written yet, unable to write counter data")
+	}
+
+	pm := getCovPkgMap()
+	s := &emitState{
+		counterlist: cl,
+		pkgmap:      pm,
+	}
+	return s.emitCounterDataToWriter(w)
+}
+
+// ClearCounters clears/resets all coverage counter variables in the
+// currently running program. It returns an error if the program in
+// question was not built with the "-cover" flag. Clearing of coverage
+// counters is also not supported for programs not using atomic
+// counter mode (see more detailed comments below for the rationale
+// here).
+func ClearCounters() error {
+	cl := getCovCounterList()
+	if len(cl) == 0 {
+		return fmt.Errorf("program not built with -cover")
+	}
+	if cmode != coverage.CtrModeAtomic {
+		return fmt.Errorf("ClearCounters invoked for program build with -covermode=%s (please use -covermode=atomic)", cmode.String())
+	}
+
+	// Implementation note: this function would be faster and simpler
+	// if we could just zero out the entire counter array, but for the
+	// moment we go through and zero out just the slots in the array
+	// corresponding to the counter values. We do this to avoid the
+	// following bad scenario: suppose that a user builds their Go
+	// program with "-cover", and that program has a function (call it
+	// main.XYZ) that invokes ClearCounters:
+	//
+	//     func XYZ() {
+	//       ... do some stuff ...
+	//       coverage.ClearCounters()
+	//       if someCondition {   <<--- HERE
+	//         ...
+	//       }
+	//     }
+	//
+	// At the point where ClearCounters executes, main.XYZ has not yet
+	// finished running, thus as soon as the call returns the line
+	// marked "HERE" above will trigger the writing of a non-zero
+	// value into main.XYZ's counter slab. However since we've just
+	// finished clearing the entire counter segment, we will have lost
+	// the values in the prolog portion of main.XYZ's counter slab
+	// (nctrs, pkgid, funcid). This means that later on at the end of
+	// program execution as we walk through the entire counter array
+	// for the program looking for executed functions, we'll zoom past
+	// main.XYZ's prolog (which was zero'd) and hit the non-zero
+	// counter value corresponding to the "HERE" block, which will
+	// then be interpreted as the start of another live function.
+	// Things will go downhill from there.
+	//
+	// This same scenario is also a potential risk if the program is
+	// running on an architecture that permits reordering of
+	// writes/stores, since the inconsistency described above could
+	// arise here. Example scenario:
+	//
+	//     func ABC() {
+	//       ...                    // prolog
+	//       if alwaysTrue() {
+	//         XYZ()                // counter update here
+	//       }
+	//     }
+	//
+	// In the instrumented version of ABC, the prolog of the function
+	// will contain a series of stores to the initial portion of the
+	// counter array to write number-of-counters, pkgid, funcid. Later
+	// in the function there is also a store to increment a counter
+	// for the block containing the call to XYZ(). If the CPU is
+	// allowed to reorder stores and decides to issue the XYZ store
+	// before the prolog stores, this could be observable as an
+	// inconsistency similar to the one above. Hence the requirement
+	// for atomic counter mode: according to package atomic docs,
+	// "...operations that happen in a specific order on one thread,
+	// will always be observed to happen in exactly that order by
+	// another thread". Thus we can be sure that there will be no
+	// inconsistency when reading the counter array from the thread
+	// running ClearCounters.
+
+	var sd []atomic.Uint32
+
+	bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&sd))
+	for _, c := range cl {
+		bufHdr.Data = uintptr(unsafe.Pointer(c.Counters))
+		bufHdr.Len = int(c.Len)
+		bufHdr.Cap = int(c.Len)
+		for i := 0; i < len(sd); i++ {
+			// Skip ahead until the next non-zero value.
+			sdi := sd[i].Load()
+			if sdi == 0 {
+				continue
+			}
+			// We found a function that was executed; clear its counters.
+			nCtrs := sdi
+			for j := 0; j < int(nCtrs); j++ {
+				sd[i+coverage.FirstCtrOffset+j].Store(0)
+			}
+			// Move to next function.
+			i += coverage.FirstCtrOffset + int(nCtrs) - 1
+		}
+	}
+	return nil
+}
diff --git a/src/runtime/coverage/dummy.s b/src/runtime/coverage/dummy.s
new file mode 100644
index 0000000..7592859
--- /dev/null
+++ b/src/runtime/coverage/dummy.s
@@ -0,0 +1,8 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The runtime package uses //go:linkname to push a few functions into this
+// package but we still need a .s file so the Go tool does not pass -complete
+// to 'go tool compile' so the latter does not complain about Go functions
+// with no bodies.
diff --git a/src/runtime/coverage/emit.go b/src/runtime/coverage/emit.go
new file mode 100644
index 0000000..2aed99c
--- /dev/null
+++ b/src/runtime/coverage/emit.go
@@ -0,0 +1,667 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import (
+	"crypto/md5"
+	"fmt"
+	"internal/coverage"
+	"internal/coverage/encodecounter"
+	"internal/coverage/encodemeta"
+	"internal/coverage/rtcov"
+	"io"
+	"os"
+	"path/filepath"
+	"reflect"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+// This file contains functions that support the writing of data files
+// emitted at the end of code coverage testing runs, from instrumented
+// executables.
+
+// getCovMetaList returns a list of meta-data blobs registered
+// for the currently executing instrumented program. It is defined in the
+// runtime.
+func getCovMetaList() []rtcov.CovMetaBlob
+
+// getCovCounterList returns a list of counter-data blobs registered
+// for the currently executing instrumented program. It is defined in the
+// runtime.
+func getCovCounterList() []rtcov.CovCounterBlob
+
+// getCovPkgMap returns a map storing the remapped package IDs for
+// hard-coded runtime packages (see internal/coverage/pkgid.go for
+// more on why hard-coded package IDs are needed). This function
+// is defined in the runtime.
+func getCovPkgMap() map[int]int
+
+// emitState holds useful state information during the emit process.
+//
+// When an instrumented program finishes execution and starts the
+// process of writing out coverage data, it's possible that an
+// existing meta-data file already exists in the output directory. In
+// this case openOutputFiles() below will leave the 'mf' field below
+// as nil. If a new meta-data file is needed, field 'mfname' will be
+// the final desired path of the meta file, 'mftmp' will be a
+// temporary file, and 'mf' will be an open os.File pointer for
+// 'mftmp'. The meta-data file payload will be written to 'mf', the
+// temp file will be then closed and renamed (from 'mftmp' to
+// 'mfname'), so as to insure that the meta-data file is created
+// atomically; we want this so that things work smoothly in cases
+// where there are several instances of a given instrumented program
+// all terminating at the same time and trying to create meta-data
+// files simultaneously.
+//
+// For counter data files there is less chance of a collision, hence
+// the openOutputFiles() stores the counter data file in 'cfname' and
+// then places the *io.File into 'cf'.
+type emitState struct {
+	mfname string   // path of final meta-data output file
+	mftmp  string   // path to meta-data temp file (if needed)
+	mf     *os.File // open os.File for meta-data temp file
+	cfname string   // path of final counter data file
+	cftmp  string   // path to counter data temp file
+	cf     *os.File // open os.File for counter data file
+	outdir string   // output directory
+
+	// List of meta-data symbols obtained from the runtime
+	metalist []rtcov.CovMetaBlob
+
+	// List of counter-data symbols obtained from the runtime
+	counterlist []rtcov.CovCounterBlob
+
+	// Table to use for remapping hard-coded pkg ids.
+	pkgmap map[int]int
+
+	// emit debug trace output
+	debug bool
+}
+
+var (
+	// finalHash is computed at init time from the list of meta-data
+	// symbols registered during init. It is used both for writing the
+	// meta-data file and counter-data files.
+	finalHash [16]byte
+	// Set to true when we've computed finalHash + finalMetaLen.
+	finalHashComputed bool
+	// Total meta-data length.
+	finalMetaLen uint64
+	// Records whether we've already attempted to write meta-data.
+	metaDataEmitAttempted bool
+	// Counter mode for this instrumented program run.
+	cmode coverage.CounterMode
+	// Counter granularity for this instrumented program run.
+	cgran coverage.CounterGranularity
+	// Cached value of GOCOVERDIR environment variable.
+	goCoverDir string
+	// Copy of os.Args made at init time, converted into map format.
+	capturedOsArgs map[string]string
+	// Flag used in tests to signal that coverage data already written.
+	covProfileAlreadyEmitted bool
+)
+
+// fileType is used to select between counter-data files and
+// meta-data files.
+type fileType int
+
+const (
+	noFile = 1 << iota
+	metaDataFile
+	counterDataFile
+)
+
+// emitMetaData emits the meta-data output file for this coverage run.
+// This entry point is intended to be invoked by the compiler from
+// an instrumented program's main package init func.
+func emitMetaData() {
+	if covProfileAlreadyEmitted {
+		return
+	}
+	ml, err := prepareForMetaEmit()
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "error: coverage meta-data prep failed: %v\n", err)
+		if os.Getenv("GOCOVERDEBUG") != "" {
+			panic("meta-data write failure")
+		}
+	}
+	if len(ml) == 0 {
+		fmt.Fprintf(os.Stderr, "program not built with -cover\n")
+		return
+	}
+
+	goCoverDir = os.Getenv("GOCOVERDIR")
+	if goCoverDir == "" {
+		fmt.Fprintf(os.Stderr, "warning: GOCOVERDIR not set, no coverage data emitted\n")
+		return
+	}
+
+	if err := emitMetaDataToDirectory(goCoverDir, ml); err != nil {
+		fmt.Fprintf(os.Stderr, "error: coverage meta-data emit failed: %v\n", err)
+		if os.Getenv("GOCOVERDEBUG") != "" {
+			panic("meta-data write failure")
+		}
+	}
+}
+
+func modeClash(m coverage.CounterMode) bool {
+	if m == coverage.CtrModeRegOnly || m == coverage.CtrModeTestMain {
+		return false
+	}
+	if cmode == coverage.CtrModeInvalid {
+		cmode = m
+		return false
+	}
+	return cmode != m
+}
+
+func granClash(g coverage.CounterGranularity) bool {
+	if cgran == coverage.CtrGranularityInvalid {
+		cgran = g
+		return false
+	}
+	return cgran != g
+}
+
+// prepareForMetaEmit performs preparatory steps needed prior to
+// emitting a meta-data file, notably computing a final hash of
+// all meta-data blobs and capturing os args.
+func prepareForMetaEmit() ([]rtcov.CovMetaBlob, error) {
+	// Ask the runtime for the list of coverage meta-data symbols.
+	ml := getCovMetaList()
+
+	// In the normal case (go build -o prog.exe ... ; ./prog.exe)
+	// len(ml) will always be non-zero, but we check here since at
+	// some point this function will be reachable via user-callable
+	// APIs (for example, to write out coverage data from a server
+	// program that doesn't ever call os.Exit).
+	if len(ml) == 0 {
+		return nil, nil
+	}
+
+	s := &emitState{
+		metalist: ml,
+		debug:    os.Getenv("GOCOVERDEBUG") != "",
+	}
+
+	// Capture os.Args() now so as to avoid issues if args
+	// are rewritten during program execution.
+	capturedOsArgs = captureOsArgs()
+
+	if s.debug {
+		fmt.Fprintf(os.Stderr, "=+= GOCOVERDIR is %s\n", os.Getenv("GOCOVERDIR"))
+		fmt.Fprintf(os.Stderr, "=+= contents of covmetalist:\n")
+		for k, b := range ml {
+			fmt.Fprintf(os.Stderr, "=+= slot: %d path: %s ", k, b.PkgPath)
+			if b.PkgID != -1 {
+				fmt.Fprintf(os.Stderr, " hcid: %d", b.PkgID)
+			}
+			fmt.Fprintf(os.Stderr, "\n")
+		}
+		pm := getCovPkgMap()
+		fmt.Fprintf(os.Stderr, "=+= remap table:\n")
+		for from, to := range pm {
+			fmt.Fprintf(os.Stderr, "=+= from %d to %d\n",
+				uint32(from), uint32(to))
+		}
+	}
+
+	h := md5.New()
+	tlen := uint64(unsafe.Sizeof(coverage.MetaFileHeader{}))
+	for _, entry := range ml {
+		if _, err := h.Write(entry.Hash[:]); err != nil {
+			return nil, err
+		}
+		tlen += uint64(entry.Len)
+		ecm := coverage.CounterMode(entry.CounterMode)
+		if modeClash(ecm) {
+			return nil, fmt.Errorf("coverage counter mode clash: package %s uses mode=%d, but package %s uses mode=%s\n", ml[0].PkgPath, cmode, entry.PkgPath, ecm)
+		}
+		ecg := coverage.CounterGranularity(entry.CounterGranularity)
+		if granClash(ecg) {
+			return nil, fmt.Errorf("coverage counter granularity clash: package %s uses gran=%d, but package %s uses gran=%s\n", ml[0].PkgPath, cgran, entry.PkgPath, ecg)
+		}
+	}
+
+	// Hash mode and granularity as well.
+	h.Write([]byte(cmode.String()))
+	h.Write([]byte(cgran.String()))
+
+	// Compute final digest.
+	fh := h.Sum(nil)
+	copy(finalHash[:], fh)
+	finalHashComputed = true
+	finalMetaLen = tlen
+
+	return ml, nil
+}
+
+// emitMetaData emits the meta-data output file to the specified
+// directory, returning an error if something went wrong.
+func emitMetaDataToDirectory(outdir string, ml []rtcov.CovMetaBlob) error {
+	ml, err := prepareForMetaEmit()
+	if err != nil {
+		return err
+	}
+	if len(ml) == 0 {
+		return nil
+	}
+
+	metaDataEmitAttempted = true
+
+	s := &emitState{
+		metalist: ml,
+		debug:    os.Getenv("GOCOVERDEBUG") != "",
+		outdir:   outdir,
+	}
+
+	// Open output files.
+	if err := s.openOutputFiles(finalHash, finalMetaLen, metaDataFile); err != nil {
+		return err
+	}
+
+	// Emit meta-data file only if needed (may already be present).
+	if s.needMetaDataFile() {
+		if err := s.emitMetaDataFile(finalHash, finalMetaLen); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// emitCounterData emits the counter data output file for this coverage run.
+// This entry point is intended to be invoked by the runtime when an
+// instrumented program is terminating or calling os.Exit().
+func emitCounterData() {
+	if goCoverDir == "" || !finalHashComputed || covProfileAlreadyEmitted {
+		return
+	}
+	if err := emitCounterDataToDirectory(goCoverDir); err != nil {
+		fmt.Fprintf(os.Stderr, "error: coverage counter data emit failed: %v\n", err)
+		if os.Getenv("GOCOVERDEBUG") != "" {
+			panic("counter-data write failure")
+		}
+	}
+}
+
+// emitMetaData emits the counter-data output file for this coverage run.
+func emitCounterDataToDirectory(outdir string) error {
+	// Ask the runtime for the list of coverage counter symbols.
+	cl := getCovCounterList()
+	if len(cl) == 0 {
+		// no work to do here.
+		return nil
+	}
+
+	if !finalHashComputed {
+		return fmt.Errorf("error: meta-data not available (binary not built with -cover?)")
+	}
+
+	// Ask the runtime for the list of coverage counter symbols.
+	pm := getCovPkgMap()
+	s := &emitState{
+		counterlist: cl,
+		pkgmap:      pm,
+		outdir:      outdir,
+		debug:       os.Getenv("GOCOVERDEBUG") != "",
+	}
+
+	// Open output file.
+	if err := s.openOutputFiles(finalHash, finalMetaLen, counterDataFile); err != nil {
+		return err
+	}
+	if s.cf == nil {
+		return fmt.Errorf("counter data output file open failed (no additional info")
+	}
+
+	// Emit counter data file.
+	if err := s.emitCounterDataFile(finalHash, s.cf); err != nil {
+		return err
+	}
+	if err := s.cf.Close(); err != nil {
+		return fmt.Errorf("closing counter data file: %v", err)
+	}
+
+	// Counter file has now been closed. Rename the temp to the
+	// final desired path.
+	if err := os.Rename(s.cftmp, s.cfname); err != nil {
+		return fmt.Errorf("writing %s: rename from %s failed: %v\n", s.cfname, s.cftmp, err)
+	}
+
+	return nil
+}
+
+// emitMetaData emits counter data for this coverage run to an io.Writer.
+func (s *emitState) emitCounterDataToWriter(w io.Writer) error {
+	if err := s.emitCounterDataFile(finalHash, w); err != nil {
+		return err
+	}
+	return nil
+}
+
+// openMetaFile determines whether we need to emit a meta-data output
+// file, or whether we can reuse the existing file in the coverage out
+// dir. It updates mfname/mftmp/mf fields in 's', returning an error
+// if something went wrong. See the comment on the emitState type
+// definition above for more on how file opening is managed.
+func (s *emitState) openMetaFile(metaHash [16]byte, metaLen uint64) error {
+
+	// Open meta-outfile for reading to see if it exists.
+	fn := fmt.Sprintf("%s.%x", coverage.MetaFilePref, metaHash)
+	s.mfname = filepath.Join(s.outdir, fn)
+	fi, err := os.Stat(s.mfname)
+	if err != nil || fi.Size() != int64(metaLen) {
+		// We need a new meta-file.
+		tname := "tmp." + fn + fmt.Sprintf("%d", time.Now().UnixNano())
+		s.mftmp = filepath.Join(s.outdir, tname)
+		s.mf, err = os.Create(s.mftmp)
+		if err != nil {
+			return fmt.Errorf("creating meta-data file %s: %v", s.mftmp, err)
+		}
+	}
+	return nil
+}
+
+// openCounterFile opens an output file for the counter data portion
+// of a test coverage run. If updates the 'cfname' and 'cf' fields in
+// 's', returning an error if something went wrong.
+func (s *emitState) openCounterFile(metaHash [16]byte) error {
+	processID := os.Getpid()
+	fn := fmt.Sprintf(coverage.CounterFileTempl, coverage.CounterFilePref, metaHash, processID, time.Now().UnixNano())
+	s.cfname = filepath.Join(s.outdir, fn)
+	s.cftmp = filepath.Join(s.outdir, "tmp."+fn)
+	var err error
+	s.cf, err = os.Create(s.cftmp)
+	if err != nil {
+		return fmt.Errorf("creating counter data file %s: %v", s.cftmp, err)
+	}
+	return nil
+}
+
+// openOutputFiles opens output files in preparation for emitting
+// coverage data. In the case of the meta-data file, openOutputFiles
+// may determine that we can reuse an existing meta-data file in the
+// outdir, in which case it will leave the 'mf' field in the state
+// struct as nil. If a new meta-file is needed, the field 'mfname'
+// will be the final desired path of the meta file, 'mftmp' will be a
+// temporary file, and 'mf' will be an open os.File pointer for
+// 'mftmp'. The idea is that the client/caller will write content into
+// 'mf', close it, and then rename 'mftmp' to 'mfname'. This function
+// also opens the counter data output file, setting 'cf' and 'cfname'
+// in the state struct.
+func (s *emitState) openOutputFiles(metaHash [16]byte, metaLen uint64, which fileType) error {
+	fi, err := os.Stat(s.outdir)
+	if err != nil {
+		return fmt.Errorf("output directory %q inaccessible (err: %v); no coverage data written", s.outdir, err)
+	}
+	if !fi.IsDir() {
+		return fmt.Errorf("output directory %q not a directory; no coverage data written", s.outdir)
+	}
+
+	if (which & metaDataFile) != 0 {
+		if err := s.openMetaFile(metaHash, metaLen); err != nil {
+			return err
+		}
+	}
+	if (which & counterDataFile) != 0 {
+		if err := s.openCounterFile(metaHash); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+// emitMetaDataFile emits coverage meta-data to a previously opened
+// temporary file (s.mftmp), then renames the generated file to the
+// final path (s.mfname).
+func (s *emitState) emitMetaDataFile(finalHash [16]byte, tlen uint64) error {
+	if err := writeMetaData(s.mf, s.metalist, cmode, cgran, finalHash); err != nil {
+		return fmt.Errorf("writing %s: %v\n", s.mftmp, err)
+	}
+	if err := s.mf.Close(); err != nil {
+		return fmt.Errorf("closing meta data temp file: %v", err)
+	}
+
+	// Temp file has now been flushed and closed. Rename the temp to the
+	// final desired path.
+	if err := os.Rename(s.mftmp, s.mfname); err != nil {
+		return fmt.Errorf("writing %s: rename from %s failed: %v\n", s.mfname, s.mftmp, err)
+	}
+
+	return nil
+}
+
+// needMetaDataFile returns TRUE if we need to emit a meta-data file
+// for this program run. It should be used only after
+// openOutputFiles() has been invoked.
+func (s *emitState) needMetaDataFile() bool {
+	return s.mf != nil
+}
+
+func writeMetaData(w io.Writer, metalist []rtcov.CovMetaBlob, cmode coverage.CounterMode, gran coverage.CounterGranularity, finalHash [16]byte) error {
+	mfw := encodemeta.NewCoverageMetaFileWriter("<io.Writer>", w)
+
+	// Note: "sd" is re-initialized on each iteration of the loop
+	// below, and would normally be declared inside the loop, but
+	// placed here escape analysis since we capture it in bufHdr.
+	var sd []byte
+	bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&sd))
+
+	var blobs [][]byte
+	for _, e := range metalist {
+		bufHdr.Data = uintptr(unsafe.Pointer(e.P))
+		bufHdr.Len = int(e.Len)
+		bufHdr.Cap = int(e.Len)
+		blobs = append(blobs, sd)
+	}
+	return mfw.Write(finalHash, blobs, cmode, gran)
+}
+
+func (s *emitState) NumFuncs() (int, error) {
+	var sd []atomic.Uint32
+	bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&sd))
+
+	totalFuncs := 0
+	for _, c := range s.counterlist {
+		bufHdr.Data = uintptr(unsafe.Pointer(c.Counters))
+		bufHdr.Len = int(c.Len)
+		bufHdr.Cap = int(c.Len)
+		for i := 0; i < len(sd); i++ {
+			// Skip ahead until the next non-zero value.
+			sdi := sd[i].Load()
+			if sdi == 0 {
+				continue
+			}
+
+			// We found a function that was executed.
+			nCtrs := sdi
+
+			// Check to make sure that we have at least one live
+			// counter. See the implementation note in ClearCoverageCounters
+			// for a description of why this is needed.
+			isLive := false
+			st := i + coverage.FirstCtrOffset
+			counters := sd[st : st+int(nCtrs)]
+			for i := 0; i < len(counters); i++ {
+				if counters[i].Load() != 0 {
+					isLive = true
+					break
+				}
+			}
+			if !isLive {
+				// Skip this function.
+				i += coverage.FirstCtrOffset + int(nCtrs) - 1
+				continue
+			}
+
+			totalFuncs++
+
+			// Move to the next function.
+			i += coverage.FirstCtrOffset + int(nCtrs) - 1
+		}
+	}
+	return totalFuncs, nil
+}
+
+func (s *emitState) VisitFuncs(f encodecounter.CounterVisitorFn) error {
+	var sd []atomic.Uint32
+	var tcounters []uint32
+	bufHdr := (*reflect.SliceHeader)(unsafe.Pointer(&sd))
+
+	rdCounters := func(actrs []atomic.Uint32, ctrs []uint32) []uint32 {
+		ctrs = ctrs[:0]
+		for i := range actrs {
+			ctrs = append(ctrs, actrs[i].Load())
+		}
+		return ctrs
+	}
+
+	dpkg := uint32(0)
+	for _, c := range s.counterlist {
+		bufHdr.Data = uintptr(unsafe.Pointer(c.Counters))
+		bufHdr.Len = int(c.Len)
+		bufHdr.Cap = int(c.Len)
+		for i := 0; i < len(sd); i++ {
+			// Skip ahead until the next non-zero value.
+			sdi := sd[i].Load()
+			if sdi == 0 {
+				continue
+			}
+
+			// We found a function that was executed.
+			nCtrs := sd[i+coverage.NumCtrsOffset].Load()
+			pkgId := sd[i+coverage.PkgIdOffset].Load()
+			funcId := sd[i+coverage.FuncIdOffset].Load()
+			cst := i + coverage.FirstCtrOffset
+			counters := sd[cst : cst+int(nCtrs)]
+
+			// Check to make sure that we have at least one live
+			// counter. See the implementation note in ClearCoverageCounters
+			// for a description of why this is needed.
+			isLive := false
+			for i := 0; i < len(counters); i++ {
+				if counters[i].Load() != 0 {
+					isLive = true
+					break
+				}
+			}
+			if !isLive {
+				// Skip this function.
+				i += coverage.FirstCtrOffset + int(nCtrs) - 1
+				continue
+			}
+
+			if s.debug {
+				if pkgId != dpkg {
+					dpkg = pkgId
+					fmt.Fprintf(os.Stderr, "\n=+= %d: pk=%d visit live fcn",
+						i, pkgId)
+				}
+				fmt.Fprintf(os.Stderr, " {i=%d F%d NC%d}", i, funcId, nCtrs)
+			}
+
+			// Vet and/or fix up package ID. A package ID of zero
+			// indicates that there is some new package X that is a
+			// runtime dependency, and this package has code that
+			// executes before its corresponding init package runs.
+			// This is a fatal error that we should only see during
+			// Go development (e.g. tip).
+			ipk := int32(pkgId)
+			if ipk == 0 {
+				fmt.Fprintf(os.Stderr, "\n")
+				reportErrorInHardcodedList(int32(i), ipk, funcId, nCtrs)
+			} else if ipk < 0 {
+				if newId, ok := s.pkgmap[int(ipk)]; ok {
+					pkgId = uint32(newId)
+				} else {
+					fmt.Fprintf(os.Stderr, "\n")
+					reportErrorInHardcodedList(int32(i), ipk, funcId, nCtrs)
+				}
+			} else {
+				// The package ID value stored in the counter array
+				// has 1 added to it (so as to preclude the
+				// possibility of a zero value ; see
+				// runtime.addCovMeta), so subtract off 1 here to form
+				// the real package ID.
+				pkgId--
+			}
+
+			tcounters = rdCounters(counters, tcounters)
+			if err := f(pkgId, funcId, tcounters); err != nil {
+				return err
+			}
+
+			// Skip over this function.
+			i += coverage.FirstCtrOffset + int(nCtrs) - 1
+		}
+		if s.debug {
+			fmt.Fprintf(os.Stderr, "\n")
+		}
+	}
+	return nil
+}
+
+// captureOsArgs converts os.Args() into the format we use to store
+// this info in the counter data file (counter data file "args"
+// section is a generic key-value collection). See the 'args' section
+// in internal/coverage/defs.go for more info. The args map
+// is also used to capture GOOS + GOARCH values as well.
+func captureOsArgs() map[string]string {
+	m := make(map[string]string)
+	m["argc"] = fmt.Sprintf("%d", len(os.Args))
+	for k, a := range os.Args {
+		m[fmt.Sprintf("argv%d", k)] = a
+	}
+	m["GOOS"] = runtime.GOOS
+	m["GOARCH"] = runtime.GOARCH
+	return m
+}
+
+// emitCounterDataFile emits the counter data portion of a
+// coverage output file (to the file 's.cf').
+func (s *emitState) emitCounterDataFile(finalHash [16]byte, w io.Writer) error {
+	cfw := encodecounter.NewCoverageDataWriter(w, coverage.CtrULeb128)
+	if err := cfw.Write(finalHash, capturedOsArgs, s); err != nil {
+		return err
+	}
+	return nil
+}
+
+// markProfileEmitted signals the runtime/coverage machinery that
+// coverate data output files have already been written out, and there
+// is no need to take any additional action at exit time. This
+// function is called (via linknamed reference) from the
+// coverage-related boilerplate code in _testmain.go emitted for go
+// unit tests.
+func markProfileEmitted(val bool) {
+	covProfileAlreadyEmitted = val
+}
+
+func reportErrorInHardcodedList(slot, pkgID int32, fnID, nCtrs uint32) {
+	metaList := getCovMetaList()
+	pkgMap := getCovPkgMap()
+
+	println("internal error in coverage meta-data tracking:")
+	println("encountered bad pkgID:", pkgID, " at slot:", slot,
+		" fnID:", fnID, " numCtrs:", nCtrs)
+	println("list of hard-coded runtime package IDs needs revising.")
+	println("[see the comment on the 'rtPkgs' var in ")
+	println(" <goroot>/src/internal/coverage/pkid.go]")
+	println("registered list:")
+	for k, b := range metaList {
+		print("slot: ", k, " path='", b.PkgPath, "' ")
+		if b.PkgID != -1 {
+			print(" hard-coded id: ", b.PkgID)
+		}
+		println("")
+	}
+	println("remap table:")
+	for from, to := range pkgMap {
+		println("from ", from, " to ", to)
+	}
+}
diff --git a/src/runtime/coverage/emitdata_test.go b/src/runtime/coverage/emitdata_test.go
new file mode 100644
index 0000000..3839e44
--- /dev/null
+++ b/src/runtime/coverage/emitdata_test.go
@@ -0,0 +1,451 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import (
+	"fmt"
+	"internal/coverage"
+	"internal/goexperiment"
+	"internal/platform"
+	"internal/testenv"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+// Set to true for debugging (linux only).
+const fixedTestDir = false
+
+func TestCoverageApis(t *testing.T) {
+	if testing.Short() {
+		t.Skipf("skipping test: too long for short mode")
+	}
+	if !goexperiment.CoverageRedesign {
+		t.Skipf("skipping new coverage tests (experiment not enabled)")
+	}
+	testenv.MustHaveGoBuild(t)
+	dir := t.TempDir()
+	if fixedTestDir {
+		dir = "/tmp/qqqzzz"
+		os.RemoveAll(dir)
+		mkdir(t, dir)
+	}
+
+	// Build harness.
+	bdir := mkdir(t, filepath.Join(dir, "build"))
+	hargs := []string{"-cover", "-coverpkg=all"}
+	if testing.CoverMode() != "" {
+		hargs = append(hargs, "-covermode="+testing.CoverMode())
+	}
+	harnessPath := buildHarness(t, bdir, hargs)
+
+	t.Logf("harness path is %s", harnessPath)
+
+	// Sub-tests for each API we want to inspect, plus
+	// extras for error testing.
+	t.Run("emitToDir", func(t *testing.T) {
+		t.Parallel()
+		testEmitToDir(t, harnessPath, dir)
+	})
+	t.Run("emitToWriter", func(t *testing.T) {
+		t.Parallel()
+		testEmitToWriter(t, harnessPath, dir)
+	})
+	t.Run("emitToNonexistentDir", func(t *testing.T) {
+		t.Parallel()
+		testEmitToNonexistentDir(t, harnessPath, dir)
+	})
+	t.Run("emitToNilWriter", func(t *testing.T) {
+		t.Parallel()
+		testEmitToNilWriter(t, harnessPath, dir)
+	})
+	t.Run("emitToFailingWriter", func(t *testing.T) {
+		t.Parallel()
+		testEmitToFailingWriter(t, harnessPath, dir)
+	})
+	t.Run("emitWithCounterClear", func(t *testing.T) {
+		t.Parallel()
+		testEmitWithCounterClear(t, harnessPath, dir)
+	})
+
+}
+
+// upmergeCoverData helps improve coverage data for this package
+// itself. If this test itself is being invoked with "-cover", then
+// what we'd like is for package coverage data (that is, coverage for
+// routines in "runtime/coverage") to be incorporated into the test
+// run from the "harness.exe" runs we've just done. We can accomplish
+// this by doing a merge from the harness gocoverdir's to the test
+// gocoverdir.
+func upmergeCoverData(t *testing.T, gocoverdir string) {
+	if testing.CoverMode() == "" {
+		return
+	}
+	testGoCoverDir := os.Getenv("GOCOVERDIR")
+	if testGoCoverDir == "" {
+		return
+	}
+	args := []string{"tool", "covdata", "merge", "-pkg=runtime/coverage",
+		"-o", testGoCoverDir, "-i", gocoverdir}
+	t.Logf("up-merge of covdata from %s to %s", gocoverdir, testGoCoverDir)
+	t.Logf("executing: go %+v", args)
+	cmd := exec.Command(testenv.GoToolPath(t), args...)
+	if b, err := cmd.CombinedOutput(); err != nil {
+		t.Fatalf("covdata merge failed (%v): %s", err, b)
+	}
+}
+
+// buildHarness builds the helper program "harness.exe".
+func buildHarness(t *testing.T, dir string, opts []string) string {
+	harnessPath := filepath.Join(dir, "harness.exe")
+	harnessSrc := filepath.Join("testdata", "harness.go")
+	args := []string{"build", "-o", harnessPath}
+	args = append(args, opts...)
+	args = append(args, harnessSrc)
+	//t.Logf("harness build: go %+v\n", args)
+	cmd := exec.Command(testenv.GoToolPath(t), args...)
+	if b, err := cmd.CombinedOutput(); err != nil {
+		t.Fatalf("build failed (%v): %s", err, b)
+	}
+	return harnessPath
+}
+
+func mkdir(t *testing.T, d string) string {
+	t.Helper()
+	if err := os.Mkdir(d, 0777); err != nil {
+		t.Fatalf("mkdir failed: %v", err)
+	}
+	return d
+}
+
+// updateGoCoverDir updates the specified environment 'env' to set
+// GOCOVERDIR to 'gcd' (if setGoCoverDir is TRUE) or removes
+// GOCOVERDIR from the environment (if setGoCoverDir is false).
+func updateGoCoverDir(env []string, gcd string, setGoCoverDir bool) []string {
+	rv := []string{}
+	found := false
+	for _, v := range env {
+		if strings.HasPrefix(v, "GOCOVERDIR=") {
+			if !setGoCoverDir {
+				continue
+			}
+			v = "GOCOVERDIR=" + gcd
+			found = true
+		}
+		rv = append(rv, v)
+	}
+	if !found && setGoCoverDir {
+		rv = append(rv, "GOCOVERDIR="+gcd)
+	}
+	return rv
+}
+
+func runHarness(t *testing.T, harnessPath string, tp string, setGoCoverDir bool, rdir, edir string) (string, error) {
+	t.Logf("running: %s -tp %s -o %s with rdir=%s and GOCOVERDIR=%v", harnessPath, tp, edir, rdir, setGoCoverDir)
+	cmd := exec.Command(harnessPath, "-tp", tp, "-o", edir)
+	cmd.Dir = rdir
+	cmd.Env = updateGoCoverDir(os.Environ(), rdir, setGoCoverDir)
+	b, err := cmd.CombinedOutput()
+	//t.Logf("harness run output: %s\n", string(b))
+	return string(b), err
+}
+
+func testForSpecificFunctions(t *testing.T, dir string, want []string, avoid []string) string {
+	args := []string{"tool", "covdata", "debugdump",
+		"-live", "-pkg=command-line-arguments", "-i=" + dir}
+	t.Logf("running: go %v\n", args)
+	cmd := exec.Command(testenv.GoToolPath(t), args...)
+	b, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("'go tool covdata failed (%v): %s", err, b)
+	}
+	output := string(b)
+	rval := ""
+	for _, f := range want {
+		wf := "Func: " + f + "\n"
+		if strings.Contains(output, wf) {
+			continue
+		}
+		rval += fmt.Sprintf("error: output should contain %q but does not\n", wf)
+	}
+	for _, f := range avoid {
+		wf := "Func: " + f + "\n"
+		if strings.Contains(output, wf) {
+			rval += fmt.Sprintf("error: output should not contain %q but does\n", wf)
+		}
+	}
+	if rval != "" {
+		t.Logf("=-= begin output:\n" + output + "\n=-= end output\n")
+	}
+	return rval
+}
+
+func withAndWithoutRunner(f func(setit bool, tag string)) {
+	// Run 'f' with and without GOCOVERDIR set.
+	for i := 0; i < 2; i++ {
+		tag := "x"
+		setGoCoverDir := true
+		if i == 0 {
+			setGoCoverDir = false
+			tag = "y"
+		}
+		f(setGoCoverDir, tag)
+	}
+}
+
+func mktestdirs(t *testing.T, tag, tp, dir string) (string, string) {
+	t.Helper()
+	rdir := mkdir(t, filepath.Join(dir, tp+"-rdir-"+tag))
+	edir := mkdir(t, filepath.Join(dir, tp+"-edir-"+tag))
+	return rdir, edir
+}
+
+func testEmitToDir(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		tp := "emitToDir"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+		output, err := runHarness(t, harnessPath, tp,
+			setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp emitDir': %v", err)
+		}
+
+		// Just check to make sure meta-data file and counter data file were
+		// written. Another alternative would be to run "go tool covdata"
+		// or equivalent, but for now, this is what we've got.
+		dents, err := os.ReadDir(edir)
+		if err != nil {
+			t.Fatalf("os.ReadDir(%s) failed: %v", edir, err)
+		}
+		mfc := 0
+		cdc := 0
+		for _, e := range dents {
+			if e.IsDir() {
+				continue
+			}
+			if strings.HasPrefix(e.Name(), coverage.MetaFilePref) {
+				mfc++
+			} else if strings.HasPrefix(e.Name(), coverage.CounterFilePref) {
+				cdc++
+			}
+		}
+		wantmf := 1
+		wantcf := 1
+		if mfc != wantmf {
+			t.Errorf("EmitToDir: want %d meta-data files, got %d\n", wantmf, mfc)
+		}
+		if cdc != wantcf {
+			t.Errorf("EmitToDir: want %d counter-data files, got %d\n", wantcf, cdc)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitToWriter(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		tp := "emitToWriter"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+		output, err := runHarness(t, harnessPath, tp, setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		want := []string{"main", tp}
+		avoid := []string{"final"}
+		if msg := testForSpecificFunctions(t, edir, want, avoid); msg != "" {
+			t.Errorf("coverage data from %q output match failed: %s", tp, msg)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitToNonexistentDir(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		tp := "emitToNonexistentDir"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+		output, err := runHarness(t, harnessPath, tp, setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitToUnwritableDir(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+
+		tp := "emitToUnwritableDir"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+
+		// Make edir unwritable.
+		if err := os.Chmod(edir, 0555); err != nil {
+			t.Fatalf("chmod failed: %v", err)
+		}
+		defer os.Chmod(edir, 0777)
+
+		output, err := runHarness(t, harnessPath, tp, setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitToNilWriter(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		tp := "emitToNilWriter"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+		output, err := runHarness(t, harnessPath, tp, setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitToFailingWriter(t *testing.T, harnessPath string, dir string) {
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		tp := "emitToFailingWriter"
+		rdir, edir := mktestdirs(t, tag, tp, dir)
+		output, err := runHarness(t, harnessPath, tp, setGoCoverDir, rdir, edir)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		upmergeCoverData(t, edir)
+		upmergeCoverData(t, rdir)
+	})
+}
+
+func testEmitWithCounterClear(t *testing.T, harnessPath string, dir string) {
+	// Ensure that we have two versions of the harness: one built with
+	// -covermode=atomic and one built with -covermode=set (we need
+	// both modes to test all of the functionality).
+	var nonatomicHarnessPath, atomicHarnessPath string
+	if testing.CoverMode() != "atomic" {
+		nonatomicHarnessPath = harnessPath
+		bdir2 := mkdir(t, filepath.Join(dir, "build2"))
+		hargs := []string{"-covermode=atomic", "-coverpkg=all"}
+		atomicHarnessPath = buildHarness(t, bdir2, hargs)
+	} else {
+		atomicHarnessPath = harnessPath
+		mode := "set"
+		if testing.CoverMode() != "" && testing.CoverMode() != "atomic" {
+			mode = testing.CoverMode()
+		}
+		// Build a special nonatomic covermode version of the harness
+		// (we need both modes to test all of the functionality).
+		bdir2 := mkdir(t, filepath.Join(dir, "build2"))
+		hargs := []string{"-covermode=" + mode, "-coverpkg=all"}
+		nonatomicHarnessPath = buildHarness(t, bdir2, hargs)
+	}
+
+	withAndWithoutRunner(func(setGoCoverDir bool, tag string) {
+		// First a run with the nonatomic harness path, which we
+		// expect to fail.
+		tp := "emitWithCounterClear"
+		rdir1, edir1 := mktestdirs(t, tag, tp+"1", dir)
+		output, err := runHarness(t, nonatomicHarnessPath, tp,
+			setGoCoverDir, rdir1, edir1)
+		if err == nil {
+			t.Logf("%s", output)
+			t.Fatalf("running '%s -tp %s': unexpected success",
+				nonatomicHarnessPath, tp)
+		}
+
+		// Next a run with the atomic harness path, which we
+		// expect to succeed.
+		rdir2, edir2 := mktestdirs(t, tag, tp+"2", dir)
+		output, err = runHarness(t, atomicHarnessPath, tp,
+			setGoCoverDir, rdir2, edir2)
+		if err != nil {
+			t.Logf("%s", output)
+			t.Fatalf("running 'harness -tp %s': %v", tp, err)
+		}
+		want := []string{tp, "postClear"}
+		avoid := []string{"preClear", "main", "final"}
+		if msg := testForSpecificFunctions(t, edir2, want, avoid); msg != "" {
+			t.Logf("%s", output)
+			t.Errorf("coverage data from %q output match failed: %s", tp, msg)
+		}
+
+		if testing.CoverMode() == "atomic" {
+			upmergeCoverData(t, edir2)
+			upmergeCoverData(t, rdir2)
+		} else {
+			upmergeCoverData(t, edir1)
+			upmergeCoverData(t, rdir1)
+		}
+	})
+}
+
+func TestApisOnNocoverBinary(t *testing.T) {
+	if testing.Short() {
+		t.Skipf("skipping test: too long for short mode")
+	}
+	testenv.MustHaveGoBuild(t)
+	dir := t.TempDir()
+
+	// Build harness with no -cover.
+	bdir := mkdir(t, filepath.Join(dir, "nocover"))
+	edir := mkdir(t, filepath.Join(dir, "emitDirNo"))
+	harnessPath := buildHarness(t, bdir, nil)
+	output, err := runHarness(t, harnessPath, "emitToDir", false, edir, edir)
+	if err == nil {
+		t.Fatalf("expected error on TestApisOnNocoverBinary harness run")
+	}
+	const want = "not built with -cover"
+	if !strings.Contains(output, want) {
+		t.Errorf("error output does not contain %q: %s", want, output)
+	}
+}
+
+func TestIssue56006EmitDataRaceCoverRunningGoroutine(t *testing.T) {
+	if testing.Short() {
+		t.Skipf("skipping test: too long for short mode")
+	}
+	if !goexperiment.CoverageRedesign {
+		t.Skipf("skipping new coverage tests (experiment not enabled)")
+	}
+
+	// This test requires "go test -race -cover", meaning that we need
+	// go build, go run, and "-race" support.
+	testenv.MustHaveGoRun(t)
+	if !platform.RaceDetectorSupported(runtime.GOOS, runtime.GOARCH) ||
+		!testenv.HasCGO() {
+		t.Skip("skipped due to lack of race detector support / CGO")
+	}
+
+	// This will run a program with -cover and -race where we have a
+	// goroutine still running (and updating counters) at the point where
+	// the test runtime is trying to write out counter data.
+	cmd := exec.Command(testenv.GoToolPath(t), "test", "-cover", "-race")
+	cmd.Dir = filepath.Join("testdata", "issue56006")
+	b, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("go test -cover -race failed: %v", err)
+	}
+
+	// Don't want to see any data races in output.
+	avoid := []string{"DATA RACE"}
+	for _, no := range avoid {
+		if strings.Contains(string(b), no) {
+			t.Logf("%s\n", string(b))
+			t.Fatalf("found %s in test output, not permitted", no)
+		}
+	}
+}
diff --git a/src/runtime/coverage/hooks.go b/src/runtime/coverage/hooks.go
new file mode 100644
index 0000000..a9fbf9d
--- /dev/null
+++ b/src/runtime/coverage/hooks.go
@@ -0,0 +1,42 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import _ "unsafe"
+
+// initHook is invoked from the main package "init" routine in
+// programs built with "-cover". This function is intended to be
+// called only by the compiler.
+//
+// If 'istest' is false, it indicates we're building a regular program
+// ("go build -cover ..."), in which case we immediately try to write
+// out the meta-data file, and register emitCounterData as an exit
+// hook.
+//
+// If 'istest' is true (indicating that the program in question is a
+// Go test binary), then we tentatively queue up both emitMetaData and
+// emitCounterData as exit hooks. In the normal case (e.g. regular "go
+// test -cover" run) the testmain.go boilerplate will run at the end
+// of the test, write out the coverage percentage, and then invoke
+// markProfileEmitted() to indicate that no more work needs to be
+// done. If however that call is never made, this is a sign that the
+// test binary is being used as a replacement binary for the tool
+// being tested, hence we do want to run exit hooks when the program
+// terminates.
+func initHook(istest bool) {
+	// Note: hooks are run in reverse registration order, so
+	// register the counter data hook before the meta-data hook
+	// (in the case where two hooks are needed).
+	runOnNonZeroExit := true
+	runtime_addExitHook(emitCounterData, runOnNonZeroExit)
+	if istest {
+		runtime_addExitHook(emitMetaData, runOnNonZeroExit)
+	} else {
+		emitMetaData()
+	}
+}
+
+//go:linkname runtime_addExitHook runtime.addExitHook
+func runtime_addExitHook(f func(), runOnNonZeroExit bool)
diff --git a/src/runtime/coverage/testdata/harness.go b/src/runtime/coverage/testdata/harness.go
new file mode 100644
index 0000000..5c87e4c
--- /dev/null
+++ b/src/runtime/coverage/testdata/harness.go
@@ -0,0 +1,259 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"flag"
+	"fmt"
+	"internal/coverage/slicewriter"
+	"io"
+	"io/ioutil"
+	"log"
+	"path/filepath"
+	"runtime/coverage"
+	"strings"
+)
+
+var verbflag = flag.Int("v", 0, "Verbose trace output level")
+var testpointflag = flag.String("tp", "", "Testpoint to run")
+var outdirflag = flag.String("o", "", "Output dir into which to emit")
+
+func emitToWriter() {
+	log.SetPrefix("emitToWriter: ")
+	var slwm slicewriter.WriteSeeker
+	if err := coverage.WriteMeta(&slwm); err != nil {
+		log.Fatalf("error: WriteMeta returns %v", err)
+	}
+	mf := filepath.Join(*outdirflag, "covmeta.0abcdef")
+	if err := ioutil.WriteFile(mf, slwm.BytesWritten(), 0666); err != nil {
+		log.Fatalf("error: writing %s: %v", mf, err)
+	}
+	var slwc slicewriter.WriteSeeker
+	if err := coverage.WriteCounters(&slwc); err != nil {
+		log.Fatalf("error: WriteCounters returns %v", err)
+	}
+	cf := filepath.Join(*outdirflag, "covcounters.0abcdef.99.77")
+	if err := ioutil.WriteFile(cf, slwc.BytesWritten(), 0666); err != nil {
+		log.Fatalf("error: writing %s: %v", cf, err)
+	}
+}
+
+func emitToDir() {
+	log.SetPrefix("emitToDir: ")
+	if err := coverage.WriteMetaDir(*outdirflag); err != nil {
+		log.Fatalf("error: WriteMetaDir returns %v", err)
+	}
+	if err := coverage.WriteCountersDir(*outdirflag); err != nil {
+		log.Fatalf("error: WriteCountersDir returns %v", err)
+	}
+}
+
+func emitToNonexistentDir() {
+	log.SetPrefix("emitToNonexistentDir: ")
+
+	want := []string{
+		"no such file or directory",             // linux-ish
+		"system cannot find the file specified", // windows
+		"does not exist",                        // plan9
+	}
+
+	checkWant := func(which string, got string) {
+		found := false
+		for _, w := range want {
+			if strings.Contains(got, w) {
+				found = true
+				break
+			}
+		}
+		if !found {
+			log.Fatalf("%s emit to bad dir: got error:\n  %v\nwanted error with one of:\n  %+v", which, got, want)
+		}
+	}
+
+	// Mangle the output directory to produce something nonexistent.
+	mangled := *outdirflag + "_MANGLED"
+	if err := coverage.WriteMetaDir(mangled); err == nil {
+		log.Fatal("expected error from WriteMetaDir to nonexistent dir")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		checkWant("meta data", got)
+	}
+
+	// Now try to emit counter data file to a bad dir.
+	if err := coverage.WriteCountersDir(mangled); err == nil {
+		log.Fatal("expected error emitting counter data to bad dir")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		checkWant("counter data", got)
+	}
+}
+
+func emitToUnwritableDir() {
+	log.SetPrefix("emitToUnwritableDir: ")
+
+	want := "permission denied"
+
+	if err := coverage.WriteMetaDir(*outdirflag); err == nil {
+		log.Fatal("expected error from WriteMetaDir to unwritable dir")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		if !strings.Contains(got, want) {
+			log.Fatalf("meta-data emit to unwritable dir: wanted error containing %q got %q", want, got)
+		}
+	}
+
+	// Similarly with writing counter data.
+	if err := coverage.WriteCountersDir(*outdirflag); err == nil {
+		log.Fatal("expected error emitting counter data to unwritable dir")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		if !strings.Contains(got, want) {
+			log.Fatalf("emitting counter data to unwritable dir: wanted error containing %q got %q", want, got)
+		}
+	}
+}
+
+func emitToNilWriter() {
+	log.SetPrefix("emitToWriter: ")
+	want := "nil writer"
+	var bad io.WriteSeeker
+	if err := coverage.WriteMeta(bad); err == nil {
+		log.Fatal("expected error passing nil writer for meta emit")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		if !strings.Contains(got, want) {
+			log.Fatalf("emitting meta-data passing nil writer: wanted error containing %q got %q", want, got)
+		}
+	}
+
+	if err := coverage.WriteCounters(bad); err == nil {
+		log.Fatal("expected error passing nil writer for counter emit")
+	} else {
+		got := fmt.Sprintf("%v", err)
+		if !strings.Contains(got, want) {
+			log.Fatalf("emitting counter data passing nil writer: wanted error containing %q got %q", want, got)
+		}
+	}
+}
+
+type failingWriter struct {
+	writeCount int
+	writeLimit int
+	slws       slicewriter.WriteSeeker
+}
+
+func (f *failingWriter) Write(p []byte) (n int, err error) {
+	c := f.writeCount
+	f.writeCount++
+	if f.writeLimit < 0 || c < f.writeLimit {
+		return f.slws.Write(p)
+	}
+	return 0, fmt.Errorf("manufactured write error")
+}
+
+func (f *failingWriter) Seek(offset int64, whence int) (int64, error) {
+	return f.slws.Seek(offset, whence)
+}
+
+func (f *failingWriter) reset(lim int) {
+	f.writeCount = 0
+	f.writeLimit = lim
+	f.slws = slicewriter.WriteSeeker{}
+}
+
+func writeStressTest(tag string, testf func(testf *failingWriter) error) {
+	// Invoke the function initially without the write limit
+	// set, to capture the number of writes performed.
+	fw := &failingWriter{writeLimit: -1}
+	testf(fw)
+
+	// Now that we know how many writes are going to happen, run the
+	// function repeatedly, each time with a Write operation set to
+	// fail at a new spot. The goal here is to make sure that:
+	// A) an error is reported, and B) nothing crashes.
+	tot := fw.writeCount
+	for i := 0; i < tot; i++ {
+		fw.reset(i)
+		err := testf(fw)
+		if err == nil {
+			log.Fatalf("no error from write %d tag %s", i, tag)
+		}
+	}
+}
+
+func postClear() int {
+	return 42
+}
+
+func preClear() int {
+	return 42
+}
+
+// This test is designed to ensure that write errors are properly
+// handled by the code that writes out coverage data. It repeatedly
+// invokes the 'emit to writer' apis using a specially crafted writer
+// that captures the total number of expected writes, then replays the
+// execution N times with a manufactured write error at the
+// appropriate spot.
+func emitToFailingWriter() {
+	log.SetPrefix("emitToFailingWriter: ")
+
+	writeStressTest("emit-meta", func(f *failingWriter) error {
+		return coverage.WriteMeta(f)
+	})
+	writeStressTest("emit-counter", func(f *failingWriter) error {
+		return coverage.WriteCounters(f)
+	})
+}
+
+func emitWithCounterClear() {
+	log.SetPrefix("emitWitCounterClear: ")
+	preClear()
+	if err := coverage.ClearCounters(); err != nil {
+		log.Fatalf("clear failed: %v", err)
+	}
+	postClear()
+	if err := coverage.WriteMetaDir(*outdirflag); err != nil {
+		log.Fatalf("error: WriteMetaDir returns %v", err)
+	}
+	if err := coverage.WriteCountersDir(*outdirflag); err != nil {
+		log.Fatalf("error: WriteCountersDir returns %v", err)
+	}
+}
+
+func final() int {
+	println("I run last.")
+	return 43
+}
+
+func main() {
+	log.SetFlags(0)
+	flag.Parse()
+	if *testpointflag == "" {
+		log.Fatalf("error: no testpoint (use -tp flag)")
+	}
+	if *outdirflag == "" {
+		log.Fatalf("error: no output dir specified (use -o flag)")
+	}
+	switch *testpointflag {
+	case "emitToDir":
+		emitToDir()
+	case "emitToWriter":
+		emitToWriter()
+	case "emitToNonexistentDir":
+		emitToNonexistentDir()
+	case "emitToUnwritableDir":
+		emitToUnwritableDir()
+	case "emitToNilWriter":
+		emitToNilWriter()
+	case "emitToFailingWriter":
+		emitToFailingWriter()
+	case "emitWithCounterClear":
+		emitWithCounterClear()
+	default:
+		log.Fatalf("error: unknown testpoint %q", *testpointflag)
+	}
+	final()
+}
diff --git a/src/runtime/coverage/testdata/issue56006/repro.go b/src/runtime/coverage/testdata/issue56006/repro.go
new file mode 100644
index 0000000..60a4925
--- /dev/null
+++ b/src/runtime/coverage/testdata/issue56006/repro.go
@@ -0,0 +1,26 @@
+package main
+
+//go:noinline
+func blah(x int) int {
+	if x != 0 {
+		return x + 42
+	}
+	return x - 42
+}
+
+func main() {
+	go infloop()
+	println(blah(1) + blah(0))
+}
+
+var G int
+
+func infloop() {
+	for {
+		G += blah(1)
+		G += blah(0)
+		if G > 10000 {
+			G = 0
+		}
+	}
+}
diff --git a/src/runtime/coverage/testdata/issue56006/repro_test.go b/src/runtime/coverage/testdata/issue56006/repro_test.go
new file mode 100644
index 0000000..674d819
--- /dev/null
+++ b/src/runtime/coverage/testdata/issue56006/repro_test.go
@@ -0,0 +1,8 @@
+package main
+
+import "testing"
+
+func TestSomething(t *testing.T) {
+	go infloop()
+	println(blah(1) + blah(0))
+}
diff --git a/src/runtime/coverage/testsupport.go b/src/runtime/coverage/testsupport.go
new file mode 100644
index 0000000..a481bbb
--- /dev/null
+++ b/src/runtime/coverage/testsupport.go
@@ -0,0 +1,234 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import (
+	"fmt"
+	"internal/coverage"
+	"internal/coverage/calloc"
+	"internal/coverage/cformat"
+	"internal/coverage/cmerge"
+	"internal/coverage/decodecounter"
+	"internal/coverage/decodemeta"
+	"internal/coverage/pods"
+	"io"
+	"os"
+	"strings"
+)
+
+// processCoverTestDir is called (via a linknamed reference) from
+// testmain code when "go test -cover" is in effect. It is not
+// intended to be used other than internally by the Go command's
+// generated code.
+func processCoverTestDir(dir string, cfile string, cm string, cpkg string) error {
+	return processCoverTestDirInternal(dir, cfile, cm, cpkg, os.Stdout)
+}
+
+// processCoverTestDirInternal is an io.Writer version of processCoverTestDir,
+// exposed for unit testing.
+func processCoverTestDirInternal(dir string, cfile string, cm string, cpkg string, w io.Writer) error {
+	cmode := coverage.ParseCounterMode(cm)
+	if cmode == coverage.CtrModeInvalid {
+		return fmt.Errorf("invalid counter mode %q", cm)
+	}
+
+	// Emit meta-data and counter data.
+	ml := getCovMetaList()
+	if len(ml) == 0 {
+		// This corresponds to the case where we have a package that
+		// contains test code but no functions (which is fine). In this
+		// case there is no need to emit anything.
+	} else {
+		if err := emitMetaDataToDirectory(dir, ml); err != nil {
+			return err
+		}
+		if err := emitCounterDataToDirectory(dir); err != nil {
+			return err
+		}
+	}
+
+	// Collect pods from test run. For the majority of cases we would
+	// expect to see a single pod here, but allow for multiple pods in
+	// case the test harness is doing extra work to collect data files
+	// from builds that it kicks off as part of the testing.
+	podlist, err := pods.CollectPods([]string{dir}, false)
+	if err != nil {
+		return fmt.Errorf("reading from %s: %v", dir, err)
+	}
+
+	// Open text output file if appropriate.
+	var tf *os.File
+	var tfClosed bool
+	if cfile != "" {
+		var err error
+		tf, err = os.Create(cfile)
+		if err != nil {
+			return fmt.Errorf("internal error: opening coverage data output file %q: %v", cfile, err)
+		}
+		defer func() {
+			if !tfClosed {
+				tfClosed = true
+				tf.Close()
+			}
+		}()
+	}
+
+	// Read/process the pods.
+	ts := &tstate{
+		cm:    &cmerge.Merger{},
+		cf:    cformat.NewFormatter(cmode),
+		cmode: cmode,
+	}
+	// Generate the expected hash string based on the final meta-data
+	// hash for this test, then look only for pods that refer to that
+	// hash (just in case there are multiple instrumented executables
+	// in play). See issue #57924 for more on this.
+	hashstring := fmt.Sprintf("%x", finalHash)
+	for _, p := range podlist {
+		if !strings.Contains(p.MetaFile, hashstring) {
+			continue
+		}
+		if err := ts.processPod(p); err != nil {
+			return err
+		}
+	}
+
+	// Emit percent.
+	if err := ts.cf.EmitPercent(w, cpkg, true); err != nil {
+		return err
+	}
+
+	// Emit text output.
+	if tf != nil {
+		if err := ts.cf.EmitTextual(tf); err != nil {
+			return err
+		}
+		tfClosed = true
+		if err := tf.Close(); err != nil {
+			return fmt.Errorf("closing %s: %v", cfile, err)
+		}
+	}
+
+	return nil
+}
+
+type tstate struct {
+	calloc.BatchCounterAlloc
+	cm    *cmerge.Merger
+	cf    *cformat.Formatter
+	cmode coverage.CounterMode
+}
+
+// processPod reads coverage counter data for a specific pod.
+func (ts *tstate) processPod(p pods.Pod) error {
+	// Open meta-data file
+	f, err := os.Open(p.MetaFile)
+	if err != nil {
+		return fmt.Errorf("unable to open meta-data file %s: %v", p.MetaFile, err)
+	}
+	defer func() {
+		f.Close()
+	}()
+	var mfr *decodemeta.CoverageMetaFileReader
+	mfr, err = decodemeta.NewCoverageMetaFileReader(f, nil)
+	if err != nil {
+		return fmt.Errorf("error reading meta-data file %s: %v", p.MetaFile, err)
+	}
+	newmode := mfr.CounterMode()
+	if newmode != ts.cmode {
+		return fmt.Errorf("internal error: counter mode clash: %q from test harness, %q from data file %s", ts.cmode.String(), newmode.String(), p.MetaFile)
+	}
+	newgran := mfr.CounterGranularity()
+	if err := ts.cm.SetModeAndGranularity(p.MetaFile, cmode, newgran); err != nil {
+		return err
+	}
+
+	// A map to store counter data, indexed by pkgid/fnid tuple.
+	pmm := make(map[pkfunc][]uint32)
+
+	// Helper to read a single counter data file.
+	readcdf := func(cdf string) error {
+		cf, err := os.Open(cdf)
+		if err != nil {
+			return fmt.Errorf("opening counter data file %s: %s", cdf, err)
+		}
+		defer cf.Close()
+		var cdr *decodecounter.CounterDataReader
+		cdr, err = decodecounter.NewCounterDataReader(cdf, cf)
+		if err != nil {
+			return fmt.Errorf("reading counter data file %s: %s", cdf, err)
+		}
+		var data decodecounter.FuncPayload
+		for {
+			ok, err := cdr.NextFunc(&data)
+			if err != nil {
+				return fmt.Errorf("reading counter data file %s: %v", cdf, err)
+			}
+			if !ok {
+				break
+			}
+
+			// NB: sanity check on pkg and func IDs?
+			key := pkfunc{pk: data.PkgIdx, fcn: data.FuncIdx}
+			if prev, found := pmm[key]; found {
+				// Note: no overflow reporting here.
+				if err, _ := ts.cm.MergeCounters(data.Counters, prev); err != nil {
+					return fmt.Errorf("processing counter data file %s: %v", cdf, err)
+				}
+			}
+			c := ts.AllocateCounters(len(data.Counters))
+			copy(c, data.Counters)
+			pmm[key] = c
+		}
+		return nil
+	}
+
+	// Read counter data files.
+	for _, cdf := range p.CounterDataFiles {
+		if err := readcdf(cdf); err != nil {
+			return err
+		}
+	}
+
+	// Visit meta-data file.
+	np := uint32(mfr.NumPackages())
+	payload := []byte{}
+	for pkIdx := uint32(0); pkIdx < np; pkIdx++ {
+		var pd *decodemeta.CoverageMetaDataDecoder
+		pd, payload, err = mfr.GetPackageDecoder(pkIdx, payload)
+		if err != nil {
+			return fmt.Errorf("reading pkg %d from meta-file %s: %s", pkIdx, p.MetaFile, err)
+		}
+		ts.cf.SetPackage(pd.PackagePath())
+		var fd coverage.FuncDesc
+		nf := pd.NumFuncs()
+		for fnIdx := uint32(0); fnIdx < nf; fnIdx++ {
+			if err := pd.ReadFunc(fnIdx, &fd); err != nil {
+				return fmt.Errorf("reading meta-data file %s: %v",
+					p.MetaFile, err)
+			}
+			key := pkfunc{pk: pkIdx, fcn: fnIdx}
+			counters, haveCounters := pmm[key]
+			for i := 0; i < len(fd.Units); i++ {
+				u := fd.Units[i]
+				// Skip units with non-zero parent (no way to represent
+				// these in the existing format).
+				if u.Parent != 0 {
+					continue
+				}
+				count := uint32(0)
+				if haveCounters {
+					count = counters[i]
+				}
+				ts.cf.AddUnit(fd.Srcfile, fd.Funcname, fd.Lit, u, count)
+			}
+		}
+	}
+	return nil
+}
+
+type pkfunc struct {
+	pk, fcn uint32
+}
diff --git a/src/runtime/coverage/ts_test.go b/src/runtime/coverage/ts_test.go
new file mode 100644
index 0000000..b826058
--- /dev/null
+++ b/src/runtime/coverage/ts_test.go
@@ -0,0 +1,58 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package coverage
+
+import (
+	"internal/goexperiment"
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+	_ "unsafe"
+)
+
+//go:linkname testing_testGoCoverDir testing.testGoCoverDir
+func testing_testGoCoverDir() string
+
+// TestTestSupport does a basic verification of the functionality in
+// runtime/coverage.processCoverTestDir (doing this here as opposed to
+// relying on other test paths will provide a better signal when
+// running "go test -cover" for this package).
+func TestTestSupport(t *testing.T) {
+	if !goexperiment.CoverageRedesign {
+		return
+	}
+	if testing.CoverMode() == "" {
+		return
+	}
+	t.Logf("testing.testGoCoverDir() returns %s mode=%s\n",
+		testing_testGoCoverDir(), testing.CoverMode())
+
+	textfile := filepath.Join(t.TempDir(), "file.txt")
+	var sb strings.Builder
+	err := processCoverTestDirInternal(testing_testGoCoverDir(), textfile,
+		testing.CoverMode(), "", &sb)
+	if err != nil {
+		t.Fatalf("bad: %v", err)
+	}
+
+	// Check for existence of text file.
+	if inf, err := os.Open(textfile); err != nil {
+		t.Fatalf("problems opening text file %s: %v", textfile, err)
+	} else {
+		inf.Close()
+	}
+
+	// Check for percent output with expected tokens.
+	strout := sb.String()
+	want1 := "runtime/coverage"
+	want2 := "of statements"
+	if !strings.Contains(strout, want1) ||
+		!strings.Contains(strout, want2) {
+		t.Logf("output from run: %s\n", strout)
+		t.Fatalf("percent output missing key tokens: %q and %q",
+			want1, want2)
+	}
+}
diff --git a/src/runtime/covercounter.go b/src/runtime/covercounter.go
new file mode 100644
index 0000000..72842bd
--- /dev/null
+++ b/src/runtime/covercounter.go
@@ -0,0 +1,26 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/coverage/rtcov"
+	"unsafe"
+)
+
+//go:linkname runtime_coverage_getCovCounterList runtime/coverage.getCovCounterList
+func runtime_coverage_getCovCounterList() []rtcov.CovCounterBlob {
+	res := []rtcov.CovCounterBlob{}
+	u32sz := unsafe.Sizeof(uint32(0))
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if datap.covctrs == datap.ecovctrs {
+			continue
+		}
+		res = append(res, rtcov.CovCounterBlob{
+			Counters: (*uint32)(unsafe.Pointer(datap.covctrs)),
+			Len:      uint64((datap.ecovctrs - datap.covctrs) / u32sz),
+		})
+	}
+	return res
+}
diff --git a/src/runtime/covermeta.go b/src/runtime/covermeta.go
new file mode 100644
index 0000000..54ef42a
--- /dev/null
+++ b/src/runtime/covermeta.go
@@ -0,0 +1,72 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/coverage/rtcov"
+	"unsafe"
+)
+
+// covMeta is the top-level container for bits of state related to
+// code coverage meta-data in the runtime.
+var covMeta struct {
+	// metaList contains the list of currently registered meta-data
+	// blobs for the running program.
+	metaList []rtcov.CovMetaBlob
+
+	// pkgMap records mappings from hard-coded package IDs to
+	// slots in the covMetaList above.
+	pkgMap map[int]int
+
+	// Set to true if we discover a package mapping glitch.
+	hardCodedListNeedsUpdating bool
+}
+
+// addCovMeta is invoked during package "init" functions by the
+// compiler when compiling for coverage instrumentation; here 'p' is a
+// meta-data blob of length 'dlen' for the package in question, 'hash'
+// is a compiler-computed md5.sum for the blob, 'pkpath' is the
+// package path, 'pkid' is the hard-coded ID that the compiler is
+// using for the package (or -1 if the compiler doesn't think a
+// hard-coded ID is needed), and 'cmode'/'cgran' are the coverage
+// counter mode and granularity requested by the user. Return value is
+// the ID for the package for use by the package code itself.
+func addCovMeta(p unsafe.Pointer, dlen uint32, hash [16]byte, pkpath string, pkid int, cmode uint8, cgran uint8) uint32 {
+	slot := len(covMeta.metaList)
+	covMeta.metaList = append(covMeta.metaList,
+		rtcov.CovMetaBlob{
+			P:                  (*byte)(p),
+			Len:                dlen,
+			Hash:               hash,
+			PkgPath:            pkpath,
+			PkgID:              pkid,
+			CounterMode:        cmode,
+			CounterGranularity: cgran,
+		})
+	if pkid != -1 {
+		if covMeta.pkgMap == nil {
+			covMeta.pkgMap = make(map[int]int)
+		}
+		if _, ok := covMeta.pkgMap[pkid]; ok {
+			throw("runtime.addCovMeta: coverage package map collision")
+		}
+		// Record the real slot (position on meta-list) for this
+		// package; we'll use the map to fix things up later on.
+		covMeta.pkgMap[pkid] = slot
+	}
+
+	// ID zero is reserved as invalid.
+	return uint32(slot + 1)
+}
+
+//go:linkname runtime_coverage_getCovMetaList runtime/coverage.getCovMetaList
+func runtime_coverage_getCovMetaList() []rtcov.CovMetaBlob {
+	return covMeta.metaList
+}
+
+//go:linkname runtime_coverage_getCovPkgMap runtime/coverage.getCovPkgMap
+func runtime_coverage_getCovPkgMap() map[int]int {
+	return covMeta.pkgMap
+}
diff --git a/src/runtime/cpuflags_arm64.go b/src/runtime/cpuflags_arm64.go
index 7576bef..a0f1d11 100644
--- a/src/runtime/cpuflags_arm64.go
+++ b/src/runtime/cpuflags_arm64.go
@@ -11,7 +11,7 @@
 var arm64UseAlignedLoads bool
 
 func init() {
-	if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsZeus {
+	if cpu.ARM64.IsNeoverseN1 || cpu.ARM64.IsNeoverseV1 {
 		arm64UseAlignedLoads = true
 	}
 }
diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
index 2f7f6b4..6ef374e 100644
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go
@@ -14,7 +14,6 @@
 
 import (
 	"internal/abi"
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -106,12 +105,12 @@
 //go:nowritebarrierrec
 func (p *cpuProfile) add(tagPtr *unsafe.Pointer, stk []uintptr) {
 	// Simple cas-lock to coordinate with setcpuprofilerate.
-	for !atomic.Cas(&prof.signalLock, 0, 1) {
+	for !prof.signalLock.CompareAndSwap(0, 1) {
 		// TODO: Is it safe to osyield here? https://go.dev/issue/52672
 		osyield()
 	}
 
-	if prof.hz != 0 { // implies cpuprof.log != nil
+	if prof.hz.Load() != 0 { // implies cpuprof.log != nil
 		if p.numExtra > 0 || p.lostExtra > 0 || p.lostAtomic > 0 {
 			p.addExtra()
 		}
@@ -123,7 +122,7 @@
 		cpuprof.log.write(tagPtr, nanotime(), hdr[:], stk)
 	}
 
-	atomic.Store(&prof.signalLock, 0)
+	prof.signalLock.Store(0)
 }
 
 // addNonGo adds the non-Go stack trace to the profile.
@@ -143,7 +142,7 @@
 	// process at a time. If not, this lock will serialize those too.
 	// The use of timer_create(2) on Linux to request process-targeted
 	// signals may have changed this.)
-	for !atomic.Cas(&prof.signalLock, 0, 1) {
+	for !prof.signalLock.CompareAndSwap(0, 1) {
 		// TODO: Is it safe to osyield here? https://go.dev/issue/52672
 		osyield()
 	}
@@ -157,7 +156,7 @@
 		cpuprof.lostExtra++
 	}
 
-	atomic.Store(&prof.signalLock, 0)
+	prof.signalLock.Store(0)
 }
 
 // addExtra adds the "extra" profiling events,
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 5e58712..51d7bb5 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -8,6 +8,7 @@
 
 import (
 	"fmt"
+	"internal/goos"
 	"internal/testenv"
 	"os"
 	"os/exec"
@@ -217,7 +218,9 @@
 }
 
 func TestCgoPprofCallback(t *testing.T) {
-	t.Parallel()
+	if testing.Short() {
+		t.Skip("skipping in short mode") // takes a full second
+	}
 	switch runtime.GOOS {
 	case "windows", "plan9":
 		t.Skipf("skipping cgo pprof callback test on %s", runtime.GOOS)
@@ -603,8 +606,14 @@
 		t.Skipf("no signals on %s", runtime.GOOS)
 	}
 
-	for _, test := range []string{"Segv", "SegvInCgo"} {
+	for _, test := range []string{"Segv", "SegvInCgo", "TgkillSegv", "TgkillSegvInCgo"} {
 		test := test
+
+		// The tgkill variants only run on Linux.
+		if runtime.GOOS != "linux" && strings.HasPrefix(test, "Tgkill") {
+			continue
+		}
+
 		t.Run(test, func(t *testing.T) {
 			t.Parallel()
 			got := runTestProg(t, "testprogcgo", test)
@@ -633,9 +642,14 @@
 				testenv.SkipFlaky(t, 50979)
 			}
 
-			nowant := "runtime: "
-			if strings.Contains(got, nowant) {
-				t.Errorf("unexpectedly saw %q in output", nowant)
+			for _, nowant := range []string{"fatal error: ", "runtime: "} {
+				if strings.Contains(got, nowant) {
+					if runtime.GOOS == "darwin" && strings.Contains(got, "0xb01dfacedebac1e") {
+						// See the comment in signal_darwin_amd64.go.
+						t.Skip("skipping due to Darwin handling of malformed addresses")
+					}
+					t.Errorf("unexpectedly saw %q in output", nowant)
+				}
 			}
 		})
 	}
@@ -710,3 +724,47 @@
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+func TestCgoTraceParser(t *testing.T) {
+	// Test issue 29707.
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprogcgo", "CgoTraceParser")
+	want := "OK\n"
+	ErrTimeOrder := "ErrTimeOrder\n"
+	if output == ErrTimeOrder {
+		t.Skipf("skipping due to golang.org/issue/16755: %v", output)
+	} else if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
+
+func TestCgoTraceParserWithOneProc(t *testing.T) {
+	// Test issue 29707.
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no pthreads on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprogcgo", "CgoTraceParser", "GOMAXPROCS=1")
+	want := "OK\n"
+	ErrTimeOrder := "ErrTimeOrder\n"
+	if output == ErrTimeOrder {
+		t.Skipf("skipping due to golang.org/issue/16755: %v", output)
+	} else if output != want {
+		t.Fatalf("GOMAXPROCS=1, want %s, got %s\n", want, output)
+	}
+}
+
+func TestCgoSigfwd(t *testing.T) {
+	t.Parallel()
+	if !goos.IsUnix {
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+
+	got := runTestProg(t, "testprogcgo", "CgoSigfwd", "GO_TEST_CGOSIGFWD=1")
+	if want := "OK\n"; got != want {
+		t.Fatalf("expected %q, but got:\n%s", want, got)
+	}
+}
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 01d7cbe..309777d 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -18,6 +18,7 @@
 	"strings"
 	"sync"
 	"testing"
+	"time"
 )
 
 var toRemove []string
@@ -58,18 +59,31 @@
 }
 
 func runBuiltTestProg(t *testing.T, exe, name string, env ...string) string {
+	t.Helper()
+
 	if *flagQuick {
 		t.Skip("-quick")
 	}
 
-	testenv.MustHaveGoBuild(t)
+	start := time.Now()
 
-	cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
+	cmd := testenv.CleanCmdEnv(testenv.Command(t, exe, name))
 	cmd.Env = append(cmd.Env, env...)
 	if testing.Short() {
 		cmd.Env = append(cmd.Env, "RUNTIME_TEST_SHORT=1")
 	}
-	out, _ := testenv.RunWithTimeout(t, cmd)
+	out, err := cmd.CombinedOutput()
+	if err == nil {
+		t.Logf("%v (%v): ok", cmd, time.Since(start))
+	} else {
+		if _, ok := err.(*exec.ExitError); ok {
+			t.Logf("%v: %v", cmd, err)
+		} else if errors.Is(err, exec.ErrWaitDelay) {
+			t.Fatalf("%v: %v", cmd, err)
+		} else {
+			t.Fatalf("%v failed to start: %v", cmd, err)
+		}
+	}
 	return string(out)
 }
 
@@ -844,3 +858,11 @@
 		}
 	}
 }
+
+func TestPanicOnUnsafeSlice(t *testing.T) {
+	output := runTestProg(t, "testprog", "panicOnNilAndEleSizeIsZero")
+	want := "panic: runtime error: unsafe.Slice: ptr is nil and len is not zero"
+	if !strings.Contains(output, want) {
+		t.Errorf("output does not contain %q:\n%s", want, output)
+	}
+}
diff --git a/src/runtime/create_file_nounix.go b/src/runtime/create_file_nounix.go
new file mode 100644
index 0000000..60f7517
--- /dev/null
+++ b/src/runtime/create_file_nounix.go
@@ -0,0 +1,14 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !unix
+
+package runtime
+
+const canCreateFile = false
+
+func create(name *byte, perm int32) int32 {
+	throw("unimplemented")
+	return -1
+}
diff --git a/src/runtime/create_file_unix.go b/src/runtime/create_file_unix.go
new file mode 100644
index 0000000..7280810
--- /dev/null
+++ b/src/runtime/create_file_unix.go
@@ -0,0 +1,14 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix
+
+package runtime
+
+const canCreateFile = true
+
+// create returns an fd to a write-only file.
+func create(name *byte, perm int32) int32 {
+	return open(name, _O_CREAT|_O_WRONLY|_O_TRUNC, perm)
+}
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index 0ab23e0..669c36f 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -85,13 +85,13 @@
 //go:linkname mayMoreStackPreempt
 func mayMoreStackPreempt() {
 	// Don't do anything on the g0 or gsignal stack.
-	g := getg()
-	if g == g.m.g0 || g == g.m.gsignal {
+	gp := getg()
+	if gp == gp.m.g0 || gp == gp.m.gsignal {
 		return
 	}
 	// Force a preemption, unless the stack is already poisoned.
-	if g.stackguard0 < stackPoisonMin {
-		g.stackguard0 = stackPreempt
+	if gp.stackguard0 < stackPoisonMin {
+		gp.stackguard0 = stackPreempt
 	}
 }
 
@@ -104,12 +104,12 @@
 //go:linkname mayMoreStackMove
 func mayMoreStackMove() {
 	// Don't do anything on the g0 or gsignal stack.
-	g := getg()
-	if g == g.m.g0 || g == g.m.gsignal {
+	gp := getg()
+	if gp == gp.m.g0 || gp == gp.m.gsignal {
 		return
 	}
 	// Force stack movement, unless the stack is already poisoned.
-	if g.stackguard0 < stackPoisonMin {
-		g.stackguard0 = stackForceMove
+	if gp.stackguard0 < stackPoisonMin {
+		gp.stackguard0 = stackForceMove
 	}
 }
diff --git a/src/runtime/debug/mod.go b/src/runtime/debug/mod.go
index 688e258..8b7a423 100644
--- a/src/runtime/debug/mod.go
+++ b/src/runtime/debug/mod.go
@@ -11,7 +11,7 @@
 	"strings"
 )
 
-// exported from runtime
+// exported from runtime.
 func modinfo() string
 
 // ReadBuildInfo returns the build information embedded
@@ -39,14 +39,26 @@
 
 // BuildInfo represents the build information read from a Go binary.
 type BuildInfo struct {
-	GoVersion string         // Version of Go that produced this binary.
-	Path      string         // The main package path
-	Main      Module         // The module containing the main package
-	Deps      []*Module      // Module dependencies
-	Settings  []BuildSetting // Other information about the build.
+	// GoVersion is the version of the Go toolchain that built the binary
+	// (for example, "go1.19.2").
+	GoVersion string
+
+	// Path is the package path of the main package for the binary
+	// (for example, "golang.org/x/tools/cmd/stringer").
+	Path string
+
+	// Main describes the module that contains the main package for the binary.
+	Main Module
+
+	// Deps describes all the dependency modules, both direct and indirect,
+	// that contributed packages to the build of this binary.
+	Deps []*Module
+
+	// Settings describes the build settings used to build the binary.
+	Settings []BuildSetting
 }
 
-// Module represents a module.
+// A Module describes a single module included in a build.
 type Module struct {
 	Path    string  // module path
 	Version string  // module version
@@ -54,8 +66,24 @@
 	Replace *Module // replaced by this module
 }
 
-// BuildSetting describes a setting that may be used to understand how the
-// binary was built. For example, VCS commit and dirty status is stored here.
+// A BuildSetting is a key-value pair describing one setting that influenced a build.
+//
+// Defined keys include:
+//
+//   - -buildmode: the buildmode flag used (typically "exe")
+//   - -compiler: the compiler toolchain flag used (typically "gc")
+//   - CGO_ENABLED: the effective CGO_ENABLED environment variable
+//   - CGO_CFLAGS: the effective CGO_CFLAGS environment variable
+//   - CGO_CPPFLAGS: the effective CGO_CPPFLAGS environment variable
+//   - CGO_CXXFLAGS:  the effective CGO_CPPFLAGS environment variable
+//   - CGO_LDFLAGS: the effective CGO_CPPFLAGS environment variable
+//   - GOARCH: the architecture target
+//   - GOAMD64/GOARM64/GO386/etc: the architecture feature level for GOARCH
+//   - GOOS: the operating system target
+//   - vcs: the version control system for the source tree where the build ran
+//   - vcs.revision: the revision identifier for the current commit or checkout
+//   - vcs.time: the modification time associated with vcs.revision, in RFC3339 format
+//   - vcs.modified: true or false indicating whether the source tree had local modifications
 type BuildSetting struct {
 	// Key and Value describe the build setting.
 	// Key must not contain an equals sign, space, tab, or newline.
diff --git a/src/runtime/debugcall.go b/src/runtime/debugcall.go
index 2f164e7..a4393b1 100644
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go
@@ -158,11 +158,10 @@
 		gp.schedlink = 0
 
 		// Park the calling goroutine.
-		gp.waitreason = waitReasonDebugCall
 		if trace.enabled {
 			traceGoPark(traceEvGoBlock, 1)
 		}
-		casgstatus(gp, _Grunning, _Gwaiting)
+		casGToWaiting(gp, _Grunning, waitReasonDebugCall)
 		dropg()
 
 		// Directly execute the new goroutine. The debug
diff --git a/src/runtime/debuglog.go b/src/runtime/debuglog.go
index ca1a791..b18774e 100644
--- a/src/runtime/debuglog.go
+++ b/src/runtime/debuglog.go
@@ -17,6 +17,7 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -63,7 +64,7 @@
 		allp := (*uintptr)(unsafe.Pointer(&allDloggers))
 		all := (*dlogger)(unsafe.Pointer(atomic.Loaduintptr(allp)))
 		for l1 := all; l1 != nil; l1 = l1.allLink {
-			if atomic.Load(&l1.owned) == 0 && atomic.Cas(&l1.owned, 0, 1) {
+			if l1.owned.Load() == 0 && l1.owned.CompareAndSwap(0, 1) {
 				l = l1
 				break
 			}
@@ -79,7 +80,7 @@
 			throw("failed to allocate debug log")
 		}
 		l.w.r.data = &l.w.data
-		l.owned = 1
+		l.owned.Store(1)
 
 		// Prepend to allDloggers list.
 		headp := (*uintptr)(unsafe.Pointer(&allDloggers))
@@ -121,9 +122,8 @@
 //
 // To obtain a dlogger, call dlog(). When done with the dlogger, call
 // end().
-//
-//go:notinheap
 type dlogger struct {
+	_ sys.NotInHeap
 	w debugLogWriter
 
 	// allLink is the next dlogger in the allDloggers list.
@@ -131,7 +131,7 @@
 
 	// owned indicates that this dlogger is owned by an M. This is
 	// accessed atomically.
-	owned uint32
+	owned atomic.Uint32
 }
 
 // allDloggers is a list of all dloggers, linked through
@@ -160,7 +160,7 @@
 	}
 
 	// Return the logger to the global pool.
-	atomic.Store(&l.owned, 0)
+	l.owned.Store(0)
 }
 
 const (
@@ -292,21 +292,24 @@
 	if !dlogEnabled {
 		return l
 	}
-	str := stringStructOf(&x)
+
+	strData := unsafe.StringData(x)
 	datap := &firstmoduledata
-	if len(x) > 4 && datap.etext <= uintptr(str.str) && uintptr(str.str) < datap.end {
+	if len(x) > 4 && datap.etext <= uintptr(unsafe.Pointer(strData)) && uintptr(unsafe.Pointer(strData)) < datap.end {
 		// String constants are in the rodata section, which
 		// isn't recorded in moduledata. But it has to be
 		// somewhere between etext and end.
 		l.w.byte(debugLogConstString)
-		l.w.uvarint(uint64(str.len))
-		l.w.uvarint(uint64(uintptr(str.str) - datap.etext))
+		l.w.uvarint(uint64(len(x)))
+		l.w.uvarint(uint64(uintptr(unsafe.Pointer(strData)) - datap.etext))
 	} else {
 		l.w.byte(debugLogString)
+		// We can't use unsafe.Slice as it may panic, which isn't safe
+		// in this (potentially) nowritebarrier context.
 		var b []byte
 		bb := (*slice)(unsafe.Pointer(&b))
-		bb.array = str.str
-		bb.len, bb.cap = str.len, str.len
+		bb.array = unsafe.Pointer(strData)
+		bb.len, bb.cap = len(x), len(x)
 		if len(b) > debugLogStringLimit {
 			b = b[:debugLogStringLimit]
 		}
@@ -356,9 +359,8 @@
 // overwrite old records. Hence, it maintains a reader that consumes
 // the log as it gets overwritten. That reader state is where an
 // actual log reader would start.
-//
-//go:notinheap
 type debugLogWriter struct {
+	_     sys.NotInHeap
 	write uint64
 	data  debugLogBuf
 
@@ -376,8 +378,10 @@
 	buf [10]byte
 }
 
-//go:notinheap
-type debugLogBuf [debugLogBytes]byte
+type debugLogBuf struct {
+	_ sys.NotInHeap
+	b [debugLogBytes]byte
+}
 
 const (
 	// debugLogHeaderSize is the number of bytes in the framing
@@ -390,7 +394,7 @@
 
 //go:nosplit
 func (l *debugLogWriter) ensure(n uint64) {
-	for l.write+n >= l.r.begin+uint64(len(l.data)) {
+	for l.write+n >= l.r.begin+uint64(len(l.data.b)) {
 		// Consume record at begin.
 		if l.r.skip() == ^uint64(0) {
 			// Wrapped around within a record.
@@ -406,8 +410,8 @@
 
 //go:nosplit
 func (l *debugLogWriter) writeFrameAt(pos, size uint64) bool {
-	l.data[pos%uint64(len(l.data))] = uint8(size)
-	l.data[(pos+1)%uint64(len(l.data))] = uint8(size >> 8)
+	l.data.b[pos%uint64(len(l.data.b))] = uint8(size)
+	l.data.b[(pos+1)%uint64(len(l.data.b))] = uint8(size >> 8)
 	return size <= 0xFFFF
 }
 
@@ -441,7 +445,7 @@
 	l.ensure(1)
 	pos := l.write
 	l.write++
-	l.data[pos%uint64(len(l.data))] = x
+	l.data.b[pos%uint64(len(l.data.b))] = x
 }
 
 //go:nosplit
@@ -450,7 +454,7 @@
 	pos := l.write
 	l.write += uint64(len(x))
 	for len(x) > 0 {
-		n := copy(l.data[pos%uint64(len(l.data)):], x)
+		n := copy(l.data.b[pos%uint64(len(l.data.b)):], x)
 		pos += uint64(n)
 		x = x[n:]
 	}
@@ -513,15 +517,15 @@
 
 //go:nosplit
 func (r *debugLogReader) readUint16LEAt(pos uint64) uint16 {
-	return uint16(r.data[pos%uint64(len(r.data))]) |
-		uint16(r.data[(pos+1)%uint64(len(r.data))])<<8
+	return uint16(r.data.b[pos%uint64(len(r.data.b))]) |
+		uint16(r.data.b[(pos+1)%uint64(len(r.data.b))])<<8
 }
 
 //go:nosplit
 func (r *debugLogReader) readUint64LEAt(pos uint64) uint64 {
 	var b [8]byte
 	for i := range b {
-		b[i] = r.data[pos%uint64(len(r.data))]
+		b[i] = r.data.b[pos%uint64(len(r.data.b))]
 		pos++
 	}
 	return uint64(b[0]) | uint64(b[1])<<8 |
@@ -557,7 +561,7 @@
 	pos := r.begin + debugLogHeaderSize
 	var u uint64
 	for i := uint(0); ; i += 7 {
-		b := r.data[pos%uint64(len(r.data))]
+		b := r.data.b[pos%uint64(len(r.data.b))]
 		pos++
 		u |= uint64(b&^0x80) << i
 		if b&0x80 == 0 {
@@ -588,7 +592,7 @@
 func (r *debugLogReader) uvarint() uint64 {
 	var u uint64
 	for i := uint(0); ; i += 7 {
-		b := r.data[r.begin%uint64(len(r.data))]
+		b := r.data.b[r.begin%uint64(len(r.data.b))]
 		r.begin++
 		u |= uint64(b&^0x80) << i
 		if b&0x80 == 0 {
@@ -610,7 +614,7 @@
 }
 
 func (r *debugLogReader) printVal() bool {
-	typ := r.data[r.begin%uint64(len(r.data))]
+	typ := r.data.b[r.begin%uint64(len(r.data.b))]
 	r.begin++
 
 	switch typ {
@@ -644,7 +648,7 @@
 			break
 		}
 		for sl > 0 {
-			b := r.data[r.begin%uint64(len(r.data)):]
+			b := r.data.b[r.begin%uint64(len(r.data.b)):]
 			if uint64(len(b)) > sl {
 				b = b[:sl]
 			}
@@ -656,6 +660,8 @@
 	case debugLogConstString:
 		len, ptr := int(r.uvarint()), uintptr(r.uvarint())
 		ptr += firstmoduledata.etext
+		// We can't use unsafe.String as it may panic, which isn't safe
+		// in this (potentially) nowritebarrier context.
 		str := stringStruct{
 			str: unsafe.Pointer(ptr),
 			len: len,
diff --git a/src/runtime/debuglog_test.go b/src/runtime/debuglog_test.go
index 2570e35..18c54a8 100644
--- a/src/runtime/debuglog_test.go
+++ b/src/runtime/debuglog_test.go
@@ -23,8 +23,8 @@
 package runtime_test
 
 import (
-	"bytes"
 	"fmt"
+	"internal/testenv"
 	"regexp"
 	"runtime"
 	"strings"
@@ -94,7 +94,7 @@
 		}
 		wg.Done()
 	}()
-	var want bytes.Buffer
+	var want strings.Builder
 	for i := 0; i < 1000; i++ {
 		runtime.Dlog().I(i).End()
 		fmt.Fprintf(&want, "[] %d\n", i)
@@ -122,7 +122,7 @@
 
 	runtime.ResetDebugLog()
 	var longString = strings.Repeat("a", 128)
-	var want bytes.Buffer
+	var want strings.Builder
 	for i, j := 0, 0; j < 2*runtime.DebugLogBytes; i, j = i+1, j+len(longString) {
 		runtime.Dlog().I(i).S(longString).End()
 		fmt.Fprintf(&want, "[] %d %s\n", i, longString)
@@ -156,3 +156,14 @@
 		t.Fatalf("want %q, got %q", want, got)
 	}
 }
+
+// TestDebugLogBuild verifies that the runtime builds with -tags=debuglog.
+func TestDebugLogBuild(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+
+	// It doesn't matter which program we build, anything will rebuild the
+	// runtime.
+	if _, err := buildTestProg(t, "testprog", "-tags=debuglog"); err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/src/runtime/defs1_netbsd_386.go b/src/runtime/defs1_netbsd_386.go
index b6e47a0..f7fe45b 100644
--- a/src/runtime/defs1_netbsd_386.go
+++ b/src/runtime/defs1_netbsd_386.go
@@ -8,7 +8,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
@@ -20,7 +23,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs1_netbsd_amd64.go b/src/runtime/defs1_netbsd_amd64.go
index b8292fa..80908cd 100644
--- a/src/runtime/defs1_netbsd_amd64.go
+++ b/src/runtime/defs1_netbsd_amd64.go
@@ -8,7 +8,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
@@ -20,7 +23,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs1_netbsd_arm.go b/src/runtime/defs1_netbsd_arm.go
index d2cb486..c63e592 100644
--- a/src/runtime/defs1_netbsd_arm.go
+++ b/src/runtime/defs1_netbsd_arm.go
@@ -8,7 +8,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
@@ -20,7 +23,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs1_netbsd_arm64.go b/src/runtime/defs1_netbsd_arm64.go
index 7776fe1..804b5b0 100644
--- a/src/runtime/defs1_netbsd_arm64.go
+++ b/src/runtime/defs1_netbsd_arm64.go
@@ -8,7 +8,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
@@ -20,7 +23,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs1_solaris_amd64.go b/src/runtime/defs1_solaris_amd64.go
index 3c13f33..bb53c22 100644
--- a/src/runtime/defs1_solaris_amd64.go
+++ b/src/runtime/defs1_solaris_amd64.go
@@ -23,7 +23,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x8
 	_SA_RESTART = 0x4
@@ -90,7 +91,10 @@
 
 	_MAXHOSTNAMELEN = 0x100
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x80
+	_O_TRUNC    = 0x200
+	_O_CREAT    = 0x100
 	_O_CLOEXEC  = 0x800000
 	_FD_CLOEXEC = 0x1
 	_F_GETFL    = 0x3
diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go
index 41ad735..5d6730a 100644
--- a/src/runtime/defs2_linux.go
+++ b/src/runtime/defs2_linux.go
@@ -121,17 +121,6 @@
 
 	O_RDONLY  = C.O_RDONLY
 	O_CLOEXEC = C.O_CLOEXEC
-
-	EPOLLIN       = C.POLLIN
-	EPOLLOUT      = C.POLLOUT
-	EPOLLERR      = C.POLLERR
-	EPOLLHUP      = C.POLLHUP
-	EPOLLRDHUP    = C.POLLRDHUP
-	EPOLLET       = C.EPOLLET
-	EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
-	EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
-	EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
-	EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
 )
 
 type Fpreg C.struct__fpreg
diff --git a/src/runtime/defs_aix.go b/src/runtime/defs_aix.go
index b794cd5..3895989 100644
--- a/src/runtime/defs_aix.go
+++ b/src/runtime/defs_aix.go
@@ -124,7 +124,10 @@
 	_ITIMER_PROF    = C.ITIMER_PROF
 
 	_O_RDONLY   = C.O_RDONLY
+	_O_WRONLY   = C.O_WRONLY
 	_O_NONBLOCK = C.O_NONBLOCK
+	_O_CREAT    = C.O_CREAT
+	_O_TRUNC    = C.O_TRUNC
 
 	_SS_DISABLE  = C.SS_DISABLE
 	_SI_USER     = C.SI_USER
diff --git a/src/runtime/defs_aix_ppc64.go b/src/runtime/defs_aix_ppc64.go
index 4e20c85..2d25b7c 100644
--- a/src/runtime/defs_aix_ppc64.go
+++ b/src/runtime/defs_aix_ppc64.go
@@ -81,7 +81,10 @@
 	_ITIMER_PROF    = 0x2
 
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x100
+	_O_TRUNC    = 0x200
 
 	_SS_DISABLE  = 0x2
 	_SI_USER     = 0x0
diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
index 59b81cf..89e4253 100644
--- a/src/runtime/defs_darwin.go
+++ b/src/runtime/defs_darwin.go
@@ -120,7 +120,10 @@
 	F_SETFL    = C.F_SETFL
 	FD_CLOEXEC = C.FD_CLOEXEC
 
+	O_WRONLY   = C.O_WRONLY
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CREAT    = C.O_CREAT
+	O_TRUNC    = C.O_TRUNC
 )
 
 type StackT C.struct_sigaltstack
diff --git a/src/runtime/defs_darwin_amd64.go b/src/runtime/defs_darwin_amd64.go
index cbc26bf..84e6f37 100644
--- a/src/runtime/defs_darwin_amd64.go
+++ b/src/runtime/defs_darwin_amd64.go
@@ -99,7 +99,10 @@
 	_F_SETFL    = 0x4
 	_FD_CLOEXEC = 0x1
 
-	_O_NONBLOCK = 4
+	_O_WRONLY   = 0x1
+	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 )
 
 type stackt struct {
diff --git a/src/runtime/defs_darwin_arm64.go b/src/runtime/defs_darwin_arm64.go
index 9076e8b..30d7443 100644
--- a/src/runtime/defs_darwin_arm64.go
+++ b/src/runtime/defs_darwin_arm64.go
@@ -101,7 +101,10 @@
 	_F_SETFL    = 0x4
 	_FD_CLOEXEC = 0x1
 
-	_O_NONBLOCK = 4
+	_O_WRONLY   = 0x1
+	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 )
 
 type stackt struct {
diff --git a/src/runtime/defs_dragonfly.go b/src/runtime/defs_dragonfly.go
index 952163b..9dcfdf0 100644
--- a/src/runtime/defs_dragonfly.go
+++ b/src/runtime/defs_dragonfly.go
@@ -32,7 +32,10 @@
 	EBUSY  = C.EBUSY
 	EAGAIN = C.EAGAIN
 
+	O_WRONLY   = C.O_WRONLY
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CREAT    = C.O_CREAT
+	O_TRUNC    = C.O_TRUNC
 	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
@@ -44,7 +47,8 @@
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
-	MADV_FREE = C.MADV_FREE
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
 
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
diff --git a/src/runtime/defs_dragonfly_amd64.go b/src/runtime/defs_dragonfly_amd64.go
index 4358c1e..f1a2302 100644
--- a/src/runtime/defs_dragonfly_amd64.go
+++ b/src/runtime/defs_dragonfly_amd64.go
@@ -11,7 +11,10 @@
 	_EBUSY  = 0x10
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x20000
 
 	_PROT_NONE  = 0x0
@@ -23,7 +26,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
index 3fbd580..d86ae91 100644
--- a/src/runtime/defs_freebsd.go
+++ b/src/runtime/defs_freebsd.go
@@ -16,10 +16,11 @@
 
 /*
 #include <sys/types.h>
+#include <unistd.h>
+#include <fcntl.h>
 #include <sys/time.h>
 #include <signal.h>
 #include <errno.h>
-#define _WANT_FREEBSD11_KEVENT 1
 #include <sys/event.h>
 #include <sys/mman.h>
 #include <sys/ucontext.h>
@@ -45,11 +46,15 @@
 )
 
 const (
-	EINTR  = C.EINTR
-	EFAULT = C.EFAULT
-	EAGAIN = C.EAGAIN
+	EINTR     = C.EINTR
+	EFAULT    = C.EFAULT
+	EAGAIN    = C.EAGAIN
+	ETIMEDOUT = C.ETIMEDOUT
 
+	O_WRONLY   = C.O_WRONLY
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CREAT    = C.O_CREAT
+	O_TRUNC    = C.O_TRUNC
 	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
@@ -62,7 +67,8 @@
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
-	MADV_FREE = C.MADV_FREE
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
 
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
@@ -154,7 +160,7 @@
 
 type Umtx_time C.struct__umtx_time
 
-type Kevent C.struct_kevent_freebsd11
+type KeventT C.struct_kevent
 
 type bintime C.struct_bintime
 type vdsoTimehands C.struct_vdso_timehands
diff --git a/src/runtime/defs_freebsd_386.go b/src/runtime/defs_freebsd_386.go
index ff4dcfa..ee82741 100644
--- a/src/runtime/defs_freebsd_386.go
+++ b/src/runtime/defs_freebsd_386.go
@@ -1,5 +1,6 @@
-// created by cgo -cdefs and then converted to Go
-// cgo -cdefs defs_freebsd.go
+// Code generated by cgo, then manually converted into appropriate naming and code
+// for the Go runtime.
+// go tool cgo -godefs defs_freebsd.go
 
 package runtime
 
@@ -18,7 +19,10 @@
 	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
@@ -31,7 +35,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
@@ -228,8 +233,9 @@
 	filter int16
 	flags  uint16
 	fflags uint32
-	data   int32
+	data   int64
 	udata  *byte
+	ext    [4]uint64
 }
 
 type bintime struct {
diff --git a/src/runtime/defs_freebsd_amd64.go b/src/runtime/defs_freebsd_amd64.go
index f537c89..9003f92 100644
--- a/src/runtime/defs_freebsd_amd64.go
+++ b/src/runtime/defs_freebsd_amd64.go
@@ -1,5 +1,6 @@
-// created by cgo -cdefs and then converted to Go
-// cgo -cdefs defs_freebsd.go
+// Code generated by cgo, then manually converted into appropriate naming and code
+// for the Go runtime.
+// go tool cgo -godefs defs_freebsd.go
 
 package runtime
 
@@ -18,7 +19,10 @@
 	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
@@ -31,7 +35,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
@@ -241,6 +246,7 @@
 	fflags uint32
 	data   int64
 	udata  *byte
+	ext    [4]uint64
 }
 
 type bintime struct {
diff --git a/src/runtime/defs_freebsd_arm.go b/src/runtime/defs_freebsd_arm.go
index 2e20ae7..68cc1b9 100644
--- a/src/runtime/defs_freebsd_arm.go
+++ b/src/runtime/defs_freebsd_arm.go
@@ -1,5 +1,6 @@
-// created by cgo -cdefs and then converted to Go
-// cgo -cdefs defs_freebsd.go
+// Code generated by cgo, then manually converted into appropriate naming and code
+// for the Go runtime.
+// go tool cgo -godefs defs_freebsd.go
 
 package runtime
 
@@ -18,7 +19,10 @@
 	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
@@ -31,7 +35,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
@@ -197,12 +202,15 @@
 }
 
 type keventt struct {
-	ident  uint32
-	filter int16
-	flags  uint16
-	fflags uint32
-	data   int32
-	udata  *byte
+	ident     uint32
+	filter    int16
+	flags     uint16
+	fflags    uint32
+	pad_cgo_0 [4]byte
+	data      int64
+	udata     *byte
+	pad_cgo_1 [4]byte
+	ext       [4]uint64
 }
 
 type bintime struct {
diff --git a/src/runtime/defs_freebsd_arm64.go b/src/runtime/defs_freebsd_arm64.go
index 1838108..1d67236 100644
--- a/src/runtime/defs_freebsd_arm64.go
+++ b/src/runtime/defs_freebsd_arm64.go
@@ -1,5 +1,6 @@
-// created by cgo -cdefs and then converted to Go
-// cgo -cdefs defs_freebsd.go
+// Code generated by cgo, then manually converted into appropriate naming and code
+// for the Go runtime.
+// go tool cgo -godefs defs_freebsd.go
 
 package runtime
 
@@ -18,7 +19,10 @@
 	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
@@ -31,7 +35,8 @@
 	_MAP_PRIVATE = 0x2
 	_MAP_FIXED   = 0x10
 
-	_MADV_FREE = 0x5
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
@@ -225,6 +230,7 @@
 	fflags uint32
 	data   int64
 	udata  *byte
+	ext    [4]uint64
 }
 
 type bintime struct {
diff --git a/src/runtime/defs_freebsd_riscv64.go b/src/runtime/defs_freebsd_riscv64.go
new file mode 100644
index 0000000..b977bde
--- /dev/null
+++ b/src/runtime/defs_freebsd_riscv64.go
@@ -0,0 +1,266 @@
+// created by cgo -cdefs and then converted to Go
+// cgo -cdefs defs_freebsd.go
+
+package runtime
+
+import "unsafe"
+
+const (
+	_NBBY            = 0x8
+	_CTL_MAXNAME     = 0x18
+	_CPU_LEVEL_WHICH = 0x3
+	_CPU_WHICH_PID   = 0x2
+)
+
+const (
+	_EINTR     = 0x4
+	_EFAULT    = 0xe
+	_EAGAIN    = 0x23
+	_ETIMEDOUT = 0x3c
+
+	_O_WRONLY   = 0x1
+	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
+	_O_CLOEXEC  = 0x100000
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x1000
+	_MAP_SHARED  = 0x1
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x5
+
+	_SA_SIGINFO = 0x40
+	_SA_RESTART = 0x2
+	_SA_ONSTACK = 0x1
+
+	_CLOCK_MONOTONIC = 0x4
+	_CLOCK_REALTIME  = 0x0
+
+	_UMTX_OP_WAIT_UINT         = 0xb
+	_UMTX_OP_WAIT_UINT_PRIVATE = 0xf
+	_UMTX_OP_WAKE              = 0x3
+	_UMTX_OP_WAKE_PRIVATE      = 0x10
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGEMT    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGBUS    = 0xa
+	_SIGSEGV   = 0xb
+	_SIGSYS    = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGTERM   = 0xf
+	_SIGURG    = 0x10
+	_SIGSTOP   = 0x11
+	_SIGTSTP   = 0x12
+	_SIGCONT   = 0x13
+	_SIGCHLD   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGIO     = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGINFO   = 0x1d
+	_SIGUSR1   = 0x1e
+	_SIGUSR2   = 0x1f
+
+	_FPE_INTDIV = 0x2
+	_FPE_INTOVF = 0x1
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EV_ADD       = 0x1
+	_EV_DELETE    = 0x2
+	_EV_CLEAR     = 0x20
+	_EV_RECEIPT   = 0x40
+	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
+	_EVFILT_READ  = -0x1
+	_EVFILT_WRITE = -0x2
+)
+
+type rtprio struct {
+	_type uint16
+	prio  uint16
+}
+
+type thrparam struct {
+	start_func uintptr
+	arg        unsafe.Pointer
+	stack_base uintptr
+	stack_size uintptr
+	tls_base   unsafe.Pointer
+	tls_size   uintptr
+	child_tid  unsafe.Pointer // *int64
+	parent_tid *int64
+	flags      int32
+	pad_cgo_0  [4]byte
+	rtp        *rtprio
+	spare      [3]uintptr
+}
+
+type thread int64 // long
+
+type sigset struct {
+	__bits [4]uint32
+}
+
+type stackt struct {
+	ss_sp     uintptr
+	ss_size   uintptr
+	ss_flags  int32
+	pad_cgo_0 [4]byte
+}
+
+type siginfo struct {
+	si_signo  int32
+	si_errno  int32
+	si_code   int32
+	si_pid    int32
+	si_uid    uint32
+	si_status int32
+	si_addr   uint64
+	si_value  [8]byte
+	_reason   [40]byte
+}
+
+type gpregs struct {
+	gp_ra      uint64
+	gp_sp      uint64
+	gp_gp      uint64
+	gp_tp      uint64
+	gp_t       [7]uint64
+	gp_s       [12]uint64
+	gp_a       [8]uint64
+	gp_sepc    uint64
+	gp_sstatus uint64
+}
+
+type fpregs struct {
+	fp_x     [64]uint64 // actually __uint64_t fp_x[32][2]
+	fp_fcsr  uint64
+	fp_flags int32
+	pad      int32
+}
+
+type mcontext struct {
+	mc_gpregs gpregs
+	mc_fpregs fpregs
+	mc_flags  int32
+	mc_pad    int32
+	mc_spare  [8]uint64
+}
+
+type ucontext struct {
+	uc_sigmask  sigset
+	uc_mcontext mcontext
+	uc_link     *ucontext
+	uc_stack    stackt
+	uc_flags    int32
+	__spare__   [4]int32
+	pad_cgo_0   [12]byte
+}
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type umtx_time struct {
+	_timeout timespec
+	_flags   uint32
+	_clockid uint32
+}
+
+type keventt struct {
+	ident  uint64
+	filter int16
+	flags  uint16
+	fflags uint32
+	data   int64
+	udata  *byte
+	ext    [4]uint64
+}
+
+type bintime struct {
+	sec  int64
+	frac uint64
+}
+
+type vdsoTimehands struct {
+	algo         uint32
+	gen          uint32
+	scale        uint64
+	offset_count uint32
+	counter_mask uint32
+	offset       bintime
+	boottime     bintime
+	physical     uint32
+	res          [7]uint32
+}
+
+type vdsoTimekeep struct {
+	ver       uint32
+	enabled   uint32
+	current   uint32
+	pad_cgo_0 [4]byte
+}
+
+const (
+	_VDSO_TK_VER_CURR = 0x1
+
+	vdsoTimehandsSize = 0x58
+	vdsoTimekeepSize  = 0x10
+)
diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go
index e55bb6b..296fcb4 100644
--- a/src/runtime/defs_linux.go
+++ b/src/runtime/defs_linux.go
@@ -115,17 +115,6 @@
 	CLOCK_THREAD_CPUTIME_ID = C.CLOCK_THREAD_CPUTIME_ID
 
 	SIGEV_THREAD_ID = C.SIGEV_THREAD_ID
-
-	EPOLLIN       = C.POLLIN
-	EPOLLOUT      = C.POLLOUT
-	EPOLLERR      = C.POLLERR
-	EPOLLHUP      = C.POLLHUP
-	EPOLLRDHUP    = C.POLLRDHUP
-	EPOLLET       = C.EPOLLET
-	EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
-	EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
-	EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
-	EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
 )
 
 type Sigset C.sigset_t
@@ -136,4 +125,3 @@
 type Itimerspec C.struct_itimerspec
 type Itimerval C.struct_itimerval
 type Sigevent C.struct_sigevent
-type EpollEvent C.struct_epoll_event
diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go
index 5376bde..72339f4 100644
--- a/src/runtime/defs_linux_386.go
+++ b/src/runtime/defs_linux_386.go
@@ -90,20 +90,12 @@
 	_SIGEV_THREAD_ID = 0x4
 
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
+	_O_CREAT    = 0x40
+	_O_TRUNC    = 0x200
 	_O_NONBLOCK = 0x800
 	_O_CLOEXEC  = 0x80000
 
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
-
 	_AF_UNIX    = 0x1
 	_SOCK_DGRAM = 0x2
 )
@@ -254,11 +246,6 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events uint32
-	data   [8]byte // to match amd64
-}
-
 type sockaddr_un struct {
 	family uint16
 	path   [108]byte
diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go
index da4d357..298f3eb 100644
--- a/src/runtime/defs_linux_amd64.go
+++ b/src/runtime/defs_linux_amd64.go
@@ -89,17 +89,6 @@
 
 	_SIGEV_THREAD_ID = 0x4
 
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
-
 	_AF_UNIX    = 0x1
 	_SOCK_DGRAM = 0x2
 )
@@ -171,16 +160,14 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events uint32
-	data   [8]byte // unaligned uintptr
-}
-
 // created by cgo -cdefs and then converted to Go
 // cgo -cdefs defs_linux.go defs1_linux.go
 
 const (
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
+	_O_CREAT    = 0x40
+	_O_TRUNC    = 0x200
 	_O_NONBLOCK = 0x800
 	_O_CLOEXEC  = 0x80000
 )
diff --git a/src/runtime/defs_linux_arm.go b/src/runtime/defs_linux_arm.go
index 18aa093..6fee57d 100644
--- a/src/runtime/defs_linux_arm.go
+++ b/src/runtime/defs_linux_arm.go
@@ -80,6 +80,9 @@
 	_ITIMER_PROF    = 0x2
 	_ITIMER_VIRTUAL = 0x1
 	_O_RDONLY       = 0
+	_O_WRONLY       = 0x1
+	_O_CREAT        = 0x40
+	_O_TRUNC        = 0x200
 	_O_NONBLOCK     = 0x800
 	_O_CLOEXEC      = 0x80000
 
@@ -87,17 +90,6 @@
 
 	_SIGEV_THREAD_ID = 0x4
 
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
-
 	_AF_UNIX    = 0x1
 	_SOCK_DGRAM = 0x2
 )
@@ -208,12 +200,6 @@
 	sa_mask     uint64
 }
 
-type epollevent struct {
-	events uint32
-	_pad   uint32
-	data   [8]byte // to match amd64
-}
-
 type sockaddr_un struct {
 	family uint16
 	path   [108]byte
diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go
index c5d7d7e..0216096 100644
--- a/src/runtime/defs_linux_arm64.go
+++ b/src/runtime/defs_linux_arm64.go
@@ -89,17 +89,6 @@
 
 	_SIGEV_THREAD_ID = 0x4
 
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
-
 	_AF_UNIX    = 0x1
 	_SOCK_DGRAM = 0x2
 )
@@ -171,17 +160,14 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events uint32
-	_pad   uint32
-	data   [8]byte // to match amd64
-}
-
 // Created by cgo -cdefs and then converted to Go by hand
 // ../cmd/cgo/cgo -cdefs defs_linux.go defs1_linux.go defs2_linux.go
 
 const (
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
+	_O_CREAT    = 0x40
+	_O_TRUNC    = 0x200
 	_O_NONBLOCK = 0x800
 	_O_CLOEXEC  = 0x80000
 )
diff --git a/src/runtime/defs_linux_loong64.go b/src/runtime/defs_linux_loong64.go
index dda4009..6eca18b 100644
--- a/src/runtime/defs_linux_loong64.go
+++ b/src/runtime/defs_linux_loong64.go
@@ -89,17 +89,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 type timespec struct {
@@ -146,14 +135,11 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 const (
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
+	_O_CREAT    = 0x40
+	_O_TRUNC    = 0x200
 	_O_NONBLOCK = 0x800
 	_O_CLOEXEC  = 0x80000
 )
diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go
index e645248..2e8c405 100644
--- a/src/runtime/defs_linux_mips64x.go
+++ b/src/runtime/defs_linux_mips64x.go
@@ -90,17 +90,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 //struct Sigset {
@@ -178,14 +167,11 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 const (
 	_O_RDONLY    = 0x0
+	_O_WRONLY    = 0x1
+	_O_CREAT     = 0x100
+	_O_TRUNC     = 0x200
 	_O_NONBLOCK  = 0x80
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go
index 5afb6f4..7593600 100644
--- a/src/runtime/defs_linux_mipsx.go
+++ b/src/runtime/defs_linux_mipsx.go
@@ -90,17 +90,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 type timespec struct {
@@ -172,15 +161,12 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      uint64
-}
-
 const (
 	_O_RDONLY    = 0x0
+	_O_WRONLY    = 0x1
 	_O_NONBLOCK  = 0x80
+	_O_CREAT     = 0x100
+	_O_TRUNC     = 0x200
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_linux_ppc64.go b/src/runtime/defs_linux_ppc64.go
index f3e305e..bb3ac01 100644
--- a/src/runtime/defs_linux_ppc64.go
+++ b/src/runtime/defs_linux_ppc64.go
@@ -87,17 +87,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 //struct Sigset {
@@ -172,17 +161,14 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 // created by cgo -cdefs and then converted to Go
 // cgo -cdefs defs_linux.go defs3_linux.go
 
 const (
 	_O_RDONLY    = 0x0
+	_O_WRONLY    = 0x1
+	_O_CREAT     = 0x40
+	_O_TRUNC     = 0x200
 	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
diff --git a/src/runtime/defs_linux_ppc64le.go b/src/runtime/defs_linux_ppc64le.go
index f3e305e..bb3ac01 100644
--- a/src/runtime/defs_linux_ppc64le.go
+++ b/src/runtime/defs_linux_ppc64le.go
@@ -87,17 +87,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 //struct Sigset {
@@ -172,17 +161,14 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 // created by cgo -cdefs and then converted to Go
 // cgo -cdefs defs_linux.go defs3_linux.go
 
 const (
 	_O_RDONLY    = 0x0
+	_O_WRONLY    = 0x1
+	_O_CREAT     = 0x40
+	_O_TRUNC     = 0x200
 	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
diff --git a/src/runtime/defs_linux_riscv64.go b/src/runtime/defs_linux_riscv64.go
index 29496ac..ce4a7f3 100644
--- a/src/runtime/defs_linux_riscv64.go
+++ b/src/runtime/defs_linux_riscv64.go
@@ -89,17 +89,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 type timespec struct {
@@ -171,14 +160,11 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 const (
 	_O_RDONLY   = 0x0
+	_O_WRONLY   = 0x1
+	_O_CREAT    = 0x40
+	_O_TRUNC    = 0x200
 	_O_NONBLOCK = 0x800
 	_O_CLOEXEC  = 0x80000
 )
diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go
index 817a29e..36497dd 100644
--- a/src/runtime/defs_linux_s390x.go
+++ b/src/runtime/defs_linux_s390x.go
@@ -88,17 +88,6 @@
 	_CLOCK_THREAD_CPUTIME_ID = 0x3
 
 	_SIGEV_THREAD_ID = 0x4
-
-	_EPOLLIN       = 0x1
-	_EPOLLOUT      = 0x4
-	_EPOLLERR      = 0x8
-	_EPOLLHUP      = 0x10
-	_EPOLLRDHUP    = 0x2000
-	_EPOLLET       = 0x80000000
-	_EPOLL_CLOEXEC = 0x80000
-	_EPOLL_CTL_ADD = 0x1
-	_EPOLL_CTL_DEL = 0x2
-	_EPOLL_CTL_MOD = 0x3
 )
 
 type timespec struct {
@@ -168,14 +157,11 @@
 	_ [_sigev_max_size - unsafe.Sizeof(sigeventFields{})]byte
 }
 
-type epollevent struct {
-	events    uint32
-	pad_cgo_0 [4]byte
-	data      [8]byte // unaligned uintptr
-}
-
 const (
 	_O_RDONLY    = 0x0
+	_O_WRONLY    = 0x1
+	_O_CREAT     = 0x40
+	_O_TRUNC     = 0x200
 	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
diff --git a/src/runtime/defs_netbsd.go b/src/runtime/defs_netbsd.go
index 6b084c0..43923e3 100644
--- a/src/runtime/defs_netbsd.go
+++ b/src/runtime/defs_netbsd.go
@@ -34,7 +34,10 @@
 	EFAULT = C.EFAULT
 	EAGAIN = C.EAGAIN
 
+	O_WRONLY   = C.O_WRONLY
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CREAT    = C.O_CREAT
+	O_TRUNC    = C.O_TRUNC
 	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
@@ -46,7 +49,8 @@
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
-	MADV_FREE = C.MADV_FREE
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
 
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index cbf53eb..4161e21 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -48,7 +48,8 @@
 	MAP_FIXED   = C.MAP_FIXED
 	MAP_STACK   = C.MAP_STACK
 
-	MADV_FREE = C.MADV_FREE
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
 
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
diff --git a/src/runtime/defs_openbsd_386.go b/src/runtime/defs_openbsd_386.go
index 35c559b..25524c5 100644
--- a/src/runtime/defs_openbsd_386.go
+++ b/src/runtime/defs_openbsd_386.go
@@ -10,7 +10,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
@@ -23,7 +26,8 @@
 	_MAP_FIXED   = 0x10
 	_MAP_STACK   = 0x4000
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_openbsd_amd64.go b/src/runtime/defs_openbsd_amd64.go
index d7432da..a31d03b 100644
--- a/src/runtime/defs_openbsd_amd64.go
+++ b/src/runtime/defs_openbsd_amd64.go
@@ -10,7 +10,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
@@ -23,7 +26,8 @@
 	_MAP_FIXED   = 0x10
 	_MAP_STACK   = 0x4000
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_openbsd_arm.go b/src/runtime/defs_openbsd_arm.go
index 471b306..1d1767b 100644
--- a/src/runtime/defs_openbsd_arm.go
+++ b/src/runtime/defs_openbsd_arm.go
@@ -10,7 +10,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
@@ -23,7 +26,8 @@
 	_MAP_FIXED   = 0x10
 	_MAP_STACK   = 0x4000
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_openbsd_arm64.go b/src/runtime/defs_openbsd_arm64.go
index 5300ab0..745d0d3 100644
--- a/src/runtime/defs_openbsd_arm64.go
+++ b/src/runtime/defs_openbsd_arm64.go
@@ -11,7 +11,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
@@ -24,7 +27,8 @@
 	_MAP_FIXED   = 0x10
 	_MAP_STACK   = 0x4000
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_openbsd_mips64.go b/src/runtime/defs_openbsd_mips64.go
index a8789ef..1e469e4 100644
--- a/src/runtime/defs_openbsd_mips64.go
+++ b/src/runtime/defs_openbsd_mips64.go
@@ -17,7 +17,10 @@
 	_EFAULT = 0xe
 	_EAGAIN = 0x23
 
+	_O_WRONLY   = 0x1
 	_O_NONBLOCK = 0x4
+	_O_CREAT    = 0x200
+	_O_TRUNC    = 0x400
 	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
@@ -30,7 +33,8 @@
 	_MAP_FIXED   = 0x10
 	_MAP_STACK   = 0x4000
 
-	_MADV_FREE = 0x6
+	_MADV_DONTNEED = 0x4
+	_MADV_FREE     = 0x6
 
 	_SA_SIGINFO = 0x40
 	_SA_RESTART = 0x2
diff --git a/src/runtime/defs_solaris.go b/src/runtime/defs_solaris.go
index f626498..406304d 100644
--- a/src/runtime/defs_solaris.go
+++ b/src/runtime/defs_solaris.go
@@ -53,7 +53,8 @@
 	MAP_PRIVATE = C.MAP_PRIVATE
 	MAP_FIXED   = C.MAP_FIXED
 
-	MADV_FREE = C.MADV_FREE
+	MADV_DONTNEED = C.MADV_DONTNEED
+	MADV_FREE     = C.MADV_FREE
 
 	SA_SIGINFO = C.SA_SIGINFO
 	SA_RESTART = C.SA_RESTART
@@ -119,7 +120,10 @@
 
 	MAXHOSTNAMELEN = C.MAXHOSTNAMELEN
 
+	O_WRONLY   = C.O_WRONLY
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CREAT    = C.O_CREAT
+	O_TRUNC    = C.O_TRUNC
 	O_CLOEXEC  = C.O_CLOEXEC
 	FD_CLOEXEC = C.FD_CLOEXEC
 	F_GETFL    = C.F_GETFL
diff --git a/src/runtime/ehooks_test.go b/src/runtime/ehooks_test.go
new file mode 100644
index 0000000..ee286ec
--- /dev/null
+++ b/src/runtime/ehooks_test.go
@@ -0,0 +1,91 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"internal/platform"
+	"internal/testenv"
+	"os/exec"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func TestExitHooks(t *testing.T) {
+	bmodes := []string{""}
+	if testing.Short() {
+		t.Skip("skipping due to -short")
+	}
+	// Note the HasCGO() test below; this is to prevent the test
+	// running if CGO_ENABLED=0 is in effect.
+	haverace := platform.RaceDetectorSupported(runtime.GOOS, runtime.GOARCH)
+	if haverace && testenv.HasCGO() {
+		bmodes = append(bmodes, "-race")
+	}
+	for _, bmode := range bmodes {
+		scenarios := []struct {
+			mode     string
+			expected string
+			musthave string
+		}{
+			{
+				mode:     "simple",
+				expected: "bar foo",
+				musthave: "",
+			},
+			{
+				mode:     "goodexit",
+				expected: "orange apple",
+				musthave: "",
+			},
+			{
+				mode:     "badexit",
+				expected: "blub blix",
+				musthave: "",
+			},
+			{
+				mode:     "panics",
+				expected: "",
+				musthave: "fatal error: internal error: exit hook invoked panic",
+			},
+			{
+				mode:     "callsexit",
+				expected: "",
+				musthave: "fatal error: internal error: exit hook invoked exit",
+			},
+		}
+
+		exe, err := buildTestProg(t, "testexithooks", bmode)
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		bt := ""
+		if bmode != "" {
+			bt = " bmode: " + bmode
+		}
+		for _, s := range scenarios {
+			cmd := exec.Command(exe, []string{"-mode", s.mode}...)
+			out, _ := cmd.CombinedOutput()
+			outs := strings.ReplaceAll(string(out), "\n", " ")
+			outs = strings.TrimSpace(outs)
+			if s.expected != "" {
+				if s.expected != outs {
+					t.Logf("raw output: %q", outs)
+					t.Errorf("failed%s mode %s: wanted %q got %q", bt,
+						s.mode, s.expected, outs)
+				}
+			} else if s.musthave != "" {
+				if !strings.Contains(outs, s.musthave) {
+					t.Logf("raw output: %q", outs)
+					t.Errorf("failed mode %s: output does not contain %q",
+						s.mode, s.musthave)
+				}
+			} else {
+				panic("badly written scenario")
+			}
+		}
+	}
+}
diff --git a/src/runtime/env_plan9.go b/src/runtime/env_plan9.go
index 65480c8..d206c5d 100644
--- a/src/runtime/env_plan9.go
+++ b/src/runtime/env_plan9.go
@@ -17,7 +17,7 @@
 	nameOffset = 39
 )
 
-// Goenvs caches the Plan 9 environment variables at start of execution into
+// goenvs caches the Plan 9 environment variables at start of execution into
 // string array envs, to supply the initial contents for os.Environ.
 // Subsequent calls to os.Setenv will change this cache, without writing back
 // to the (possibly shared) Plan 9 environment, so that Setenv and Getenv
@@ -70,7 +70,7 @@
 	})
 }
 
-// Dofiles reads the directory opened with file descriptor fd, applying function f
+// dofiles reads the directory opened with file descriptor fd, applying function f
 // to each filename in it.
 //
 //go:nosplit
@@ -95,7 +95,7 @@
 	}
 }
 
-// Gdirname returns the first filename from a buffer of directory entries,
+// gdirname returns the first filename from a buffer of directory entries,
 // and a slice containing the remaining directory entries.
 // If the buffer doesn't start with a valid directory entry, the returned name is nil.
 //
@@ -117,7 +117,7 @@
 	return
 }
 
-// Gbit16 reads a 16-bit little-endian binary number from b and returns it
+// gbit16 reads a 16-bit little-endian binary number from b and returns it
 // with the remaining slice of b.
 //
 //go:nosplit
diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
index 94a19d8..0eb4f0d 100644
--- a/src/runtime/env_posix.go
+++ b/src/runtime/env_posix.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build unix || (js && wasm) || windows || plan9
-
 package runtime
 
 import "unsafe"
@@ -48,10 +46,7 @@
 var _cgo_unsetenv unsafe.Pointer // pointer to C function
 
 // Update the C environment if cgo is loaded.
-// Called from syscall.Setenv.
-//
-//go:linkname syscall_setenv_c syscall.setenv_c
-func syscall_setenv_c(k string, v string) {
+func setenv_c(k string, v string) {
 	if _cgo_setenv == nil {
 		return
 	}
@@ -60,10 +55,7 @@
 }
 
 // Update the C environment if cgo is loaded.
-// Called from syscall.unsetenv.
-//
-//go:linkname syscall_unsetenv_c syscall.unsetenv_c
-func syscall_unsetenv_c(k string) {
+func unsetenv_c(k string) {
 	if _cgo_unsetenv == nil {
 		return
 	}
diff --git a/src/runtime/error.go b/src/runtime/error.go
index b11473c..a211fbf 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -151,7 +151,7 @@
 	boundsSlice3Acap: "slice bounds out of range [::%x] with capacity %y",
 	boundsSlice3B:    "slice bounds out of range [:%x:%y]",
 	boundsSlice3C:    "slice bounds out of range [%x:%y:]",
-	boundsConvert:    "cannot convert slice with length %y to pointer to array with length %x",
+	boundsConvert:    "cannot convert slice with length %y to array or pointer to array with length %x",
 }
 
 // boundsNegErrorFmts are overriding formats if x is negative. In this case there's no need to report y.
diff --git a/src/runtime/exithook.go b/src/runtime/exithook.go
new file mode 100644
index 0000000..bb29a94
--- /dev/null
+++ b/src/runtime/exithook.go
@@ -0,0 +1,69 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// addExitHook registers the specified function 'f' to be run at
+// program termination (e.g. when someone invokes os.Exit(), or when
+// main.main returns). Hooks are run in reverse order of registration:
+// first hook added is the last one run.
+//
+// CAREFUL: the expectation is that addExitHook should only be called
+// from a safe context (e.g. not an error/panic path or signal
+// handler, preemption enabled, allocation allowed, write barriers
+// allowed, etc), and that the exit function 'f' will be invoked under
+// similar circumstances. That is the say, we are expecting that 'f'
+// uses normal / high-level Go code as opposed to one of the more
+// restricted dialects used for the trickier parts of the runtime.
+func addExitHook(f func(), runOnNonZeroExit bool) {
+	exitHooks.hooks = append(exitHooks.hooks, exitHook{f: f, runOnNonZeroExit: runOnNonZeroExit})
+}
+
+// exitHook stores a function to be run on program exit, registered
+// by the utility runtime.addExitHook.
+type exitHook struct {
+	f                func() // func to run
+	runOnNonZeroExit bool   // whether to run on non-zero exit code
+}
+
+// exitHooks stores state related to hook functions registered to
+// run when program execution terminates.
+var exitHooks struct {
+	hooks            []exitHook
+	runningExitHooks bool
+}
+
+// runExitHooks runs any registered exit hook functions (funcs
+// previously registered using runtime.addExitHook). Here 'exitCode'
+// is the status code being passed to os.Exit, or zero if the program
+// is terminating normally without calling os.Exit).
+func runExitHooks(exitCode int) {
+	if exitHooks.runningExitHooks {
+		throw("internal error: exit hook invoked exit")
+	}
+	exitHooks.runningExitHooks = true
+
+	runExitHook := func(f func()) (caughtPanic bool) {
+		defer func() {
+			if x := recover(); x != nil {
+				caughtPanic = true
+			}
+		}()
+		f()
+		return
+	}
+
+	finishPageTrace()
+	for i := range exitHooks.hooks {
+		h := exitHooks.hooks[len(exitHooks.hooks)-i-1]
+		if exitCode != 0 && !h.runOnNonZeroExit {
+			continue
+		}
+		if caughtPanic := runExitHook(h.f); caughtPanic {
+			throw("internal error: exit hook invoked panic")
+		}
+	}
+	exitHooks.hooks = nil
+	exitHooks.runningExitHooks = false
+}
diff --git a/src/runtime/export_debug_test.go b/src/runtime/export_debug_test.go
index 09e9779..2d8a133 100644
--- a/src/runtime/export_debug_test.go
+++ b/src/runtime/export_debug_test.go
@@ -109,7 +109,7 @@
 	// a signal handler. Add the go:nowritebarrierrec annotation and restructure
 	// this to avoid write barriers.
 
-	switch h.gp.atomicstatus {
+	switch h.gp.atomicstatus.Load() {
 	case _Grunning:
 		if getg().m != h.mp {
 			println("trap on wrong M", getg().m, h.mp)
diff --git a/src/runtime/export_debuglog_test.go b/src/runtime/export_debuglog_test.go
index 1a9074e..c9dfdcb 100644
--- a/src/runtime/export_debuglog_test.go
+++ b/src/runtime/export_debuglog_test.go
@@ -25,11 +25,11 @@
 func (l *dlogger) PC(x uintptr) *dlogger { return l.pc(x) }
 
 func DumpDebugLog() string {
-	g := getg()
-	g.writebuf = make([]byte, 0, 1<<20)
+	gp := getg()
+	gp.writebuf = make([]byte, 0, 1<<20)
 	printDebugLog()
-	buf := g.writebuf
-	g.writebuf = nil
+	buf := gp.writebuf
+	gp.writebuf = nil
 
 	return string(buf)
 }
diff --git a/src/runtime/export_linux_test.go b/src/runtime/export_linux_test.go
index dea94a9..a441c0e 100644
--- a/src/runtime/export_linux_test.go
+++ b/src/runtime/export_linux_test.go
@@ -6,19 +6,17 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/syscall"
+)
 
 const SiginfoMaxSize = _si_max_size
 const SigeventMaxSize = _sigev_max_size
 
+var Closeonexec = syscall.CloseOnExec
 var NewOSProc0 = newosproc0
 var Mincore = mincore
 var Add = add
 
-type EpollEvent epollevent
 type Siginfo siginfo
 type Sigevent sigevent
-
-func Epollctl(epfd, op, fd int32, ev unsafe.Pointer) int32 {
-	return epollctl(epfd, op, fd, (*epollevent)(ev))
-}
diff --git a/src/runtime/export_openbsd_test.go b/src/runtime/export_openbsd_test.go
new file mode 100644
index 0000000..ef680dc
--- /dev/null
+++ b/src/runtime/export_openbsd_test.go
@@ -0,0 +1,15 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build openbsd && !mips64
+
+package runtime
+
+func Fcntl(fd, cmd, arg uintptr) (uintptr, uintptr) {
+	r := fcntl(int32(fd), int32(cmd), int32(arg))
+	if r < 0 {
+		return ^uintptr(0), uintptr(-r)
+	}
+	return uintptr(r), 0
+}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 9639946..e7476e6 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -69,6 +69,9 @@
 func LFStackPop(head *uint64) *LFNode {
 	return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
 }
+func LFNodeValidate(node *LFNode) {
+	lfnodeValidate((*lfnode)(unsafe.Pointer(node)))
+}
 
 func Netpoll(delta int64) {
 	systemstack(func() {
@@ -84,23 +87,23 @@
 }
 
 func RunSchedLocalQueueTest() {
-	_p_ := new(p)
-	gs := make([]g, len(_p_.runq))
+	pp := new(p)
+	gs := make([]g, len(pp.runq))
 	Escape(gs) // Ensure gs doesn't move, since we use guintptrs
-	for i := 0; i < len(_p_.runq); i++ {
-		if g, _ := runqget(_p_); g != nil {
+	for i := 0; i < len(pp.runq); i++ {
+		if g, _ := runqget(pp); g != nil {
 			throw("runq is not empty initially")
 		}
 		for j := 0; j < i; j++ {
-			runqput(_p_, &gs[i], false)
+			runqput(pp, &gs[i], false)
 		}
 		for j := 0; j < i; j++ {
-			if g, _ := runqget(_p_); g != &gs[i] {
+			if g, _ := runqget(pp); g != &gs[i] {
 				print("bad element at iter ", i, "/", j, "\n")
 				throw("bad element")
 			}
 		}
-		if g, _ := runqget(_p_); g != nil {
+		if g, _ := runqget(pp); g != nil {
 			throw("runq is not empty afterwards")
 		}
 	}
@@ -362,6 +365,9 @@
 			if s.state.get() != mSpanInUse {
 				continue
 			}
+			if s.isUnusedUserArenaChunk() {
+				continue
+			}
 			if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
 				slow.Mallocs++
 				slow.Alloc += uint64(s.elemsize)
@@ -460,17 +466,17 @@
 }
 
 func LockOSCounts() (external, internal uint32) {
-	g := getg()
-	if g.m.lockedExt+g.m.lockedInt == 0 {
-		if g.lockedm != 0 {
+	gp := getg()
+	if gp.m.lockedExt+gp.m.lockedInt == 0 {
+		if gp.lockedm != 0 {
 			panic("lockedm on non-locked goroutine")
 		}
 	} else {
-		if g.lockedm == 0 {
+		if gp.lockedm == 0 {
 			panic("nil lockedm on locked goroutine")
 		}
 	}
-	return g.m.lockedExt, g.m.lockedInt
+	return gp.m.lockedExt, gp.m.lockedInt
 }
 
 //go:noinline
@@ -500,7 +506,10 @@
 // MapNextArenaHint reserves a page at the next arena growth hint,
 // preventing the arena from growing there, and returns the range of
 // addresses that are no longer viable.
-func MapNextArenaHint() (start, end uintptr) {
+//
+// This may fail to reserve memory. If it fails, it still returns the
+// address range it attempted to reserve.
+func MapNextArenaHint() (start, end uintptr, ok bool) {
 	hint := mheap_.arenaHints
 	addr := hint.addr
 	if hint.down {
@@ -509,7 +518,13 @@
 	} else {
 		start, end = addr, addr+heapArenaBytes
 	}
-	sysReserve(unsafe.Pointer(addr), physPageSize)
+	got := sysReserve(unsafe.Pointer(addr), physPageSize)
+	ok = (addr == uintptr(got))
+	if !ok {
+		// We were unable to get the requested reservation.
+		// Release what we did get and fail.
+		sysFreeOS(got, physPageSize)
+	}
 	return
 }
 
@@ -525,6 +540,12 @@
 	return getg()
 }
 
+func GIsWaitingOnMutex(gp *G) bool {
+	return readgstatus(gp) == _Gwaiting && gp.waitreason.isMutexWait()
+}
+
+var CasGStatusAlwaysTrack = &casgstatusAlwaysTrack
+
 //go:noinline
 func PanicForTesting(b []byte, i int) byte {
 	return unexportedPanicForTesting(b, i)
@@ -1164,7 +1185,7 @@
 
 func SemNwait(addr *uint32) uint32 {
 	root := semtable.rootFor(addr)
-	return atomic.Load(&root.nwait)
+	return root.nwait.Load()
 }
 
 const SemTableSize = semTabSize
@@ -1196,8 +1217,6 @@
 }
 
 // mspan wrapper for testing.
-//
-//go:notinheap
 type MSpan mspan
 
 // Allocate an mspan for testing.
@@ -1230,23 +1249,29 @@
 }
 
 const (
-	TimeHistSubBucketBits   = timeHistSubBucketBits
-	TimeHistNumSubBuckets   = timeHistNumSubBuckets
-	TimeHistNumSuperBuckets = timeHistNumSuperBuckets
+	TimeHistSubBucketBits = timeHistSubBucketBits
+	TimeHistNumSubBuckets = timeHistNumSubBuckets
+	TimeHistNumBuckets    = timeHistNumBuckets
+	TimeHistMinBucketBits = timeHistMinBucketBits
+	TimeHistMaxBucketBits = timeHistMaxBucketBits
 )
 
 type TimeHistogram timeHistogram
 
 // Counts returns the counts for the given bucket, subBucket indices.
 // Returns true if the bucket was valid, otherwise returns the counts
-// for the underflow bucket and false.
-func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
+// for the overflow bucket if bucket > 0 or the underflow bucket if
+// bucket < 0, and false.
+func (th *TimeHistogram) Count(bucket, subBucket int) (uint64, bool) {
 	t := (*timeHistogram)(th)
-	i := bucket*TimeHistNumSubBuckets + subBucket
-	if i >= uint(len(t.counts)) {
-		return t.underflow, false
+	if bucket < 0 {
+		return t.underflow.Load(), false
 	}
-	return t.counts[i], true
+	i := bucket*TimeHistNumSubBuckets + subBucket
+	if i >= len(t.counts) {
+		return t.overflow.Load(), false
+	}
+	return t.counts[i].Load(), true
 }
 
 func (th *TimeHistogram) Record(duration int64) {
@@ -1266,10 +1291,7 @@
 }
 
 func FinalizerGAsleep() bool {
-	lock(&finlock)
-	result := fingwait
-	unlock(&finlock)
-	return result
+	return fingStatus.Load()&fingWait != 0
 }
 
 // For GCTestMoveStackOnNextCall, it's important not to introduce an
@@ -1322,10 +1344,10 @@
 	if c.heapMarked > trigger {
 		trigger = c.heapMarked
 	}
-	c.maxStackScan = stackSize
-	c.globalsScan = globalsSize
-	c.heapLive = trigger
-	c.heapScan += uint64(float64(trigger-c.heapMarked) * scannableFrac)
+	c.maxStackScan.Store(stackSize)
+	c.globalsScan.Store(globalsSize)
+	c.heapLive.Store(trigger)
+	c.heapScan.Add(int64(float64(trigger-c.heapMarked) * scannableFrac))
 	c.startCycle(0, gomaxprocs, gcTrigger{kind: gcTriggerHeap})
 }
 
@@ -1338,7 +1360,7 @@
 }
 
 func (c *GCController) HeapLive() uint64 {
-	return c.heapLive
+	return c.heapLive.Load()
 }
 
 func (c *GCController) HeapMarked() uint64 {
@@ -1358,8 +1380,8 @@
 }
 
 func (c *GCController) Revise(d GCControllerReviseDelta) {
-	c.heapLive += uint64(d.HeapLive)
-	c.heapScan += uint64(d.HeapScan)
+	c.heapLive.Add(d.HeapLive)
+	c.heapScan.Add(d.HeapScan)
 	c.heapScanWork.Add(d.HeapScanWork)
 	c.stackScanWork.Add(d.StackScanWork)
 	c.globalsScanWork.Add(d.GlobalsScanWork)
@@ -1616,3 +1638,83 @@
 func (s *ScavengeIndex) Clear(ci ChunkIdx) {
 	s.i.clear(chunkIdx(ci))
 }
+
+const GTrackingPeriod = gTrackingPeriod
+
+var ZeroBase = unsafe.Pointer(&zerobase)
+
+const UserArenaChunkBytes = userArenaChunkBytes
+
+type UserArena struct {
+	arena *userArena
+}
+
+func NewUserArena() *UserArena {
+	return &UserArena{newUserArena()}
+}
+
+func (a *UserArena) New(out *any) {
+	i := efaceOf(out)
+	typ := i._type
+	if typ.kind&kindMask != kindPtr {
+		panic("new result of non-ptr type")
+	}
+	typ = (*ptrtype)(unsafe.Pointer(typ)).elem
+	i.data = a.arena.new(typ)
+}
+
+func (a *UserArena) Slice(sl any, cap int) {
+	a.arena.slice(sl, cap)
+}
+
+func (a *UserArena) Free() {
+	a.arena.free()
+}
+
+func GlobalWaitingArenaChunks() int {
+	n := 0
+	systemstack(func() {
+		lock(&mheap_.lock)
+		for s := mheap_.userArena.quarantineList.first; s != nil; s = s.next {
+			n++
+		}
+		unlock(&mheap_.lock)
+	})
+	return n
+}
+
+func UserArenaClone[T any](s T) T {
+	return arena_heapify(s).(T)
+}
+
+var AlignUp = alignUp
+
+// BlockUntilEmptyFinalizerQueue blocks until either the finalizer
+// queue is emptied (and the finalizers have executed) or the timeout
+// is reached. Returns true if the finalizer queue was emptied.
+func BlockUntilEmptyFinalizerQueue(timeout int64) bool {
+	start := nanotime()
+	for nanotime()-start < timeout {
+		lock(&finlock)
+		// We know the queue has been drained when both finq is nil
+		// and the finalizer g has stopped executing.
+		empty := finq == nil
+		empty = empty && readgstatus(fing) == _Gwaiting && fing.waitreason == waitReasonFinalizerWait
+		unlock(&finlock)
+		if empty {
+			return true
+		}
+		Gosched()
+	}
+	return false
+}
+
+func FrameStartLine(f *Frame) int {
+	return f.startLine
+}
+
+// PersistentAlloc allocates some memory that lives outside the Go heap.
+// This memory will never be freed; use sparingly.
+func PersistentAlloc(n uintptr) unsafe.Pointer {
+	return persistentalloc(n, 0, &memstats.other_sys)
+}
diff --git a/src/runtime/export_unix2_test.go b/src/runtime/export_unix2_test.go
new file mode 100644
index 0000000..360565f
--- /dev/null
+++ b/src/runtime/export_unix2_test.go
@@ -0,0 +1,10 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix && !linux
+
+package runtime
+
+// for linux close-on-exec implemented in runtime/internal/syscall
+var Closeonexec = closeonexec
diff --git a/src/runtime/export_unix_test.go b/src/runtime/export_unix_test.go
index a548cf7..71a55d8 100644
--- a/src/runtime/export_unix_test.go
+++ b/src/runtime/export_unix_test.go
@@ -9,7 +9,6 @@
 import "unsafe"
 
 var NonblockingPipe = nonblockingPipe
-var Closeonexec = closeonexec
 
 func sigismember(mask *sigset, i int) bool {
 	clear := *mask
@@ -90,3 +89,9 @@
 func SendSigusr1(mp *M) {
 	signalM(mp, _SIGUSR1)
 }
+
+const (
+	O_WRONLY = _O_WRONLY
+	O_CREAT  = _O_CREAT
+	O_TRUNC  = _O_TRUNC
+)
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 15c519d..6c41c62 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -42,6 +42,12 @@
 	clobber the memory content of an object with bad content when it frees
 	the object.
 
+	cpu.*: cpu.all=off disables the use of all optional instruction set extensions.
+	cpu.extension=off disables use of instructions from the specified instruction set extension.
+	extension is the lower case name for the instruction set extension such as sse41 or avx
+	as listed in internal/cpu package. As an example cpu.avx=off disables runtime detection
+	and thereby use of AVX instructions.
+
 	cgocheck: setting cgocheck=0 disables all checks for packages
 	using cgo to incorrectly pass Go pointers to non-Go code.
 	Setting cgocheck=1 (the default) enables relatively cheap
@@ -73,7 +79,7 @@
 	error at each collection, summarizing the amount of memory collected and the
 	length of the pause. The format of this line is subject to change.
 	Currently, it is:
-		gc # @#s #%: #+#+# ms clock, #+#/#/#+# ms cpu, #->#-># MB, # MB goal, # P
+		gc # @#s #%: #+#+# ms clock, #+#/#/#+# ms cpu, #->#-># MB, # MB goal, # MB stacks, #MB globals, # P
 	where the fields are as follows:
 		gc #         the GC number, incremented at each GC
 		@#s          time in seconds since program start
@@ -112,12 +118,22 @@
 	madvdontneed: setting madvdontneed=0 will use MADV_FREE
 	instead of MADV_DONTNEED on Linux when returning memory to the
 	kernel. This is more efficient, but means RSS numbers will
-	drop only when the OS is under memory pressure.
+	drop only when the OS is under memory pressure. On the BSDs and
+	Illumos/Solaris, setting madvdontneed=1 will use MADV_DONTNEED instead
+	of MADV_FREE. This is less efficient, but causes RSS numbers to drop
+	more quickly.
 
 	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
 	When set to 0 memory profiling is disabled.  Refer to the description of
 	MemProfileRate for the default value.
 
+	pagetrace: setting pagetrace=/path/to/file will write out a trace of page events
+	that can be viewed, analyzed, and visualized using the x/debug/cmd/pagetrace tool.
+	Build your program with GOEXPERIMENT=pagetrace to enable this functionality. Do not
+	enable this functionality if your program is a setuid binary as it introduces a security
+	risk in that scenario. Currently not supported on Windows, plan9 or js/wasm. Setting this
+	option for some applications can produce large traces, so use with care.
+
 	invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
 	copier to crash the program if an invalid pointer value (for example, 1)
 	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
diff --git a/src/runtime/float.go b/src/runtime/float.go
index c80c8b7..9f281c4 100644
--- a/src/runtime/float.go
+++ b/src/runtime/float.go
@@ -24,12 +24,12 @@
 	return !isNaN(f) && !isFinite(f)
 }
 
-// Abs returns the absolute value of x.
+// abs returns the absolute value of x.
 //
 // Special cases are:
 //
-//	Abs(±Inf) = +Inf
-//	Abs(NaN) = NaN
+//	abs(±Inf) = +Inf
+//	abs(NaN) = NaN
 func abs(x float64) float64 {
 	const sign = 1 << 63
 	return float64frombits(float64bits(x) &^ sign)
@@ -42,12 +42,12 @@
 	return float64frombits(float64bits(x)&^sign | float64bits(y)&sign)
 }
 
-// Float64bits returns the IEEE 754 binary representation of f.
+// float64bits returns the IEEE 754 binary representation of f.
 func float64bits(f float64) uint64 {
 	return *(*uint64)(unsafe.Pointer(&f))
 }
 
-// Float64frombits returns the floating point number corresponding
+// float64frombits returns the floating point number corresponding
 // the IEEE 754 binary representation b.
 func float64frombits(b uint64) float64 {
 	return *(*float64)(unsafe.Pointer(&b))
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 122818f..0b2c972 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -689,7 +689,7 @@
 		time.Sleep(100 * time.Millisecond)
 		start := time.Now()
 		runtime.ReadMemStats(&ms)
-		latencies = append(latencies, time.Now().Sub(start))
+		latencies = append(latencies, time.Since(start))
 	}
 	// Make sure to stop the timer before we wait! The load created above
 	// is very heavy-weight and not easy to stop, so we could end up
diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
index e726006..d4a2b3f 100644
--- a/src/runtime/hash_test.go
+++ b/src/runtime/hash_test.go
@@ -6,6 +6,7 @@
 
 import (
 	"fmt"
+	"internal/race"
 	"math"
 	"math/rand"
 	. "runtime"
@@ -125,6 +126,9 @@
 
 // All 0-3 byte strings have distinct hashes.
 func TestSmhasherSmallKeys(t *testing.T) {
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	h := newHashSet()
 	var b [3]byte
 	for i := 0; i < 256; i++ {
@@ -166,6 +170,9 @@
 	if testing.Short() {
 		t.Skip("Skipping in short mode")
 	}
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	h := newHashSet()
 	for n := 2; n <= 16; n++ {
 		twoNonZero(h, n)
@@ -208,6 +215,9 @@
 	if testing.Short() {
 		t.Skip("Skipping in short mode")
 	}
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	r := rand.New(rand.NewSource(1234))
 	const REPEAT = 8
 	const N = 1000000
@@ -275,6 +285,9 @@
 	if testing.Short() {
 		t.Skip("Skipping in short mode")
 	}
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
 	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
 	permutation(t, []uint32{0, 1}, 20)
@@ -447,6 +460,9 @@
 	if testing.Short() {
 		t.Skip("Skipping in short mode")
 	}
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	avalancheTest1(t, &BytesKey{make([]byte, 2)})
 	avalancheTest1(t, &BytesKey{make([]byte, 4)})
 	avalancheTest1(t, &BytesKey{make([]byte, 8)})
@@ -514,6 +530,9 @@
 
 // All bit rotations of a set of distinct keys
 func TestSmhasherWindowed(t *testing.T) {
+	if race.Enabled {
+		t.Skip("Too long for race mode")
+	}
 	t.Logf("32 bit keys")
 	windowed(t, &Int32Key{})
 	t.Logf("64 bit keys")
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index c7f2b7a..f57a1a1 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -120,7 +120,7 @@
 
 var typecache [typeCacheBuckets]typeCacheBucket
 
-// dump a uint64 in a varint format parseable by encoding/binary
+// dump a uint64 in a varint format parseable by encoding/binary.
 func dumpint(v uint64) {
 	var buf [10]byte
 	var n int
@@ -142,7 +142,7 @@
 	}
 }
 
-// dump varint uint64 length followed by memory contents
+// dump varint uint64 length followed by memory contents.
 func dumpmemrange(data unsafe.Pointer, len uintptr) {
 	dumpint(uint64(len))
 	dwrite(data, len)
@@ -156,11 +156,10 @@
 }
 
 func dumpstr(s string) {
-	sp := stringStructOf(&s)
-	dumpmemrange(sp.str, uintptr(sp.len))
+	dumpmemrange(unsafe.Pointer(unsafe.StringData(s)), uintptr(len(s)))
 }
 
-// dump information for a type
+// dump information for a type.
 func dumptype(t *_type) {
 	if t == nil {
 		return
@@ -197,19 +196,17 @@
 	if x := t.uncommon(); x == nil || t.nameOff(x.pkgpath).name() == "" {
 		dumpstr(t.string())
 	} else {
-		pkgpathstr := t.nameOff(x.pkgpath).name()
-		pkgpath := stringStructOf(&pkgpathstr)
-		namestr := t.name()
-		name := stringStructOf(&namestr)
-		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
-		dwrite(pkgpath.str, uintptr(pkgpath.len))
+		pkgpath := t.nameOff(x.pkgpath).name()
+		name := t.name()
+		dumpint(uint64(uintptr(len(pkgpath)) + 1 + uintptr(len(name))))
+		dwrite(unsafe.Pointer(unsafe.StringData(pkgpath)), uintptr(len(pkgpath)))
 		dwritebyte('.')
-		dwrite(name.str, uintptr(name.len))
+		dwrite(unsafe.Pointer(unsafe.StringData(name)), uintptr(len(name)))
 	}
 	dumpbool(t.kind&kindDirectIface == 0 || t.ptrdata != 0)
 }
 
-// dump an object
+// dump an object.
 func dumpobj(obj unsafe.Pointer, size uintptr, bv bitvector) {
 	dumpint(tagObject)
 	dumpint(uint64(uintptr(obj)))
@@ -242,7 +239,7 @@
 	depth  uintptr   // depth in call stack (0 == most recent)
 }
 
-// dump kinds & offsets of interesting fields in bv
+// dump kinds & offsets of interesting fields in bv.
 func dumpbv(cbv *bitvector, offset uintptr) {
 	for i := uintptr(0); i < uintptr(cbv.n); i++ {
 		if cbv.ptrbit(i) == 1 {
@@ -327,7 +324,7 @@
 
 	// Record arg info for parent.
 	child.argoff = s.argp - s.fp
-	child.arglen = s.arglen
+	child.arglen = s.argBytes()
 	child.sp = (*uint8)(unsafe.Pointer(s.sp))
 	child.depth++
 	stkmap = (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
@@ -354,7 +351,7 @@
 	dumpint(tagGoroutine)
 	dumpint(uint64(uintptr(unsafe.Pointer(gp))))
 	dumpint(uint64(sp))
-	dumpint(uint64(gp.goid))
+	dumpint(gp.goid)
 	dumpint(uint64(gp.gopc))
 	dumpint(uint64(readgstatus(gp)))
 	dumpbool(isSystemGoroutine(gp, false))
@@ -693,9 +690,8 @@
 func writeheapdump_m(fd uintptr, m *MemStats) {
 	assertWorldStopped()
 
-	_g_ := getg()
-	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
-	_g_.waitreason = waitReasonDumpingHeap
+	gp := getg()
+	casGToWaiting(gp.m.curg, _Grunning, waitReasonDumpingHeap)
 
 	// Set dump file.
 	dumpfd = fd
@@ -710,7 +706,7 @@
 		tmpbuf = nil
 	}
 
-	casgstatus(_g_.m.curg, _Gwaiting, _Grunning)
+	casgstatus(gp.m.curg, _Gwaiting, _Grunning)
 }
 
 // dumpint() the kind & offset of each field in an object.
@@ -737,16 +733,16 @@
 	for i := uintptr(0); i < nptr/8+1; i++ {
 		tmpbuf[i] = 0
 	}
-	i := uintptr(0)
-	hbits := heapBitsForAddr(p)
-	for ; i < nptr; i++ {
-		if !hbits.morePointers() {
-			break // end of object
+
+	hbits := heapBitsForAddr(p, size)
+	for {
+		var addr uintptr
+		hbits, addr = hbits.next()
+		if addr == 0 {
+			break
 		}
-		if hbits.isPointer() {
-			tmpbuf[i/8] |= 1 << (i % 8)
-		}
-		hbits = hbits.next()
+		i := (addr - p) / goarch.PtrSize
+		tmpbuf[i/8] |= 1 << (i % 8)
 	}
-	return bitvector{int32(i), &tmpbuf[0]}
+	return bitvector{int32(nptr), &tmpbuf[0]}
 }
diff --git a/src/runtime/histogram.go b/src/runtime/histogram.go
index eddfbab..43dfe61 100644
--- a/src/runtime/histogram.go
+++ b/src/runtime/histogram.go
@@ -12,72 +12,88 @@
 
 const (
 	// For the time histogram type, we use an HDR histogram.
-	// Values are placed in super-buckets based solely on the most
-	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are placed in buckets based solely on the most
+	// significant set bit. Thus, buckets are power-of-2 sized.
 	// Values are then placed into sub-buckets based on the value of
 	// the next timeHistSubBucketBits most significant bits. Thus,
-	// sub-buckets are linear within a super-bucket.
+	// sub-buckets are linear within a bucket.
 	//
 	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
 	// defines the error. This error may be computed as
 	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
-	// per super-bucket the error is approximately 6%.
+	// per bucket the error is approximately 6%.
 	//
-	// The number of super-buckets (timeHistNumSuperBuckets), on the
-	// other hand, defines the range. To reserve room for sub-buckets,
-	// bit timeHistSubBucketBits is the first bit considered for
-	// super-buckets, so super-bucket indices are adjusted accordingly.
+	// The number of buckets (timeHistNumBuckets), on the
+	// other hand, defines the range. To avoid producing a large number
+	// of buckets that are close together, especially for small numbers
+	// (e.g. 1, 2, 3, 4, 5 ns) that aren't very useful, timeHistNumBuckets
+	// is defined in terms of the least significant bit (timeHistMinBucketBits)
+	// that needs to be set before we start bucketing and the most
+	// significant bit (timeHistMaxBucketBits) that we bucket before we just
+	// dump it into a catch-all bucket.
 	//
-	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	// As an example, consider the configuration:
 	//
-	//    00110
-	//    ^----
-	//    │  ^
-	//    │  └---- Lowest 4 bits -> sub-bucket 6
-	//    â””------- Bit 4 unset -> super-bucket 0
+	//    timeHistMinBucketBits = 9
+	//    timeHistMaxBucketBits = 48
+	//    timeHistSubBucketBits = 2
 	//
-	//    10110
-	//    ^----
-	//    │  ^
-	//    │  └---- Next 4 bits -> sub-bucket 6
-	//    â””------- Bit 4 set -> super-bucket 1
-	//    100010
-	//    ^----^
-	//    │  ^ └-- Lower bits ignored
-	//    │  └---- Next 4 bits -> sub-bucket 1
-	//    â””------- Bit 5 set -> super-bucket 2
+	// Then:
 	//
-	// Following this pattern, super-bucket 44 will have the bit 47 set. We don't
-	// have any buckets for higher values, so the highest sub-bucket will
-	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
-	// more than enough to handle durations produced by the runtime.
-	timeHistSubBucketBits   = 4
-	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
-	timeHistNumSuperBuckets = 45
-	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+	//    011000001
+	//    ^--
+	//    │ ^
+	//    │ └---- Next 2 bits -> sub-bucket 3
+	//    â””------- Bit 9 unset -> bucket 0
+	//
+	//    110000001
+	//    ^--
+	//    │ ^
+	//    │ └---- Next 2 bits -> sub-bucket 2
+	//    â””------- Bit 9 set -> bucket 1
+	//
+	//    1000000010
+	//    ^-- ^
+	//    │ ^ └-- Lower bits ignored
+	//    │ └---- Next 2 bits -> sub-bucket 0
+	//    â””------- Bit 10 set -> bucket 2
+	//
+	// Following this pattern, bucket 38 will have the bit 46 set. We don't
+	// have any buckets for higher values, so we spill the rest into an overflow
+	// bucket containing values of 2^47-1 nanoseconds or approx. 1 day or more.
+	// This range is more than enough to handle durations produced by the runtime.
+	timeHistMinBucketBits = 9
+	timeHistMaxBucketBits = 48 // Note that this is exclusive; 1 higher than the actual range.
+	timeHistSubBucketBits = 2
+	timeHistNumSubBuckets = 1 << timeHistSubBucketBits
+	timeHistNumBuckets    = timeHistMaxBucketBits - timeHistMinBucketBits + 1
+	// Two extra buckets, one for underflow, one for overflow.
+	timeHistTotalBuckets = timeHistNumBuckets*timeHistNumSubBuckets + 2
 )
 
 // timeHistogram represents a distribution of durations in
 // nanoseconds.
 //
 // The accuracy and range of the histogram is defined by the
-// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+// timeHistSubBucketBits and timeHistNumBuckets constants.
 //
 // It is an HDR histogram with exponentially-distributed
 // buckets and linearly distributed sub-buckets.
 //
-// Counts in the histogram are updated atomically, so it is safe
-// for concurrent use. It is also safe to read all the values
-// atomically.
+// The histogram is safe for concurrent reads and writes.
 type timeHistogram struct {
-	counts [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+	counts [timeHistNumBuckets * timeHistNumSubBuckets]atomic.Uint64
 
 	// underflow counts all the times we got a negative duration
 	// sample. Because of how time works on some platforms, it's
 	// possible to measure negative durations. We could ignore them,
 	// but we record them anyway because it's better to have some
 	// signal that it's happening than just missing samples.
-	underflow uint64
+	underflow atomic.Uint64
+
+	// overflow counts all the times we got a duration that exceeded
+	// the range counts represents.
+	overflow atomic.Uint64
 }
 
 // record adds the given duration to the distribution.
@@ -87,36 +103,35 @@
 //
 //go:nosplit
 func (h *timeHistogram) record(duration int64) {
+	// If the duration is negative, capture that in underflow.
 	if duration < 0 {
-		atomic.Xadd64(&h.underflow, 1)
+		h.underflow.Add(1)
 		return
 	}
-	// The index of the exponential bucket is just the index
-	// of the highest set bit adjusted for how many bits we
-	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
-	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
-	var superBucket, subBucket uint
-	if duration >= timeHistNumSubBuckets {
-		// At this point, we know the duration value will always be
-		// at least timeHistSubBucketsBits long.
-		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
-		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
-			// The bucket index we got is larger than what we support, so
-			// include this count in the highest bucket, which extends to
-			// infinity.
-			superBucket = timeHistNumSuperBuckets - 1
-			subBucket = timeHistNumSubBuckets - 1
-		} else {
-			// The linear subbucket index is just the timeHistSubBucketsBits
-			// bits after the top bit. To extract that value, shift down
-			// the duration such that we leave the top bit and the next bits
-			// intact, then extract the index.
-			subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
-		}
+	// bucketBit is the target bit for the bucket which is usually the
+	// highest 1 bit, but if we're less than the minimum, is the highest
+	// 1 bit of the minimum (which will be zero in the duration).
+	//
+	// bucket is the bucket index, which is the bucketBit minus the
+	// highest bit of the minimum, plus one to leave room for the catch-all
+	// bucket for samples lower than the minimum.
+	var bucketBit, bucket uint
+	if l := sys.Len64(uint64(duration)); l < timeHistMinBucketBits {
+		bucketBit = timeHistMinBucketBits
+		bucket = 0 // bucketBit - timeHistMinBucketBits
 	} else {
-		subBucket = uint(duration)
+		bucketBit = uint(l)
+		bucket = bucketBit - timeHistMinBucketBits + 1
 	}
-	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+	// If the bucket we computed is greater than the number of buckets,
+	// count that in overflow.
+	if bucket >= timeHistNumBuckets {
+		h.overflow.Add(1)
+		return
+	}
+	// The sub-bucket index is just next timeHistSubBucketBits after the bucketBit.
+	subBucket := uint(duration>>(bucketBit-1-timeHistSubBucketBits)) % timeHistNumSubBuckets
+	h.counts[bucket*timeHistNumSubBuckets+subBucket].Add(1)
 }
 
 const (
@@ -139,33 +154,37 @@
 // not nanoseconds like the timeHistogram represents durations.
 func timeHistogramMetricsBuckets() []float64 {
 	b := make([]float64, timeHistTotalBuckets+1)
+	// Underflow bucket.
 	b[0] = float64NegInf()
-	// Super-bucket 0 has no bits above timeHistSubBucketBits
-	// set, so just iterate over each bucket and assign the
-	// incrementing bucket.
-	for i := 0; i < timeHistNumSubBuckets; i++ {
-		bucketNanos := uint64(i)
-		b[i+1] = float64(bucketNanos) / 1e9
+
+	for j := 0; j < timeHistNumSubBuckets; j++ {
+		// No bucket bit for the first few buckets. Just sub-bucket bits after the
+		// min bucket bit.
+		bucketNanos := uint64(j) << (timeHistMinBucketBits - 1 - timeHistSubBucketBits)
+		// Convert nanoseconds to seconds via a division.
+		// These values will all be exactly representable by a float64.
+		b[j+1] = float64(bucketNanos) / 1e9
 	}
-	// Generate the rest of the super-buckets. It's easier to reason
-	// about if we cut out the 0'th bucket, so subtract one since
-	// we just handled that bucket.
-	for i := 0; i < timeHistNumSuperBuckets-1; i++ {
+	// Generate the rest of the buckets. It's easier to reason
+	// about if we cut out the 0'th bucket.
+	for i := timeHistMinBucketBits; i < timeHistMaxBucketBits; i++ {
 		for j := 0; j < timeHistNumSubBuckets; j++ {
-			// Set the super-bucket bit.
-			bucketNanos := uint64(1) << (i + timeHistSubBucketBits)
+			// Set the bucket bit.
+			bucketNanos := uint64(1) << (i - 1)
 			// Set the sub-bucket bits.
-			bucketNanos |= uint64(j) << i
-			// The index for this bucket is going to be the (i+1)'th super bucket
-			// (note that we're starting from zero, but handled the first super-bucket
+			bucketNanos |= uint64(j) << (i - 1 - timeHistSubBucketBits)
+			// The index for this bucket is going to be the (i+1)'th bucket
+			// (note that we're starting from zero, but handled the first bucket
 			// earlier, so we need to compensate), and the j'th sub bucket.
 			// Add 1 because we left space for -Inf.
-			bucketIndex := (i+1)*timeHistNumSubBuckets + j + 1
+			bucketIndex := (i-timeHistMinBucketBits+1)*timeHistNumSubBuckets + j + 1
 			// Convert nanoseconds to seconds via a division.
 			// These values will all be exactly representable by a float64.
 			b[bucketIndex] = float64(bucketNanos) / 1e9
 		}
 	}
+	// Overflow bucket.
+	b[len(b)-2] = float64(uint64(1)<<(timeHistMaxBucketBits-1)) / 1e9
 	b[len(b)-1] = float64Inf()
 	return b
 }
diff --git a/src/runtime/histogram_test.go b/src/runtime/histogram_test.go
index b12b65a..5246e86 100644
--- a/src/runtime/histogram_test.go
+++ b/src/runtime/histogram_test.go
@@ -20,50 +20,54 @@
 	h := &dummyTimeHistogram
 
 	// Record exactly one sample in each bucket.
-	for i := 0; i < TimeHistNumSuperBuckets; i++ {
-		var base int64
-		if i > 0 {
-			base = int64(1) << (i + TimeHistSubBucketBits - 1)
-		}
-		for j := 0; j < TimeHistNumSubBuckets; j++ {
-			v := int64(j)
-			if i > 0 {
-				v <<= i - 1
-			}
-			h.Record(base + v)
+	for j := 0; j < TimeHistNumSubBuckets; j++ {
+		v := int64(j) << (TimeHistMinBucketBits - 1 - TimeHistSubBucketBits)
+		for k := 0; k < j; k++ {
+			// Record a number of times equal to the bucket index.
+			h.Record(v)
 		}
 	}
-	// Hit the underflow bucket.
+	for i := TimeHistMinBucketBits; i < TimeHistMaxBucketBits; i++ {
+		base := int64(1) << (i - 1)
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
+			v := int64(j) << (i - 1 - TimeHistSubBucketBits)
+			for k := 0; k < (i+1-TimeHistMinBucketBits)*TimeHistNumSubBuckets+j; k++ {
+				// Record a number of times equal to the bucket index.
+				h.Record(base + v)
+			}
+		}
+	}
+	// Hit the underflow and overflow buckets.
 	h.Record(int64(-1))
+	h.Record(math.MaxInt64)
+	h.Record(math.MaxInt64)
 
 	// Check to make sure there's exactly one count in each
 	// bucket.
-	for i := uint(0); i < TimeHistNumSuperBuckets; i++ {
-		for j := uint(0); j < TimeHistNumSubBuckets; j++ {
+	for i := 0; i < TimeHistNumBuckets; i++ {
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
 			c, ok := h.Count(i, j)
 			if !ok {
-				t.Errorf("hit underflow bucket unexpectedly: (%d, %d)", i, j)
-			} else if c != 1 {
-				t.Errorf("bucket (%d, %d) has count that is not 1: %d", i, j, c)
+				t.Errorf("unexpected invalid bucket: (%d, %d)", i, j)
+			} else if idx := uint64(i*TimeHistNumSubBuckets + j); c != idx {
+				t.Errorf("bucket (%d, %d) has count that is not %d: %d", i, j, idx, c)
 			}
 		}
 	}
-	c, ok := h.Count(TimeHistNumSuperBuckets, 0)
+	c, ok := h.Count(-1, 0)
 	if ok {
-		t.Errorf("expected to hit underflow bucket: (%d, %d)", TimeHistNumSuperBuckets, 0)
+		t.Errorf("expected to hit underflow bucket: (%d, %d)", -1, 0)
 	}
 	if c != 1 {
-		t.Errorf("underflow bucket has count that is not 1: %d", c)
+		t.Errorf("overflow bucket has count that is not 1: %d", c)
 	}
 
-	// Check overflow behavior.
-	// By hitting a high value, we should just be adding into the highest bucket.
-	h.Record(math.MaxInt64)
-	c, ok = h.Count(TimeHistNumSuperBuckets-1, TimeHistNumSubBuckets-1)
-	if !ok {
-		t.Error("hit underflow bucket in highest bucket unexpectedly")
-	} else if c != 2 {
-		t.Errorf("highest has count that is not 2: %d", c)
+	c, ok = h.Count(TimeHistNumBuckets+1, 0)
+	if ok {
+		t.Errorf("expected to hit overflow bucket: (%d, %d)", TimeHistNumBuckets+1, 0)
+	}
+	if c != 2 {
+		t.Errorf("overflow bucket has count that is not 2: %d", c)
 	}
 
 	dummyTimeHistogram = TimeHistogram{}
@@ -72,34 +76,32 @@
 func TestTimeHistogramMetricsBuckets(t *testing.T) {
 	buckets := TimeHistogramMetricsBuckets()
 
-	nonInfBucketsLen := TimeHistNumSubBuckets * TimeHistNumSuperBuckets
-	expBucketsLen := nonInfBucketsLen + 2 // Count -Inf and +Inf.
+	nonInfBucketsLen := TimeHistNumSubBuckets * TimeHistNumBuckets
+	expBucketsLen := nonInfBucketsLen + 3 // Count -Inf, the edge for the overflow bucket, and +Inf.
 	if len(buckets) != expBucketsLen {
 		t.Fatalf("unexpected length of buckets: got %d, want %d", len(buckets), expBucketsLen)
 	}
-	// Check the first non-Inf 2*TimeHistNumSubBuckets buckets in order, skipping the
-	// first bucket which should be -Inf (checked later).
-	//
-	// Because of the way this scheme works, the bottom TimeHistNumSubBuckets
-	// buckets are fully populated, and then the next TimeHistNumSubBuckets
-	// have the TimeHistSubBucketBits'th bit set, while the bottom are once
-	// again fully populated.
-	for i := 1; i <= 2*TimeHistNumSubBuckets+1; i++ {
-		if got, want := buckets[i], float64(i-1)/1e9; got != want {
-			t.Errorf("expected bucket %d to have value %e, got %e", i, want, got)
-		}
-	}
 	// Check some values.
 	idxToBucket := map[int]float64{
 		0:                 math.Inf(-1),
-		33:                float64(0x10<<1) / 1e9,
-		34:                float64(0x11<<1) / 1e9,
-		49:                float64(0x10<<2) / 1e9,
-		58:                float64(0x19<<2) / 1e9,
-		65:                float64(0x10<<3) / 1e9,
-		513:               float64(0x10<<31) / 1e9,
-		519:               float64(0x16<<31) / 1e9,
-		expBucketsLen - 2: float64(0x1f<<43) / 1e9,
+		1:                 0.0,
+		2:                 float64(0x040) / 1e9,
+		3:                 float64(0x080) / 1e9,
+		4:                 float64(0x0c0) / 1e9,
+		5:                 float64(0x100) / 1e9,
+		6:                 float64(0x140) / 1e9,
+		7:                 float64(0x180) / 1e9,
+		8:                 float64(0x1c0) / 1e9,
+		9:                 float64(0x200) / 1e9,
+		10:                float64(0x280) / 1e9,
+		11:                float64(0x300) / 1e9,
+		12:                float64(0x380) / 1e9,
+		13:                float64(0x400) / 1e9,
+		15:                float64(0x600) / 1e9,
+		81:                float64(0x8000000) / 1e9,
+		82:                float64(0xa000000) / 1e9,
+		108:               float64(0x380000000) / 1e9,
+		expBucketsLen - 2: float64(0x1<<47) / 1e9,
 		expBucketsLen - 1: math.Inf(1),
 	}
 	for idx, bucket := range idxToBucket {
diff --git a/src/runtime/internal/atomic/atomic_loong64.go b/src/runtime/internal/atomic/atomic_loong64.go
index 908a7d6..d82a5b8 100644
--- a/src/runtime/internal/atomic/atomic_loong64.go
+++ b/src/runtime/internal/atomic/atomic_loong64.go
@@ -42,6 +42,9 @@
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
 func LoadAcquintptr(ptr *uintptr) uintptr
 
 //go:noescape
@@ -80,4 +83,7 @@
 func StoreRel(ptr *uint32, val uint32)
 
 //go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
 func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_loong64.s b/src/runtime/internal/atomic/atomic_loong64.s
index bfb6c7e..3d802be 100644
--- a/src/runtime/internal/atomic/atomic_loong64.s
+++ b/src/runtime/internal/atomic/atomic_loong64.s
@@ -156,6 +156,9 @@
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
 	JMP     ·Store64(SB)
 
@@ -293,6 +296,10 @@
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
 
+// uint64 ·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
+
 // uintptr ·LoadAcquintptr(uintptr volatile* ptr)
 TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
 	JMP     atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index 2ae60b8..2427bfd 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -345,6 +345,36 @@
 	}
 }
 
+func TestCasRel(t *testing.T) {
+	const _magic = 0x5a5aa5a5
+	var x struct {
+		before uint32
+		i      uint32
+		after  uint32
+		o      uint32
+		n      uint32
+	}
+
+	x.before = _magic
+	x.after = _magic
+	for j := 0; j < 32; j += 1 {
+		x.i = (1 << j) + 0
+		x.o = (1 << j) + 0
+		x.n = (1 << j) + 1
+		if !atomic.CasRel(&x.i, x.o, x.n) {
+			t.Fatalf("should have swapped %#x %#x", x.o, x.n)
+		}
+
+		if x.i != x.n {
+			t.Fatalf("wrong x.i after swap: x.i=%#x x.n=%#x", x.i, x.n)
+		}
+
+		if x.before != _magic || x.after != _magic {
+			t.Fatalf("wrong magic: %#x _ %#x != %#x _ %#x", x.before, x.after, _magic, _magic)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/src/runtime/internal/atomic/sys_linux_arm.s b/src/runtime/internal/atomic/sys_linux_arm.s
index 0cc7fa7..9225df8 100644
--- a/src/runtime/internal/atomic/sys_linux_arm.s
+++ b/src/runtime/internal/atomic/sys_linux_arm.s
@@ -15,9 +15,6 @@
 //	LR = return address
 // The function returns with CS true if the swap happened.
 // http://lxr.linux.no/linux+v2.6.37.2/arch/arm/kernel/entry-armv.S#L850
-// On older kernels (before 2.6.24) the function can incorrectly
-// report a conflict, so we have to double-check the compare ourselves
-// and retry if necessary.
 //
 // https://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=b49c0f24cf6744a3f4fd09289fe7cade349dead5
 //
@@ -37,20 +34,13 @@
 	// because we don't know how to traceback through __kuser_cmpxchg
 	MOVW    (R2), R0
 	MOVW	old+4(FP), R0
-loop:
 	MOVW	new+8(FP), R1
 	BL	cas<>(SB)
-	BCC	check
+	BCC	ret0
 	MOVW	$1, R0
 	MOVB	R0, ret+12(FP)
 	RET
-check:
-	// Kernel lies; double-check.
-	MOVW	ptr+0(FP), R2
-	MOVW	old+4(FP), R0
-	MOVW	0(R2), R3
-	CMP	R0, R3
-	BEQ	loop
+ret0:
 	MOVW	$0, R0
 	MOVB	R0, ret+12(FP)
 	RET
diff --git a/src/runtime/internal/atomic/types.go b/src/runtime/internal/atomic/types.go
index d346a76..0d75226 100644
--- a/src/runtime/internal/atomic/types.go
+++ b/src/runtime/internal/atomic/types.go
@@ -15,25 +15,32 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (i *Int32) Load() int32 {
 	return Loadint32(&i.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (i *Int32) Store(value int32) {
 	Storeint32(&i.value, value)
 }
 
 // CompareAndSwap atomically compares i's value with old,
 // and if they're equal, swaps i's value with new.
+// It reports whether the swap ran.
 //
-// Returns true if the operation succeeded.
+//go:nosplit
 func (i *Int32) CompareAndSwap(old, new int32) bool {
 	return Casint32(&i.value, old, new)
 }
 
 // Swap replaces i's value with new, returning
 // i's value before the replacement.
+//
+//go:nosplit
 func (i *Int32) Swap(new int32) int32 {
 	return Xchgint32(&i.value, new)
 }
@@ -43,6 +50,8 @@
 //
 // This operation wraps around in the usual
 // two's-complement way.
+//
+//go:nosplit
 func (i *Int32) Add(delta int32) int32 {
 	return Xaddint32(&i.value, delta)
 }
@@ -59,25 +68,32 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (i *Int64) Load() int64 {
 	return Loadint64(&i.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (i *Int64) Store(value int64) {
 	Storeint64(&i.value, value)
 }
 
 // CompareAndSwap atomically compares i's value with old,
 // and if they're equal, swaps i's value with new.
+// It reports whether the swap ran.
 //
-// Returns true if the operation succeeded.
+//go:nosplit
 func (i *Int64) CompareAndSwap(old, new int64) bool {
 	return Casint64(&i.value, old, new)
 }
 
 // Swap replaces i's value with new, returning
 // i's value before the replacement.
+//
+//go:nosplit
 func (i *Int64) Swap(new int64) int64 {
 	return Xchgint64(&i.value, new)
 }
@@ -87,6 +103,8 @@
 //
 // This operation wraps around in the usual
 // two's-complement way.
+//
+//go:nosplit
 func (i *Int64) Add(delta int64) int64 {
 	return Xaddint64(&i.value, delta)
 }
@@ -100,11 +118,15 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (u *Uint8) Load() uint8 {
 	return Load8(&u.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (u *Uint8) Store(value uint8) {
 	Store8(&u.value, value)
 }
@@ -114,6 +136,8 @@
 // the result into u.
 //
 // The full process is performed atomically.
+//
+//go:nosplit
 func (u *Uint8) And(value uint8) {
 	And8(&u.value, value)
 }
@@ -123,6 +147,8 @@
 // the result into u.
 //
 // The full process is performed atomically.
+//
+//go:nosplit
 func (u *Uint8) Or(value uint8) {
 	Or8(&u.value, value)
 }
@@ -136,11 +162,15 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (b *Bool) Load() bool {
 	return b.u.Load() != 0
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (b *Bool) Store(value bool) {
 	s := uint8(0)
 	if value {
@@ -158,6 +188,8 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (u *Uint32) Load() uint32 {
 	return Load(&u.value)
 }
@@ -169,11 +201,15 @@
 // on this thread can be observed to occur before it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uint32) LoadAcquire() uint32 {
 	return LoadAcq(&u.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (u *Uint32) Store(value uint32) {
 	Store(&u.value, value)
 }
@@ -185,14 +221,17 @@
 // on this thread can be observed to occur after it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uint32) StoreRelease(value uint32) {
 	StoreRel(&u.value, value)
 }
 
 // CompareAndSwap atomically compares u's value with old,
 // and if they're equal, swaps u's value with new.
+// It reports whether the swap ran.
 //
-// Returns true if the operation succeeded.
+//go:nosplit
 func (u *Uint32) CompareAndSwap(old, new uint32) bool {
 	return Cas(&u.value, old, new)
 }
@@ -202,16 +241,19 @@
 // may observe operations that occur after this operation to
 // precede it, but no operation that precedes it
 // on this thread can be observed to occur after it.
-//
-// Returns true if the operation succeeded.
+// It reports whether the swap ran.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uint32) CompareAndSwapRelease(old, new uint32) bool {
 	return CasRel(&u.value, old, new)
 }
 
 // Swap replaces u's value with new, returning
 // u's value before the replacement.
+//
+//go:nosplit
 func (u *Uint32) Swap(value uint32) uint32 {
 	return Xchg(&u.value, value)
 }
@@ -221,6 +263,8 @@
 // the result into u.
 //
 // The full process is performed atomically.
+//
+//go:nosplit
 func (u *Uint32) And(value uint32) {
 	And(&u.value, value)
 }
@@ -230,6 +274,8 @@
 // the result into u.
 //
 // The full process is performed atomically.
+//
+//go:nosplit
 func (u *Uint32) Or(value uint32) {
 	Or(&u.value, value)
 }
@@ -239,6 +285,8 @@
 //
 // This operation wraps around in the usual
 // two's-complement way.
+//
+//go:nosplit
 func (u *Uint32) Add(delta int32) uint32 {
 	return Xadd(&u.value, delta)
 }
@@ -255,25 +303,32 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (u *Uint64) Load() uint64 {
 	return Load64(&u.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (u *Uint64) Store(value uint64) {
 	Store64(&u.value, value)
 }
 
 // CompareAndSwap atomically compares u's value with old,
 // and if they're equal, swaps u's value with new.
+// It reports whether the swap ran.
 //
-// Returns true if the operation succeeded.
+//go:nosplit
 func (u *Uint64) CompareAndSwap(old, new uint64) bool {
 	return Cas64(&u.value, old, new)
 }
 
 // Swap replaces u's value with new, returning
 // u's value before the replacement.
+//
+//go:nosplit
 func (u *Uint64) Swap(value uint64) uint64 {
 	return Xchg64(&u.value, value)
 }
@@ -283,6 +338,8 @@
 //
 // This operation wraps around in the usual
 // two's-complement way.
+//
+//go:nosplit
 func (u *Uint64) Add(delta int64) uint64 {
 	return Xadd64(&u.value, delta)
 }
@@ -296,6 +353,8 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (u *Uintptr) Load() uintptr {
 	return Loaduintptr(&u.value)
 }
@@ -307,11 +366,15 @@
 // on this thread can be observed to occur before it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uintptr) LoadAcquire() uintptr {
 	return LoadAcquintptr(&u.value)
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (u *Uintptr) Store(value uintptr) {
 	Storeuintptr(&u.value, value)
 }
@@ -323,20 +386,25 @@
 // on this thread can be observed to occur after it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uintptr) StoreRelease(value uintptr) {
 	StoreReluintptr(&u.value, value)
 }
 
 // CompareAndSwap atomically compares u's value with old,
 // and if they're equal, swaps u's value with new.
+// It reports whether the swap ran.
 //
-// Returns true if the operation succeeded.
+//go:nosplit
 func (u *Uintptr) CompareAndSwap(old, new uintptr) bool {
 	return Casuintptr(&u.value, old, new)
 }
 
 // Swap replaces u's value with new, returning
 // u's value before the replacement.
+//
+//go:nosplit
 func (u *Uintptr) Swap(value uintptr) uintptr {
 	return Xchguintptr(&u.value, value)
 }
@@ -346,6 +414,8 @@
 //
 // This operation wraps around in the usual
 // two's-complement way.
+//
+//go:nosplit
 func (u *Uintptr) Add(delta uintptr) uintptr {
 	return Xadduintptr(&u.value, delta)
 }
@@ -361,12 +431,16 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (f *Float64) Load() float64 {
 	r := f.u.Load()
 	return *(*float64)(unsafe.Pointer(&r))
 }
 
 // Store updates the value atomically.
+//
+//go:nosplit
 func (f *Float64) Store(value float64) {
 	f.u.Store(*(*uint64)(unsafe.Pointer(&value)))
 }
@@ -386,6 +460,8 @@
 }
 
 // Load accesses and returns the value atomically.
+//
+//go:nosplit
 func (u *UnsafePointer) Load() unsafe.Pointer {
 	return Loadp(unsafe.Pointer(&u.value))
 }
@@ -396,24 +472,102 @@
 // perform a write barrier on value, and so this operation may
 // hide pointers from the GC. Use with care and sparingly.
 // It is safe to use with values not found in the Go heap.
+// Prefer Store instead.
+//
+//go:nosplit
 func (u *UnsafePointer) StoreNoWB(value unsafe.Pointer) {
 	StorepNoWB(unsafe.Pointer(&u.value), value)
 }
 
+// Store updates the value atomically.
+func (u *UnsafePointer) Store(value unsafe.Pointer) {
+	storePointer(&u.value, value)
+}
+
+// provided by runtime
+//go:linkname storePointer
+func storePointer(ptr *unsafe.Pointer, new unsafe.Pointer)
+
 // CompareAndSwapNoWB atomically (with respect to other methods)
 // compares u's value with old, and if they're equal,
 // swaps u's value with new.
-//
-// Returns true if the operation succeeded.
+// It reports whether the swap ran.
 //
 // WARNING: As the name implies this operation does *not*
 // perform a write barrier on value, and so this operation may
 // hide pointers from the GC. Use with care and sparingly.
 // It is safe to use with values not found in the Go heap.
+// Prefer CompareAndSwap instead.
+//
+//go:nosplit
 func (u *UnsafePointer) CompareAndSwapNoWB(old, new unsafe.Pointer) bool {
 	return Casp1(&u.value, old, new)
 }
 
+// CompareAndSwap atomically compares u's value with old,
+// and if they're equal, swaps u's value with new.
+// It reports whether the swap ran.
+func (u *UnsafePointer) CompareAndSwap(old, new unsafe.Pointer) bool {
+	return casPointer(&u.value, old, new)
+}
+
+func casPointer(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+// Pointer is an atomic pointer of type *T.
+type Pointer[T any] struct {
+	u UnsafePointer
+}
+
+// Load accesses and returns the value atomically.
+//
+//go:nosplit
+func (p *Pointer[T]) Load() *T {
+	return (*T)(p.u.Load())
+}
+
+// StoreNoWB updates the value atomically.
+//
+// WARNING: As the name implies this operation does *not*
+// perform a write barrier on value, and so this operation may
+// hide pointers from the GC. Use with care and sparingly.
+// It is safe to use with values not found in the Go heap.
+// Prefer Store instead.
+//
+//go:nosplit
+func (p *Pointer[T]) StoreNoWB(value *T) {
+	p.u.StoreNoWB(unsafe.Pointer(value))
+}
+
+// Store updates the value atomically.
+//go:nosplit
+func (p *Pointer[T]) Store(value *T) {
+	p.u.Store(unsafe.Pointer(value))
+}
+
+// CompareAndSwapNoWB atomically (with respect to other methods)
+// compares u's value with old, and if they're equal,
+// swaps u's value with new.
+// It reports whether the swap ran.
+//
+// WARNING: As the name implies this operation does *not*
+// perform a write barrier on value, and so this operation may
+// hide pointers from the GC. Use with care and sparingly.
+// It is safe to use with values not found in the Go heap.
+// Prefer CompareAndSwap instead.
+//
+//go:nosplit
+func (p *Pointer[T]) CompareAndSwapNoWB(old, new *T) bool {
+	return p.u.CompareAndSwapNoWB(unsafe.Pointer(old), unsafe.Pointer(new))
+}
+
+// CompareAndSwap atomically (with respect to other methods)
+// compares u's value with old, and if they're equal,
+// swaps u's value with new.
+// It reports whether the swap ran.
+func (p *Pointer[T]) CompareAndSwap(old, new *T) bool {
+	return p.u.CompareAndSwap(unsafe.Pointer(old), unsafe.Pointer(new))
+}
+
 // noCopy may be embedded into structs which must not be copied
 // after the first use.
 //
diff --git a/src/runtime/internal/atomic/types_64bit.go b/src/runtime/internal/atomic/types_64bit.go
index 43c1ba2..006e83b 100644
--- a/src/runtime/internal/atomic/types_64bit.go
+++ b/src/runtime/internal/atomic/types_64bit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build amd64 || arm64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
+//go:build amd64 || arm64 || loong64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || wasm
 
 package atomic
 
@@ -13,6 +13,8 @@
 // on this thread can be observed to occur before it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uint64) LoadAcquire() uint64 {
 	return LoadAcq64(&u.value)
 }
@@ -24,6 +26,8 @@
 // on this thread can be observed to occur after it.
 //
 // WARNING: Use sparingly and with great care.
+//
+//go:nosplit
 func (u *Uint64) StoreRelease(value uint64) {
 	StoreRel64(&u.value, value)
 }
diff --git a/src/runtime/internal/startlinetest/func_amd64.go b/src/runtime/internal/startlinetest/func_amd64.go
new file mode 100644
index 0000000..ab7063d
--- /dev/null
+++ b/src/runtime/internal/startlinetest/func_amd64.go
@@ -0,0 +1,13 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package startlinetest contains helpers for runtime_test.TestStartLineAsm.
+package startlinetest
+
+// Defined in func_amd64.s, this is a trivial assembly function that calls
+// runtime_test.callerStartLine.
+func AsmFunc() int
+
+// Provided by runtime_test.
+var CallerStartLine func(bool) int
diff --git a/src/runtime/internal/startlinetest/func_amd64.s b/src/runtime/internal/startlinetest/func_amd64.s
new file mode 100644
index 0000000..96982be
--- /dev/null
+++ b/src/runtime/internal/startlinetest/func_amd64.s
@@ -0,0 +1,28 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "funcdata.h"
+#include "textflag.h"
+
+// Assembly function for runtime_test.TestStartLineAsm.
+//
+// Note that this file can't be built directly as part of runtime_test, as assembly
+// files can't declare an alternative package. Building it into runtime is
+// possible, but linkshared complicates things:
+//
+//  1. linkshared mode leaves the function around in the final output of
+//     non-test builds.
+//  2. Due of (1), the linker can't resolve the callerStartLine relocation
+//     (as runtime_test isn't built for non-test builds).
+//
+// Thus it is simpler to just put this in its own package, imported only by
+// runtime_test. We use ABIInternal as no ABI wrapper is generated for
+// callerStartLine since it is in a different package.
+
+TEXT	·AsmFunc<ABIInternal>(SB),NOSPLIT,$8-0
+	NO_LOCAL_POINTERS
+	MOVQ	$0, AX // wantInlined
+	MOVQ	·CallerStartLine(SB), DX
+	CALL	(DX)
+	RET
diff --git a/src/runtime/internal/sys/consts.go b/src/runtime/internal/sys/consts.go
index fffcf81..98c0f09 100644
--- a/src/runtime/internal/sys/consts.go
+++ b/src/runtime/internal/sys/consts.go
@@ -10,7 +10,9 @@
 )
 
 // AIX requires a larger stack for syscalls.
-const StackGuardMultiplier = StackGuardMultiplierDefault*(1-goos.IsAix) + 2*goos.IsAix
+// The race build also needs more stack. See issue 54291.
+// This arithmetic must match that in cmd/internal/objabi/stack.go:stackGuardMultiplier.
+const StackGuardMultiplier = 1 + goos.IsAix + isRace
 
 // DefaultPhysPageSize is the default physical page size.
 const DefaultPhysPageSize = goarch.DefaultPhysPageSize
diff --git a/src/runtime/internal/sys/consts_norace.go b/src/runtime/internal/sys/consts_norace.go
new file mode 100644
index 0000000..a9613b8
--- /dev/null
+++ b/src/runtime/internal/sys/consts_norace.go
@@ -0,0 +1,9 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !race
+
+package sys
+
+const isRace = 0
diff --git a/src/runtime/internal/sys/consts_race.go b/src/runtime/internal/sys/consts_race.go
new file mode 100644
index 0000000..f824fb3
--- /dev/null
+++ b/src/runtime/internal/sys/consts_race.go
@@ -0,0 +1,9 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build race
+
+package sys
+
+const isRace = 1
diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
index 5af4901..902d893 100644
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go
@@ -5,56 +5,75 @@
 //go:build !386
 
 // TODO finish intrinsifying 386, deadcode the assembly, remove build tags, merge w/ intrinsics_common
-// TODO replace all uses of CtzXX with TrailingZerosXX; they are the same.
 
 package sys
 
-// Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf
+// Copied from math/bits to avoid dependence.
 
-const deBruijn64ctz = 0x0218a392cd3d5dbf
-
-var deBruijnIdx64ctz = [64]byte{
-	0, 1, 2, 7, 3, 13, 8, 19,
-	4, 25, 14, 28, 9, 34, 20, 40,
-	5, 17, 26, 38, 15, 46, 29, 48,
-	10, 31, 35, 54, 21, 50, 41, 57,
-	63, 6, 12, 18, 24, 27, 33, 39,
-	16, 37, 45, 47, 30, 53, 49, 56,
-	62, 11, 23, 32, 36, 44, 52, 55,
-	61, 22, 43, 51, 60, 42, 59, 58,
+var deBruijn32tab = [32]byte{
+	0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+	31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9,
 }
 
-const deBruijn32ctz = 0x04653adf
+const deBruijn32 = 0x077CB531
 
-var deBruijnIdx32ctz = [32]byte{
-	0, 1, 2, 6, 3, 11, 7, 16,
-	4, 14, 12, 21, 8, 23, 17, 26,
-	31, 5, 10, 15, 13, 20, 22, 25,
-	30, 9, 19, 24, 29, 18, 28, 27,
+var deBruijn64tab = [64]byte{
+	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
+	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
+	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
+	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
 }
 
-// Ctz64 counts trailing (low-order) zeroes,
-// and if all are zero, then 64.
-func Ctz64(x uint64) int {
-	x &= -x                       // isolate low-order bit
-	y := x * deBruijn64ctz >> 58  // extract part of deBruijn sequence
-	i := int(deBruijnIdx64ctz[y]) // convert to bit index
-	z := int((x - 1) >> 57 & 64)  // adjustment if zero
-	return i + z
+const deBruijn64 = 0x03f79d71b4ca8b09
+
+const ntz8tab = "" +
+	"\x08\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x05\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x06\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x05\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x07\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x05\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x06\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x05\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00" +
+	"\x04\x00\x01\x00\x02\x00\x01\x00\x03\x00\x01\x00\x02\x00\x01\x00"
+
+// TrailingZeros32 returns the number of trailing zero bits in x; the result is 32 for x == 0.
+func TrailingZeros32(x uint32) int {
+	if x == 0 {
+		return 32
+	}
+	// see comment in TrailingZeros64
+	return int(deBruijn32tab[(x&-x)*deBruijn32>>(32-5)])
 }
 
-// Ctz32 counts trailing (low-order) zeroes,
-// and if all are zero, then 32.
-func Ctz32(x uint32) int {
-	x &= -x                       // isolate low-order bit
-	y := x * deBruijn32ctz >> 27  // extract part of deBruijn sequence
-	i := int(deBruijnIdx32ctz[y]) // convert to bit index
-	z := int((x - 1) >> 26 & 32)  // adjustment if zero
-	return i + z
+// TrailingZeros64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
+func TrailingZeros64(x uint64) int {
+	if x == 0 {
+		return 64
+	}
+	// If popcount is fast, replace code below with return popcount(^x & (x - 1)).
+	//
+	// x & -x leaves only the right-most bit set in the word. Let k be the
+	// index of that bit. Since only a single bit is set, the value is two
+	// to the power of k. Multiplying by a power of two is equivalent to
+	// left shifting, in this case by k bits. The de Bruijn (64 bit) constant
+	// is such that all six bit, consecutive substrings are distinct.
+	// Therefore, if we have a left shifted version of this constant we can
+	// find by how many bits it was shifted by looking at which six bit
+	// substring ended up at the top of the word.
+	// (Knuth, volume 4, section 7.3.1)
+	return int(deBruijn64tab[(x&-x)*deBruijn64>>(64-6)])
 }
 
-// Ctz8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
-func Ctz8(x uint8) int {
+// TrailingZeros8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func TrailingZeros8(x uint8) int {
 	return int(ntz8tab[x])
 }
 
diff --git a/src/runtime/internal/sys/intrinsics_386.s b/src/runtime/internal/sys/intrinsics_386.s
index 784b246..f33ade0 100644
--- a/src/runtime/internal/sys/intrinsics_386.s
+++ b/src/runtime/internal/sys/intrinsics_386.s
@@ -4,7 +4,7 @@
 
 #include "textflag.h"
 
-TEXT runtime∕internal∕sys·Ctz64(SB), NOSPLIT, $0-12
+TEXT runtime∕internal∕sys·TrailingZeros64(SB), NOSPLIT, $0-12
 	// Try low 32 bits.
 	MOVL	x_lo+0(FP), AX
 	BSFL	AX, AX
@@ -26,7 +26,7 @@
 	MOVL	$64, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8
+TEXT runtime∕internal∕sys·TrailingZeros32(SB), NOSPLIT, $0-8
 	MOVL	x+0(FP), AX
 	BSFL	AX, AX
 	JNZ	2(PC)
@@ -34,7 +34,7 @@
 	MOVL	AX, ret+4(FP)
 	RET
 
-TEXT runtime∕internal∕sys·Ctz8(SB), NOSPLIT, $0-8
+TEXT runtime∕internal∕sys·TrailingZeros8(SB), NOSPLIT, $0-8
 	MOVBLZX	x+0(FP), AX
 	BSFL	AX, AX
 	JNZ	2(PC)
diff --git a/src/runtime/internal/sys/intrinsics_common.go b/src/runtime/internal/sys/intrinsics_common.go
index 48d9759..1461551 100644
--- a/src/runtime/internal/sys/intrinsics_common.go
+++ b/src/runtime/internal/sys/intrinsics_common.go
@@ -6,45 +6,29 @@
 
 // Copied from math/bits to avoid dependence.
 
-var len8tab = [256]uint8{
-	0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
-	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
-	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
-	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
-}
+const len8tab = "" +
+	"\x00\x01\x02\x02\x03\x03\x03\x03\x04\x04\x04\x04\x04\x04\x04\x04" +
+	"\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05\x05" +
+	"\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06" +
+	"\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06\x06" +
+	"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" +
+	"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" +
+	"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" +
+	"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" +
+	"\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08"
 
-var ntz8tab = [256]uint8{
-	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-}
-
-// len64 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+// Len64 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+//
+// nosplit because this is used in src/runtime/histogram.go, which make run in sensitive contexts.
+//
+//go:nosplit
 func Len64(x uint64) (n int) {
 	if x >= 1<<32 {
 		x >>= 32
@@ -98,45 +82,12 @@
 	return int(x) & (1<<7 - 1)
 }
 
-var deBruijn64tab = [64]byte{
-	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
-	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
-	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
-	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
-}
-
-const deBruijn64 = 0x03f79d71b4ca8b09
-
-// TrailingZeros64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
-func TrailingZeros64(x uint64) int {
-	if x == 0 {
-		return 64
-	}
-	// If popcount is fast, replace code below with return popcount(^x & (x - 1)).
-	//
-	// x & -x leaves only the right-most bit set in the word. Let k be the
-	// index of that bit. Since only a single bit is set, the value is two
-	// to the power of k. Multiplying by a power of two is equivalent to
-	// left shifting, in this case by k bits. The de Bruijn (64 bit) constant
-	// is such that all six bit, consecutive substrings are distinct.
-	// Therefore, if we have a left shifted version of this constant we can
-	// find by how many bits it was shifted by looking at which six bit
-	// substring ended up at the top of the word.
-	// (Knuth, volume 4, section 7.3.1)
-	return int(deBruijn64tab[(x&-x)*deBruijn64>>(64-6)])
-}
-
 // LeadingZeros64 returns the number of leading zero bits in x; the result is 64 for x == 0.
 func LeadingZeros64(x uint64) int { return 64 - Len64(x) }
 
 // LeadingZeros8 returns the number of leading zero bits in x; the result is 8 for x == 0.
 func LeadingZeros8(x uint8) int { return 8 - Len8(x) }
 
-// TrailingZeros8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
-func TrailingZeros8(x uint8) int {
-	return int(ntz8tab[x])
-}
-
 // Len8 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
 func Len8(x uint8) int {
 	return int(len8tab[x])
diff --git a/src/runtime/internal/sys/intrinsics_stubs.go b/src/runtime/internal/sys/intrinsics_stubs.go
index a020652..66cfcde 100644
--- a/src/runtime/internal/sys/intrinsics_stubs.go
+++ b/src/runtime/internal/sys/intrinsics_stubs.go
@@ -6,8 +6,8 @@
 
 package sys
 
-func Ctz64(x uint64) int
-func Ctz32(x uint32) int
-func Ctz8(x uint8) int
+func TrailingZeros64(x uint64) int
+func TrailingZeros32(x uint32) int
+func TrailingZeros8(x uint8) int
 func Bswap64(x uint64) uint64
 func Bswap32(x uint32) uint32
diff --git a/src/runtime/internal/sys/intrinsics_test.go b/src/runtime/internal/sys/intrinsics_test.go
index 0444183..bf75f19 100644
--- a/src/runtime/internal/sys/intrinsics_test.go
+++ b/src/runtime/internal/sys/intrinsics_test.go
@@ -5,19 +5,19 @@
 	"testing"
 )
 
-func TestCtz64(t *testing.T) {
+func TestTrailingZeros64(t *testing.T) {
 	for i := 0; i <= 64; i++ {
 		x := uint64(5) << uint(i)
-		if got := sys.Ctz64(x); got != i {
-			t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
+		if got := sys.TrailingZeros64(x); got != i {
+			t.Errorf("TrailingZeros64(%d)=%d, want %d", x, got, i)
 		}
 	}
 }
-func TestCtz32(t *testing.T) {
+func TestTrailingZeros32(t *testing.T) {
 	for i := 0; i <= 32; i++ {
 		x := uint32(5) << uint(i)
-		if got := sys.Ctz32(x); got != i {
-			t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
+		if got := sys.TrailingZeros32(x); got != i {
+			t.Errorf("TrailingZeros32(%d)=%d, want %d", x, got, i)
 		}
 	}
 }
diff --git a/src/runtime/internal/sys/nih.go b/src/runtime/internal/sys/nih.go
new file mode 100644
index 0000000..17eab67
--- /dev/null
+++ b/src/runtime/internal/sys/nih.go
@@ -0,0 +1,41 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+// NOTE: keep in sync with cmd/compile/internal/types.CalcSize
+// to make the compiler recognize this as an intrinsic type.
+type nih struct{}
+
+// NotInHeap is a type must never be allocated from the GC'd heap or on the stack,
+// and is called not-in-heap.
+//
+// Other types can embed NotInHeap to make it not-in-heap. Specifically, pointers
+// to these types must always fail the `runtime.inheap` check. The type may be used
+// for global variables, or for objects in unmanaged memory (e.g., allocated with
+// `sysAlloc`, `persistentalloc`, r`fixalloc`, or from a manually-managed span).
+//
+// Specifically:
+//
+// 1. `new(T)`, `make([]T)`, `append([]T, ...)` and implicit heap
+// allocation of T are disallowed. (Though implicit allocations are
+// disallowed in the runtime anyway.)
+//
+// 2. A pointer to a regular type (other than `unsafe.Pointer`) cannot be
+// converted to a pointer to a not-in-heap type, even if they have the
+// same underlying type.
+//
+// 3. Any type that containing a not-in-heap type is itself considered as not-in-heap.
+//
+// - Structs and arrays are not-in-heap if their elements are not-in-heap.
+// - Maps and channels contains no-in-heap types are disallowed.
+//
+// 4. Write barriers on pointers to not-in-heap types can be omitted.
+//
+// The last point is the real benefit of NotInHeap. The runtime uses
+// it for low-level internal structures to avoid memory barriers in the
+// scheduler and the memory allocator where they are illegal or simply
+// inefficient. This mechanism is reasonably safe and does not compromise
+// the readability of the runtime.
+type NotInHeap struct{ _ nih }
diff --git a/src/runtime/internal/sys/zversion.go b/src/runtime/internal/sys/zversion.go
index b058a3d..184c263 100644
--- a/src/runtime/internal/sys/zversion.go
+++ b/src/runtime/internal/sys/zversion.go
@@ -1,5 +1,3 @@
 // Code generated by go tool dist; DO NOT EDIT.
 
 package sys
-
-const StackGuardMultiplierDefault = 1
diff --git a/src/runtime/internal/syscall/asm_linux_mips64x.s b/src/runtime/internal/syscall/asm_linux_mips64x.s
index 0e88a2d..6b7c524 100644
--- a/src/runtime/internal/syscall/asm_linux_mips64x.s
+++ b/src/runtime/internal/syscall/asm_linux_mips64x.s
@@ -15,6 +15,7 @@
 	MOVV	a4+32(FP), R7
 	MOVV	a5+40(FP), R8
 	MOVV	a6+48(FP), R9
+	MOVV	R0, R3	// reset R3 to 0 as 1-ret SYSCALL keeps it
 	SYSCALL
 	BEQ	R7, ok
 	MOVV	$-1, R1
diff --git a/src/runtime/internal/syscall/asm_linux_mipsx.s b/src/runtime/internal/syscall/asm_linux_mipsx.s
index 050029e..561310f 100644
--- a/src/runtime/internal/syscall/asm_linux_mipsx.s
+++ b/src/runtime/internal/syscall/asm_linux_mipsx.s
@@ -20,6 +20,7 @@
 	MOVW	a6+24(FP), R9
 	MOVW	R8, 16(R29)
 	MOVW	R9, 20(R29)
+	MOVW	R0, R3	// reset R3 to 0 as 1-ret SYSCALL keeps it
 	SYSCALL
 	BEQ	R7, ok
 	MOVW	$-1, R1
diff --git a/src/runtime/internal/syscall/asm_linux_ppc64x.s b/src/runtime/internal/syscall/asm_linux_ppc64x.s
index 8cf8737..3e985ed 100644
--- a/src/runtime/internal/syscall/asm_linux_ppc64x.s
+++ b/src/runtime/internal/syscall/asm_linux_ppc64x.s
@@ -7,22 +7,17 @@
 #include "textflag.h"
 
 // func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	MOVD	num+0(FP), R9	// syscall entry
-	MOVD	a1+8(FP), R3
-	MOVD	a2+16(FP), R4
-	MOVD	a3+24(FP), R5
-	MOVD	a4+32(FP), R6
-	MOVD	a5+40(FP), R7
-	MOVD	a6+48(FP), R8
-	SYSCALL	R9
-	MOVD	R0, r2+64(FP) // r2 is not used. Always set to 0.
-	BVC	ok
-	MOVD	$-1, R4
-	MOVD	R4, r1+56(FP)
-	MOVD	R3, errno+72(FP)
-	RET
-ok:
-	MOVD	R3, r1+56(FP)
-	MOVD	R0, errno+72(FP)
+TEXT ·Syscall6<ABIInternal>(SB),NOSPLIT,$0-80
+	MOVD	R3, R10	// Move syscall number to R10. SYSCALL will move it R0, and restore R0.
+	MOVD	R4, R3
+	MOVD	R5, R4
+	MOVD	R6, R5
+	MOVD	R7, R6
+	MOVD	R8, R7
+	MOVD	R9, R8
+	SYSCALL	R10
+	MOVD	$-1, R6
+	ISEL	CR0SO, R3, R0, R5 // errno = (error) ? R3 : 0
+	ISEL	CR0SO, R6, R3, R3 // r1 = (error) ? -1 : 0
+	MOVD	$0, R4            // r2 is not used on linux/ppc64
 	RET
diff --git a/src/runtime/internal/syscall/asm_linux_riscv64.s b/src/runtime/internal/syscall/asm_linux_riscv64.s
index a8652fd..15e50ec 100644
--- a/src/runtime/internal/syscall/asm_linux_riscv64.s
+++ b/src/runtime/internal/syscall/asm_linux_riscv64.s
@@ -5,25 +5,39 @@
 #include "textflag.h"
 
 // func Syscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr)
-TEXT ·Syscall6(SB),NOSPLIT,$0-80
-	MOV	num+0(FP), A7	// syscall entry
-	MOV	a1+8(FP), A0
-	MOV	a2+16(FP), A1
-	MOV	a3+24(FP), A2
-	MOV	a4+32(FP), A3
-	MOV	a5+40(FP), A4
-	MOV	a6+48(FP), A5
+//
+// We need to convert to the syscall ABI.
+//
+// arg | ABIInternal | Syscall
+// ---------------------------
+// num | A0          | A7
+// a1  | A1          | A0
+// a2  | A2          | A1
+// a3  | A3          | A2
+// a4  | A4          | A3
+// a5  | A5          | A4
+// a6  | A6          | A5
+//
+// r1  | A0          | A0
+// r2  | A1          | A1
+// err | A2          | part of A0
+TEXT ·Syscall6<ABIInternal>(SB),NOSPLIT,$0-80
+	MOV	A0, A7
+	MOV	A1, A0
+	MOV	A2, A1
+	MOV	A3, A2
+	MOV	A4, A3
+	MOV	A5, A4
+	MOV	A6, A5
 	ECALL
 	MOV	$-4096, T0
 	BLTU	T0, A0, err
-	MOV	A0, r1+56(FP)
-	MOV	A1, r2+64(FP)
-	MOV	ZERO, errno+72(FP)
+	// r1 already in A0
+	// r2 already in A1
+	MOV	ZERO, A2 // errno
 	RET
 err:
-	MOV	$-1, T0
-	MOV	T0, r1+56(FP)
-	MOV	ZERO, r2+64(FP)
-	SUB	A0, ZERO, A0
-	MOV	A0, errno+72(FP)
+	SUB	A0, ZERO, A2 // errno
+	MOV	$-1, A0	     // r1
+	MOV	ZERO, A1     // r2
 	RET
diff --git a/src/runtime/internal/syscall/defs_linux.go b/src/runtime/internal/syscall/defs_linux.go
new file mode 100644
index 0000000..71f1fa1
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux.go
@@ -0,0 +1,10 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	F_SETFD    = 2
+	FD_CLOEXEC = 1
+)
diff --git a/src/runtime/internal/syscall/defs_linux_386.go b/src/runtime/internal/syscall/defs_linux_386.go
new file mode 100644
index 0000000..dc723a6
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_386.go
@@ -0,0 +1,29 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_FCNTL         = 55
+	SYS_EPOLL_CTL     = 255
+	SYS_EPOLL_PWAIT   = 319
+	SYS_EPOLL_CREATE1 = 329
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events uint32
+	Data   [8]byte // to match amd64
+}
diff --git a/src/runtime/internal/syscall/defs_linux_amd64.go b/src/runtime/internal/syscall/defs_linux_amd64.go
new file mode 100644
index 0000000..886eb5b
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_amd64.go
@@ -0,0 +1,29 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_FCNTL         = 72
+	SYS_EPOLL_CTL     = 233
+	SYS_EPOLL_PWAIT   = 281
+	SYS_EPOLL_CREATE1 = 291
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events uint32
+	Data   [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/defs_linux_arm.go b/src/runtime/internal/syscall/defs_linux_arm.go
new file mode 100644
index 0000000..8f812a2
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_arm.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_FCNTL         = 55
+	SYS_EPOLL_CTL     = 251
+	SYS_EPOLL_PWAIT   = 346
+	SYS_EPOLL_CREATE1 = 357
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events uint32
+	_pad   uint32
+	Data   [8]byte // to match amd64
+}
diff --git a/src/runtime/internal/syscall/defs_linux_arm64.go b/src/runtime/internal/syscall/defs_linux_arm64.go
new file mode 100644
index 0000000..48e11b0
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_arm64.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_EPOLL_CREATE1 = 20
+	SYS_EPOLL_CTL     = 21
+	SYS_EPOLL_PWAIT   = 22
+	SYS_FCNTL         = 25
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events uint32
+	_pad   uint32
+	Data   [8]byte // to match amd64
+}
diff --git a/src/runtime/internal/syscall/defs_linux_loong64.go b/src/runtime/internal/syscall/defs_linux_loong64.go
new file mode 100644
index 0000000..b78ef81
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_loong64.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_EPOLL_CREATE1 = 20
+	SYS_EPOLL_CTL     = 21
+	SYS_EPOLL_PWAIT   = 22
+	SYS_FCNTL         = 25
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/defs_linux_mips64x.go b/src/runtime/internal/syscall/defs_linux_mips64x.go
new file mode 100644
index 0000000..92b49ca
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_mips64x.go
@@ -0,0 +1,32 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (mips64 || mips64le)
+
+package syscall
+
+const (
+	SYS_FCNTL         = 5070
+	SYS_EPOLL_CTL     = 5208
+	SYS_EPOLL_PWAIT   = 5272
+	SYS_EPOLL_CREATE1 = 5285
+	SYS_EPOLL_PWAIT2  = 5441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/defs_linux_mipsx.go b/src/runtime/internal/syscall/defs_linux_mipsx.go
new file mode 100644
index 0000000..e28d09c
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_mipsx.go
@@ -0,0 +1,32 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (mips || mipsle)
+
+package syscall
+
+const (
+	SYS_FCNTL         = 4055
+	SYS_EPOLL_CTL     = 4249
+	SYS_EPOLL_PWAIT   = 4313
+	SYS_EPOLL_CREATE1 = 4326
+	SYS_EPOLL_PWAIT2  = 4441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      uint64
+}
diff --git a/src/runtime/internal/syscall/defs_linux_ppc64x.go b/src/runtime/internal/syscall/defs_linux_ppc64x.go
new file mode 100644
index 0000000..a74483e
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_ppc64x.go
@@ -0,0 +1,32 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && (ppc64 || ppc64le)
+
+package syscall
+
+const (
+	SYS_FCNTL         = 55
+	SYS_EPOLL_CTL     = 237
+	SYS_EPOLL_PWAIT   = 303
+	SYS_EPOLL_CREATE1 = 315
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/defs_linux_riscv64.go b/src/runtime/internal/syscall/defs_linux_riscv64.go
new file mode 100644
index 0000000..b78ef81
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_riscv64.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_EPOLL_CREATE1 = 20
+	SYS_EPOLL_CTL     = 21
+	SYS_EPOLL_PWAIT   = 22
+	SYS_FCNTL         = 25
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/defs_linux_s390x.go b/src/runtime/internal/syscall/defs_linux_s390x.go
new file mode 100644
index 0000000..a7bb1ba
--- /dev/null
+++ b/src/runtime/internal/syscall/defs_linux_s390x.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall
+
+const (
+	SYS_FCNTL         = 55
+	SYS_EPOLL_CTL     = 250
+	SYS_EPOLL_PWAIT   = 312
+	SYS_EPOLL_CREATE1 = 327
+	SYS_EPOLL_PWAIT2  = 441
+
+	EPOLLIN       = 0x1
+	EPOLLOUT      = 0x4
+	EPOLLERR      = 0x8
+	EPOLLHUP      = 0x10
+	EPOLLRDHUP    = 0x2000
+	EPOLLET       = 0x80000000
+	EPOLL_CLOEXEC = 0x80000
+	EPOLL_CTL_ADD = 0x1
+	EPOLL_CTL_DEL = 0x2
+	EPOLL_CTL_MOD = 0x3
+)
+
+type EpollEvent struct {
+	Events    uint32
+	pad_cgo_0 [4]byte
+	Data      [8]byte // unaligned uintptr
+}
diff --git a/src/runtime/internal/syscall/syscall_linux.go b/src/runtime/internal/syscall/syscall_linux.go
index 7f268e8..a103d31 100644
--- a/src/runtime/internal/syscall/syscall_linux.go
+++ b/src/runtime/internal/syscall/syscall_linux.go
@@ -6,7 +6,7 @@
 package syscall
 
 import (
-	_ "unsafe" // for go:linkname
+	"unsafe"
 )
 
 // TODO(https://go.dev/issue/51087): This package is incomplete and currently
@@ -37,3 +37,30 @@
 func syscall_RawSyscall6(num, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, errno uintptr) {
 	return Syscall6(num, a1, a2, a3, a4, a5, a6)
 }
+
+func EpollCreate1(flags int32) (fd int32, errno uintptr) {
+	r1, _, e := Syscall6(SYS_EPOLL_CREATE1, uintptr(flags), 0, 0, 0, 0, 0)
+	return int32(r1), e
+}
+
+var _zero uintptr
+
+func EpollWait(epfd int32, events []EpollEvent, maxev, waitms int32) (n int32, errno uintptr) {
+	var ev unsafe.Pointer
+	if len(events) > 0 {
+		ev = unsafe.Pointer(&events[0])
+	} else {
+		ev = unsafe.Pointer(&_zero)
+	}
+	r1, _, e := Syscall6(SYS_EPOLL_PWAIT, uintptr(epfd), uintptr(ev), uintptr(maxev), uintptr(waitms), 0, 0)
+	return int32(r1), e
+}
+
+func EpollCtl(epfd, op, fd int32, event *EpollEvent) (errno uintptr) {
+	_, _, e := Syscall6(SYS_EPOLL_CTL, uintptr(epfd), uintptr(op), uintptr(fd), uintptr(unsafe.Pointer(event)), 0, 0)
+	return e
+}
+
+func CloseOnExec(fd int32) {
+	Syscall6(SYS_FCNTL, uintptr(fd), F_SETFD, FD_CLOEXEC, 0, 0, 0)
+}
diff --git a/src/runtime/internal/syscall/syscall_linux_test.go b/src/runtime/internal/syscall/syscall_linux_test.go
new file mode 100644
index 0000000..1976da5
--- /dev/null
+++ b/src/runtime/internal/syscall/syscall_linux_test.go
@@ -0,0 +1,19 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall_test
+
+import (
+	"runtime/internal/syscall"
+	"testing"
+)
+
+func TestEpollctlErrorSign(t *testing.T) {
+	v := syscall.EpollCtl(-1, 1, -1, &syscall.EpollEvent{})
+
+	const EBADF = 0x09
+	if v != EBADF {
+		t.Errorf("epollctl = %v, want %v", v, EBADF)
+	}
+}
diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index 406561a..306a8e8 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go
@@ -18,8 +18,7 @@
 // This stack is intrusive. Nodes must embed lfnode as the first field.
 //
 // The stack does not keep GC-visible pointers to nodes, so the caller
-// is responsible for ensuring the nodes are not garbage collected
-// (typically by allocating them from manually-managed memory).
+// must ensure the nodes are allocated outside the Go heap.
 type lfstack uint64
 
 func (head *lfstack) push(node *lfnode) {
@@ -59,6 +58,9 @@
 // lfnodeValidate panics if node is not a valid address for use with
 // lfstack.push. This only needs to be called when node is allocated.
 func lfnodeValidate(node *lfnode) {
+	if base, _, _ := findObject(uintptr(unsafe.Pointer(node)), 0, 0); base != 0 {
+		throw("lfstack node allocated from the heap")
+	}
 	if lfstackUnpack(lfstackPack(node, ^uintptr(0))) != node {
 		printlock()
 		println("runtime: bad lfnode address", hex(uintptr(unsafe.Pointer(node))))
diff --git a/src/runtime/lfstack_64bit.go b/src/runtime/lfstack_64bit.go
index 154130c..88cbd3b 100644
--- a/src/runtime/lfstack_64bit.go
+++ b/src/runtime/lfstack_64bit.go
@@ -36,12 +36,21 @@
 	// We use one bit to distinguish between the two ranges.
 	aixAddrBits = 57
 	aixCntBits  = 64 - aixAddrBits + 3
+
+	// riscv64 SV57 mode gives 56 bits of userspace VA.
+	// lfstack code supports it, but broader support for SV57 mode is incomplete,
+	// and there may be other issues (see #54104).
+	riscv64AddrBits = 56
+	riscv64CntBits  = 64 - riscv64AddrBits + 3
 )
 
 func lfstackPack(node *lfnode, cnt uintptr) uint64 {
 	if GOARCH == "ppc64" && GOOS == "aix" {
 		return uint64(uintptr(unsafe.Pointer(node)))<<(64-aixAddrBits) | uint64(cnt&(1<<aixCntBits-1))
 	}
+	if GOARCH == "riscv64" {
+		return uint64(uintptr(unsafe.Pointer(node)))<<(64-riscv64AddrBits) | uint64(cnt&(1<<riscv64CntBits-1))
+	}
 	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
 }
 
@@ -54,5 +63,8 @@
 	if GOARCH == "ppc64" && GOOS == "aix" {
 		return (*lfnode)(unsafe.Pointer(uintptr((val >> aixCntBits << 3) | 0xa<<56)))
 	}
+	if GOARCH == "riscv64" {
+		return (*lfnode)(unsafe.Pointer(uintptr(val >> riscv64CntBits << 3)))
+	}
 	return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
 }
diff --git a/src/runtime/lfstack_test.go b/src/runtime/lfstack_test.go
index d0a1b6b..e36297e 100644
--- a/src/runtime/lfstack_test.go
+++ b/src/runtime/lfstack_test.go
@@ -16,6 +16,17 @@
 	data int
 }
 
+// allocMyNode allocates nodes that are stored in an lfstack
+// outside the Go heap.
+// We require lfstack objects to live outside the heap so that
+// checkptr passes on the unsafe shenanigans used.
+func allocMyNode(data int) *MyNode {
+	n := (*MyNode)(PersistentAlloc(unsafe.Sizeof(MyNode{})))
+	LFNodeValidate(&n.LFNode)
+	n.data = data
+	return n
+}
+
 func fromMyNode(node *MyNode) *LFNode {
 	return (*LFNode)(unsafe.Pointer(node))
 }
@@ -30,22 +41,17 @@
 	stack := new(uint64)
 	global = stack // force heap allocation
 
-	// Need to keep additional references to nodes, the stack is not all that type-safe.
-	var nodes []*MyNode
-
 	// Check the stack is initially empty.
 	if LFStackPop(stack) != nil {
 		t.Fatalf("stack is not empty")
 	}
 
 	// Push one element.
-	node := &MyNode{data: 42}
-	nodes = append(nodes, node)
+	node := allocMyNode(42)
 	LFStackPush(stack, fromMyNode(node))
 
 	// Push another.
-	node = &MyNode{data: 43}
-	nodes = append(nodes, node)
+	node = allocMyNode(43)
 	LFStackPush(stack, fromMyNode(node))
 
 	// Pop one element.
@@ -75,8 +81,6 @@
 	}
 }
 
-var stress []*MyNode
-
 func TestLFStackStress(t *testing.T) {
 	const K = 100
 	P := 4 * GOMAXPROCS(-1)
@@ -86,15 +90,11 @@
 	}
 	// Create 2 stacks.
 	stacks := [2]*uint64{new(uint64), new(uint64)}
-	// Need to keep additional references to nodes,
-	// the lock-free stack is not type-safe.
-	stress = nil
 	// Push K elements randomly onto the stacks.
 	sum := 0
 	for i := 0; i < K; i++ {
 		sum += i
-		node := &MyNode{data: i}
-		stress = append(stress, node)
+		node := allocMyNode(i)
 		LFStackPush(stacks[i%2], fromMyNode(node))
 	}
 	c := make(chan bool, P)
@@ -134,7 +134,4 @@
 	if sum2 != sum {
 		t.Fatalf("Wrong sum %d/%d", sum2, sum)
 	}
-
-	// Let nodes be collected now.
-	stress = nil
 }
diff --git a/src/runtime/libfuzzer.go b/src/runtime/libfuzzer.go
index 6bfaef8..0ece035 100644
--- a/src/runtime/libfuzzer.go
+++ b/src/runtime/libfuzzer.go
@@ -20,49 +20,49 @@
 // This may result in these functions having callers that are nosplit. That is why they must be nosplit.
 //
 //go:nosplit
-func libfuzzerTraceCmp1(arg0, arg1 uint8, fakePC int) {
+func libfuzzerTraceCmp1(arg0, arg1 uint8, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_cmp1, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceCmp2(arg0, arg1 uint16, fakePC int) {
+func libfuzzerTraceCmp2(arg0, arg1 uint16, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_cmp2, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceCmp4(arg0, arg1 uint32, fakePC int) {
+func libfuzzerTraceCmp4(arg0, arg1 uint32, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_cmp4, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceCmp8(arg0, arg1 uint64, fakePC int) {
+func libfuzzerTraceCmp8(arg0, arg1 uint64, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_cmp8, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceConstCmp1(arg0, arg1 uint8, fakePC int) {
+func libfuzzerTraceConstCmp1(arg0, arg1 uint8, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_const_cmp1, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceConstCmp2(arg0, arg1 uint16, fakePC int) {
+func libfuzzerTraceConstCmp2(arg0, arg1 uint16, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_const_cmp2, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceConstCmp4(arg0, arg1 uint32, fakePC int) {
+func libfuzzerTraceConstCmp4(arg0, arg1 uint32, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_const_cmp4, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
 
 //go:nosplit
-func libfuzzerTraceConstCmp8(arg0, arg1 uint64, fakePC int) {
+func libfuzzerTraceConstCmp8(arg0, arg1 uint64, fakePC uint) {
 	fakePC = fakePC % retSledSize
 	libfuzzerCallTraceIntCmp(&__sanitizer_cov_trace_const_cmp8, uintptr(arg0), uintptr(arg1), uintptr(fakePC))
 }
@@ -148,13 +148,8 @@
 //go:cgo_import_static __sanitizer_cov_8bit_counters_init
 var __sanitizer_cov_8bit_counters_init byte
 
-//go:linkname __start___sancov_cntrs __start___sancov_cntrs
-//go:cgo_import_static __start___sancov_cntrs
-var __start___sancov_cntrs byte
-
-//go:linkname __stop___sancov_cntrs __stop___sancov_cntrs
-//go:cgo_import_static __stop___sancov_cntrs
-var __stop___sancov_cntrs byte
+// start, stop markers of counters, set by the linker
+var __start___sancov_cntrs, __stop___sancov_cntrs byte
 
 //go:linkname __sanitizer_cov_pcs_init __sanitizer_cov_pcs_init
 //go:cgo_import_static __sanitizer_cov_pcs_init
diff --git a/src/runtime/libfuzzer_amd64.s b/src/runtime/libfuzzer_amd64.s
index 65ac7a3..4355369 100644
--- a/src/runtime/libfuzzer_amd64.s
+++ b/src/runtime/libfuzzer_amd64.s
@@ -52,7 +52,7 @@
 // manipulating the return address so that libfuzzer's integer compare hooks
 // work
 // libFuzzer's compare hooks obtain the caller's address from the compiler
-// builtin __builtin_return_adress. Since we invoke the hooks always
+// builtin __builtin_return_address. Since we invoke the hooks always
 // from the same native function, this builtin would always return the same
 // value. Internally, the libFuzzer hooks call through to the always inlined
 // HandleCmp and thus can't be mimicked without patching libFuzzer.
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 1578984..cc7d465 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -226,7 +226,7 @@
 }
 
 // same as runtime·notetsleep, but called on user g (not g0)
-// calls only nosplit functions between entersyscallblock/exitsyscall
+// calls only nosplit functions between entersyscallblock/exitsyscall.
 func notetsleepg(n *note, ns int64) bool {
 	gp := getg()
 	if gp == gp.m.g0 {
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index c5e8cfe..e15bbf7 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -284,7 +284,7 @@
 }
 
 // same as runtime·notetsleep, but called on user g (not g0)
-// calls only nosplit functions between entersyscallblock/exitsyscall
+// calls only nosplit functions between entersyscallblock/exitsyscall.
 func notetsleepg(n *note, ns int64) bool {
 	gp := getg()
 	if gp == gp.m.g0 {
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index bb0b189..284a61e 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -1,183 +1,120 @@
-// Copyright 2020 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file records the static ranks of the locks in the runtime. If a lock
-// is not given a rank, then it is assumed to be a leaf lock, which means no other
-// lock can be acquired while it is held. Therefore, leaf locks do not need to be
-// given an explicit rank. We list all of the architecture-independent leaf locks
-// for documentation purposes, but don't list any of the architecture-dependent
-// locks (which are all leaf locks). debugLock is ignored for ranking, since it is used
-// when printing out lock ranking errors.
-//
-// lockInit(l *mutex, rank int) is used to set the rank of lock before it is used.
-// If there is no clear place to initialize a lock, then the rank of a lock can be
-// specified during the lock call itself via lockWithrank(l *mutex, rank int).
-//
-// Besides the static lock ranking (which is a total ordering of the locks), we
-// also represent and enforce the actual partial order among the locks in the
-// arcs[] array below. That is, if it is possible that lock B can be acquired when
-// lock A is the previous acquired lock that is still held, then there should be
-// an entry for A in arcs[B][]. We will currently fail not only if the total order
-// (the lock ranking) is violated, but also if there is a missing entry in the
-// partial order.
+// Code generated by mklockrank.go; DO NOT EDIT.
 
 package runtime
 
 type lockRank int
 
-// Constants representing the lock rank of the architecture-independent locks in
-// the runtime. Locks with lower rank must be taken before locks with higher
-// rank.
+// Constants representing the ranks of all non-leaf runtime locks, in rank order.
+// Locks with lower rank must be taken before locks with higher rank,
+// in addition to satisfying the partial order in lockPartialOrder.
+// A few ranks allow self-cycles, which are specified in lockPartialOrder.
 const (
-	lockRankDummy lockRank = iota
+	lockRankUnknown lockRank = iota
 
-	// Locks held above sched
 	lockRankSysmon
 	lockRankScavenge
 	lockRankForcegc
+	lockRankDefer
 	lockRankSweepWaiters
 	lockRankAssistQueue
-	lockRankCpuprof
 	lockRankSweep
-
 	lockRankPollDesc
+	lockRankCpuprof
 	lockRankSched
-	lockRankDeadlock
 	lockRankAllg
 	lockRankAllp
-
-	lockRankTimers // Multiple timers locked simultaneously in destroy()
+	lockRankTimers
+	lockRankNetpollInit
+	lockRankHchan
+	lockRankNotifyList
+	lockRankSudog
+	lockRankRwmutexW
+	lockRankRwmutexR
+	lockRankRoot
 	lockRankItab
 	lockRankReflectOffs
-	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankUserArenaState
+	// TRACEGLOBAL
 	lockRankTraceBuf
-	lockRankFin
-	lockRankNotifyList
 	lockRankTraceStrings
+	// MALLOC
+	lockRankFin
+	lockRankGcBitsArenas
+	lockRankMheapSpecial
 	lockRankMspanSpecial
+	lockRankSpanSetSpine
+	// MPROF
 	lockRankProfInsert
 	lockRankProfBlock
 	lockRankProfMemActive
 	lockRankProfMemFuture
-	lockRankGcBitsArenas
-	lockRankRoot
-	lockRankTrace
-	lockRankTraceStackTab
-	lockRankNetpollInit
-
-	lockRankRwmutexW
-	lockRankRwmutexR
-
-	lockRankSpanSetSpine
+	// STACKGROW
 	lockRankGscan
 	lockRankStackpool
 	lockRankStackLarge
-	lockRankDefer
-	lockRankSudog
-
-	// Memory-related non-leaf locks
+	lockRankHchanLeaf
+	// WB
 	lockRankWbufSpans
 	lockRankMheap
-	lockRankMheapSpecial
-
-	// Memory-related leaf locks
 	lockRankGlobalAlloc
-	lockRankPageAllocScav
-
-	// Other leaf locks
-	lockRankGFree
-	// Generally, hchan must be acquired before gscan. But in one specific
-	// case (in syncadjustsudogs from markroot after the g has been suspended
-	// by suspendG), we allow gscan to be acquired, and then an hchan lock. To
-	// allow this case, we get this lockRankHchanLeaf rank in
-	// syncadjustsudogs(), rather than lockRankHchan. By using this special
-	// rank, we don't allow any further locks to be acquired other than more
-	// hchan locks.
-	lockRankHchanLeaf
+	// TRACE
+	lockRankTrace
+	lockRankTraceStackTab
 	lockRankPanic
-
-	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
-	// There are other architecture-dependent leaf locks as well.
-	lockRankNewmHandoff
-	lockRankDebugPtrmask
-	lockRankFaketimeState
-	lockRankTicks
-	lockRankRaceFini
-	lockRankPollCache
-	lockRankDebug
+	lockRankDeadlock
 )
 
-// lockRankLeafRank is the rank of lock that does not have a declared rank, and hence is
-// a leaf lock.
+// lockRankLeafRank is the rank of lock that does not have a declared rank,
+// and hence is a leaf lock.
 const lockRankLeafRank lockRank = 1000
 
-// lockNames gives the names associated with each of the above ranks
+// lockNames gives the names associated with each of the above ranks.
 var lockNames = []string{
-	lockRankDummy: "",
-
-	lockRankSysmon:       "sysmon",
-	lockRankScavenge:     "scavenge",
-	lockRankForcegc:      "forcegc",
-	lockRankSweepWaiters: "sweepWaiters",
-	lockRankAssistQueue:  "assistQueue",
-	lockRankCpuprof:      "cpuprof",
-	lockRankSweep:        "sweep",
-
-	lockRankPollDesc: "pollDesc",
-	lockRankSched:    "sched",
-	lockRankDeadlock: "deadlock",
-	lockRankAllg:     "allg",
-	lockRankAllp:     "allp",
-
-	lockRankTimers:      "timers",
-	lockRankItab:        "itab",
-	lockRankReflectOffs: "reflectOffs",
-
-	lockRankHchan:         "hchan",
-	lockRankTraceBuf:      "traceBuf",
-	lockRankFin:           "fin",
-	lockRankNotifyList:    "notifyList",
-	lockRankTraceStrings:  "traceStrings",
-	lockRankMspanSpecial:  "mspanSpecial",
-	lockRankProfInsert:    "profInsert",
-	lockRankProfBlock:     "profBlock",
-	lockRankProfMemActive: "profMemActive",
-	lockRankProfMemFuture: "profMemFuture",
-	lockRankGcBitsArenas:  "gcBitsArenas",
-	lockRankRoot:          "root",
-	lockRankTrace:         "trace",
-	lockRankTraceStackTab: "traceStackTab",
-	lockRankNetpollInit:   "netpollInit",
-
-	lockRankRwmutexW: "rwmutexW",
-	lockRankRwmutexR: "rwmutexR",
-
-	lockRankSpanSetSpine: "spanSetSpine",
-	lockRankGscan:        "gscan",
-	lockRankStackpool:    "stackpool",
-	lockRankStackLarge:   "stackLarge",
-	lockRankDefer:        "defer",
-	lockRankSudog:        "sudog",
-
-	lockRankWbufSpans:    "wbufSpans",
-	lockRankMheap:        "mheap",
-	lockRankMheapSpecial: "mheapSpecial",
-
-	lockRankGlobalAlloc:   "globalAlloc.mutex",
-	lockRankPageAllocScav: "pageAlloc.scav.lock",
-
-	lockRankGFree:     "gFree",
-	lockRankHchanLeaf: "hchanLeaf",
-	lockRankPanic:     "panic",
-
-	lockRankNewmHandoff:   "newmHandoff.lock",
-	lockRankDebugPtrmask:  "debugPtrmask.lock",
-	lockRankFaketimeState: "faketimeState.lock",
-	lockRankTicks:         "ticks.lock",
-	lockRankRaceFini:      "raceFiniLock",
-	lockRankPollCache:     "pollCache.lock",
-	lockRankDebug:         "debugLock",
+	lockRankSysmon:         "sysmon",
+	lockRankScavenge:       "scavenge",
+	lockRankForcegc:        "forcegc",
+	lockRankDefer:          "defer",
+	lockRankSweepWaiters:   "sweepWaiters",
+	lockRankAssistQueue:    "assistQueue",
+	lockRankSweep:          "sweep",
+	lockRankPollDesc:       "pollDesc",
+	lockRankCpuprof:        "cpuprof",
+	lockRankSched:          "sched",
+	lockRankAllg:           "allg",
+	lockRankAllp:           "allp",
+	lockRankTimers:         "timers",
+	lockRankNetpollInit:    "netpollInit",
+	lockRankHchan:          "hchan",
+	lockRankNotifyList:     "notifyList",
+	lockRankSudog:          "sudog",
+	lockRankRwmutexW:       "rwmutexW",
+	lockRankRwmutexR:       "rwmutexR",
+	lockRankRoot:           "root",
+	lockRankItab:           "itab",
+	lockRankReflectOffs:    "reflectOffs",
+	lockRankUserArenaState: "userArenaState",
+	lockRankTraceBuf:       "traceBuf",
+	lockRankTraceStrings:   "traceStrings",
+	lockRankFin:            "fin",
+	lockRankGcBitsArenas:   "gcBitsArenas",
+	lockRankMheapSpecial:   "mheapSpecial",
+	lockRankMspanSpecial:   "mspanSpecial",
+	lockRankSpanSetSpine:   "spanSetSpine",
+	lockRankProfInsert:     "profInsert",
+	lockRankProfBlock:      "profBlock",
+	lockRankProfMemActive:  "profMemActive",
+	lockRankProfMemFuture:  "profMemFuture",
+	lockRankGscan:          "gscan",
+	lockRankStackpool:      "stackpool",
+	lockRankStackLarge:     "stackLarge",
+	lockRankHchanLeaf:      "hchanLeaf",
+	lockRankWbufSpans:      "wbufSpans",
+	lockRankMheap:          "mheap",
+	lockRankGlobalAlloc:    "globalAlloc",
+	lockRankTrace:          "trace",
+	lockRankTraceStackTab:  "traceStackTab",
+	lockRankPanic:          "panic",
+	lockRankDeadlock:       "deadlock",
 }
 
 func (rank lockRank) String() string {
@@ -187,74 +124,61 @@
 	if rank == lockRankLeafRank {
 		return "LEAF"
 	}
+	if rank < 0 || int(rank) >= len(lockNames) {
+		return "BAD RANK"
+	}
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the
-// immediate ordering that has actually been observed in the runtime. Each entry
-// (which corresponds to a particular lock rank) specifies the list of locks
-// that can already be held immediately "above" it.
+// lockPartialOrder is the transitive closure of the lock rank graph.
+// An entry for rank X lists all of the ranks that can already be held
+// when rank X is acquired.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding
-// it in rank can actually be held. The allp lock shows that only the sysmon or
-// sched lock can be held immediately above it when it is acquired.
+// Lock ranks that allow self-cycles list themselves.
 var lockPartialOrder [][]lockRank = [][]lockRank{
-	lockRankDummy:         {},
-	lockRankSysmon:        {},
-	lockRankScavenge:      {lockRankSysmon},
-	lockRankForcegc:       {lockRankSysmon},
-	lockRankSweepWaiters:  {},
-	lockRankAssistQueue:   {},
-	lockRankCpuprof:       {},
-	lockRankSweep:         {},
-	lockRankPollDesc:      {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
-	lockRankDeadlock:      {lockRankDeadlock},
-	lockRankAllg:          {lockRankSysmon, lockRankSched},
-	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankPollDesc, lockRankSched, lockRankAllp, lockRankTimers},
-	lockRankItab:          {},
-	lockRankReflectOffs:   {lockRankItab},
-	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
-	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
-	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf},
-	lockRankNotifyList:    {},
-	lockRankTraceStrings:  {lockRankTraceBuf},
-	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankProfInsert:    {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankProfBlock:     {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankProfMemActive: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankProfMemFuture: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings, lockRankProfMemActive},
-	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankRoot:          {},
-	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot},
-	lockRankTraceStackTab: {lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankRoot, lockRankTrace},
-	lockRankNetpollInit:   {lockRankTimers},
-
-	lockRankRwmutexW: {},
-	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
-
-	lockRankSpanSetSpine:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-	lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
-	lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
-	lockRankDefer:         {},
-	lockRankSudog:         {lockRankHchan, lockRankNotifyList},
-	lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankRoot, lockRankTrace, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
-	lockRankMheapSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankGlobalAlloc:   {lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
-	lockRankPageAllocScav: {lockRankMheap},
-
-	lockRankGFree:     {lockRankSched},
-	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
-	lockRankPanic:     {lockRankDeadlock}, // plus any other lock held on throw.
-
-	lockRankNewmHandoff:   {},
-	lockRankDebugPtrmask:  {},
-	lockRankFaketimeState: {},
-	lockRankTicks:         {},
-	lockRankRaceFini:      {},
-	lockRankPollCache:     {},
-	lockRankDebug:         {},
+	lockRankSysmon:         {},
+	lockRankScavenge:       {lockRankSysmon},
+	lockRankForcegc:        {lockRankSysmon},
+	lockRankDefer:          {},
+	lockRankSweepWaiters:   {},
+	lockRankAssistQueue:    {},
+	lockRankSweep:          {},
+	lockRankPollDesc:       {},
+	lockRankCpuprof:        {},
+	lockRankSched:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof},
+	lockRankAllg:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched},
+	lockRankAllp:           {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched},
+	lockRankTimers:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllp, lockRankTimers},
+	lockRankNetpollInit:    {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllp, lockRankTimers},
+	lockRankHchan:          {lockRankSysmon, lockRankScavenge, lockRankSweep, lockRankHchan},
+	lockRankNotifyList:     {},
+	lockRankSudog:          {lockRankSysmon, lockRankScavenge, lockRankSweep, lockRankHchan, lockRankNotifyList},
+	lockRankRwmutexW:       {},
+	lockRankRwmutexR:       {lockRankSysmon, lockRankRwmutexW},
+	lockRankRoot:           {},
+	lockRankItab:           {},
+	lockRankReflectOffs:    {lockRankItab},
+	lockRankUserArenaState: {},
+	lockRankTraceBuf:       {lockRankSysmon, lockRankScavenge},
+	lockRankTraceStrings:   {lockRankSysmon, lockRankScavenge, lockRankTraceBuf},
+	lockRankFin:            {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankGcBitsArenas:   {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankMheapSpecial:   {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankMspanSpecial:   {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankSpanSetSpine:   {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProfInsert:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProfBlock:      {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProfMemActive:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProfMemFuture:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankHchan, lockRankNotifyList, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankProfMemActive},
+	lockRankGscan:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture},
+	lockRankStackpool:      {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankRwmutexW, lockRankRwmutexR, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+	lockRankStackLarge:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+	lockRankHchanLeaf:      {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankHchanLeaf},
+	lockRankWbufSpans:      {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankSudog, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankMspanSpecial, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan},
+	lockRankMheap:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankSudog, lockRankRwmutexW, lockRankRwmutexR, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankMspanSpecial, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans},
+	lockRankGlobalAlloc:    {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankSudog, lockRankRwmutexW, lockRankRwmutexR, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankMheapSpecial, lockRankMspanSpecial, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
+	lockRankTrace:          {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankSudog, lockRankRwmutexW, lockRankRwmutexR, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankMspanSpecial, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap},
+	lockRankTraceStackTab:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankDefer, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankNetpollInit, lockRankHchan, lockRankNotifyList, lockRankSudog, lockRankRwmutexW, lockRankRwmutexR, lockRankRoot, lockRankItab, lockRankReflectOffs, lockRankUserArenaState, lockRankTraceBuf, lockRankTraceStrings, lockRankFin, lockRankGcBitsArenas, lockRankMspanSpecial, lockRankSpanSetSpine, lockRankProfInsert, lockRankProfBlock, lockRankProfMemActive, lockRankProfMemFuture, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankWbufSpans, lockRankMheap, lockRankTrace},
+	lockRankPanic:          {},
+	lockRankDeadlock:       {lockRankPanic, lockRankDeadlock},
 }
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
index a170569..5dcc79b 100644
--- a/src/runtime/lockrank_on.go
+++ b/src/runtime/lockrank_on.go
@@ -13,7 +13,7 @@
 
 // worldIsStopped is accessed atomically to track world-stops. 1 == world
 // stopped.
-var worldIsStopped uint32
+var worldIsStopped atomic.Uint32
 
 // lockRankStruct is embedded in mutex
 type lockRankStruct struct {
@@ -24,6 +24,9 @@
 	pad int
 }
 
+// lockInit(l *mutex, rank int) sets the rank of lock before it is used.
+// If there is no clear place to initialize a lock, then the rank of a lock can be
+// specified during the lock call itself via lockWithRank(l *mutex, rank int).
 func lockInit(l *mutex, rank lockRank) {
 	l.rank = rank
 }
@@ -298,7 +301,7 @@
 //
 //go:nosplit
 func worldStopped() {
-	if stopped := atomic.Xadd(&worldIsStopped, 1); stopped != 1 {
+	if stopped := worldIsStopped.Add(1); stopped != 1 {
 		systemstack(func() {
 			print("world stop count=", stopped, "\n")
 			throw("recursive world stop")
@@ -314,7 +317,7 @@
 //
 //go:nosplit
 func worldStarted() {
-	if stopped := atomic.Xadd(&worldIsStopped, -1); stopped != 0 {
+	if stopped := worldIsStopped.Add(-1); stopped != 0 {
 		systemstack(func() {
 			print("world stop count=", stopped, "\n")
 			throw("released non-stopped world stop")
@@ -326,7 +329,7 @@
 //
 //go:nosplit
 func checkWorldStopped() bool {
-	stopped := atomic.Load(&worldIsStopped)
+	stopped := worldIsStopped.Load()
 	if stopped > 1 {
 		systemstack(func() {
 			print("inconsistent world stop count=", stopped, "\n")
diff --git a/src/runtime/lockrank_test.go b/src/runtime/lockrank_test.go
index 4b2fc0e..a7b1b8d 100644
--- a/src/runtime/lockrank_test.go
+++ b/src/runtime/lockrank_test.go
@@ -1,41 +1,29 @@
-// Copyright 2021 The Go Authors. All rights reserved.
+// Copyright 2022 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime_test
 
 import (
-	. "runtime"
+	"bytes"
+	"internal/testenv"
+	"os"
+	"os/exec"
 	"testing"
 )
 
-// Check that the partial order in lockPartialOrder fits within the total order
-// determined by the order of the lockRank constants.
-func TestLockRankPartialOrder(t *testing.T) {
-	for r, list := range LockPartialOrder {
-		rank := LockRank(r)
-		for _, e := range list {
-			entry := LockRank(e)
-			if entry > rank {
-				t.Errorf("lockPartialOrder row %v entry %v is inconsistent with total lock ranking order", rank, entry)
-			}
-		}
+// Test that the generated code for the lock rank graph is up-to-date.
+func TestLockRankGenerated(t *testing.T) {
+	testenv.MustHaveGoRun(t)
+	want, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "run", "mklockrank.go")).CombinedOutput()
+	if err != nil {
+		t.Fatal(err)
 	}
-}
-
-// Verify that partial order lists are kept sorted. This is a purely cosemetic
-// check to make manual reviews simpler. It does not affect correctness, unlike
-// the above test.
-func TestLockRankPartialOrderSortedEntries(t *testing.T) {
-	for r, list := range LockPartialOrder {
-		rank := LockRank(r)
-		var prev LockRank
-		for _, e := range list {
-			entry := LockRank(e)
-			if entry <= prev {
-				t.Errorf("Partial order for rank %v out of order: %v <= %v in %v", rank, entry, prev, list)
-			}
-			prev = entry
-		}
+	got, err := os.ReadFile("lockrank.go")
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !bytes.Equal(want, got) {
+		t.Fatalf("lockrank.go is out of date. Please run go generate.")
 	}
 }
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index eb24fdb..7ff2190 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -247,13 +247,15 @@
 	// memory.
 	heapArenaBytes = 1 << logHeapArenaBytes
 
+	heapArenaWords = heapArenaBytes / goarch.PtrSize
+
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
 	logHeapArenaBytes = (6+20)*(_64bit*(1-goos.IsWindows)*(1-goarch.IsWasm)*(1-goos.IsIos*goarch.IsArm64)) + (2+20)*(_64bit*goos.IsWindows) + (2+20)*(1-_64bit) + (2+20)*goarch.IsWasm + (2+20)*goos.IsIos*goarch.IsArm64
 
-	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
-	heapArenaBitmapBytes = heapArenaBytes / (goarch.PtrSize * 8 / 2)
+	// heapArenaBitmapWords is the size of each heap arena's bitmap in uintptrs.
+	heapArenaBitmapWords = heapArenaWords / (8 * goarch.PtrSize)
 
 	pagesPerArena = heapArenaBytes / pageSize
 
@@ -353,10 +355,10 @@
 		throw("bad TinySizeClass")
 	}
 
-	if heapArenaBitmapBytes&(heapArenaBitmapBytes-1) != 0 {
+	if heapArenaBitmapWords&(heapArenaBitmapWords-1) != 0 {
 		// heapBits expects modular arithmetic on bitmap
 		// addresses to work.
-		throw("heapArenaBitmapBytes not a power of 2")
+		throw("heapArenaBitmapWords not a power of 2")
 	}
 
 	// Check physPageSize.
@@ -450,6 +452,14 @@
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
+		//
+		// Space mapped for user arenas comes immediately after the range
+		// originally reserved for the regular heap when race mode is not
+		// enabled because user arena chunks can never be used for regular heap
+		// allocations and we want to avoid fragmenting the address space.
+		//
+		// In race mode we have no choice but to just use the same hints because
+		// the race detector requires that the heap be mapped contiguously.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
@@ -475,9 +485,16 @@
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
+			// Switch to generating hints for user arenas if we've gone
+			// through about half the hints. In race mode, take only about
+			// a quarter; we don't have very much space to work with.
+			hintList := &mheap_.arenaHints
+			if (!raceenabled && i > 0x3f) || (raceenabled && i > 0x5f) {
+				hintList = &mheap_.userArena.arenaHints
+			}
 			hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
 			hint.addr = p
-			hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+			hint.next, *hintList = *hintList, hint
 		}
 	} else {
 		// On a 32-bit machine, we're much more concerned
@@ -545,6 +562,14 @@
 		hint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
 		hint.addr = p
 		hint.next, mheap_.arenaHints = mheap_.arenaHints, hint
+
+		// Place the hint for user arenas just after the large reservation.
+		//
+		// While this potentially competes with the hint above, in practice we probably
+		// aren't going to be getting this far anyway on 32-bit platforms.
+		userArenaHint := (*arenaHint)(mheap_.arenaHintAlloc.alloc())
+		userArenaHint.addr = p
+		userArenaHint.next, mheap_.userArena.arenaHints = mheap_.userArena.arenaHints, userArenaHint
 	}
 }
 
@@ -554,26 +579,37 @@
 // heapArenaBytes. sysAlloc returns nil on failure.
 // There is no corresponding free function.
 //
+// hintList is a list of hint addresses for where to allocate new
+// heap arenas. It must be non-nil.
+//
+// register indicates whether the heap arena should be registered
+// in allArenas.
+//
 // sysAlloc returns a memory region in the Reserved state. This region must
 // be transitioned to Prepared and then Ready before use.
 //
 // h must be locked.
-func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
+func (h *mheap) sysAlloc(n uintptr, hintList **arenaHint, register bool) (v unsafe.Pointer, size uintptr) {
 	assertLockHeld(&h.lock)
 
 	n = alignUp(n, heapArenaBytes)
 
-	// First, try the arena pre-reservation.
-	// Newly-used mappings are considered released.
-	v = h.arena.alloc(n, heapArenaBytes, &gcController.heapReleased)
-	if v != nil {
-		size = n
-		goto mapped
+	if hintList == &h.arenaHints {
+		// First, try the arena pre-reservation.
+		// Newly-used mappings are considered released.
+		//
+		// Only do this if we're using the regular heap arena hints.
+		// This behavior is only for the heap.
+		v = h.arena.alloc(n, heapArenaBytes, &gcController.heapReleased)
+		if v != nil {
+			size = n
+			goto mapped
+		}
 	}
 
 	// Try to grow the heap at a hint address.
-	for h.arenaHints != nil {
-		hint := h.arenaHints
+	for *hintList != nil {
+		hint := *hintList
 		p := hint.addr
 		if hint.down {
 			p -= n
@@ -605,7 +641,7 @@
 		if v != nil {
 			sysFreeOS(v, n)
 		}
-		h.arenaHints = hint.next
+		*hintList = hint.next
 		h.arenaHintAlloc.free(unsafe.Pointer(hint))
 	}
 
@@ -690,26 +726,28 @@
 			}
 		}
 
-		// Add the arena to the arenas list.
-		if len(h.allArenas) == cap(h.allArenas) {
-			size := 2 * uintptr(cap(h.allArenas)) * goarch.PtrSize
-			if size == 0 {
-				size = physPageSize
+		// Register the arena in allArenas if requested.
+		if register {
+			if len(h.allArenas) == cap(h.allArenas) {
+				size := 2 * uintptr(cap(h.allArenas)) * goarch.PtrSize
+				if size == 0 {
+					size = physPageSize
+				}
+				newArray := (*notInHeap)(persistentalloc(size, goarch.PtrSize, &memstats.gcMiscSys))
+				if newArray == nil {
+					throw("out of memory allocating allArenas")
+				}
+				oldSlice := h.allArenas
+				*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / goarch.PtrSize)}
+				copy(h.allArenas, oldSlice)
+				// Do not free the old backing array because
+				// there may be concurrent readers. Since we
+				// double the array each time, this can lead
+				// to at most 2x waste.
 			}
-			newArray := (*notInHeap)(persistentalloc(size, goarch.PtrSize, &memstats.gcMiscSys))
-			if newArray == nil {
-				throw("out of memory allocating allArenas")
-			}
-			oldSlice := h.allArenas
-			*(*notInHeapSlice)(unsafe.Pointer(&h.allArenas)) = notInHeapSlice{newArray, len(h.allArenas), int(size / goarch.PtrSize)}
-			copy(h.allArenas, oldSlice)
-			// Do not free the old backing array because
-			// there may be concurrent readers. Since we
-			// double the array each time, this can lead
-			// to at most 2x waste.
+			h.allArenas = h.allArenas[:len(h.allArenas)+1]
+			h.allArenas[len(h.allArenas)-1] = ri
 		}
-		h.allArenas = h.allArenas[:len(h.allArenas)+1]
-		h.allArenas[len(h.allArenas)-1] = ri
 
 		// Store atomically just in case an object from the
 		// new heap arena becomes visible before the heap lock
@@ -740,8 +778,6 @@
 	case p == 0:
 		return nil, 0
 	case p&(align-1) == 0:
-		// We got lucky and got an aligned region, so we can
-		// use the whole thing.
 		return unsafe.Pointer(p), size + align
 	case GOOS == "windows":
 		// On Windows we can't release pieces of a
@@ -780,7 +816,7 @@
 // nextFreeFast returns the next free object if one is quickly available.
 // Otherwise it returns 0.
 func nextFreeFast(s *mspan) gclinkptr {
-	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
+	theBit := sys.TrailingZeros64(s.allocCache) // Is there a free object in the allocCache?
 	if theBit < 64 {
 		result := s.freeindex + uintptr(theBit)
 		if result < s.nelems {
@@ -847,6 +883,11 @@
 	if size == 0 {
 		return unsafe.Pointer(&zerobase)
 	}
+
+	// It's possible for any malloc to trigger sweeping, which may in
+	// turn queue finalizers. Record this dynamic lock edge.
+	lockRankMayQueueFinalizer()
+
 	userSize := size
 	if asanenabled {
 		// Refer to ASAN runtime library, the malloc() function allocates extra memory,
@@ -888,24 +929,7 @@
 
 	// assistG is the G to charge for this allocation, or nil if
 	// GC is not currently active.
-	var assistG *g
-	if gcBlackenEnabled != 0 {
-		// Charge the current user G for this allocation.
-		assistG = getg()
-		if assistG.m.curg != nil {
-			assistG = assistG.m.curg
-		}
-		// Charge the allocation against the G. We'll account
-		// for internal fragmentation at the end of mallocgc.
-		assistG.gcAssistBytes -= int64(size)
-
-		if assistG.gcAssistBytes < 0 {
-			// This G is in debt. Assist the GC to correct
-			// this before allocating. This must happen
-			// before disabling preemption.
-			gcAssistAlloc(assistG)
-		}
-	}
+	assistG := deductAssistCredit(size)
 
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
@@ -1019,7 +1043,7 @@
 			}
 			x = unsafe.Pointer(v)
 			if needzero && span.needzero != 0 {
-				memclrNoHeapPointers(unsafe.Pointer(v), size)
+				memclrNoHeapPointers(x, size)
 			}
 		}
 	} else {
@@ -1045,8 +1069,8 @@
 		}
 	}
 
-	var scanSize uintptr
 	if !noscan {
+		var scanSize uintptr
 		heapBitsSetType(uintptr(x), size, dataSize, typ)
 		if dataSize > typ.size {
 			// Array allocation. If there are any
@@ -1068,13 +1092,23 @@
 	// the garbage collector could follow a pointer to x,
 	// but see uninitialized memory or stale heap bits.
 	publicationBarrier()
+	// As x and the heap bits are initialized, update
+	// freeIndexForScan now so x is seen by the GC
+	// (including convervative scan) as an allocated object.
+	// While this pointer can't escape into user code as a
+	// _live_ pointer until we return, conservative scanning
+	// may find a dead pointer that happens to point into this
+	// object. Delaying this update until now ensures that
+	// conservative scanning considers this pointer dead until
+	// this point.
+	span.freeIndexForScan = span.freeindex
 
 	// Allocate black during GC.
 	// All slots hold nil so no scanning is needed.
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
 	if gcphase != _GCoff {
-		gcmarknewobject(span, uintptr(x), size, scanSize)
+		gcmarknewobject(span, uintptr(x), size)
 	}
 
 	if raceenabled {
@@ -1158,6 +1192,34 @@
 	return x
 }
 
+// deductAssistCredit reduces the current G's assist credit
+// by size bytes, and assists the GC if necessary.
+//
+// Caller must be preemptible.
+//
+// Returns the G for which the assist credit was accounted.
+func deductAssistCredit(size uintptr) *g {
+	var assistG *g
+	if gcBlackenEnabled != 0 {
+		// Charge the current user G for this allocation.
+		assistG = getg()
+		if assistG.m.curg != nil {
+			assistG = assistG.m.curg
+		}
+		// Charge the allocation against the G. We'll account
+		// for internal fragmentation at the end of mallocgc.
+		assistG.gcAssistBytes -= int64(size)
+
+		if assistG.gcAssistBytes < 0 {
+			// This G is in debt. Assist the GC to correct
+			// this before allocating. This must happen
+			// before disabling preemption.
+			gcAssistAlloc(assistG)
+		}
+	}
+	return assistG
+}
+
 // memclrNoHeapPointersChunked repeatedly calls memclrNoHeapPointers
 // on chunks of the buffer to be zeroed, with opportunities for preemption
 // along the way.  memclrNoHeapPointers contains no safepoints and also
@@ -1187,7 +1249,7 @@
 
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
-// of this function
+// of this function.
 func newobject(typ *_type) unsafe.Pointer {
 	return mallocgc(typ.size, typ, true)
 }
@@ -1245,7 +1307,7 @@
 	}
 	if GOOS == "plan9" {
 		// Plan 9 doesn't support floating point in note handler.
-		if g := getg(); g == g.m.gsignal {
+		if gp := getg(); gp == gp.m.gsignal {
 			return nextSampleNoFP()
 		}
 	}
@@ -1323,7 +1385,8 @@
 // The returned memory will be zeroed.
 // sysStat must be non-nil.
 //
-// Consider marking persistentalloc'd types go:notinheap.
+// Consider marking persistentalloc'd types not in heap by embedding
+// runtime/internal/sys.NotInHeap.
 func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
@@ -1464,14 +1527,12 @@
 // notInHeap is off-heap memory allocated by a lower-level allocator
 // like sysAlloc or persistentAlloc.
 //
-// In general, it's better to use real types marked as go:notinheap,
-// but this serves as a generic type for situations where that isn't
-// possible (like in the allocators).
+// In general, it's better to use real types which embed
+// runtime/internal/sys.NotInHeap, but this serves as a generic type
+// for situations where that isn't possible (like in the allocators).
 //
 // TODO: Use this as the return type of sysAlloc, persistentAlloc, etc?
-//
-//go:notinheap
-type notInHeap struct{}
+type notInHeap struct{ _ sys.NotInHeap }
 
 func (p *notInHeap) add(bytes uintptr) *notInHeap {
 	return (*notInHeap)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) + bytes))
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index cc20076..5b9ce98 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -294,7 +294,11 @@
 	for i := 0; i < 5; i++ {
 		// Reserve memory at the next hint so it can't be used
 		// for the heap.
-		start, end := MapNextArenaHint()
+		start, end, ok := MapNextArenaHint()
+		if !ok {
+			t.Skipf("failed to reserve memory at next arena hint [%#x, %#x)", start, end)
+		}
+		t.Logf("reserved [%#x, %#x)", start, end)
 		disallowed = append(disallowed, [2]uintptr{start, end})
 		// Allocate until the runtime tries to use the hint we
 		// just mapped over.
@@ -314,46 +318,36 @@
 	}
 }
 
-var mallocSink uintptr
-
 func BenchmarkMalloc8(b *testing.B) {
-	var x uintptr
 	for i := 0; i < b.N; i++ {
 		p := new(int64)
-		x ^= uintptr(unsafe.Pointer(p))
+		Escape(p)
 	}
-	mallocSink = x
 }
 
 func BenchmarkMalloc16(b *testing.B) {
-	var x uintptr
 	for i := 0; i < b.N; i++ {
 		p := new([2]int64)
-		x ^= uintptr(unsafe.Pointer(p))
+		Escape(p)
 	}
-	mallocSink = x
 }
 
 func BenchmarkMallocTypeInfo8(b *testing.B) {
-	var x uintptr
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
 			p [8 / unsafe.Sizeof(uintptr(0))]*int
 		})
-		x ^= uintptr(unsafe.Pointer(p))
+		Escape(p)
 	}
-	mallocSink = x
 }
 
 func BenchmarkMallocTypeInfo16(b *testing.B) {
-	var x uintptr
 	for i := 0; i < b.N; i++ {
 		p := new(struct {
 			p [16 / unsafe.Sizeof(uintptr(0))]*int
 		})
-		x ^= uintptr(unsafe.Pointer(p))
+		Escape(p)
 	}
-	mallocSink = x
 }
 
 type LargeStruct struct {
@@ -361,12 +355,10 @@
 }
 
 func BenchmarkMallocLargeStruct(b *testing.B) {
-	var x uintptr
 	for i := 0; i < b.N; i++ {
 		p := make([]LargeStruct, 2)
-		x ^= uintptr(unsafe.Pointer(&p[0]))
+		Escape(p)
 	}
-	mallocSink = x
 }
 
 var n = flag.Int("n", 1000, "number of goroutines")
diff --git a/src/runtime/map.go b/src/runtime/map.go
index 65be472..f546ce8 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go
@@ -514,7 +514,7 @@
 	return unsafe.Pointer(&zeroVal[0]), false
 }
 
-// returns both key and elem. Used by map iterator
+// returns both key and elem. Used by map iterator.
 func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe.Pointer) {
 	if h == nil || h.count == 0 {
 		return nil, nil
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index c3b4541..46ef42f 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -196,7 +196,7 @@
 	reflect_typedmemmove(typ, dst, src)
 }
 
-// typedmemmovepartial is like typedmemmove but assumes that
+// reflect_typedmemmovepartial is like typedmemmove but assumes that
 // dst and src point off bytes into the value and only copies size bytes.
 // off must be a multiple of goarch.PtrSize.
 //
@@ -311,6 +311,8 @@
 // If the caller knows that typ has pointers, it can alternatively
 // call memclrHasPointers.
 //
+// TODO: A "go:nosplitrec" annotation would be perfect for this.
+//
 //go:nosplit
 func typedmemclr(typ *_type, ptr unsafe.Pointer) {
 	if writeBarrier.needed && typ.ptrdata != 0 {
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index a3a6590..088b566 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -14,34 +14,28 @@
 //
 // Heap bitmap
 //
-// The heap bitmap comprises 2 bits for each pointer-sized word in the heap,
-// stored in the heapArena metadata backing each heap arena.
-// That is, if ha is the heapArena for the arena starting a start,
-// then ha.bitmap[0] holds the 2-bit entries for the four words start
-// through start+3*ptrSize, ha.bitmap[1] holds the entries for
-// start+4*ptrSize through start+7*ptrSize, and so on.
+// The heap bitmap comprises 1 bit for each pointer-sized word in the heap,
+// recording whether a pointer is stored in that word or not. This bitmap
+// is stored in the heapArena metadata backing each heap arena.
+// That is, if ha is the heapArena for the arena starting at "start",
+// then ha.bitmap[0] holds the 64 bits for the 64 words "start"
+// through start+63*ptrSize, ha.bitmap[1] holds the entries for
+// start+64*ptrSize through start+127*ptrSize, and so on.
+// Bits correspond to words in little-endian order. ha.bitmap[0]&1 represents
+// the word at "start", ha.bitmap[0]>>1&1 represents the word at start+8, etc.
+// (For 32-bit platforms, s/64/32/.)
 //
-// In each 2-bit entry, the lower bit is a pointer/scalar bit, just
-// like in the stack/data bitmaps described above. The upper bit
-// indicates scan/dead: a "1" value ("scan") indicates that there may
-// be pointers in later words of the allocation, and a "0" value
-// ("dead") indicates there are no more pointers in the allocation. If
-// the upper bit is 0, the lower bit must also be 0, and this
-// indicates scanning can ignore the rest of the allocation.
+// We also keep a noMorePtrs bitmap which allows us to stop scanning
+// the heap bitmap early in certain situations. If ha.noMorePtrs[i]>>j&1
+// is 1, then the object containing the last word described by ha.bitmap[8*i+j]
+// has no more pointers beyond those described by ha.bitmap[8*i+j].
+// If ha.noMorePtrs[i]>>j&1 is set, the entries in ha.bitmap[8*i+j+1] and
+// beyond must all be zero until the start of the next object.
 //
-// The 2-bit entries are split when written into the byte, so that the top half
-// of the byte contains 4 high (scan) bits and the bottom half contains 4 low
-// (pointer) bits. This form allows a copy from the 1-bit to the 4-bit form to
-// keep the pointer bits contiguous, instead of having to space them out.
+// The bitmap for noscan spans is set to all zero at span allocation time.
 //
-// The code makes use of the fact that the zero value for a heap
-// bitmap means scalar/dead. This property must be preserved when
-// modifying the encoding.
-//
-// The bitmap for noscan spans is not maintained. Code must ensure
-// that an object is scannable before consulting its bitmap by
-// checking either the noscan bit in the span or by consulting its
-// type's information.
+// The bitmap for unallocated objects in scannable spans is not maintained
+// (can be junk).
 
 package runtime
 
@@ -52,18 +46,6 @@
 	"unsafe"
 )
 
-const (
-	bitPointer = 1 << 0
-	bitScan    = 1 << 4
-
-	heapBitsShift      = 1     // shift offset between successive bitPointer or bitScan entries
-	wordsPerBitmapByte = 8 / 2 // heap words described by one bitmap byte
-
-	// all scan/pointer bits in a byte
-	bitScanAll    = bitScan | bitScan<<heapBitsShift | bitScan<<(2*heapBitsShift) | bitScan<<(3*heapBitsShift)
-	bitPointerAll = bitPointer | bitPointer<<heapBitsShift | bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
-)
-
 // addb returns the byte pointer p+n.
 //
 //go:nowritebarrier
@@ -110,21 +92,6 @@
 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 }
 
-// heapBits provides access to the bitmap bits for a single heap word.
-// The methods on heapBits take value receivers so that the compiler
-// can more easily inline calls to those methods and registerize the
-// struct fields independently.
-type heapBits struct {
-	bitp  *uint8
-	shift uint32
-	arena uint32 // Index of heap arena containing bitp
-	last  *uint8 // Last byte arena's bitmap
-}
-
-// Make the compiler check that heapBits.arena is large enough to hold
-// the maximum arena frame number.
-var _ = heapBits{arena: (1<<heapAddrBits)/heapArenaBytes - 1}
-
 // markBits provides access to the mark bit for an object in the heap.
 // bytep points to the byte holding the mark bit.
 // mask is a byte with a single bit set that can be &ed with *bytep
@@ -180,7 +147,7 @@
 
 	aCache := s.allocCache
 
-	bitIndex := sys.Ctz64(aCache)
+	bitIndex := sys.TrailingZeros64(aCache)
 	for bitIndex == 64 {
 		// Move index to start of next cached bits.
 		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
@@ -192,7 +159,7 @@
 		// Refill s.allocCache with the next 64 alloc bits.
 		s.refillAllocCache(whichByte)
 		aCache = s.allocCache
-		bitIndex = sys.Ctz64(aCache)
+		bitIndex = sys.TrailingZeros64(aCache)
 		// nothing available in cached bits
 		// grab the next 8 bytes and try again.
 	}
@@ -224,7 +191,7 @@
 // been no preemption points since ensuring this (which could allow a
 // GC transition, which would allow the state to change).
 func (s *mspan) isFree(index uintptr) bool {
-	if index < s.freeindex {
+	if index < s.freeIndexForScan {
 		return false
 	}
 	bytep, mask := s.allocBits.bitp(index)
@@ -264,7 +231,7 @@
 }
 
 func (s *mspan) markBitsForBase() markBits {
-	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
+	return markBits{&s.gcmarkBits.x, uint8(1), 0}
 }
 
 // isMarked reports whether mark bit m is set.
@@ -313,32 +280,6 @@
 	m.index++
 }
 
-// heapBitsForAddr returns the heapBits for the address addr.
-// The caller must ensure addr is in an allocated span.
-// In particular, be careful not to point past the end of an object.
-//
-// nosplit because it is used during write barriers and must not be preempted.
-//
-//go:nosplit
-func heapBitsForAddr(addr uintptr) (h heapBits) {
-	// 2 bits per word, 4 pairs per byte, and a mask is hard coded.
-	arena := arenaIndex(addr)
-	ha := mheap_.arenas[arena.l1()][arena.l2()]
-	// The compiler uses a load for nil checking ha, but in this
-	// case we'll almost never hit that cache line again, so it
-	// makes more sense to do a value check.
-	if ha == nil {
-		// addr is not in the heap. Return nil heapBits, which
-		// we expect to crash in the caller.
-		return
-	}
-	h.bitp = &ha.bitmap[(addr/(goarch.PtrSize*4))%heapArenaBitmapBytes]
-	h.shift = uint32((addr / goarch.PtrSize) & 3)
-	h.arena = uint32(arena)
-	h.last = &ha.bitmap[len(ha.bitmap)-1]
-	return
-}
-
 // clobberdeadPtr is a special value that is used by the compiler to
 // clobber dead stack slots, when -clobberdead flag is set.
 const clobberdeadPtr = uintptr(0xdeaddead | 0xdeaddead<<((^uintptr(0)>>63)*32))
@@ -423,7 +364,7 @@
 	return
 }
 
-// verifyNotInHeapPtr reports whether converting the not-in-heap pointer into a unsafe.Pointer is ok.
+// reflect_verifyNotInHeapPtr reports whether converting the not-in-heap pointer into a unsafe.Pointer is ok.
 //
 //go:linkname reflect_verifyNotInHeapPtr reflect.verifyNotInHeapPtr
 func reflect_verifyNotInHeapPtr(p uintptr) bool {
@@ -433,121 +374,134 @@
 	return spanOf(p) == nil && p != clobberdeadPtr
 }
 
-// next returns the heapBits describing the next pointer-sized word in memory.
-// That is, if h describes address p, h.next() describes p+ptrSize.
+const ptrBits = 8 * goarch.PtrSize
+
+// heapBits provides access to the bitmap bits for a single heap word.
+// The methods on heapBits take value receivers so that the compiler
+// can more easily inline calls to those methods and registerize the
+// struct fields independently.
+type heapBits struct {
+	// heapBits will report on pointers in the range [addr,addr+size).
+	// The low bit of mask contains the pointerness of the word at addr
+	// (assuming valid>0).
+	addr, size uintptr
+
+	// The next few pointer bits representing words starting at addr.
+	// Those bits already returned by next() are zeroed.
+	mask uintptr
+	// Number of bits in mask that are valid. mask is always less than 1<<valid.
+	valid uintptr
+}
+
+// heapBitsForAddr returns the heapBits for the address addr.
+// The caller must ensure [addr,addr+size) is in an allocated span.
+// In particular, be careful not to point past the end of an object.
+//
+// nosplit because it is used during write barriers and must not be preempted.
+//
+//go:nosplit
+func heapBitsForAddr(addr, size uintptr) heapBits {
+	// Find arena
+	ai := arenaIndex(addr)
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+
+	// Word index in arena.
+	word := addr / goarch.PtrSize % heapArenaWords
+
+	// Word index and bit offset in bitmap array.
+	idx := word / ptrBits
+	off := word % ptrBits
+
+	// Grab relevant bits of bitmap.
+	mask := ha.bitmap[idx] >> off
+	valid := ptrBits - off
+
+	// Process depending on where the object ends.
+	nptr := size / goarch.PtrSize
+	if nptr < valid {
+		// Bits for this object end before the end of this bitmap word.
+		// Squash bits for the following objects.
+		mask &= 1<<(nptr&(ptrBits-1)) - 1
+		valid = nptr
+	} else if nptr == valid {
+		// Bits for this object end at exactly the end of this bitmap word.
+		// All good.
+	} else {
+		// Bits for this object extend into the next bitmap word. See if there
+		// may be any pointers recorded there.
+		if uintptr(ha.noMorePtrs[idx/8])>>(idx%8)&1 != 0 {
+			// No more pointers in this object after this bitmap word.
+			// Update size so we know not to look there.
+			size = valid * goarch.PtrSize
+		}
+	}
+
+	return heapBits{addr: addr, size: size, mask: mask, valid: valid}
+}
+
+// Returns the (absolute) address of the next known pointer and
+// a heapBits iterator representing any remaining pointers.
+// If there are no more pointers, returns address 0.
 // Note that next does not modify h. The caller must record the result.
 //
 // nosplit because it is used during write barriers and must not be preempted.
 //
 //go:nosplit
-func (h heapBits) next() heapBits {
-	if h.shift < 3*heapBitsShift {
-		h.shift += heapBitsShift
-	} else if h.bitp != h.last {
-		h.bitp, h.shift = add1(h.bitp), 0
+func (h heapBits) next() (heapBits, uintptr) {
+	for {
+		if h.mask != 0 {
+			var i int
+			if goarch.PtrSize == 8 {
+				i = sys.TrailingZeros64(uint64(h.mask))
+			} else {
+				i = sys.TrailingZeros32(uint32(h.mask))
+			}
+			h.mask ^= uintptr(1) << (i & (ptrBits - 1))
+			return h, h.addr + uintptr(i)*goarch.PtrSize
+		}
+
+		// Skip words that we've already processed.
+		h.addr += h.valid * goarch.PtrSize
+		h.size -= h.valid * goarch.PtrSize
+		if h.size == 0 {
+			return h, 0 // no more pointers
+		}
+
+		// Grab more bits and try again.
+		h = heapBitsForAddr(h.addr, h.size)
+	}
+}
+
+// nextFast is like next, but can return 0 even when there are more pointers
+// to be found. Callers should call next if nextFast returns 0 as its second
+// return value.
+//
+//	if addr, h = h.nextFast(); addr == 0 {
+//	    if addr, h = h.next(); addr == 0 {
+//	        ... no more pointers ...
+//	    }
+//	}
+//	... process pointer at addr ...
+//
+// nextFast is designed to be inlineable.
+//
+//go:nosplit
+func (h heapBits) nextFast() (heapBits, uintptr) {
+	// TESTQ/JEQ
+	if h.mask == 0 {
+		return h, 0
+	}
+	// BSFQ
+	var i int
+	if goarch.PtrSize == 8 {
+		i = sys.TrailingZeros64(uint64(h.mask))
 	} else {
-		// Move to the next arena.
-		return h.nextArena()
+		i = sys.TrailingZeros32(uint32(h.mask))
 	}
-	return h
-}
-
-// nextArena advances h to the beginning of the next heap arena.
-//
-// This is a slow-path helper to next. gc's inliner knows that
-// heapBits.next can be inlined even though it calls this. This is
-// marked noinline so it doesn't get inlined into next and cause next
-// to be too big to inline.
-//
-//go:nosplit
-//go:noinline
-func (h heapBits) nextArena() heapBits {
-	h.arena++
-	ai := arenaIdx(h.arena)
-	l2 := mheap_.arenas[ai.l1()]
-	if l2 == nil {
-		// We just passed the end of the object, which
-		// was also the end of the heap. Poison h. It
-		// should never be dereferenced at this point.
-		return heapBits{}
-	}
-	ha := l2[ai.l2()]
-	if ha == nil {
-		return heapBits{}
-	}
-	h.bitp, h.shift = &ha.bitmap[0], 0
-	h.last = &ha.bitmap[len(ha.bitmap)-1]
-	return h
-}
-
-// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
-// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
-// h.forward(1) is equivalent to h.next(), just slower.
-// Note that forward does not modify h. The caller must record the result.
-// bits returns the heap bits for the current word.
-//
-//go:nosplit
-func (h heapBits) forward(n uintptr) heapBits {
-	n += uintptr(h.shift) / heapBitsShift
-	nbitp := uintptr(unsafe.Pointer(h.bitp)) + n/4
-	h.shift = uint32(n%4) * heapBitsShift
-	if nbitp <= uintptr(unsafe.Pointer(h.last)) {
-		h.bitp = (*uint8)(unsafe.Pointer(nbitp))
-		return h
-	}
-
-	// We're in a new heap arena.
-	past := nbitp - (uintptr(unsafe.Pointer(h.last)) + 1)
-	h.arena += 1 + uint32(past/heapArenaBitmapBytes)
-	ai := arenaIdx(h.arena)
-	if l2 := mheap_.arenas[ai.l1()]; l2 != nil && l2[ai.l2()] != nil {
-		a := l2[ai.l2()]
-		h.bitp = &a.bitmap[past%heapArenaBitmapBytes]
-		h.last = &a.bitmap[len(a.bitmap)-1]
-	} else {
-		h.bitp, h.last = nil, nil
-	}
-	return h
-}
-
-// forwardOrBoundary is like forward, but stops at boundaries between
-// contiguous sections of the bitmap. It returns the number of words
-// advanced over, which will be <= n.
-func (h heapBits) forwardOrBoundary(n uintptr) (heapBits, uintptr) {
-	maxn := 4 * ((uintptr(unsafe.Pointer(h.last)) + 1) - uintptr(unsafe.Pointer(h.bitp)))
-	if n > maxn {
-		n = maxn
-	}
-	return h.forward(n), n
-}
-
-// The caller can test morePointers and isPointer by &-ing with bitScan and bitPointer.
-// The result includes in its higher bits the bits for subsequent words
-// described by the same bitmap byte.
-//
-// nosplit because it is used during write barriers and must not be preempted.
-//
-//go:nosplit
-func (h heapBits) bits() uint32 {
-	// The (shift & 31) eliminates a test and conditional branch
-	// from the generated code.
-	return uint32(*h.bitp) >> (h.shift & 31)
-}
-
-// morePointers reports whether this word and all remaining words in this object
-// are scalars.
-// h must not describe the second word of the object.
-func (h heapBits) morePointers() bool {
-	return h.bits()&bitScan != 0
-}
-
-// isPointer reports whether the heap bits describe a pointer word.
-//
-// nosplit because it is used during write barriers and must not be preempted.
-//
-//go:nosplit
-func (h heapBits) isPointer() bool {
-	return h.bits()&bitPointer != 0
+	// BTCQ
+	h.mask ^= uintptr(1) << (i & (ptrBits - 1))
+	// LEAQ (XX)(XX*8)
+	return h, h.addr + uintptr(i)*goarch.PtrSize
 }
 
 // bulkBarrierPreWrite executes a write barrier
@@ -611,27 +565,29 @@
 	}
 
 	buf := &getg().m.p.ptr().wbBuf
-	h := heapBitsForAddr(dst)
+	h := heapBitsForAddr(dst, size)
 	if src == 0 {
-		for i := uintptr(0); i < size; i += goarch.PtrSize {
-			if h.isPointer() {
-				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				if !buf.putFast(*dstx, 0) {
-					wbBufFlush(nil, 0)
-				}
+		for {
+			var addr uintptr
+			if h, addr = h.next(); addr == 0 {
+				break
 			}
-			h = h.next()
+			dstx := (*uintptr)(unsafe.Pointer(addr))
+			if !buf.putFast(*dstx, 0) {
+				wbBufFlush(nil, 0)
+			}
 		}
 	} else {
-		for i := uintptr(0); i < size; i += goarch.PtrSize {
-			if h.isPointer() {
-				dstx := (*uintptr)(unsafe.Pointer(dst + i))
-				srcx := (*uintptr)(unsafe.Pointer(src + i))
-				if !buf.putFast(*dstx, *srcx) {
-					wbBufFlush(nil, 0)
-				}
+		for {
+			var addr uintptr
+			if h, addr = h.next(); addr == 0 {
+				break
 			}
-			h = h.next()
+			dstx := (*uintptr)(unsafe.Pointer(addr))
+			srcx := (*uintptr)(unsafe.Pointer(src + (addr - dst)))
+			if !buf.putFast(*dstx, *srcx) {
+				wbBufFlush(nil, 0)
+			}
 		}
 	}
 }
@@ -654,15 +610,16 @@
 		return
 	}
 	buf := &getg().m.p.ptr().wbBuf
-	h := heapBitsForAddr(dst)
-	for i := uintptr(0); i < size; i += goarch.PtrSize {
-		if h.isPointer() {
-			srcx := (*uintptr)(unsafe.Pointer(src + i))
-			if !buf.putFast(0, *srcx) {
-				wbBufFlush(nil, 0)
-			}
+	h := heapBitsForAddr(dst, size)
+	for {
+		var addr uintptr
+		if h, addr = h.next(); addr == 0 {
+			break
 		}
-		h = h.next()
+		srcx := (*uintptr)(unsafe.Pointer(addr - dst + src))
+		if !buf.putFast(0, *srcx) {
+			wbBufFlush(nil, 0)
+		}
 	}
 }
 
@@ -759,43 +716,31 @@
 	}
 }
 
-// The methods operating on spans all require that h has been returned
-// by heapBitsForSpan and that size, n, total are the span layout description
-// returned by the mspan's layout method.
-// If total > size*n, it means that there is extra leftover memory in the span,
-// usually due to rounding.
-//
-// TODO(rsc): Perhaps introduce a different heapBitsSpan type.
-
-// initSpan initializes the heap bitmap for a span.
-// If this is a span of pointer-sized objects, it initializes all
-// words to pointer/scan.
-// Otherwise, it initializes all words to scalar/dead.
-func (h heapBits) initSpan(s *mspan) {
-	// Clear bits corresponding to objects.
-	nw := (s.npages << _PageShift) / goarch.PtrSize
-	if nw%wordsPerBitmapByte != 0 {
-		throw("initSpan: unaligned length")
-	}
-	if h.shift != 0 {
-		throw("initSpan: unaligned base")
+// initHeapBits initializes the heap bitmap for a span.
+// If this is a span of single pointer allocations, it initializes all
+// words to pointer. If force is true, clears all bits.
+func (s *mspan) initHeapBits(forceClear bool) {
+	if forceClear || s.spanclass.noscan() {
+		// Set all the pointer bits to zero. We do this once
+		// when the span is allocated so we don't have to do it
+		// for each object allocation.
+		base := s.base()
+		size := s.npages * pageSize
+		h := writeHeapBitsForAddr(base)
+		h.flush(base, size)
+		return
 	}
 	isPtrs := goarch.PtrSize == 8 && s.elemsize == goarch.PtrSize
-	for nw > 0 {
-		hNext, anw := h.forwardOrBoundary(nw)
-		nbyte := anw / wordsPerBitmapByte
-		if isPtrs {
-			bitp := h.bitp
-			for i := uintptr(0); i < nbyte; i++ {
-				*bitp = bitPointerAll | bitScanAll
-				bitp = add1(bitp)
-			}
-		} else {
-			memclrNoHeapPointers(unsafe.Pointer(h.bitp), nbyte)
-		}
-		h = hNext
-		nw -= anw
+	if !isPtrs {
+		return // nothing to do
 	}
+	h := writeHeapBitsForAddr(s.base())
+	size := s.npages * pageSize
+	nptrs := size / goarch.PtrSize
+	for i := uintptr(0); i < nptrs; i += ptrBits {
+		h = h.write(^uintptr(0), ptrBits)
+	}
+	h.flush(s.base(), size)
 }
 
 // countAlloc returns the number of objects allocated in span s by
@@ -818,6 +763,159 @@
 	return count
 }
 
+type writeHeapBits struct {
+	addr  uintptr // address that the low bit of mask represents the pointer state of.
+	mask  uintptr // some pointer bits starting at the address addr.
+	valid uintptr // number of bits in buf that are valid (including low)
+	low   uintptr // number of low-order bits to not overwrite
+}
+
+func writeHeapBitsForAddr(addr uintptr) (h writeHeapBits) {
+	// We start writing bits maybe in the middle of a heap bitmap word.
+	// Remember how many bits into the word we started, so we can be sure
+	// not to overwrite the previous bits.
+	h.low = addr / goarch.PtrSize % ptrBits
+
+	// round down to heap word that starts the bitmap word.
+	h.addr = addr - h.low*goarch.PtrSize
+
+	// We don't have any bits yet.
+	h.mask = 0
+	h.valid = h.low
+
+	return
+}
+
+// write appends the pointerness of the next valid pointer slots
+// using the low valid bits of bits. 1=pointer, 0=scalar.
+func (h writeHeapBits) write(bits, valid uintptr) writeHeapBits {
+	if h.valid+valid <= ptrBits {
+		// Fast path - just accumulate the bits.
+		h.mask |= bits << h.valid
+		h.valid += valid
+		return h
+	}
+	// Too many bits to fit in this word. Write the current word
+	// out and move on to the next word.
+
+	data := h.mask | bits<<h.valid       // mask for this word
+	h.mask = bits >> (ptrBits - h.valid) // leftover for next word
+	h.valid += valid - ptrBits           // have h.valid+valid bits, writing ptrBits of them
+
+	// Flush mask to the memory bitmap.
+	// TODO: figure out how to cache arena lookup.
+	ai := arenaIndex(h.addr)
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords
+	m := uintptr(1)<<h.low - 1
+	ha.bitmap[idx] = ha.bitmap[idx]&m | data
+	// Note: no synchronization required for this write because
+	// the allocator has exclusive access to the page, and the bitmap
+	// entries are all for a single page. Also, visibility of these
+	// writes is guaranteed by the publication barrier in mallocgc.
+
+	// Clear noMorePtrs bit, since we're going to be writing bits
+	// into the following word.
+	ha.noMorePtrs[idx/8] &^= uint8(1) << (idx % 8)
+	// Note: same as above
+
+	// Move to next word of bitmap.
+	h.addr += ptrBits * goarch.PtrSize
+	h.low = 0
+	return h
+}
+
+// Add padding of size bytes.
+func (h writeHeapBits) pad(size uintptr) writeHeapBits {
+	if size == 0 {
+		return h
+	}
+	words := size / goarch.PtrSize
+	for words > ptrBits {
+		h = h.write(0, ptrBits)
+		words -= ptrBits
+	}
+	return h.write(0, words)
+}
+
+// Flush the bits that have been written, and add zeros as needed
+// to cover the full object [addr, addr+size).
+func (h writeHeapBits) flush(addr, size uintptr) {
+	// zeros counts the number of bits needed to represent the object minus the
+	// number of bits we've already written. This is the number of 0 bits
+	// that need to be added.
+	zeros := (addr+size-h.addr)/goarch.PtrSize - h.valid
+
+	// Add zero bits up to the bitmap word boundary
+	if zeros > 0 {
+		z := ptrBits - h.valid
+		if z > zeros {
+			z = zeros
+		}
+		h.valid += z
+		zeros -= z
+	}
+
+	// Find word in bitmap that we're going to write.
+	ai := arenaIndex(h.addr)
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords
+
+	// Write remaining bits.
+	if h.valid != h.low {
+		m := uintptr(1)<<h.low - 1      // don't clear existing bits below "low"
+		m |= ^(uintptr(1)<<h.valid - 1) // don't clear existing bits above "valid"
+		ha.bitmap[idx] = ha.bitmap[idx]&m | h.mask
+	}
+	if zeros == 0 {
+		return
+	}
+
+	// Record in the noMorePtrs map that there won't be any more 1 bits,
+	// so readers can stop early.
+	ha.noMorePtrs[idx/8] |= uint8(1) << (idx % 8)
+
+	// Advance to next bitmap word.
+	h.addr += ptrBits * goarch.PtrSize
+
+	// Continue on writing zeros for the rest of the object.
+	// For standard use of the ptr bits this is not required, as
+	// the bits are read from the beginning of the object. Some uses,
+	// like noscan spans, oblets, bulk write barriers, and cgocheck, might
+	// start mid-object, so these writes are still required.
+	for {
+		// Write zero bits.
+		ai := arenaIndex(h.addr)
+		ha := mheap_.arenas[ai.l1()][ai.l2()]
+		idx := h.addr / (ptrBits * goarch.PtrSize) % heapArenaBitmapWords
+		if zeros < ptrBits {
+			ha.bitmap[idx] &^= uintptr(1)<<zeros - 1
+			break
+		} else if zeros == ptrBits {
+			ha.bitmap[idx] = 0
+			break
+		} else {
+			ha.bitmap[idx] = 0
+			zeros -= ptrBits
+		}
+		ha.noMorePtrs[idx/8] |= uint8(1) << (idx % 8)
+		h.addr += ptrBits * goarch.PtrSize
+	}
+}
+
+// Read the bytes starting at the aligned pointer p into a uintptr.
+// Read is little-endian.
+func readUintptr(p *byte) uintptr {
+	x := *(*uintptr)(unsafe.Pointer(p))
+	if goarch.BigEndian {
+		if goarch.PtrSize == 8 {
+			return uintptr(sys.Bswap64(uint64(x)))
+		}
+		return uintptr(sys.Bswap32(uint32(x)))
+	}
+	return x
+}
+
 // heapBitsSetType records that the new allocation [x, x+size)
 // holds in [x, x+dataSize) one or more values of type typ.
 // (The number of values is given by dataSize / typ.size.)
@@ -829,7 +927,7 @@
 // heapBitsSweepSpan.
 //
 // There can only be one allocation from a given span active at a time,
-// and the bitmap for a span always falls on byte boundaries,
+// and the bitmap for a span always falls on word boundaries,
 // so there are no write-write races for access to the heap bitmap.
 // Hence, heapBitsSetType can access the bitmap without atomics.
 //
@@ -844,209 +942,61 @@
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	const doubleCheck = false // slow but helpful; enable to test modifications to this code
 
-	const (
-		mask1 = bitPointer | bitScan                        // 00010001
-		mask2 = bitPointer | bitScan | mask1<<heapBitsShift // 00110011
-		mask3 = bitPointer | bitScan | mask2<<heapBitsShift // 01110111
-	)
-
-	// dataSize is always size rounded up to the next malloc size class,
-	// except in the case of allocating a defer block, in which case
-	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
-	// arbitrarily larger.
-	//
-	// The checks for size == goarch.PtrSize and size == 2*goarch.PtrSize can therefore
-	// assume that dataSize == size without checking it explicitly.
+	if doubleCheck && dataSize%typ.size != 0 {
+		throw("heapBitsSetType: dataSize not a multiple of typ.size")
+	}
 
 	if goarch.PtrSize == 8 && size == goarch.PtrSize {
 		// It's one word and it has pointers, it must be a pointer.
 		// Since all allocated one-word objects are pointers
 		// (non-pointers are aggregated into tinySize allocations),
-		// initSpan sets the pointer bits for us. Nothing to do here.
+		// (*mspan).initHeapBits sets the pointer bits for us.
+		// Nothing to do here.
 		if doubleCheck {
-			h := heapBitsForAddr(x)
-			if !h.isPointer() {
+			h, addr := heapBitsForAddr(x, size).next()
+			if addr != x {
 				throw("heapBitsSetType: pointer bit missing")
 			}
-			if !h.morePointers() {
-				throw("heapBitsSetType: scan bit missing")
+			_, addr = h.next()
+			if addr != 0 {
+				throw("heapBitsSetType: second pointer bit found")
 			}
 		}
 		return
 	}
 
-	h := heapBitsForAddr(x)
-	ptrmask := typ.gcdata // start of 1-bit pointer mask (or GC program, handled below)
+	h := writeHeapBitsForAddr(x)
 
-	// 2-word objects only have 4 bitmap bits and 3-word objects only have 6 bitmap bits.
-	// Therefore, these objects share a heap bitmap byte with the objects next to them.
-	// These are called out as a special case primarily so the code below can assume all
-	// objects are at least 4 words long and that their bitmaps start either at the beginning
-	// of a bitmap byte, or half-way in (h.shift of 0 and 2 respectively).
-
-	if size == 2*goarch.PtrSize {
-		if typ.size == goarch.PtrSize {
-			// We're allocating a block big enough to hold two pointers.
-			// On 64-bit, that means the actual object must be two pointers,
-			// or else we'd have used the one-pointer-sized block.
-			// On 32-bit, however, this is the 8-byte block, the smallest one.
-			// So it could be that we're allocating one pointer and this was
-			// just the smallest block available. Distinguish by checking dataSize.
-			// (In general the number of instances of typ being allocated is
-			// dataSize/typ.size.)
-			if goarch.PtrSize == 4 && dataSize == goarch.PtrSize {
-				// 1 pointer object. On 32-bit machines clear the bit for the
-				// unused second word.
-				*h.bitp &^= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
-				*h.bitp |= (bitPointer | bitScan) << h.shift
-			} else {
-				// 2-element array of pointer.
-				*h.bitp |= (bitPointer | bitScan | (bitPointer|bitScan)<<heapBitsShift) << h.shift
-			}
-			return
-		}
-		// Otherwise typ.size must be 2*goarch.PtrSize,
-		// and typ.kind&kindGCProg == 0.
-		if doubleCheck {
-			if typ.size != 2*goarch.PtrSize || typ.kind&kindGCProg != 0 {
-				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
-				throw("heapBitsSetType")
-			}
-		}
-		b := uint32(*ptrmask)
-		hb := b & 3
-		hb |= bitScanAll & ((bitScan << (typ.ptrdata / goarch.PtrSize)) - 1)
-		// Clear the bits for this object so we can set the
-		// appropriate ones.
-		*h.bitp &^= (bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << h.shift
-		*h.bitp |= uint8(hb << h.shift)
-		return
-	} else if size == 3*goarch.PtrSize {
-		b := uint8(*ptrmask)
-		if doubleCheck {
-			if b == 0 {
-				println("runtime: invalid type ", typ.string())
-				throw("heapBitsSetType: called with non-pointer type")
-			}
-			if goarch.PtrSize != 8 {
-				throw("heapBitsSetType: unexpected 3 pointer wide size class on 32 bit")
-			}
-			if typ.kind&kindGCProg != 0 {
-				throw("heapBitsSetType: unexpected GC prog for 3 pointer wide size class")
-			}
-			if typ.size == 2*goarch.PtrSize {
-				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, "\n")
-				throw("heapBitsSetType: inconsistent object sizes")
-			}
-		}
-		if typ.size == goarch.PtrSize {
-			// The type contains a pointer otherwise heapBitsSetType wouldn't have been called.
-			// Since the type is only 1 pointer wide and contains a pointer, its gcdata must be exactly 1.
-			if doubleCheck && *typ.gcdata != 1 {
-				print("runtime: heapBitsSetType size=", size, " typ.size=", typ.size, "but *typ.gcdata", *typ.gcdata, "\n")
-				throw("heapBitsSetType: unexpected gcdata for 1 pointer wide type size in 3 pointer wide size class")
-			}
-			// 3 element array of pointers. Unrolling ptrmask 3 times into p yields 00000111.
-			b = 7
-		}
-
-		hb := b & 7
-		// Set bitScan bits for all pointers.
-		hb |= hb << wordsPerBitmapByte
-		// First bitScan bit is always set since the type contains pointers.
-		hb |= bitScan
-		// Second bitScan bit needs to also be set if the third bitScan bit is set.
-		hb |= hb & (bitScan << (2 * heapBitsShift)) >> 1
-
-		// For h.shift > 1 heap bits cross a byte boundary and need to be written part
-		// to h.bitp and part to the next h.bitp.
-		switch h.shift {
-		case 0:
-			*h.bitp &^= mask3 << 0
-			*h.bitp |= hb << 0
-		case 1:
-			*h.bitp &^= mask3 << 1
-			*h.bitp |= hb << 1
-		case 2:
-			*h.bitp &^= mask2 << 2
-			*h.bitp |= (hb & mask2) << 2
-			// Two words written to the first byte.
-			// Advance two words to get to the next byte.
-			h = h.next().next()
-			*h.bitp &^= mask1
-			*h.bitp |= (hb >> 2) & mask1
-		case 3:
-			*h.bitp &^= mask1 << 3
-			*h.bitp |= (hb & mask1) << 3
-			// One word written to the first byte.
-			// Advance one word to get to the next byte.
-			h = h.next()
-			*h.bitp &^= mask2
-			*h.bitp |= (hb >> 1) & mask2
-		}
-		return
-	}
-
-	// Copy from 1-bit ptrmask into 2-bit bitmap.
-	// The basic approach is to use a single uintptr as a bit buffer,
-	// alternating between reloading the buffer and writing bitmap bytes.
-	// In general, one load can supply two bitmap byte writes.
-	// This is a lot of lines of code, but it compiles into relatively few
-	// machine instructions.
-
-	outOfPlace := false
-	if arenaIndex(x+size-1) != arenaIdx(h.arena) || (doubleCheck && fastrandn(2) == 0) {
-		// This object spans heap arenas, so the bitmap may be
-		// discontiguous. Unroll it into the object instead
-		// and then copy it out.
-		//
-		// In doubleCheck mode, we randomly do this anyway to
-		// stress test the bitmap copying path.
-		outOfPlace = true
-		h.bitp = (*uint8)(unsafe.Pointer(x))
-		h.last = nil
-	}
-
-	var (
-		// Ptrmask input.
-		p     *byte   // last ptrmask byte read
-		b     uintptr // ptrmask bits already loaded
-		nb    uintptr // number of bits in b at next read
-		endp  *byte   // final ptrmask byte to read (then repeat)
-		endnb uintptr // number of valid bits in *endp
-		pbits uintptr // alternate source of bits
-
-		// Heap bitmap output.
-		w     uintptr // words processed
-		nw    uintptr // number of words to process
-		hbitp *byte   // next heap bitmap byte to write
-		hb    uintptr // bits being prepared for *hbitp
-	)
-
-	hbitp = h.bitp
-
-	// Handle GC program. Delayed until this part of the code
-	// so that we can use the same double-checking mechanism
-	// as the 1-bit case. Nothing above could have encountered
-	// GC programs: the cases were all too small.
+	// Handle GC program.
 	if typ.kind&kindGCProg != 0 {
-		heapBitsSetTypeGCProg(h, typ.ptrdata, typ.size, dataSize, size, addb(typ.gcdata, 4))
-		if doubleCheck {
-			// Double-check the heap bits written by GC program
-			// by running the GC program to create a 1-bit pointer mask
-			// and then jumping to the double-check code below.
-			// This doesn't catch bugs shared between the 1-bit and 4-bit
-			// GC program execution, but it does catch mistakes specific
-			// to just one of those and bugs in heapBitsSetTypeGCProg's
-			// implementation of arrays.
-			lock(&debugPtrmask.lock)
-			if debugPtrmask.data == nil {
-				debugPtrmask.data = (*byte)(persistentalloc(1<<20, 1, &memstats.other_sys))
+		// Expand the gc program into the storage we're going to use for the actual object.
+		obj := (*uint8)(unsafe.Pointer(x))
+		n := runGCProg(addb(typ.gcdata, 4), obj)
+		// Use the expanded program to set the heap bits.
+		for i := uintptr(0); true; i += typ.size {
+			// Copy expanded program to heap bitmap.
+			p := obj
+			j := n
+			for j > 8 {
+				h = h.write(uintptr(*p), 8)
+				p = add1(p)
+				j -= 8
 			}
-			ptrmask = debugPtrmask.data
-			runGCProg(addb(typ.gcdata, 4), nil, ptrmask, 1)
+			h = h.write(uintptr(*p), j)
+
+			if i+typ.size == dataSize {
+				break // no padding after last element
+			}
+
+			// Pad with zeros to the start of the next element.
+			h = h.pad(typ.size - n*goarch.PtrSize)
 		}
-		goto Phase4
+
+		h.flush(x, size)
+
+		// Erase the expanded GC program.
+		memclrNoHeapPointers(unsafe.Pointer(obj), (n+7)/8)
+		return
 	}
 
 	// Note about sizes:
@@ -1061,424 +1011,98 @@
 	// to scan the buffer's heap bitmap at all.
 	// The 1-bit ptrmasks are sized to contain only bits for
 	// the typ.ptrdata prefix, zero padded out to a full byte
-	// of bitmap. This code sets nw (below) so that heap bitmap
-	// bits are only written for the typ.ptrdata prefix; if there is
-	// more room in the allocated object, the next heap bitmap
-	// entry is a 00, indicating that there are no more pointers
-	// to scan. So only the ptrmask for the ptrdata bytes is needed.
+	// of bitmap. If there is more room in the allocated object,
+	// that space is pointerless. The noMorePtrs bitmap will prevent
+	// scanning large pointerless tails of an object.
 	//
 	// Replicated copies are not as nice: if there is an array of
 	// objects with scalar tails, all but the last tail does have to
 	// be initialized, because there is no way to say "skip forward".
-	// However, because of the possibility of a repeated type with
-	// size not a multiple of 4 pointers (one heap bitmap byte),
-	// the code already must handle the last ptrmask byte specially
-	// by treating it as containing only the bits for endnb pointers,
-	// where endnb <= 4. We represent large scalar tails that must
-	// be expanded in the replication by setting endnb larger than 4.
-	// This will have the effect of reading many bits out of b,
-	// but once the real bits are shifted out, b will supply as many
-	// zero bits as we try to read, which is exactly what we need.
 
-	p = ptrmask
-	if typ.size < dataSize {
-		// Filling in bits for an array of typ.
-		// Set up for repetition of ptrmask during main loop.
-		// Note that ptrmask describes only a prefix of
-		const maxBits = goarch.PtrSize*8 - 7
-		if typ.ptrdata/goarch.PtrSize <= maxBits {
-			// Entire ptrmask fits in uintptr with room for a byte fragment.
-			// Load into pbits and never read from ptrmask again.
-			// This is especially important when the ptrmask has
-			// fewer than 8 bits in it; otherwise the reload in the middle
-			// of the Phase 2 loop would itself need to loop to gather
-			// at least 8 bits.
-
-			// Accumulate ptrmask into b.
-			// ptrmask is sized to describe only typ.ptrdata, but we record
-			// it as describing typ.size bytes, since all the high bits are zero.
-			nb = typ.ptrdata / goarch.PtrSize
-			for i := uintptr(0); i < nb; i += 8 {
-				b |= uintptr(*p) << i
-				p = add1(p)
-			}
-			nb = typ.size / goarch.PtrSize
-
-			// Replicate ptrmask to fill entire pbits uintptr.
-			// Doubling and truncating is fewer steps than
-			// iterating by nb each time. (nb could be 1.)
-			// Since we loaded typ.ptrdata/goarch.PtrSize bits
-			// but are pretending to have typ.size/goarch.PtrSize,
-			// there might be no replication necessary/possible.
-			pbits = b
-			endnb = nb
-			if nb+nb <= maxBits {
-				for endnb <= goarch.PtrSize*8 {
-					pbits |= pbits << endnb
-					endnb += endnb
+	ptrs := typ.ptrdata / goarch.PtrSize
+	if typ.size == dataSize { // Single element
+		if ptrs <= ptrBits { // Single small element
+			m := readUintptr(typ.gcdata)
+			h = h.write(m, ptrs)
+		} else { // Single large element
+			p := typ.gcdata
+			for {
+				h = h.write(readUintptr(p), ptrBits)
+				p = addb(p, ptrBits/8)
+				ptrs -= ptrBits
+				if ptrs <= ptrBits {
+					break
 				}
-				// Truncate to a multiple of original ptrmask.
-				// Because nb+nb <= maxBits, nb fits in a byte.
-				// Byte division is cheaper than uintptr division.
-				endnb = uintptr(maxBits/byte(nb)) * nb
-				pbits &= 1<<endnb - 1
-				b = pbits
-				nb = endnb
 			}
-
-			// Clear p and endp as sentinel for using pbits.
-			// Checked during Phase 2 loop.
-			p = nil
-			endp = nil
-		} else {
-			// Ptrmask is larger. Read it multiple times.
-			n := (typ.ptrdata/goarch.PtrSize+7)/8 - 1
-			endp = addb(ptrmask, n)
-			endnb = typ.size/goarch.PtrSize - n*8
+			m := readUintptr(p)
+			h = h.write(m, ptrs)
 		}
-	}
-	if p != nil {
-		b = uintptr(*p)
-		p = add1(p)
-		nb = 8
-	}
-
-	if typ.size == dataSize {
-		// Single entry: can stop once we reach the non-pointer data.
-		nw = typ.ptrdata / goarch.PtrSize
-	} else {
-		// Repeated instances of typ in an array.
-		// Have to process first N-1 entries in full, but can stop
-		// once we reach the non-pointer data in the final entry.
-		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / goarch.PtrSize
-	}
-	if nw == 0 {
-		// No pointers! Caller was supposed to check.
-		println("runtime: invalid type ", typ.string())
-		throw("heapBitsSetType: called with non-pointer type")
-		return
-	}
-
-	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==2).
-	// The leading byte is special because it contains the bits for word 1,
-	// which does not have the scan bit set.
-	// The leading half-byte is special because it's a half a byte,
-	// so we have to be careful with the bits already there.
-	switch {
-	default:
-		throw("heapBitsSetType: unexpected shift")
-
-	case h.shift == 0:
-		// Ptrmask and heap bitmap are aligned.
-		//
-		// This is a fast path for small objects.
-		//
-		// The first byte we write out covers the first four
-		// words of the object. The scan/dead bit on the first
-		// word must be set to scan since there are pointers
-		// somewhere in the object.
-		// In all following words, we set the scan/dead
-		// appropriately to indicate that the object continues
-		// to the next 2-bit entry in the bitmap.
-		//
-		// We set four bits at a time here, but if the object
-		// is fewer than four words, phase 3 will clear
-		// unnecessary bits.
-		hb = b & bitPointerAll
-		hb |= bitScanAll
-		if w += 4; w >= nw {
-			goto Phase3
-		}
-		*hbitp = uint8(hb)
-		hbitp = add1(hbitp)
-		b >>= 4
-		nb -= 4
-
-	case h.shift == 2:
-		// Ptrmask and heap bitmap are misaligned.
-		//
-		// On 32 bit architectures only the 6-word object that corresponds
-		// to a 24 bytes size class can start with h.shift of 2 here since
-		// all other non 16 byte aligned size classes have been handled by
-		// special code paths at the beginning of heapBitsSetType on 32 bit.
-		//
-		// Many size classes are only 16 byte aligned. On 64 bit architectures
-		// this results in a heap bitmap position starting with a h.shift of 2.
-		//
-		// The bits for the first two words are in a byte shared
-		// with another object, so we must be careful with the bits
-		// already there.
-		//
-		// We took care of 1-word, 2-word, and 3-word objects above,
-		// so this is at least a 6-word object.
-		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
-		hb |= bitScan << (2 * heapBitsShift)
-		if nw > 1 {
-			hb |= bitScan << (3 * heapBitsShift)
-		}
-		b >>= 2
-		nb -= 2
-		*hbitp &^= uint8((bitPointer | bitScan | ((bitPointer | bitScan) << heapBitsShift)) << (2 * heapBitsShift))
-		*hbitp |= uint8(hb)
-		hbitp = add1(hbitp)
-		if w += 2; w >= nw {
-			// We know that there is more data, because we handled 2-word and 3-word objects above.
-			// This must be at least a 6-word object. If we're out of pointer words,
-			// mark no scan in next bitmap byte and finish.
-			hb = 0
-			w += 4
-			goto Phase3
-		}
-	}
-
-	// Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap.
-	// The loop computes the bits for that last write but does not execute the write;
-	// it leaves the bits in hb for processing by phase 3.
-	// To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to
-	// use in the first half of the loop right now, and then we only adjust nb explicitly
-	// if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop.
-	nb -= 4
-	for {
-		// Emit bitmap byte.
-		// b has at least nb+4 bits, with one exception:
-		// if w+4 >= nw, then b has only nw-w bits,
-		// but we'll stop at the break and then truncate
-		// appropriately in Phase 3.
-		hb = b & bitPointerAll
-		hb |= bitScanAll
-		if w += 4; w >= nw {
-			break
-		}
-		*hbitp = uint8(hb)
-		hbitp = add1(hbitp)
-		b >>= 4
-
-		// Load more bits. b has nb right now.
-		if p != endp {
-			// Fast path: keep reading from ptrmask.
-			// nb unmodified: we just loaded 8 bits,
-			// and the next iteration will consume 8 bits,
-			// leaving us with the same nb the next time we're here.
-			if nb < 8 {
-				b |= uintptr(*p) << nb
-				p = add1(p)
-			} else {
-				// Reduce the number of bits in b.
-				// This is important if we skipped
-				// over a scalar tail, since nb could
-				// be larger than the bit width of b.
-				nb -= 8
+	} else { // Repeated element
+		words := typ.size / goarch.PtrSize // total words, including scalar tail
+		if words <= ptrBits {              // Repeated small element
+			n := dataSize / typ.size
+			m := readUintptr(typ.gcdata)
+			// Make larger unit to repeat
+			for words <= ptrBits/2 {
+				if n&1 != 0 {
+					h = h.write(m, words)
+				}
+				n /= 2
+				m |= m << words
+				ptrs += words
+				words *= 2
+				if n == 1 {
+					break
+				}
 			}
-		} else if p == nil {
-			// Almost as fast path: track bit count and refill from pbits.
-			// For short repetitions.
-			if nb < 8 {
-				b |= pbits << nb
-				nb += endnb
+			for n > 1 {
+				h = h.write(m, words)
+				n--
 			}
-			nb -= 8 // for next iteration
-		} else {
-			// Slow path: reached end of ptrmask.
-			// Process final partial byte and rewind to start.
-			b |= uintptr(*p) << nb
-			nb += endnb
-			if nb < 8 {
-				b |= uintptr(*ptrmask) << nb
-				p = add1(ptrmask)
-			} else {
-				nb -= 8
-				p = ptrmask
+			h = h.write(m, ptrs)
+		} else { // Repeated large element
+			for i := uintptr(0); true; i += typ.size {
+				p := typ.gcdata
+				j := ptrs
+				for j > ptrBits {
+					h = h.write(readUintptr(p), ptrBits)
+					p = addb(p, ptrBits/8)
+					j -= ptrBits
+				}
+				m := readUintptr(p)
+				h = h.write(m, j)
+				if i+typ.size == dataSize {
+					break // don't need the trailing nonptr bits on the last element.
+				}
+				// Pad with zeros to the start of the next element.
+				h = h.pad(typ.size - typ.ptrdata)
 			}
 		}
-
-		// Emit bitmap byte.
-		hb = b & bitPointerAll
-		hb |= bitScanAll
-		if w += 4; w >= nw {
-			break
-		}
-		*hbitp = uint8(hb)
-		hbitp = add1(hbitp)
-		b >>= 4
 	}
+	h.flush(x, size)
 
-Phase3:
-	// Phase 3: Write last byte or partial byte and zero the rest of the bitmap entries.
-	if w > nw {
-		// Counting the 4 entries in hb not yet written to memory,
-		// there are more entries than possible pointer slots.
-		// Discard the excess entries (can't be more than 3).
-		mask := uintptr(1)<<(4-(w-nw)) - 1
-		hb &= mask | mask<<4 // apply mask to both pointer bits and scan bits
-	}
-
-	// Change nw from counting possibly-pointer words to total words in allocation.
-	nw = size / goarch.PtrSize
-
-	// Write whole bitmap bytes.
-	// The first is hb, the rest are zero.
-	if w <= nw {
-		*hbitp = uint8(hb)
-		hbitp = add1(hbitp)
-		hb = 0 // for possible final half-byte below
-		for w += 4; w <= nw; w += 4 {
-			*hbitp = 0
-			hbitp = add1(hbitp)
-		}
-	}
-
-	// Write final partial bitmap byte if any.
-	// We know w > nw, or else we'd still be in the loop above.
-	// It can be bigger only due to the 4 entries in hb that it counts.
-	// If w == nw+4 then there's nothing left to do: we wrote all nw entries
-	// and can discard the 4 sitting in hb.
-	// But if w == nw+2, we need to write first two in hb.
-	// The byte is shared with the next object, so be careful with
-	// existing bits.
-	if w == nw+2 {
-		*hbitp = *hbitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | uint8(hb)
-	}
-
-Phase4:
-	// Phase 4: Copy unrolled bitmap to per-arena bitmaps, if necessary.
-	if outOfPlace {
-		// TODO: We could probably make this faster by
-		// handling [x+dataSize, x+size) specially.
-		h := heapBitsForAddr(x)
-		// cnw is the number of heap words, or bit pairs
-		// remaining (like nw above).
-		cnw := size / goarch.PtrSize
-		src := (*uint8)(unsafe.Pointer(x))
-		// We know the first and last byte of the bitmap are
-		// not the same, but it's still possible for small
-		// objects span arenas, so it may share bitmap bytes
-		// with neighboring objects.
-		//
-		// Handle the first byte specially if it's shared. See
-		// Phase 1 for why this is the only special case we need.
-		if doubleCheck {
-			if !(h.shift == 0 || h.shift == 2) {
-				print("x=", x, " size=", size, " cnw=", h.shift, "\n")
-				throw("bad start shift")
-			}
-		}
-		if h.shift == 2 {
-			*h.bitp = *h.bitp&^((bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift)<<(2*heapBitsShift)) | *src
-			h = h.next().next()
-			cnw -= 2
-			src = addb(src, 1)
-		}
-		// We're now byte aligned. Copy out to per-arena
-		// bitmaps until the last byte (which may again be
-		// partial).
-		for cnw >= 4 {
-			// This loop processes four words at a time,
-			// so round cnw down accordingly.
-			hNext, words := h.forwardOrBoundary(cnw / 4 * 4)
-
-			// n is the number of bitmap bytes to copy.
-			n := words / 4
-			memmove(unsafe.Pointer(h.bitp), unsafe.Pointer(src), n)
-			cnw -= words
-			h = hNext
-			src = addb(src, n)
-		}
-		if doubleCheck && h.shift != 0 {
-			print("cnw=", cnw, " h.shift=", h.shift, "\n")
-			throw("bad shift after block copy")
-		}
-		// Handle the last byte if it's shared.
-		if cnw == 2 {
-			*h.bitp = *h.bitp&^(bitPointer|bitScan|(bitPointer|bitScan)<<heapBitsShift) | *src
-			src = addb(src, 1)
-			h = h.next().next()
-		}
-		if doubleCheck {
-			if uintptr(unsafe.Pointer(src)) > x+size {
-				throw("copy exceeded object size")
-			}
-			if !(cnw == 0 || cnw == 2) {
-				print("x=", x, " size=", size, " cnw=", cnw, "\n")
-				throw("bad number of remaining words")
-			}
-			// Set up hbitp so doubleCheck code below can check it.
-			hbitp = h.bitp
-		}
-		// Zero the object where we wrote the bitmap.
-		memclrNoHeapPointers(unsafe.Pointer(x), uintptr(unsafe.Pointer(src))-x)
-	}
-
-	// Double check the whole bitmap.
 	if doubleCheck {
-		// x+size may not point to the heap, so back up one
-		// word and then advance it the way we do above.
-		end := heapBitsForAddr(x + size - goarch.PtrSize)
-		if outOfPlace {
-			// In out-of-place copying, we just advance
-			// using next.
-			end = end.next()
-		} else {
-			// Don't use next because that may advance to
-			// the next arena and the in-place logic
-			// doesn't do that.
-			end.shift += heapBitsShift
-			if end.shift == 4*heapBitsShift {
-				end.bitp, end.shift = add1(end.bitp), 0
+		h := heapBitsForAddr(x, size)
+		for i := uintptr(0); i < size; i += goarch.PtrSize {
+			// Compute the pointer bit we want at offset i.
+			want := false
+			if i < dataSize {
+				off := i % typ.size
+				if off < typ.ptrdata {
+					j := off / goarch.PtrSize
+					want = *addb(typ.gcdata, j/8)>>(j%8)&1 != 0
+				}
+			}
+			if want {
+				var addr uintptr
+				h, addr = h.next()
+				if addr != x+i {
+					throw("heapBitsSetType: pointer entry not correct")
+				}
 			}
 		}
-		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
-			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
-			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
-			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
-			h0 := heapBitsForAddr(x)
-			print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
-			print("ended at hbitp=", hbitp, " but next starts at bitp=", end.bitp, " shift=", end.shift, "\n")
-			throw("bad heapBitsSetType")
-		}
-
-		// Double-check that bits to be written were written correctly.
-		// Does not check that other bits were not written, unfortunately.
-		h := heapBitsForAddr(x)
-		nptr := typ.ptrdata / goarch.PtrSize
-		ndata := typ.size / goarch.PtrSize
-		count := dataSize / typ.size
-		totalptr := ((count-1)*typ.size + typ.ptrdata) / goarch.PtrSize
-		for i := uintptr(0); i < size/goarch.PtrSize; i++ {
-			j := i % ndata
-			var have, want uint8
-			have = (*h.bitp >> h.shift) & (bitPointer | bitScan)
-			if i >= totalptr {
-				if typ.kind&kindGCProg != 0 && i < (totalptr+3)/4*4 {
-					// heapBitsSetTypeGCProg always fills
-					// in full nibbles of bitScan.
-					want = bitScan
-				}
-			} else {
-				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
-					want |= bitPointer
-				}
-				want |= bitScan
-			}
-			if have != want {
-				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
-				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
-				print("kindGCProg=", typ.kind&kindGCProg != 0, " outOfPlace=", outOfPlace, "\n")
-				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
-				h0 := heapBitsForAddr(x)
-				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
-				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
-				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
-				println("at word", i, "offset", i*goarch.PtrSize, "have", hex(have), "want", hex(want))
-				if typ.kind&kindGCProg != 0 {
-					println("GC program:")
-					dumpGCProg(addb(typ.gcdata, 4))
-				}
-				throw("bad heapBitsSetType")
-			}
-			h = h.next()
-		}
-		if ptrmask == debugPtrmask.data {
-			unlock(&debugPtrmask.lock)
+		if _, addr := h.next(); addr != 0 {
+			throw("heapBitsSetType: extra pointer")
 		}
 	}
 }
@@ -1488,92 +1112,6 @@
 	data *byte
 }
 
-// heapBitsSetTypeGCProg implements heapBitsSetType using a GC program.
-// progSize is the size of the memory described by the program.
-// elemSize is the size of the element that the GC program describes (a prefix of).
-// dataSize is the total size of the intended data, a multiple of elemSize.
-// allocSize is the total size of the allocated memory.
-//
-// GC programs are only used for large allocations.
-// heapBitsSetType requires that allocSize is a multiple of 4 words,
-// so that the relevant bitmap bytes are not shared with surrounding
-// objects.
-func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
-	if goarch.PtrSize == 8 && allocSize%(4*goarch.PtrSize) != 0 {
-		// Alignment will be wrong.
-		throw("heapBitsSetTypeGCProg: small allocation")
-	}
-	var totalBits uintptr
-	if elemSize == dataSize {
-		totalBits = runGCProg(prog, nil, h.bitp, 2)
-		if totalBits*goarch.PtrSize != progSize {
-			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
-			throw("heapBitsSetTypeGCProg: unexpected bit count")
-		}
-	} else {
-		count := dataSize / elemSize
-
-		// Piece together program trailer to run after prog that does:
-		//	literal(0)
-		//	repeat(1, elemSize-progSize-1) // zeros to fill element size
-		//	repeat(elemSize, count-1) // repeat that element for count
-		// This zero-pads the data remaining in the first element and then
-		// repeats that first element to fill the array.
-		var trailer [40]byte // 3 varints (max 10 each) + some bytes
-		i := 0
-		if n := elemSize/goarch.PtrSize - progSize/goarch.PtrSize; n > 0 {
-			// literal(0)
-			trailer[i] = 0x01
-			i++
-			trailer[i] = 0
-			i++
-			if n > 1 {
-				// repeat(1, n-1)
-				trailer[i] = 0x81
-				i++
-				n--
-				for ; n >= 0x80; n >>= 7 {
-					trailer[i] = byte(n | 0x80)
-					i++
-				}
-				trailer[i] = byte(n)
-				i++
-			}
-		}
-		// repeat(elemSize/ptrSize, count-1)
-		trailer[i] = 0x80
-		i++
-		n := elemSize / goarch.PtrSize
-		for ; n >= 0x80; n >>= 7 {
-			trailer[i] = byte(n | 0x80)
-			i++
-		}
-		trailer[i] = byte(n)
-		i++
-		n = count - 1
-		for ; n >= 0x80; n >>= 7 {
-			trailer[i] = byte(n | 0x80)
-			i++
-		}
-		trailer[i] = byte(n)
-		i++
-		trailer[i] = 0
-		i++
-
-		runGCProg(prog, &trailer[0], h.bitp, 2)
-
-		// Even though we filled in the full array just now,
-		// record that we only filled in up to the ptrdata of the
-		// last element. This will cause the code below to
-		// memclr the dead section of the final array element,
-		// so that scanobject can stop early in the final element.
-		totalBits = (elemSize*(count-1) + progSize) / goarch.PtrSize
-	}
-	endProg := unsafe.Pointer(addb(h.bitp, (totalBits+3)/4))
-	endAlloc := unsafe.Pointer(addb(h.bitp, allocSize/goarch.PtrSize/wordsPerBitmapByte))
-	memclrNoHeapPointers(endProg, uintptr(endAlloc)-uintptr(endProg))
-}
-
 // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
 // size the size of the region described by prog, in bytes.
 // The resulting bitvector will have no more than size/goarch.PtrSize bits.
@@ -1581,7 +1119,7 @@
 	n := (size/goarch.PtrSize + 7) / 8
 	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
 	x[len(x)-1] = 0xa1 // overflow check sentinel
-	n = runGCProg(prog, nil, &x[0], 1)
+	n = runGCProg(prog, &x[0])
 	if x[len(x)-1] != 0xa1 {
 		throw("progToPointerMask: overflow")
 	}
@@ -1602,15 +1140,8 @@
 //	10000000 n c: repeat the previous n bits c times; n, c are varints
 //	1nnnnnnn c: repeat the previous n bits c times; c is a varint
 
-// runGCProg executes the GC program prog, and then trailer if non-nil,
-// writing to dst with entries of the given size.
-// If size == 1, dst is a 1-bit pointer mask laid out moving forward from dst.
-// If size == 2, dst is the 2-bit heap bitmap, and writes move backward
-// starting at dst (because the heap bitmap does). In this case, the caller guarantees
-// that only whole bytes in dst need to be written.
-//
-// runGCProg returns the number of 1- or 2-bit entries written to memory.
-func runGCProg(prog, trailer, dst *byte, size int) uintptr {
+// runGCProg returns the number of 1-bit entries written to memory.
+func runGCProg(prog, dst *byte) uintptr {
 	dstStart := dst
 
 	// Bits waiting to be written to memory.
@@ -1623,20 +1154,9 @@
 		// Flush accumulated full bytes.
 		// The rest of the loop assumes that nbits <= 7.
 		for ; nbits >= 8; nbits -= 8 {
-			if size == 1 {
-				*dst = uint8(bits)
-				dst = add1(dst)
-				bits >>= 8
-			} else {
-				v := bits&bitPointerAll | bitScanAll
-				*dst = uint8(v)
-				dst = add1(dst)
-				bits >>= 4
-				v = bits&bitPointerAll | bitScanAll
-				*dst = uint8(v)
-				dst = add1(dst)
-				bits >>= 4
-			}
+			*dst = uint8(bits)
+			dst = add1(dst)
+			bits >>= 8
 		}
 
 		// Process one instruction.
@@ -1646,32 +1166,16 @@
 		if inst&0x80 == 0 {
 			// Literal bits; n == 0 means end of program.
 			if n == 0 {
-				// Program is over; continue in trailer if present.
-				if trailer != nil {
-					p = trailer
-					trailer = nil
-					continue
-				}
+				// Program is over.
 				break Run
 			}
 			nbyte := n / 8
 			for i := uintptr(0); i < nbyte; i++ {
 				bits |= uintptr(*p) << nbits
 				p = add1(p)
-				if size == 1 {
-					*dst = uint8(bits)
-					dst = add1(dst)
-					bits >>= 8
-				} else {
-					v := bits&0xf | bitScanAll
-					*dst = uint8(v)
-					dst = add1(dst)
-					bits >>= 4
-					v = bits&0xf | bitScanAll
-					*dst = uint8(v)
-					dst = add1(dst)
-					bits >>= 4
-				}
+				*dst = uint8(bits)
+				dst = add1(dst)
+				bits >>= 8
 			}
 			if n %= 8; n > 0 {
 				bits |= uintptr(*p) << nbits
@@ -1720,22 +1224,12 @@
 			npattern := nbits
 
 			// If we need more bits, fetch them from memory.
-			if size == 1 {
+			src = subtract1(src)
+			for npattern < n {
+				pattern <<= 8
+				pattern |= uintptr(*src)
 				src = subtract1(src)
-				for npattern < n {
-					pattern <<= 8
-					pattern |= uintptr(*src)
-					src = subtract1(src)
-					npattern += 8
-				}
-			} else {
-				src = subtract1(src)
-				for npattern < n {
-					pattern <<= 4
-					pattern |= uintptr(*src) & 0xf
-					src = subtract1(src)
-					npattern += 4
-				}
+				npattern += 8
 			}
 
 			// We started with the whole bit output buffer,
@@ -1785,20 +1279,11 @@
 			for ; c >= npattern; c -= npattern {
 				bits |= pattern << nbits
 				nbits += npattern
-				if size == 1 {
-					for nbits >= 8 {
-						*dst = uint8(bits)
-						dst = add1(dst)
-						bits >>= 8
-						nbits -= 8
-					}
-				} else {
-					for nbits >= 4 {
-						*dst = uint8(bits&0xf | bitScanAll)
-						dst = add1(dst)
-						bits >>= 4
-						nbits -= 4
-					}
+				for nbits >= 8 {
+					*dst = uint8(bits)
+					dst = add1(dst)
+					bits >>= 8
+					nbits -= 8
 				}
 			}
 
@@ -1815,75 +1300,38 @@
 		// Since nbits <= 7, we know the first few bytes of repeated data
 		// are already written to memory.
 		off := n - nbits // n > nbits because n > maxBits and nbits <= 7
-		if size == 1 {
-			// Leading src fragment.
-			src = subtractb(src, (off+7)/8)
-			if frag := off & 7; frag != 0 {
-				bits |= uintptr(*src) >> (8 - frag) << nbits
-				src = add1(src)
-				nbits += frag
-				c -= frag
-			}
-			// Main loop: load one byte, write another.
-			// The bits are rotating through the bit buffer.
-			for i := c / 8; i > 0; i-- {
-				bits |= uintptr(*src) << nbits
-				src = add1(src)
-				*dst = uint8(bits)
-				dst = add1(dst)
-				bits >>= 8
-			}
-			// Final src fragment.
-			if c %= 8; c > 0 {
-				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
-				nbits += c
-			}
-		} else {
-			// Leading src fragment.
-			src = subtractb(src, (off+3)/4)
-			if frag := off & 3; frag != 0 {
-				bits |= (uintptr(*src) & 0xf) >> (4 - frag) << nbits
-				src = add1(src)
-				nbits += frag
-				c -= frag
-			}
-			// Main loop: load one byte, write another.
-			// The bits are rotating through the bit buffer.
-			for i := c / 4; i > 0; i-- {
-				bits |= (uintptr(*src) & 0xf) << nbits
-				src = add1(src)
-				*dst = uint8(bits&0xf | bitScanAll)
-				dst = add1(dst)
-				bits >>= 4
-			}
-			// Final src fragment.
-			if c %= 4; c > 0 {
-				bits |= (uintptr(*src) & (1<<c - 1)) << nbits
-				nbits += c
-			}
+		// Leading src fragment.
+		src = subtractb(src, (off+7)/8)
+		if frag := off & 7; frag != 0 {
+			bits |= uintptr(*src) >> (8 - frag) << nbits
+			src = add1(src)
+			nbits += frag
+			c -= frag
 		}
-	}
-
-	// Write any final bits out, using full-byte writes, even for the final byte.
-	var totalBits uintptr
-	if size == 1 {
-		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
-		nbits += -nbits & 7
-		for ; nbits > 0; nbits -= 8 {
+		// Main loop: load one byte, write another.
+		// The bits are rotating through the bit buffer.
+		for i := c / 8; i > 0; i-- {
+			bits |= uintptr(*src) << nbits
+			src = add1(src)
 			*dst = uint8(bits)
 			dst = add1(dst)
 			bits >>= 8
 		}
-	} else {
-		totalBits = (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*4 + nbits
-		nbits += -nbits & 3
-		for ; nbits > 0; nbits -= 4 {
-			v := bits&0xf | bitScanAll
-			*dst = uint8(v)
-			dst = add1(dst)
-			bits >>= 4
+		// Final src fragment.
+		if c %= 8; c > 0 {
+			bits |= (uintptr(*src) & (1<<c - 1)) << nbits
+			nbits += c
 		}
 	}
+
+	// Write any final bits out, using full-byte writes, even for the final byte.
+	totalBits := (uintptr(unsafe.Pointer(dst))-uintptr(unsafe.Pointer(dstStart)))*8 + nbits
+	nbits += -nbits & 7
+	for ; nbits > 0; nbits -= 8 {
+		*dst = uint8(bits)
+		dst = add1(dst)
+		bits >>= 8
+	}
 	return totalBits
 }
 
@@ -1898,7 +1346,7 @@
 	// Compute the number of pages needed for bitmapBytes.
 	pages := divRoundUp(bitmapBytes, pageSize)
 	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
-	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
+	runGCProg(addb(prog, 4), (*byte)(unsafe.Pointer(s.startAddr)))
 	return s
 }
 func dematerializeGCProg(s *mspan) {
@@ -1961,18 +1409,12 @@
 	return true
 }
 
-// gcbits returns the GC type info for x, for testing.
+// reflect_gcbits returns the GC type info for x, for testing.
 // The result is the bitmap entries (0 or 1), one entry per byte.
 //
 //go:linkname reflect_gcbits reflect.gcbits
 func reflect_gcbits(x any) []byte {
-	ret := getgcmask(x)
-	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
-	nptr := typ.ptrdata / goarch.PtrSize
-	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
-		ret = ret[:len(ret)-1]
-	}
-	return ret
+	return getgcmask(x)
 }
 
 // Returns GC type info for the pointer stored in ep for testing.
@@ -2011,30 +1453,33 @@
 
 	// heap
 	if base, s, _ := findObject(uintptr(p), 0, 0); base != 0 {
-		hbits := heapBitsForAddr(base)
+		if s.spanclass.noscan() {
+			return nil
+		}
 		n := s.elemsize
+		hbits := heapBitsForAddr(base, n)
 		mask = make([]byte, n/goarch.PtrSize)
-		for i := uintptr(0); i < n; i += goarch.PtrSize {
-			if hbits.isPointer() {
-				mask[i/goarch.PtrSize] = 1
-			}
-			if !hbits.morePointers() {
-				mask = mask[:i/goarch.PtrSize]
+		for {
+			var addr uintptr
+			if hbits, addr = hbits.next(); addr == 0 {
 				break
 			}
-			hbits = hbits.next()
+			mask[(addr-base)/goarch.PtrSize] = 1
+		}
+		// Callers expect this mask to end at the last pointer.
+		for len(mask) > 0 && mask[len(mask)-1] == 0 {
+			mask = mask[:len(mask)-1]
 		}
 		return
 	}
 
 	// stack
-	if _g_ := getg(); _g_.m.curg.stack.lo <= uintptr(p) && uintptr(p) < _g_.m.curg.stack.hi {
+	if gp := getg(); gp.m.curg.stack.lo <= uintptr(p) && uintptr(p) < gp.m.curg.stack.hi {
 		var frame stkframe
 		frame.sp = uintptr(p)
-		_g_ := getg()
-		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
+		gentraceback(gp.m.curg.sched.pc, gp.m.curg.sched.sp, 0, gp.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
 		if frame.fn.valid() {
-			locals, _, _ := getStackMap(&frame, nil, false)
+			locals, _, _ := frame.getStackMap(nil, false)
 			if locals.n == 0 {
 				return
 			}
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 1f484fb..acfd99b 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -6,6 +6,7 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -15,9 +16,9 @@
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
 // must be specially handled.
-//
-//go:notinheap
 type mcache struct {
+	_ sys.NotInHeap
+
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
 	nextSample uintptr // trigger heap sample after allocating this many bytes
@@ -49,7 +50,7 @@
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
 	// can be swept. This is done in acquirep.
-	flushGen uint32
+	flushGen atomic.Uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
@@ -86,7 +87,7 @@
 	systemstack(func() {
 		lock(&mheap_.lock)
 		c = (*mcache)(mheap_.cachealloc.alloc())
-		c.flushGen = mheap_.sweepgen
+		c.flushGen.Store(mheap_.sweepgen)
 		unlock(&mheap_.lock)
 	})
 	for i := range c.alloc {
@@ -251,7 +252,7 @@
 	// visible to the background sweeper.
 	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
 	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
+	s.initHeapBits(false)
 	return s
 }
 
@@ -317,13 +318,14 @@
 	// allocate-black. However, with this approach it's difficult
 	// to avoid spilling mark bits into the *next* GC cycle.
 	sg := mheap_.sweepgen
-	if c.flushGen == sg {
+	flushGen := c.flushGen.Load()
+	if flushGen == sg {
 		return
-	} else if c.flushGen != sg-2 {
-		println("bad flushGen", c.flushGen, "in prepareForSweep; sweepgen", sg)
+	} else if flushGen != sg-2 {
+		println("bad flushGen", flushGen, "in prepareForSweep; sweepgen", sg)
 		throw("bad flushGen")
 	}
 	c.releaseAll()
 	stackcache_clear(c)
-	atomic.Store(&c.flushGen, mheap_.sweepgen) // Synchronizes with gcStart
+	c.flushGen.Store(mheap_.sweepgen) // Synchronizes with gcStart
 }
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index e4bdf35..3382c54 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -12,12 +12,14 @@
 
 package runtime
 
-import "runtime/internal/atomic"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+)
 
 // Central list of free objects of a given size.
-//
-//go:notinheap
 type mcentral struct {
+	_         sys.NotInHeap
 	spanclass spanClass
 
 	// partial and full contain two mspan sets: one of swept in-use
@@ -250,6 +252,6 @@
 	// n := (npages << _PageShift) / size
 	n := s.divideByElemSize(npages << _PageShift)
 	s.limit = s.base() + size*n
-	heapBitsForAddr(s.base()).initSpan(s)
+	s.initHeapBits(false)
 	return s
 }
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
index 1dd2858..73c1a10 100644
--- a/src/runtime/mcheckmark.go
+++ b/src/runtime/mcheckmark.go
@@ -15,6 +15,7 @@
 import (
 	"internal/goarch"
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -22,9 +23,10 @@
 // per-arena bitmap with a bit for every word in the arena. The mark
 // is stored on the bit corresponding to the first word of the marked
 // allocation.
-//
-//go:notinheap
-type checkmarksMap [heapArenaBytes / goarch.PtrSize / 8]uint8
+type checkmarksMap struct {
+	_ sys.NotInHeap
+	b [heapArenaBytes / goarch.PtrSize / 8]uint8
+}
 
 // If useCheckmark is true, marking of an object uses the checkmark
 // bits instead of the standard mark bits.
@@ -50,8 +52,8 @@
 			arena.checkmarks = bitmap
 		} else {
 			// Otherwise clear the existing bitmap.
-			for i := range bitmap {
-				bitmap[i] = 0
+			for i := range bitmap.b {
+				bitmap.b[i] = 0
 			}
 		}
 	}
@@ -88,9 +90,9 @@
 
 	ai := arenaIndex(obj)
 	arena := mheap_.arenas[ai.l1()][ai.l2()]
-	arenaWord := (obj / heapArenaBytes / 8) % uintptr(len(arena.checkmarks))
+	arenaWord := (obj / heapArenaBytes / 8) % uintptr(len(arena.checkmarks.b))
 	mask := byte(1 << ((obj / heapArenaBytes) % 8))
-	bytep := &arena.checkmarks[arenaWord]
+	bytep := &arena.checkmarks.b[arenaWord]
 
 	if atomic.Load8(bytep)&mask != 0 {
 		// Already checkmarked.
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 782465a..6c5edb1 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -23,7 +23,11 @@
 }
 
 func sysUnusedOS(v unsafe.Pointer, n uintptr) {
-	madvise(v, n, _MADV_FREE)
+	if debug.madvdontneed != 0 {
+		madvise(v, n, _MADV_DONTNEED)
+	} else {
+		madvise(v, n, _MADV_FREE)
+	}
 }
 
 func sysUsedOS(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 0e8bf74..88e7d92 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -92,7 +92,7 @@
 }
 
 func memCheck() {
-	if memDebug == false {
+	if !memDebug {
 		return
 	}
 	for p := memFreelist.ptr(); p != nil && p.next != 0; p = p.next.ptr() {
diff --git a/src/runtime/memclr_riscv64.s b/src/runtime/memclr_riscv64.s
index f0e517a..d12b545 100644
--- a/src/runtime/memclr_riscv64.s
+++ b/src/runtime/memclr_riscv64.s
@@ -8,41 +8,96 @@
 
 // void runtime·memclrNoHeapPointers(void*, uintptr)
 TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
-#ifndef GOEXPERIMENT_regabiargs
-	MOV	ptr+0(FP), A0
-	MOV	n+8(FP), A1
-#endif
-	ADD	A0, A1, T4
+	// X10 = ptr
+	// X11 = n
 
-	// If less than eight bytes, do one byte at a time.
-	SLTU	$8, A1, T3
-	BNE	T3, ZERO, outcheck
+	// If less than 8 bytes, do single byte zeroing.
+	MOV	$8, X9
+	BLT	X11, X9, check4
 
-	// Do one byte at a time until eight-aligned.
-	JMP	aligncheck
+	// Check alignment
+	AND	$3, X10, X5
+	BEQZ	X5, aligned
+
+	// Zero one byte at a time until we reach 8 byte alignment.
+	SUB	X5, X11, X11
 align:
-	MOVB	ZERO, (A0)
-	ADD	$1, A0
-aligncheck:
-	AND	$7, A0, T3
-	BNE	T3, ZERO, align
+	ADD	$-1, X5
+	MOVB	ZERO, 0(X10)
+	ADD	$1, X10
+	BNEZ	X5, align
 
-	// Do eight bytes at a time as long as there is room.
-	ADD	$-7, T4, T5
-	JMP	wordscheck
-words:
-	MOV	ZERO, (A0)
-	ADD	$8, A0
-wordscheck:
-	SLTU	T5, A0, T3
-	BNE	T3, ZERO, words
+aligned:
+	MOV	$8, X9
+	BLT	X11, X9, check4
+	MOV	$16, X9
+	BLT	X11, X9, zero8
+	MOV	$32, X9
+	BLT	X11, X9, zero16
+	MOV	$64, X9
+	BLT	X11, X9, zero32
+loop64:
+	MOV	ZERO, 0(X10)
+	MOV	ZERO, 8(X10)
+	MOV	ZERO, 16(X10)
+	MOV	ZERO, 24(X10)
+	MOV	ZERO, 32(X10)
+	MOV	ZERO, 40(X10)
+	MOV	ZERO, 48(X10)
+	MOV	ZERO, 56(X10)
+	ADD	$64, X10
+	ADD	$-64, X11
+	BGE	X11, X9, loop64
+	BEQZ	X11, done
 
-	JMP	outcheck
-out:
-	MOVB	ZERO, (A0)
-	ADD	$1, A0
-outcheck:
-	BNE	A0, T4, out
+check32:
+	MOV	$32, X9
+	BLT	X11, X9, check16
+zero32:
+	MOV	ZERO, 0(X10)
+	MOV	ZERO, 8(X10)
+	MOV	ZERO, 16(X10)
+	MOV	ZERO, 24(X10)
+	ADD	$32, X10
+	ADD	$-32, X11
+	BEQZ	X11, done
+
+check16:
+	MOV	$16, X9
+	BLT	X11, X9, check8
+zero16:
+	MOV	ZERO, 0(X10)
+	MOV	ZERO, 8(X10)
+	ADD	$16, X10
+	ADD	$-16, X11
+	BEQZ	X11, done
+
+check8:
+	MOV	$8, X9
+	BLT	X11, X9, check4
+zero8:
+	MOV	ZERO, 0(X10)
+	ADD	$8, X10
+	ADD	$-8, X11
+	BEQZ	X11, done
+
+check4:
+	MOV	$4, X9
+	BLT	X11, X9, loop1
+zero4:
+	MOVB	ZERO, 0(X10)
+	MOVB	ZERO, 1(X10)
+	MOVB	ZERO, 2(X10)
+	MOVB	ZERO, 3(X10)
+	ADD	$4, X10
+	ADD	$-4, X11
+
+loop1:
+	BEQZ	X11, done
+	MOVB	ZERO, 0(X10)
+	ADD	$1, X10
+	ADD	$-1, X11
+	JMP	loop1
 
 done:
 	RET
diff --git a/src/runtime/memclr_wasm.s b/src/runtime/memclr_wasm.s
index 5a05304..19d08ff 100644
--- a/src/runtime/memclr_wasm.s
+++ b/src/runtime/memclr_wasm.s
@@ -11,29 +11,10 @@
 	MOVD ptr+0(FP), R0
 	MOVD n+8(FP), R1
 
-loop:
-	Loop
-		Get R1
-		I64Eqz
-		If
-			RET
-		End
-
-		Get R0
-		I32WrapI64
-		I64Const $0
-		I64Store8 $0
-
-		Get R0
-		I64Const $1
-		I64Add
-		Set R0
-
-		Get R1
-		I64Const $1
-		I64Sub
-		Set R1
-
-		Br loop
-	End
-	UNDEF
+	Get R0
+	I32WrapI64
+	I32Const $0
+	Get R1
+	I32WrapI64
+	MemoryFill
+	RET
diff --git a/src/runtime/memmove_linux_amd64_test.go b/src/runtime/memmove_linux_amd64_test.go
index b3ccd90..5f90062 100644
--- a/src/runtime/memmove_linux_amd64_test.go
+++ b/src/runtime/memmove_linux_amd64_test.go
@@ -6,7 +6,6 @@
 
 import (
 	"os"
-	"reflect"
 	"syscall"
 	"testing"
 	"unsafe"
@@ -45,11 +44,7 @@
 		defer syscall.Syscall(syscall.SYS_MUNMAP, base+off, 65536, 0)
 	}
 
-	var s []byte
-	sp := (*reflect.SliceHeader)(unsafe.Pointer(&s))
-	sp.Data = base
-	sp.Len, sp.Cap = 3<<30, 3<<30
-
+	s := unsafe.Slice((*byte)(unsafe.Pointer(base)), 3<<30)
 	n := copy(s[1:], s)
 	if n != 3<<30-1 {
 		t.Fatalf("copied %d bytes, expected %d", n, 3<<30-1)
diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s
index 538aee3..ea622ed 100644
--- a/src/runtime/memmove_riscv64.s
+++ b/src/runtime/memmove_riscv64.s
@@ -8,93 +8,311 @@
 
 // void runtime·memmove(void*, void*, uintptr)
 TEXT runtime·memmove<ABIInternal>(SB),NOSPLIT,$-0-24
-#ifndef GOEXPERIMENT_regabiargs
-	MOV	to+0(FP), A0
-	MOV	from+8(FP), A1
-	MOV	n+16(FP), A2
-#endif
-	ADD	A1, A2, T5
+	// X10 = to
+	// X11 = from
+	// X12 = n
+	BEQ	X10, X11, done
+	BEQZ	X12, done
 
 	// If the destination is ahead of the source, start at the end of the
 	// buffer and go backward.
-	BLTU	A1, A0, b
+	BGTU	X10, X11, backward
 
-	// If less than eight bytes, do one byte at a time.
-	SLTU	$8, A2, T3
-	BNE	T3, ZERO, f_outcheck
+	// If less than 8 bytes, do single byte copies.
+	MOV	$8, X9
+	BLT	X12, X9, f_loop4_check
 
-	// Do one byte at a time until from is eight-aligned.
-	JMP	f_aligncheck
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$3, X10, X5
+	AND	$3, X11, X6
+	BNE	X5, X6, f_loop8_unaligned_check
+	BEQZ	X5, f_loop_check
+
+	// Move one byte at a time until we reach 8 byte alignment.
+	SUB	X5, X12, X12
 f_align:
-	MOVB	(A1), T3
-	MOVB	T3, (A0)
-	ADD	$1, A0
-	ADD	$1, A1
-f_aligncheck:
-	AND	$7, A1, T3
-	BNE	T3, ZERO, f_align
+	ADD	$-1, X5
+	MOVB	0(X11), X14
+	MOVB	X14, 0(X10)
+	ADD	$1, X10
+	ADD	$1, X11
+	BNEZ	X5, f_align
 
-	// Do eight bytes at a time as long as there is room.
-	ADD	$-7, T5, T6
-	JMP	f_wordscheck
-f_words:
-	MOV	(A1), T3
-	MOV	T3, (A0)
-	ADD	$8, A0
-	ADD	$8, A1
-f_wordscheck:
-	SLTU	T6, A1, T3
-	BNE	T3, ZERO, f_words
+f_loop_check:
+	MOV	$16, X9
+	BLT	X12, X9, f_loop8_check
+	MOV	$32, X9
+	BLT	X12, X9, f_loop16_check
+	MOV	$64, X9
+	BLT	X12, X9, f_loop32_check
+f_loop64:
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	16(X11), X16
+	MOV	24(X11), X17
+	MOV	32(X11), X18
+	MOV	40(X11), X19
+	MOV	48(X11), X20
+	MOV	56(X11), X21
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	MOV	X16, 16(X10)
+	MOV	X17, 24(X10)
+	MOV	X18, 32(X10)
+	MOV	X19, 40(X10)
+	MOV	X20, 48(X10)
+	MOV	X21, 56(X10)
+	ADD	$64, X10
+	ADD	$64, X11
+	ADD	$-64, X12
+	BGE	X12, X9, f_loop64
+	BEQZ	X12, done
 
-	// Finish off the remaining partial word.
-	JMP 	f_outcheck
-f_out:
-	MOVB	(A1), T3
-	MOVB	T3, (A0)
-	ADD	$1, A0
-	ADD	$1, A1
-f_outcheck:
-	BNE	A1, T5, f_out
+f_loop32_check:
+	MOV	$32, X9
+	BLT	X12, X9, f_loop16_check
+f_loop32:
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	16(X11), X16
+	MOV	24(X11), X17
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	MOV	X16, 16(X10)
+	MOV	X17, 24(X10)
+	ADD	$32, X10
+	ADD	$32, X11
+	ADD	$-32, X12
+	BGE	X12, X9, f_loop32
+	BEQZ	X12, done
 
-	RET
+f_loop16_check:
+	MOV	$16, X9
+	BLT	X12, X9, f_loop8_check
+f_loop16:
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	ADD	$16, X10
+	ADD	$16, X11
+	ADD	$-16, X12
+	BGE	X12, X9, f_loop16
+	BEQZ	X12, done
 
-b:
-	ADD	A0, A2, T4
-	// If less than eight bytes, do one byte at a time.
-	SLTU	$8, A2, T3
-	BNE	T3, ZERO, b_outcheck
+f_loop8_check:
+	MOV	$8, X9
+	BLT	X12, X9, f_loop4_check
+f_loop8:
+	MOV	0(X11), X14
+	MOV	X14, 0(X10)
+	ADD	$8, X10
+	ADD	$8, X11
+	ADD	$-8, X12
+	BGE	X12, X9, f_loop8
+	BEQZ	X12, done
+	JMP	f_loop4_check
 
-	// Do one byte at a time until from+n is eight-aligned.
-	JMP	b_aligncheck
+f_loop8_unaligned_check:
+	MOV	$8, X9
+	BLT	X12, X9, f_loop4_check
+f_loop8_unaligned:
+	MOVB	0(X11), X14
+	MOVB	1(X11), X15
+	MOVB	2(X11), X16
+	MOVB	3(X11), X17
+	MOVB	4(X11), X18
+	MOVB	5(X11), X19
+	MOVB	6(X11), X20
+	MOVB	7(X11), X21
+	MOVB	X14, 0(X10)
+	MOVB	X15, 1(X10)
+	MOVB	X16, 2(X10)
+	MOVB	X17, 3(X10)
+	MOVB	X18, 4(X10)
+	MOVB	X19, 5(X10)
+	MOVB	X20, 6(X10)
+	MOVB	X21, 7(X10)
+	ADD	$8, X10
+	ADD	$8, X11
+	ADD	$-8, X12
+	BGE	X12, X9, f_loop8_unaligned
+
+f_loop4_check:
+	MOV	$4, X9
+	BLT	X12, X9, f_loop1
+f_loop4:
+	MOVB	0(X11), X14
+	MOVB	1(X11), X15
+	MOVB	2(X11), X16
+	MOVB	3(X11), X17
+	MOVB	X14, 0(X10)
+	MOVB	X15, 1(X10)
+	MOVB	X16, 2(X10)
+	MOVB	X17, 3(X10)
+	ADD	$4, X10
+	ADD	$4, X11
+	ADD	$-4, X12
+	BGE	X12, X9, f_loop4
+
+f_loop1:
+	BEQZ	X12, done
+	MOVB	0(X11), X14
+	MOVB	X14, 0(X10)
+	ADD	$1, X10
+	ADD	$1, X11
+	ADD	$-1, X12
+	JMP	f_loop1
+
+backward:
+	ADD	X10, X12, X10
+	ADD	X11, X12, X11
+
+	// If less than 8 bytes, do single byte copies.
+	MOV	$8, X9
+	BLT	X12, X9, b_loop4_check
+
+	// Check alignment - if alignment differs we have to do one byte at a time.
+	AND	$3, X10, X5
+	AND	$3, X11, X6
+	BNE	X5, X6, b_loop8_unaligned_check
+	BEQZ	X5, b_loop_check
+
+	// Move one byte at a time until we reach 8 byte alignment.
+	SUB	X5, X12, X12
 b_align:
-	ADD	$-1, T4
-	ADD	$-1, T5
-	MOVB	(T5), T3
-	MOVB	T3, (T4)
-b_aligncheck:
-	AND	$7, T5, T3
-	BNE	T3, ZERO, b_align
+	ADD	$-1, X5
+	ADD	$-1, X10
+	ADD	$-1, X11
+	MOVB	0(X11), X14
+	MOVB	X14, 0(X10)
+	BNEZ	X5, b_align
 
-	// Do eight bytes at a time as long as there is room.
-	ADD	$7, A1, T6
-	JMP	b_wordscheck
-b_words:
-	ADD	$-8, T4
-	ADD	$-8, T5
-	MOV	(T5), T3
-	MOV	T3, (T4)
-b_wordscheck:
-	SLTU	T5, T6, T3
-	BNE	T3, ZERO, b_words
+b_loop_check:
+	MOV	$16, X9
+	BLT	X12, X9, b_loop8_check
+	MOV	$32, X9
+	BLT	X12, X9, b_loop16_check
+	MOV	$64, X9
+	BLT	X12, X9, b_loop32_check
+b_loop64:
+	ADD	$-64, X10
+	ADD	$-64, X11
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	16(X11), X16
+	MOV	24(X11), X17
+	MOV	32(X11), X18
+	MOV	40(X11), X19
+	MOV	48(X11), X20
+	MOV	56(X11), X21
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	MOV	X16, 16(X10)
+	MOV	X17, 24(X10)
+	MOV	X18, 32(X10)
+	MOV	X19, 40(X10)
+	MOV	X20, 48(X10)
+	MOV	X21, 56(X10)
+	ADD	$-64, X12
+	BGE	X12, X9, b_loop64
+	BEQZ	X12, done
 
-	// Finish off the remaining partial word.
-	JMP	b_outcheck
-b_out:
-	ADD	$-1, T4
-	ADD	$-1, T5
-	MOVB	(T5), T3
-	MOVB	T3, (T4)
-b_outcheck:
-	BNE	T5, A1, b_out
+b_loop32_check:
+	MOV	$32, X9
+	BLT	X12, X9, b_loop16_check
+b_loop32:
+	ADD	$-32, X10
+	ADD	$-32, X11
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	16(X11), X16
+	MOV	24(X11), X17
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	MOV	X16, 16(X10)
+	MOV	X17, 24(X10)
+	ADD	$-32, X12
+	BGE	X12, X9, b_loop32
+	BEQZ	X12, done
 
+b_loop16_check:
+	MOV	$16, X9
+	BLT	X12, X9, b_loop8_check
+b_loop16:
+	ADD	$-16, X10
+	ADD	$-16, X11
+	MOV	0(X11), X14
+	MOV	8(X11), X15
+	MOV	X14, 0(X10)
+	MOV	X15, 8(X10)
+	ADD	$-16, X12
+	BGE	X12, X9, b_loop16
+	BEQZ	X12, done
+
+b_loop8_check:
+	MOV	$8, X9
+	BLT	X12, X9, b_loop4_check
+b_loop8:
+	ADD	$-8, X10
+	ADD	$-8, X11
+	MOV	0(X11), X14
+	MOV	X14, 0(X10)
+	ADD	$-8, X12
+	BGE	X12, X9, b_loop8
+	BEQZ	X12, done
+	JMP	b_loop4_check
+
+b_loop8_unaligned_check:
+	MOV	$8, X9
+	BLT	X12, X9, b_loop4_check
+b_loop8_unaligned:
+	ADD	$-8, X10
+	ADD	$-8, X11
+	MOVB	0(X11), X14
+	MOVB	1(X11), X15
+	MOVB	2(X11), X16
+	MOVB	3(X11), X17
+	MOVB	4(X11), X18
+	MOVB	5(X11), X19
+	MOVB	6(X11), X20
+	MOVB	7(X11), X21
+	MOVB	X14, 0(X10)
+	MOVB	X15, 1(X10)
+	MOVB	X16, 2(X10)
+	MOVB	X17, 3(X10)
+	MOVB	X18, 4(X10)
+	MOVB	X19, 5(X10)
+	MOVB	X20, 6(X10)
+	MOVB	X21, 7(X10)
+	ADD	$-8, X12
+	BGE	X12, X9, b_loop8_unaligned
+
+b_loop4_check:
+	MOV	$4, X9
+	BLT	X12, X9, b_loop1
+b_loop4:
+	ADD	$-4, X10
+	ADD	$-4, X11
+	MOVB	0(X11), X14
+	MOVB	1(X11), X15
+	MOVB	2(X11), X16
+	MOVB	3(X11), X17
+	MOVB	X14, 0(X10)
+	MOVB	X15, 1(X10)
+	MOVB	X16, 2(X10)
+	MOVB	X17, 3(X10)
+	ADD	$-4, X12
+	BGE	X12, X9, b_loop4
+
+b_loop1:
+	BEQZ	X12, done
+	ADD	$-1, X10
+	ADD	$-1, X11
+	MOVB	0(X11), X14
+	MOVB	X14, 0(X10)
+	ADD	$-1, X12
+	JMP	b_loop1
+
+done:
 	RET
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 8887320..f1247f6 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -244,23 +244,23 @@
 					dst[i] = nil
 				}
 
-				var ready uint32
+				var ready atomic.Uint32
 				go func() {
 					sp := unsafe.Pointer(&src[0])
 					dp := unsafe.Pointer(&dst[0])
-					atomic.StoreUint32(&ready, 1)
+					ready.Store(1)
 					for i := 0; i < 10000; i++ {
 						Memmove(dp, sp, sz)
 						MemclrNoHeapPointers(dp, sz)
 					}
-					atomic.StoreUint32(&ready, 2)
+					ready.Store(2)
 				}()
 
-				for atomic.LoadUint32(&ready) == 0 {
+				for ready.Load() == 0 {
 					Gosched()
 				}
 
-				for atomic.LoadUint32(&ready) != 2 {
+				for ready.Load() != 2 {
 					for i := range dst {
 						p := dst[i]
 						if p != nil && p != &x {
@@ -417,20 +417,20 @@
 	}
 
 	benchSizes := []RunData{
-		RunData{[]int{1043, 1078, 1894, 1582, 1044, 1165, 1467, 1100, 1919, 1562, 1932, 1645,
+		{[]int{1043, 1078, 1894, 1582, 1044, 1165, 1467, 1100, 1919, 1562, 1932, 1645,
 			1412, 1038, 1576, 1200, 1029, 1336, 1095, 1494, 1350, 1025, 1502, 1548, 1316, 1296,
 			1868, 1639, 1546, 1626, 1642, 1308, 1726, 1665, 1678, 1187, 1515, 1598, 1353, 1237,
 			1977, 1452, 2012, 1914, 1514, 1136, 1975, 1618, 1536, 1695, 1600, 1733, 1392, 1099,
 			1358, 1996, 1224, 1783, 1197, 1838, 1460, 1556, 1554, 2020}}, // 1kb-2kb
-		RunData{[]int{3964, 5139, 6573, 7775, 6553, 2413, 3466, 5394, 2469, 7336, 7091, 6745,
+		{[]int{3964, 5139, 6573, 7775, 6553, 2413, 3466, 5394, 2469, 7336, 7091, 6745,
 			4028, 5643, 6164, 3475, 4138, 6908, 7559, 3335, 5660, 4122, 3945, 2082, 7564, 6584,
 			5111, 2288, 6789, 2797, 4928, 7986, 5163, 5447, 2999, 4968, 3174, 3202, 7908, 8137,
 			4735, 6161, 4646, 7592, 3083, 5329, 3687, 2754, 3599, 7231, 6455, 2549, 8063, 2189,
 			7121, 5048, 4277, 6626, 6306, 2815, 7473, 3963, 7549, 7255}}, // 2kb-8kb
-		RunData{[]int{16304, 15936, 15760, 4736, 9136, 11184, 10160, 5952, 14560, 15744,
+		{[]int{16304, 15936, 15760, 4736, 9136, 11184, 10160, 5952, 14560, 15744,
 			6624, 5872, 13088, 14656, 14192, 10304, 4112, 10384, 9344, 4496, 11392, 7024,
 			5200, 10064, 14784, 5808, 13504, 10480, 8512, 4896, 13264, 5600}}, // 4kb-16kb
-		RunData{[]int{164576, 233136, 220224, 183280, 214112, 217248, 228560, 201728}}, // 128kb-256kb
+		{[]int{164576, 233136, 220224, 183280, 214112, 217248, 228560, 201728}}, // 128kb-256kb
 	}
 
 	for _, t := range benchSizes {
@@ -468,160 +468,382 @@
 	}
 }
 
+func BenchmarkClearFat7(b *testing.B) {
+	p := new([7]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [7]byte{}
+	}
+}
+
 func BenchmarkClearFat8(b *testing.B) {
+	p := new([8 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [8 / 4]uint32
-		_ = x
+		*p = [8 / 4]uint32{}
 	}
 }
+
+func BenchmarkClearFat11(b *testing.B) {
+	p := new([11]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [11]byte{}
+	}
+}
+
 func BenchmarkClearFat12(b *testing.B) {
+	p := new([12 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [12 / 4]uint32
-		_ = x
+		*p = [12 / 4]uint32{}
 	}
 }
+
+func BenchmarkClearFat13(b *testing.B) {
+	p := new([13]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [13]byte{}
+	}
+}
+
+func BenchmarkClearFat14(b *testing.B) {
+	p := new([14]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [14]byte{}
+	}
+}
+
+func BenchmarkClearFat15(b *testing.B) {
+	p := new([15]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [15]byte{}
+	}
+}
+
 func BenchmarkClearFat16(b *testing.B) {
+	p := new([16 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [16 / 4]uint32
-		_ = x
+		*p = [16 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat24(b *testing.B) {
+	p := new([24 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [24 / 4]uint32
-		_ = x
+		*p = [24 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat32(b *testing.B) {
+	p := new([32 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [32 / 4]uint32
-		_ = x
+		*p = [32 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat40(b *testing.B) {
+	p := new([40 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [40 / 4]uint32
-		_ = x
+		*p = [40 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat48(b *testing.B) {
+	p := new([48 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [48 / 4]uint32
-		_ = x
+		*p = [48 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat56(b *testing.B) {
+	p := new([56 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [56 / 4]uint32
-		_ = x
+		*p = [56 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat64(b *testing.B) {
+	p := new([64 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [64 / 4]uint32
-		_ = x
+		*p = [64 / 4]uint32{}
 	}
 }
+
+func BenchmarkClearFat72(b *testing.B) {
+	p := new([72 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [72 / 4]uint32{}
+	}
+}
+
 func BenchmarkClearFat128(b *testing.B) {
+	p := new([128 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [128 / 4]uint32
-		_ = x
+		*p = [128 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat256(b *testing.B) {
+	p := new([256 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [256 / 4]uint32
-		_ = x
+		*p = [256 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat512(b *testing.B) {
+	p := new([512 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [512 / 4]uint32
-		_ = x
+		*p = [512 / 4]uint32{}
 	}
 }
+
 func BenchmarkClearFat1024(b *testing.B) {
+	p := new([1024 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		var x [1024 / 4]uint32
-		_ = x
+		*p = [1024 / 4]uint32{}
+	}
+}
+
+func BenchmarkClearFat1032(b *testing.B) {
+	p := new([1032 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [1032 / 4]uint32{}
+	}
+}
+
+func BenchmarkClearFat1040(b *testing.B) {
+	p := new([1040 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = [1040 / 4]uint32{}
+	}
+}
+
+func BenchmarkCopyFat7(b *testing.B) {
+	var x [7]byte
+	p := new([7]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
 	}
 }
 
 func BenchmarkCopyFat8(b *testing.B) {
 	var x [8 / 4]uint32
+	p := new([8 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
+func BenchmarkCopyFat11(b *testing.B) {
+	var x [11]byte
+	p := new([11]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
 func BenchmarkCopyFat12(b *testing.B) {
 	var x [12 / 4]uint32
+	p := new([12 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
+func BenchmarkCopyFat13(b *testing.B) {
+	var x [13]byte
+	p := new([13]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
+func BenchmarkCopyFat14(b *testing.B) {
+	var x [14]byte
+	p := new([14]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
+func BenchmarkCopyFat15(b *testing.B) {
+	var x [15]byte
+	p := new([15]byte)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
 func BenchmarkCopyFat16(b *testing.B) {
 	var x [16 / 4]uint32
+	p := new([16 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat24(b *testing.B) {
 	var x [24 / 4]uint32
+	p := new([24 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat32(b *testing.B) {
 	var x [32 / 4]uint32
+	p := new([32 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat64(b *testing.B) {
 	var x [64 / 4]uint32
+	p := new([64 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
+func BenchmarkCopyFat72(b *testing.B) {
+	var x [72 / 4]uint32
+	p := new([72 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
 func BenchmarkCopyFat128(b *testing.B) {
 	var x [128 / 4]uint32
+	p := new([128 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat256(b *testing.B) {
 	var x [256 / 4]uint32
+	p := new([256 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat512(b *testing.B) {
 	var x [512 / 4]uint32
+	p := new([512 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat520(b *testing.B) {
 	var x [520 / 4]uint32
+	p := new([520 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
 	}
 }
+
 func BenchmarkCopyFat1024(b *testing.B) {
 	var x [1024 / 4]uint32
+	p := new([1024 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		y := x
-		_ = y
+		*p = x
+	}
+}
+
+func BenchmarkCopyFat1032(b *testing.B) {
+	var x [1032 / 4]uint32
+	p := new([1032 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
+	}
+}
+
+func BenchmarkCopyFat1040(b *testing.B) {
+	var x [1040 / 4]uint32
+	p := new([1040 / 4]uint32)
+	Escape(p)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		*p = x
 	}
 }
 
diff --git a/src/runtime/memmove_wasm.s b/src/runtime/memmove_wasm.s
index 8525fea..1be8487 100644
--- a/src/runtime/memmove_wasm.s
+++ b/src/runtime/memmove_wasm.s
@@ -13,142 +13,10 @@
 	MOVD n+16(FP), R2
 
 	Get R0
+	I32WrapI64
 	Get R1
-	I64LtU
-	If // forward
-exit_forward_64:
-		Block
-loop_forward_64:
-			Loop
-				Get R2
-				I64Const $8
-				I64LtU
-				BrIf exit_forward_64
-
-				MOVD 0(R1), 0(R0)
-
-				Get R0
-				I64Const $8
-				I64Add
-				Set R0
-
-				Get R1
-				I64Const $8
-				I64Add
-				Set R1
-
-				Get R2
-				I64Const $8
-				I64Sub
-				Set R2
-
-				Br loop_forward_64
-			End
-		End
-
-loop_forward_8:
-		Loop
-			Get R2
-			I64Eqz
-			If
-				RET
-			End
-
-			Get R0
-			I32WrapI64
-			I64Load8U (R1)
-			I64Store8 $0
-
-			Get R0
-			I64Const $1
-			I64Add
-			Set R0
-
-			Get R1
-			I64Const $1
-			I64Add
-			Set R1
-
-			Get R2
-			I64Const $1
-			I64Sub
-			Set R2
-
-			Br loop_forward_8
-		End
-
-	Else
-		// backward
-		Get R0
-		Get R2
-		I64Add
-		Set R0
-
-		Get R1
-		Get R2
-		I64Add
-		Set R1
-
-exit_backward_64:
-		Block
-loop_backward_64:
-			Loop
-				Get R2
-				I64Const $8
-				I64LtU
-				BrIf exit_backward_64
-
-				Get R0
-				I64Const $8
-				I64Sub
-				Set R0
-
-				Get R1
-				I64Const $8
-				I64Sub
-				Set R1
-
-				Get R2
-				I64Const $8
-				I64Sub
-				Set R2
-
-				MOVD 0(R1), 0(R0)
-
-				Br loop_backward_64
-			End
-		End
-
-loop_backward_8:
-		Loop
-			Get R2
-			I64Eqz
-			If
-				RET
-			End
-
-			Get R0
-			I64Const $1
-			I64Sub
-			Set R0
-
-			Get R1
-			I64Const $1
-			I64Sub
-			Set R1
-
-			Get R2
-			I64Const $1
-			I64Sub
-			Set R2
-
-			Get R0
-			I32WrapI64
-			I64Load8U (R1)
-			I64Store8 $0
-
-			Br loop_backward_8
-		End
-	End
-
-	UNDEF
+	I32WrapI64
+	Get R2
+	I32WrapI64
+	MemoryCopy
+	RET
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
index 986121b..2061dc0 100644
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -7,7 +7,6 @@
 // Metrics implementation exported to runtime/metrics.
 
 import (
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -41,7 +40,7 @@
 	// Acquire the metricsSema but with handoff. Operations are typically
 	// expensive enough that queueing up goroutines and handing off between
 	// them will be noticeably better-behaved.
-	semacquire1(&metricsSema, true, 0, 0)
+	semacquire1(&metricsSema, true, 0, 0, waitReasonSemacquire)
 	if raceenabled {
 		raceacquire(unsafe.Pointer(&metricsSema))
 	}
@@ -91,6 +90,83 @@
 				out.scalar = uint64(NumCgoCall())
 			},
 		},
+		"/cpu/classes/gc/mark/assist:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.gcAssistTime))
+			},
+		},
+		"/cpu/classes/gc/mark/dedicated:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.gcDedicatedTime))
+			},
+		},
+		"/cpu/classes/gc/mark/idle:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.gcIdleTime))
+			},
+		},
+		"/cpu/classes/gc/pause:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.gcPauseTime))
+			},
+		},
+		"/cpu/classes/gc/total:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.gcTotalTime))
+			},
+		},
+		"/cpu/classes/idle:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.idleTime))
+			},
+		},
+		"/cpu/classes/scavenge/assist:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.scavengeAssistTime))
+			},
+		},
+		"/cpu/classes/scavenge/background:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.scavengeBgTime))
+			},
+		},
+		"/cpu/classes/scavenge/total:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.scavengeTotalTime))
+			},
+		},
+		"/cpu/classes/total:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.totalTime))
+			},
+		},
+		"/cpu/classes/user:cpu-seconds": {
+			deps: makeStatDepSet(cpuStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(in.cpuStats.userTime))
+			},
+		},
 		"/gc/cycles/automatic:gc-cycles": {
 			deps: makeStatDepSet(sysStatsDep),
 			compute: func(in *statAggregate, out *metricValue) {
@@ -197,10 +273,11 @@
 				// The bottom-most bucket, containing negative values, is tracked
 				// as a separately as underflow, so fill that in manually and then
 				// iterate over the rest.
-				hist.counts[0] = atomic.Load64(&memstats.gcPauseDist.underflow)
+				hist.counts[0] = memstats.gcPauseDist.underflow.Load()
 				for i := range memstats.gcPauseDist.counts {
-					hist.counts[i+1] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+					hist.counts[i+1] = memstats.gcPauseDist.counts[i].Load()
 				}
+				hist.counts[len(hist.counts)-1] = memstats.gcPauseDist.overflow.Load()
 			},
 		},
 		"/gc/stack/starting-size:bytes": {
@@ -327,10 +404,17 @@
 		"/sched/latencies:seconds": {
 			compute: func(_ *statAggregate, out *metricValue) {
 				hist := out.float64HistOrInit(timeHistBuckets)
-				hist.counts[0] = atomic.Load64(&sched.timeToRun.underflow)
+				hist.counts[0] = sched.timeToRun.underflow.Load()
 				for i := range sched.timeToRun.counts {
-					hist.counts[i+1] = atomic.Load64(&sched.timeToRun.counts[i])
+					hist.counts[i+1] = sched.timeToRun.counts[i].Load()
 				}
+				hist.counts[len(hist.counts)-1] = sched.timeToRun.overflow.Load()
+			},
+		},
+		"/sync/mutex/wait/total:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindFloat64
+				out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
 			},
 		},
 	}
@@ -344,6 +428,7 @@
 const (
 	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
 	sysStatsDep                 // corresponds to sysStatsAggregate
+	cpuStatsDep                 // corresponds to cpuStatsAggregate
 	numStatsDeps
 )
 
@@ -489,6 +574,23 @@
 	})
 }
 
+// cpuStatsAggregate represents CPU stats obtained from the runtime
+// acquired together to avoid skew and inconsistencies.
+type cpuStatsAggregate struct {
+	cpuStats
+}
+
+// compute populates the cpuStatsAggregate with values from the runtime.
+func (a *cpuStatsAggregate) compute() {
+	a.cpuStats = work.cpuStats
+}
+
+// nsToSec takes a duration in nanoseconds and converts it to seconds as
+// a float64.
+func nsToSec(ns int64) float64 {
+	return float64(ns) / 1e9
+}
+
 // statAggregate is the main driver of the metrics implementation.
 //
 // It contains multiple aggregates of runtime statistics, as well
@@ -498,6 +600,7 @@
 	ensured   statDepSet
 	heapStats heapStatsAggregate
 	sysStats  sysStatsAggregate
+	cpuStats  cpuStatsAggregate
 }
 
 // ensure populates statistics aggregates determined by deps if they
@@ -516,12 +619,14 @@
 			a.heapStats.compute()
 		case sysStatsDep:
 			a.sysStats.compute()
+		case cpuStatsDep:
+			a.cpuStats.compute()
 		}
 	}
 	a.ensured = a.ensured.union(missing)
 }
 
-// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// metricKind is a runtime copy of runtime/metrics.ValueKind and
 // must be kept structurally identical to that type.
 type metricKind int
 
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
index ee99d39..dcfe01e 100644
--- a/src/runtime/metrics/description.go
+++ b/src/runtime/metrics/description.go
@@ -58,6 +58,122 @@
 		Cumulative:  true,
 	},
 	{
+		Name: "/cpu/classes/gc/mark/assist:cpu-seconds",
+		Description: "Estimated total CPU time goroutines spent performing GC tasks " +
+			"to assist the GC and prevent it from falling behind the application. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/gc/mark/dedicated:cpu-seconds",
+		Description: "Estimated total CPU time spent performing GC tasks on " +
+			"processors (as defined by GOMAXPROCS) dedicated to those tasks. " +
+			"This includes time spent with the world stopped due to the GC. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/gc/mark/idle:cpu-seconds",
+		Description: "Estimated total CPU time spent performing GC tasks on " +
+			"spare CPU resources that the Go scheduler could not otherwise find " +
+			"a use for. This should be subtracted from the total GC CPU time to " +
+			"obtain a measure of compulsory GC CPU time. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/gc/pause:cpu-seconds",
+		Description: "Estimated total CPU time spent with the application paused by " +
+			"the GC. Even if only one thread is running during the pause, this is " +
+			"computed as GOMAXPROCS times the pause latency because nothing else " +
+			"can be executing. This is the exact sum of samples in /gc/pause:seconds " +
+			"if each sample is multiplied by GOMAXPROCS at the time it is taken. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/gc/total:cpu-seconds",
+		Description: "Estimated total CPU time spent performing GC tasks. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics. Sum of all metrics in /cpu/classes/gc.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/idle:cpu-seconds",
+		Description: "Estimated total available CPU time not spent executing any Go or Go runtime code. " +
+			"In other words, the part of /cpu/classes/total:cpu-seconds that was unused. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/scavenge/assist:cpu-seconds",
+		Description: "Estimated total CPU time spent returning unused memory to the " +
+			"underlying platform in response eagerly in response to memory pressure. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/scavenge/background:cpu-seconds",
+		Description: "Estimated total CPU time spent performing background tasks " +
+			"to return unused memory to the underlying platform. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/scavenge/total:cpu-seconds",
+		Description: "Estimated total CPU time spent performing tasks that return " +
+			"unused memory to the underlying platform. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics. Sum of all metrics in /cpu/classes/scavenge.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/total:cpu-seconds",
+		Description: "Estimated total available CPU time for user Go code " +
+			"or the Go runtime, as defined by GOMAXPROCS. In other words, GOMAXPROCS " +
+			"integrated over the wall-clock duration this process has been executing for. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics. Sum of all metrics in /cpu/classes.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
+		Name: "/cpu/classes/user:cpu-seconds",
+		Description: "Estimated total CPU time spent running user Go code. This may " +
+			"also include some small amount of time spent in the Go runtime. " +
+			"This metric is an overestimate, and not directly comparable to " +
+			"system CPU time measurements. Compare only with other /cpu/classes " +
+			"metrics.",
+		Kind:       KindFloat64,
+		Cumulative: true,
+	},
+	{
 		Name:        "/gc/cycles/automatic:gc-cycles",
 		Description: "Count of completed GC cycles generated by the Go runtime.",
 		Kind:        KindUint64,
@@ -250,6 +366,12 @@
 		Description: "Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.",
 		Kind:        KindFloat64Histogram,
 	},
+	{
+		Name:        "/sync/mutex/wait/total:seconds",
+		Description: "Approximate cumulative time goroutines have spent blocked on a sync.Mutex or sync.RWMutex. This metric is useful for identifying global changes in lock contention. Collect a mutex or block profile using the runtime/pprof package for more detailed contention data.",
+		Kind:        KindFloat64,
+		Cumulative:  true,
+	},
 }
 
 // All returns a slice of containing metric descriptions for all supported metrics.
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
index 28c9f6a..b593d8d 100644
--- a/src/runtime/metrics/doc.go
+++ b/src/runtime/metrics/doc.go
@@ -54,6 +54,90 @@
 	/cgo/go-to-c-calls:calls
 		Count of calls made from Go to C by the current process.
 
+	/cpu/classes/gc/mark/assist:cpu-seconds
+		Estimated total CPU time goroutines spent performing GC tasks
+		to assist the GC and prevent it from falling behind the application.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/gc/mark/dedicated:cpu-seconds
+		Estimated total CPU time spent performing GC tasks on
+		processors (as defined by GOMAXPROCS) dedicated to those tasks.
+		This includes time spent with the world stopped due to the GC.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/gc/mark/idle:cpu-seconds
+		Estimated total CPU time spent performing GC tasks on
+		spare CPU resources that the Go scheduler could not otherwise find
+		a use for. This should be subtracted from the total GC CPU time to
+		obtain a measure of compulsory GC CPU time.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/gc/pause:cpu-seconds
+		Estimated total CPU time spent with the application paused by
+		the GC. Even if only one thread is running during the pause, this is
+		computed as GOMAXPROCS times the pause latency because nothing else
+		can be executing. This is the exact sum of samples in /gc/pause:seconds
+		if each sample is multiplied by GOMAXPROCS at the time it is taken.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/gc/total:cpu-seconds
+		Estimated total CPU time spent performing GC tasks.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics. Sum of all metrics in /cpu/classes/gc.
+
+	/cpu/classes/idle:cpu-seconds
+		Estimated total available CPU time not spent executing any Go or Go
+		runtime code. In other words, the part of /cpu/classes/total:cpu-seconds
+		that was unused.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/scavenge/assist:cpu-seconds
+		Estimated total CPU time spent returning unused memory to the
+		underlying platform in response eagerly in response to memory pressure.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/scavenge/background:cpu-seconds
+		Estimated total CPU time spent performing background tasks
+		to return unused memory to the underlying platform.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
+	/cpu/classes/scavenge/total:cpu-seconds
+		Estimated total CPU time spent performing tasks that return
+		unused memory to the underlying platform.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics. Sum of all metrics in /cpu/classes/scavenge.
+
+	/cpu/classes/total:cpu-seconds
+		Estimated total available CPU time for user Go code or the Go runtime, as
+		defined by GOMAXPROCS. In other words, GOMAXPROCS integrated over the
+		wall-clock duration this process has been executing for.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics. Sum of all metrics in /cpu/classes.
+
+	/cpu/classes/user:cpu-seconds
+		Estimated total CPU time spent running user Go code. This may
+		also include some small amount of time spent in the Go runtime.
+		This metric is an overestimate, and not directly comparable to
+		system CPU time measurements. Compare only with other /cpu/classes
+		metrics.
+
 	/gc/cycles/automatic:gc-cycles
 		Count of completed GC cycles generated by the Go runtime.
 
@@ -188,5 +272,12 @@
 	/sched/latencies:seconds
 		Distribution of the time goroutines have spent in the scheduler
 		in a runnable state before actually running.
+
+	/sync/mutex/wait/total:seconds
+		Approximate cumulative time goroutines have spent blocked on a
+		sync.Mutex or sync.RWMutex. This metric is useful for identifying
+		global changes in lock contention. Collect a mutex or block
+		profile using the runtime/pprof package for more detailed
+		contention data.
 */
 package metrics
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
index 8baf020..d981c8e 100644
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"reflect"
 	"runtime"
 	"runtime/metrics"
 	"sort"
@@ -156,13 +157,19 @@
 	// Tests whether readMetrics produces consistent, sensible values.
 	// The values are read concurrently with the runtime doing other
 	// things (e.g. allocating) so what we read can't reasonably compared
-	// to runtime values.
+	// to other runtime values (e.g. MemStats).
 
 	// Run a few GC cycles to get some of the stats to be non-zero.
 	runtime.GC()
 	runtime.GC()
 	runtime.GC()
 
+	// Set GOMAXPROCS high then sleep briefly to ensure we generate
+	// some idle time.
+	oldmaxprocs := runtime.GOMAXPROCS(10)
+	time.Sleep(time.Millisecond)
+	runtime.GOMAXPROCS(oldmaxprocs)
+
 	// Read all the supported metrics through the metrics package.
 	descs, samples := prepareAllMetricsSamples()
 	metrics.Read(samples)
@@ -181,6 +188,22 @@
 		numGC  uint64
 		pauses uint64
 	}
+	var cpu struct {
+		gcAssist    float64
+		gcDedicated float64
+		gcIdle      float64
+		gcPause     float64
+		gcTotal     float64
+
+		idle float64
+		user float64
+
+		scavengeAssist float64
+		scavengeBg     float64
+		scavengeTotal  float64
+
+		total float64
+	}
 	for i := range samples {
 		kind := samples[i].Value.Kind()
 		if want := descs[samples[i].Name].Kind; kind != want {
@@ -199,6 +222,28 @@
 			}
 		}
 		switch samples[i].Name {
+		case "/cpu/classes/gc/mark/assist:cpu-seconds":
+			cpu.gcAssist = samples[i].Value.Float64()
+		case "/cpu/classes/gc/mark/dedicated:cpu-seconds":
+			cpu.gcDedicated = samples[i].Value.Float64()
+		case "/cpu/classes/gc/mark/idle:cpu-seconds":
+			cpu.gcIdle = samples[i].Value.Float64()
+		case "/cpu/classes/gc/pause:cpu-seconds":
+			cpu.gcPause = samples[i].Value.Float64()
+		case "/cpu/classes/gc/total:cpu-seconds":
+			cpu.gcTotal = samples[i].Value.Float64()
+		case "/cpu/classes/idle:cpu-seconds":
+			cpu.idle = samples[i].Value.Float64()
+		case "/cpu/classes/scavenge/assist:cpu-seconds":
+			cpu.scavengeAssist = samples[i].Value.Float64()
+		case "/cpu/classes/scavenge/background:cpu-seconds":
+			cpu.scavengeBg = samples[i].Value.Float64()
+		case "/cpu/classes/scavenge/total:cpu-seconds":
+			cpu.scavengeTotal = samples[i].Value.Float64()
+		case "/cpu/classes/total:cpu-seconds":
+			cpu.total = samples[i].Value.Float64()
+		case "/cpu/classes/user:cpu-seconds":
+			cpu.user = samples[i].Value.Float64()
 		case "/memory/classes/total:bytes":
 			totalVirtual.got = samples[i].Value.Uint64()
 		case "/memory/classes/heap/objects:bytes":
@@ -235,6 +280,33 @@
 			}
 		}
 	}
+	// Only check this on Linux where we can be reasonably sure we have a high-resolution timer.
+	if runtime.GOOS == "linux" {
+		if cpu.gcDedicated <= 0 && cpu.gcAssist <= 0 && cpu.gcIdle <= 0 {
+			t.Errorf("found no time spent on GC work: %#v", cpu)
+		}
+		if cpu.gcPause <= 0 {
+			t.Errorf("found no GC pauses: %f", cpu.gcPause)
+		}
+		if cpu.idle <= 0 {
+			t.Errorf("found no idle time: %f", cpu.idle)
+		}
+		if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.01) {
+			t.Errorf("calculated total GC CPU not within 1%% of sampled total: %f vs. %f", total, cpu.gcTotal)
+		}
+		if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.01) {
+			t.Errorf("calculated total scavenge CPU not within 1%% of sampled total: %f vs. %f", total, cpu.scavengeTotal)
+		}
+		if cpu.total <= 0 {
+			t.Errorf("found no total CPU time passed")
+		}
+		if cpu.user <= 0 {
+			t.Errorf("found no user time passed")
+		}
+		if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.02) {
+			t.Errorf("calculated total CPU not within 2%% of sampled total: %f vs. %f", total, cpu.total)
+		}
+	}
 	if totalVirtual.got != totalVirtual.want {
 		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
 	}
@@ -303,7 +375,7 @@
 	for i := 0; i < b.N; i++ {
 		start := time.Now()
 		metrics.Read(samples)
-		latencies = append(latencies, time.Now().Sub(start))
+		latencies = append(latencies, time.Since(start))
 	}
 	// Make sure to stop the timer before we wait! The load created above
 	// is very heavy-weight and not easy to stop, so we could end up
@@ -411,3 +483,131 @@
 
 	wg.Wait()
 }
+
+func withinEpsilon(v1, v2, e float64) bool {
+	return v2-v2*e <= v1 && v1 <= v2+v2*e
+}
+
+func TestMutexWaitTimeMetric(t *testing.T) {
+	var sample [1]metrics.Sample
+	sample[0].Name = "/sync/mutex/wait/total:seconds"
+
+	locks := []locker2{
+		new(mutex),
+		new(rwmutexWrite),
+		new(rwmutexReadWrite),
+		new(rwmutexWriteRead),
+	}
+	for _, lock := range locks {
+		t.Run(reflect.TypeOf(lock).Elem().Name(), func(t *testing.T) {
+			metrics.Read(sample[:])
+			before := time.Duration(sample[0].Value.Float64() * 1e9)
+
+			minMutexWaitTime := generateMutexWaitTime(lock)
+
+			metrics.Read(sample[:])
+			after := time.Duration(sample[0].Value.Float64() * 1e9)
+
+			if wt := after - before; wt < minMutexWaitTime {
+				t.Errorf("too little mutex wait time: got %s, want %s", wt, minMutexWaitTime)
+			}
+		})
+	}
+}
+
+// locker2 represents an API surface of two concurrent goroutines
+// locking the same resource, but through different APIs. It's intended
+// to abstract over the relationship of two Lock calls or an RLock
+// and a Lock call.
+type locker2 interface {
+	Lock1()
+	Unlock1()
+	Lock2()
+	Unlock2()
+}
+
+type mutex struct {
+	mu sync.Mutex
+}
+
+func (m *mutex) Lock1()   { m.mu.Lock() }
+func (m *mutex) Unlock1() { m.mu.Unlock() }
+func (m *mutex) Lock2()   { m.mu.Lock() }
+func (m *mutex) Unlock2() { m.mu.Unlock() }
+
+type rwmutexWrite struct {
+	mu sync.RWMutex
+}
+
+func (m *rwmutexWrite) Lock1()   { m.mu.Lock() }
+func (m *rwmutexWrite) Unlock1() { m.mu.Unlock() }
+func (m *rwmutexWrite) Lock2()   { m.mu.Lock() }
+func (m *rwmutexWrite) Unlock2() { m.mu.Unlock() }
+
+type rwmutexReadWrite struct {
+	mu sync.RWMutex
+}
+
+func (m *rwmutexReadWrite) Lock1()   { m.mu.RLock() }
+func (m *rwmutexReadWrite) Unlock1() { m.mu.RUnlock() }
+func (m *rwmutexReadWrite) Lock2()   { m.mu.Lock() }
+func (m *rwmutexReadWrite) Unlock2() { m.mu.Unlock() }
+
+type rwmutexWriteRead struct {
+	mu sync.RWMutex
+}
+
+func (m *rwmutexWriteRead) Lock1()   { m.mu.Lock() }
+func (m *rwmutexWriteRead) Unlock1() { m.mu.Unlock() }
+func (m *rwmutexWriteRead) Lock2()   { m.mu.RLock() }
+func (m *rwmutexWriteRead) Unlock2() { m.mu.RUnlock() }
+
+// generateMutexWaitTime causes a couple of goroutines
+// to block a whole bunch of times on a sync.Mutex, returning
+// the minimum amount of time that should be visible in the
+// /sync/mutex-wait:seconds metric.
+func generateMutexWaitTime(mu locker2) time.Duration {
+	// Set up the runtime to always track casgstatus transitions for metrics.
+	*runtime.CasGStatusAlwaysTrack = true
+
+	mu.Lock1()
+
+	// Start up a goroutine to wait on the lock.
+	gc := make(chan *runtime.G)
+	done := make(chan bool)
+	go func() {
+		gc <- runtime.Getg()
+
+		for {
+			mu.Lock2()
+			mu.Unlock2()
+			if <-done {
+				return
+			}
+		}
+	}()
+	gp := <-gc
+
+	// Set the block time high enough so that it will always show up, even
+	// on systems with coarse timer granularity.
+	const blockTime = 100 * time.Millisecond
+
+	// Make sure the goroutine spawned above actually blocks on the lock.
+	for {
+		if runtime.GIsWaitingOnMutex(gp) {
+			break
+		}
+		runtime.Gosched()
+	}
+
+	// Let some amount of time pass.
+	time.Sleep(blockTime)
+
+	// Let the other goroutine acquire the lock.
+	mu.Unlock1()
+	done <- true
+
+	// Reset flag.
+	*runtime.CasGStatusAlwaysTrack = false
+	return blockTime
+}
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index f3f3a79..d4d4f1f 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -10,6 +10,7 @@
 	"internal/abi"
 	"internal/goarch"
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -19,9 +20,8 @@
 // finblock is allocated from non-GC'd memory, so any heap pointers
 // must be specially handled. GC currently assumes that the finalizer
 // queue does not grow during marking (but it can shrink).
-//
-//go:notinheap
 type finblock struct {
+	_       sys.NotInHeap
 	alllink *finblock
 	next    *finblock
 	cnt     uint32
@@ -29,13 +29,23 @@
 	fin     [(_FinBlockSize - 2*goarch.PtrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
 }
 
+var fingStatus atomic.Uint32
+
+// finalizer goroutine status.
+const (
+	fingUninitialized uint32 = iota
+	fingCreated       uint32 = 1 << (iota - 1)
+	fingRunningFinalizer
+	fingWait
+	fingWake
+)
+
 var finlock mutex  // protects the following variables
 var fing *g        // goroutine that runs finalizers
 var finq *finblock // list of finalizers that are to be executed
 var finc *finblock // cache of free blocks
 var finptrmask [_FinBlockSize / goarch.PtrSize / 8]byte
-var fingwait bool
-var fingwake bool
+
 var allfin *finblock // list of all blocks
 
 // NOTE: Layout known to queuefinalizer.
@@ -75,6 +85,12 @@
 	0<<0 | 1<<1 | 1<<2 | 1<<3 | 1<<4 | 0<<5 | 1<<6 | 1<<7,
 }
 
+// lockRankMayQueueFinalizer records the lock ranking effects of a
+// function that may call queuefinalizer.
+func lockRankMayQueueFinalizer() {
+	lockWithRankMayAcquire(&finlock, getLockRank(&finlock))
+}
+
 func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
 	if gcphase != _GCoff {
 		// Currently we assume that the finalizer queue won't
@@ -120,8 +136,8 @@
 	f.fint = fint
 	f.ot = ot
 	f.arg = p
-	fingwake = true
 	unlock(&finlock)
+	fingStatus.Or(fingWake)
 }
 
 //go:nowritebarrier
@@ -135,30 +151,28 @@
 }
 
 func wakefing() *g {
-	var res *g
-	lock(&finlock)
-	if fingwait && fingwake {
-		fingwait = false
-		fingwake = false
-		res = fing
+	if ok := fingStatus.CompareAndSwap(fingCreated|fingWait|fingWake, fingCreated); ok {
+		return fing
 	}
-	unlock(&finlock)
-	return res
+	return nil
 }
 
-var (
-	fingCreate  uint32
-	fingRunning bool
-)
-
 func createfing() {
 	// start the finalizer goroutine exactly once
-	if fingCreate == 0 && atomic.Cas(&fingCreate, 0, 1) {
+	if fingStatus.Load() == fingUninitialized && fingStatus.CompareAndSwap(fingUninitialized, fingCreated) {
 		go runfinq()
 	}
 }
 
-// This is the goroutine that runs all of the finalizers
+func finalizercommit(gp *g, lock unsafe.Pointer) bool {
+	unlock((*mutex)(lock))
+	// fingStatus should be modified after fing is put into a waiting state
+	// to avoid waking fing in running state, even if it is about to be parked.
+	fingStatus.Or(fingWait)
+	return true
+}
+
+// This is the goroutine that runs all of the finalizers.
 func runfinq() {
 	var (
 		frame    unsafe.Pointer
@@ -176,8 +190,7 @@
 		fb := finq
 		finq = nil
 		if fb == nil {
-			fingwait = true
-			goparkunlock(&finlock, waitReasonFinalizerWait, traceEvGoBlock, 1)
+			gopark(finalizercommit, unsafe.Pointer(&finlock), waitReasonFinalizerWait, traceEvGoBlock, 1)
 			continue
 		}
 		argRegs = intArgRegs
@@ -238,9 +251,9 @@
 				default:
 					throw("bad kind in runfinq")
 				}
-				fingRunning = true
+				fingStatus.Or(fingRunningFinalizer)
 				reflectcall(nil, unsafe.Pointer(f.fn), frame, uint32(framesz), uint32(framesz), uint32(framesz), &regs)
-				fingRunning = false
+				fingStatus.And(^fingRunningFinalizer)
 
 				// Drop finalizer queue heap references
 				// before hiding them from markroot.
@@ -299,12 +312,21 @@
 // bufio.Writer, because the buffer would not be flushed at program exit.
 //
 // It is not guaranteed that a finalizer will run if the size of *obj is
-// zero bytes.
+// zero bytes, because it may share same address with other zero-size
+// objects in memory. See https://go.dev/ref/spec#Size_and_alignment_guarantees.
 //
 // It is not guaranteed that a finalizer will run for objects allocated
 // in initializers for package-level variables. Such objects may be
 // linker-allocated, not heap-allocated.
 //
+// Note that because finalizers may execute arbitrarily far into the future
+// after an object is no longer referenced, the runtime is allowed to perform
+// a space-saving optimization that batches objects together in a single
+// allocation slot. The finalizer for an unreferenced object in such an
+// allocation may never run if it always exists in the same batch as a
+// referenced object. Typically, this batching only happens for tiny
+// (on the order of 16 bytes or less) and pointer-free objects.
+//
 // A finalizer may run as soon as an object becomes unreachable.
 // In order to use finalizers correctly, the program must ensure that
 // the object is reachable until it is no longer required.
@@ -357,6 +379,11 @@
 		throw("nil elem type!")
 	}
 
+	if inUserArenaChunk(uintptr(e.data)) {
+		// Arena-allocated objects are not eligible for finalizers.
+		throw("runtime.SetFinalizer: first argument was allocated into an arena")
+	}
+
 	// find the containing object
 	base, _, _ := findObject(uintptr(e.data), 0, 0)
 
diff --git a/src/runtime/mfinal_test.go b/src/runtime/mfinal_test.go
index 902ccc5..61d625a 100644
--- a/src/runtime/mfinal_test.go
+++ b/src/runtime/mfinal_test.go
@@ -53,7 +53,7 @@
 		}},
 	}
 
-	for i, tt := range finalizerTests {
+	for _, tt := range finalizerTests {
 		done := make(chan bool, 1)
 		go func() {
 			// allocate struct with pointer to avoid hitting tinyalloc.
@@ -71,11 +71,7 @@
 		}()
 		<-done
 		runtime.GC()
-		select {
-		case <-ch:
-		case <-time.After(time.Second * 4):
-			t.Errorf("#%d: finalizer for type %T didn't run", i, tt.finalizer)
-		}
+		<-ch
 	}
 }
 
@@ -109,11 +105,7 @@
 	}()
 	<-done
 	runtime.GC()
-	select {
-	case <-ch:
-	case <-time.After(4 * time.Second):
-		t.Errorf("finalizer for type *bigValue didn't run")
-	}
+	<-ch
 }
 
 func fin(v *int) {
@@ -188,11 +180,7 @@
 	fin := make(chan bool, 1)
 	runtime.SetFinalizer(y, func(z *objtype) { fin <- true })
 	runtime.GC()
-	select {
-	case <-fin:
-	case <-time.After(4 * time.Second):
-		t.Errorf("finalizer of next object in memory didn't run")
-	}
+	<-fin
 	xsglobal = xs // keep empty slice alive until here
 }
 
@@ -220,11 +208,7 @@
 	// set finalizer on string contents of y
 	runtime.SetFinalizer(y, func(z *objtype) { fin <- true })
 	runtime.GC()
-	select {
-	case <-fin:
-	case <-time.After(4 * time.Second):
-		t.Errorf("finalizer of next string in memory didn't run")
-	}
+	<-fin
 	ssglobal = ss // keep 0-length string live until here
 }
 
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index b701a09..8788d95 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -8,7 +8,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // FixAlloc is a simple free-list allocator for fixed size objects.
 // Malloc uses a FixAlloc wrapped around sysAlloc to manage its
@@ -23,7 +26,8 @@
 // Callers can keep state in the object but the first word is
 // smashed by freeing and reallocating.
 //
-// Consider marking fixalloc'd types go:notinheap.
+// Consider marking fixalloc'd types not in heap by embedding
+// runtime/internal/sys.NotInHeap.
 type fixalloc struct {
 	size   uintptr
 	first  func(arg, p unsafe.Pointer) // called first time p is returned
@@ -42,9 +46,8 @@
 // this cannot be used by some of the internal GC structures. For example when
 // the sweeper is placing an unmarked object on the free list it does not want the
 // write barrier to be called since that could result in the object being reachable.
-//
-//go:notinheap
 type mlink struct {
+	_    sys.NotInHeap
 	next *mlink
 }
 
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 63e0463..1b05707 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -366,10 +366,6 @@
 	// explicit user call.
 	userForced bool
 
-	// totaltime is the CPU nanoseconds spent in GC since the
-	// program started if debug.gctrace > 0.
-	totaltime int64
-
 	// initialHeapLive is the value of gcController.heapLive at the
 	// beginning of this GC cycle.
 	initialHeapLive uint64
@@ -393,7 +389,7 @@
 	// cycle is sweep termination, mark, mark termination, and
 	// sweep. This differs from memstats.numgc, which is
 	// incremented at mark termination.
-	cycles uint32
+	cycles atomic.Uint32
 
 	// Timing/utilization stats for this cycle.
 	stwprocs, maxprocs                 int32
@@ -404,6 +400,9 @@
 
 	// debug.gctrace heap sizes for this cycle.
 	heap0, heap1, heap2 uint64
+
+	// Cumulative estimated CPU usage.
+	cpuStats
 }
 
 // GC runs a garbage collection and blocks the caller until the
@@ -436,7 +435,7 @@
 
 	// Wait until the current sweep termination, mark, and mark
 	// termination complete.
-	n := atomic.Load(&work.cycles)
+	n := work.cycles.Load()
 	gcWaitOnMark(n)
 
 	// We're now in sweep N or later. Trigger GC cycle N+1, which
@@ -451,7 +450,7 @@
 	// complete the cycle and because runtime.GC() is often used
 	// as part of tests and benchmarks to get the system into a
 	// relatively stable and isolated state.
-	for atomic.Load(&work.cycles) == n+1 && sweepone() != ^uintptr(0) {
+	for work.cycles.Load() == n+1 && sweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 		Gosched()
 	}
@@ -467,7 +466,7 @@
 	// First, wait for sweeping to finish. (We know there are no
 	// more spans on the sweep queue, but we may be concurrently
 	// sweeping spans, so we have to wait.)
-	for atomic.Load(&work.cycles) == n+1 && !isSweepDone() {
+	for work.cycles.Load() == n+1 && !isSweepDone() {
 		Gosched()
 	}
 
@@ -475,7 +474,7 @@
 	// stable heap profile. Only do this if we haven't already hit
 	// another mark termination.
 	mp := acquirem()
-	cycle := atomic.Load(&work.cycles)
+	cycle := work.cycles.Load()
 	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
 		mProf_PostSweep()
 	}
@@ -488,7 +487,7 @@
 	for {
 		// Disable phase transitions.
 		lock(&work.sweepWaiters.lock)
-		nMarks := atomic.Load(&work.cycles)
+		nMarks := work.cycles.Load()
 		if gcphase != _GCmark {
 			// We've already completed this cycle's mark.
 			nMarks++
@@ -546,7 +545,7 @@
 // that the exit condition for the _GCoff phase has been met. The exit
 // condition should be tested when allocating.
 func (t gcTrigger) test() bool {
-	if !memstats.enablegc || panicking != 0 || gcphase != _GCoff {
+	if !memstats.enablegc || panicking.Load() != 0 || gcphase != _GCoff {
 		return false
 	}
 	switch t.kind {
@@ -556,7 +555,7 @@
 		// atomically wrote gcController.heapLive anyway and we'll see our
 		// own write.
 		trigger, _ := gcController.trigger()
-		return atomic.Load64(&gcController.heapLive) >= trigger
+		return gcController.heapLive.Load() >= trigger
 	case gcTriggerTime:
 		if gcController.gcPercent.Load() < 0 {
 			return false
@@ -565,7 +564,7 @@
 		return lastgc != 0 && t.now-lastgc > forcegcperiod
 	case gcTriggerCycle:
 		// t.n > work.cycles, but accounting for wraparound.
-		return int32(t.n-work.cycles) > 0
+		return int32(t.n-work.cycles.Load()) > 0
 	}
 	return true
 }
@@ -612,9 +611,6 @@
 		return
 	}
 
-	// For stats, check if this GC was forced by the user.
-	work.userForced = trigger.kind == gcTriggerCycle
-
 	// In gcstoptheworld debug mode, upgrade the mode accordingly.
 	// We do this after re-checking the transition condition so
 	// that multiple goroutines that detect the heap trigger don't
@@ -630,13 +626,17 @@
 	semacquire(&gcsema)
 	semacquire(&worldsema)
 
+	// For stats, check if this GC was forced by the user.
+	// Update it under gcsema to avoid gctrace getting wrong values.
+	work.userForced = trigger.kind == gcTriggerCycle
+
 	if trace.enabled {
 		traceGCStart()
 	}
 
 	// Check that all Ps have finished deferred mcache flushes.
 	for _, p := range allp {
-		if fg := atomic.Load(&p.mcache.flushGen); fg != mheap_.sweepgen {
+		if fg := p.mcache.flushGen.Load(); fg != mheap_.sweepgen {
 			println("runtime: p", p.id, "flushGen", fg, "!= sweepgen", mheap_.sweepgen)
 			throw("p mcache not flushed")
 		}
@@ -652,7 +652,7 @@
 		// so it can't be more than ncpu, even if GOMAXPROCS is.
 		work.stwprocs = ncpu
 	}
-	work.heap0 = atomic.Load64(&gcController.heapLive)
+	work.heap0 = gcController.heapLive.Load()
 	work.pauseNS = 0
 	work.mode = mode
 
@@ -672,7 +672,7 @@
 	// reclaimed until the next GC cycle.
 	clearpools()
 
-	work.cycles++
+	work.cycles.Add(1)
 
 	// Assists and workers can start the moment we start
 	// the world.
@@ -810,22 +810,22 @@
 		// Otherwise, our attempt to force all P's to a safepoint could
 		// result in a deadlock as we attempt to preempt a worker that's
 		// trying to preempt us (e.g. for a stack scan).
-		casgstatus(gp, _Grunning, _Gwaiting)
-		forEachP(func(_p_ *p) {
+		casGToWaiting(gp, _Grunning, waitReasonGCMarkTermination)
+		forEachP(func(pp *p) {
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
-			wbBufFlush1(_p_)
+			wbBufFlush1(pp)
 
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
 			// TODO(austin): Break up these workbufs to
 			// better distribute work.
-			_p_.gcw.dispose()
+			pp.gcw.dispose()
 			// Collect the flushedWork flag.
-			if _p_.gcw.flushedWork {
+			if pp.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
-				_p_.gcw.flushedWork = false
+				pp.gcw.flushedWork = false
 			}
 		})
 		casgstatus(gp, _Gwaiting, _Grunning)
@@ -879,7 +879,7 @@
 	if restart {
 		getg().m.preemptoff = ""
 		systemstack(func() {
-			now := startTheWorldWithSema(true)
+			now := startTheWorldWithSema(trace.enabled)
 			work.pauseNS += now - work.pauseStart
 			memstats.gcPauseDist.record(now - work.pauseStart)
 		})
@@ -924,16 +924,14 @@
 	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
-	work.heap1 = gcController.heapLive
+	work.heap1 = gcController.heapLive.Load()
 	startTime := nanotime()
 
 	mp := acquirem()
 	mp.preemptoff = "gcing"
-	_g_ := getg()
-	_g_.m.traceback = 2
-	gp := _g_.m.curg
-	casgstatus(gp, _Grunning, _Gwaiting)
-	gp.waitreason = waitReasonGarbageCollection
+	mp.traceback = 2
+	curgp := mp.curg
+	casGToWaiting(curgp, _Grunning, waitReasonGarbageCollection)
 
 	// Run gc on the g0 stack. We do this so that the g stack
 	// we're currently running on will no longer change. Cuts
@@ -972,8 +970,8 @@
 		gcSweep(work.mode)
 	})
 
-	_g_.m.traceback = 0
-	casgstatus(gp, _Gwaiting, _Grunning)
+	mp.traceback = 0
+	casgstatus(curgp, _Gwaiting, _Grunning)
 
 	if trace.enabled {
 		traceGCDone()
@@ -1006,24 +1004,57 @@
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 	memstats.pause_total_ns += uint64(work.pauseNS)
 
-	// Update work.totaltime.
 	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
 	// We report idle marking time below, but omit it from the
 	// overall utilization here since it's "free".
-	markCpu := gcController.assistTime.Load() + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
+	markAssistCpu := gcController.assistTime.Load()
+	markDedicatedCpu := gcController.dedicatedMarkTime.Load()
+	markFractionalCpu := gcController.fractionalMarkTime.Load()
+	markIdleCpu := gcController.idleMarkTime.Load()
 	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
-	cycleCpu := sweepTermCpu + markCpu + markTermCpu
-	work.totaltime += cycleCpu
+	scavAssistCpu := scavenge.assistTime.Load()
+	scavBgCpu := scavenge.backgroundTime.Load()
+
+	// Update cumulative GC CPU stats.
+	work.cpuStats.gcAssistTime += markAssistCpu
+	work.cpuStats.gcDedicatedTime += markDedicatedCpu + markFractionalCpu
+	work.cpuStats.gcIdleTime += markIdleCpu
+	work.cpuStats.gcPauseTime += sweepTermCpu + markTermCpu
+	work.cpuStats.gcTotalTime += sweepTermCpu + markAssistCpu + markDedicatedCpu + markFractionalCpu + markIdleCpu + markTermCpu
+
+	// Update cumulative scavenge CPU stats.
+	work.cpuStats.scavengeAssistTime += scavAssistCpu
+	work.cpuStats.scavengeBgTime += scavBgCpu
+	work.cpuStats.scavengeTotalTime += scavAssistCpu + scavBgCpu
+
+	// Update total CPU.
+	work.cpuStats.totalTime = sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
+	work.cpuStats.idleTime += sched.idleTime.Load()
+
+	// Compute userTime. We compute this indirectly as everything that's not the above.
+	//
+	// Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
+	// is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
+	// the latter of which is fine because the P will either go idle or get used for something
+	// else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
+	// _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
+	// but that's fine. The overwhelming majority of this time will be actual user time.
+	work.cpuStats.userTime = work.cpuStats.totalTime - (work.cpuStats.gcTotalTime +
+		work.cpuStats.scavengeTotalTime + work.cpuStats.idleTime)
 
 	// Compute overall GC CPU utilization.
-	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
-	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
+	// Omit idle marking time from the overall utilization here since it's "free".
+	memstats.gc_cpu_fraction = float64(work.cpuStats.gcTotalTime-work.cpuStats.gcIdleTime) / float64(work.cpuStats.totalTime)
 
-	// Reset assist time stat.
+	// Reset assist time and background time stats.
 	//
 	// Do this now, instead of at the start of the next GC cycle, because
 	// these two may keep accumulating even if the GC is not active.
-	mheap_.pages.scav.assistTime.Store(0)
+	scavenge.assistTime.Store(0)
+	scavenge.backgroundTime.Store(0)
+
+	// Reset idle time stat.
+	sched.idleTime.Store(0)
 
 	// Reset sweep state.
 	sweep.nbgsweep = 0
@@ -1056,7 +1087,7 @@
 		throw("failed to set sweep barrier")
 	}
 
-	systemstack(func() { startTheWorldWithSema(true) })
+	systemstack(func() { startTheWorldWithSema(trace.enabled) })
 
 	// Flush the heap profile so we can start a new cycle next GC.
 	// This is relatively expensive, so we don't do it with the
@@ -1075,8 +1106,8 @@
 	// is necessary to sweep all spans, we need to ensure all
 	// mcaches are flushed before we start the next GC cycle.
 	systemstack(func() {
-		forEachP(func(_p_ *p) {
-			_p_.mcache.prepareForSweep()
+		forEachP(func(pp *p) {
+			pp.mcache.prepareForSweep()
 		})
 	})
 	// Now that we've swept stale spans in mcaches, they don't
@@ -1106,8 +1137,8 @@
 		for i, ns := range []int64{
 			sweepTermCpu,
 			gcController.assistTime.Load(),
-			gcController.dedicatedMarkTime + gcController.fractionalMarkTime,
-			gcController.idleMarkTime,
+			gcController.dedicatedMarkTime.Load() + gcController.fractionalMarkTime.Load(),
+			gcController.idleMarkTime.Load(),
 			markTermCpu,
 		} {
 			if i == 2 || i == 3 {
@@ -1121,8 +1152,8 @@
 		print(" ms cpu, ",
 			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
 			gcController.lastHeapGoal>>20, " MB goal, ",
-			atomic.Load64(&gcController.maxStackScan)>>20, " MB stacks, ",
-			gcController.globalsScan>>20, " MB globals, ",
+			gcController.lastStackScan.Load()>>20, " MB stacks, ",
+			gcController.globalsScan.Load()>>20, " MB globals, ",
 			work.maxprocs, " P")
 		if work.userForced {
 			print(" (forced)")
@@ -1131,6 +1162,15 @@
 		printunlock()
 	}
 
+	// Set any arena chunks that were deferred to fault.
+	lock(&userArenaState.lock)
+	faultList := userArenaState.fault
+	userArenaState.fault = nil
+	unlock(&userArenaState.lock)
+	for _, lc := range faultList {
+		lc.mspan.setUserArenaChunkToFault()
+	}
+
 	semrelease(&worldsema)
 	semrelease(&gcsema)
 	// Careful: another GC cycle may start now.
@@ -1183,7 +1223,7 @@
 	work.nwait = ^uint32(0)
 }
 
-// gcBgMarkWorker is an entry in the gcBgMarkWorkerPool. It points to a single
+// gcBgMarkWorkerNode is an entry in the gcBgMarkWorkerPool. It points to a single
 // gcBgMarkWorker goroutine.
 type gcBgMarkWorkerNode struct {
 	// Unused workers are managed in a lock-free stack. This field must be first.
@@ -1300,7 +1340,7 @@
 			// the G stack. However, stack shrinking is
 			// disabled for mark workers, so it is safe to
 			// read from the G stack.
-			casgstatus(gp, _Grunning, _Gwaiting)
+			casGToWaiting(gp, _Grunning, waitReasonGCWorkerActive)
 			switch pp.gcMarkWorkerMode {
 			default:
 				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
@@ -1566,7 +1606,7 @@
 	}
 
 	work.bytesMarked = 0
-	work.initialHeapLive = atomic.Load64(&gcController.heapLive)
+	work.initialHeapLive = gcController.heapLive.Load()
 }
 
 // Hooks for other packages
diff --git a/src/runtime/mgclimit.go b/src/runtime/mgclimit.go
index d94e471..bcbe7f8 100644
--- a/src/runtime/mgclimit.go
+++ b/src/runtime/mgclimit.go
@@ -55,8 +55,6 @@
 	// the mark and sweep phases.
 	transitioning bool
 
-	_ uint32 // Align assistTimePool and lastUpdate on 32-bit platforms.
-
 	// assistTimePool is the accumulated assist time since the last update.
 	assistTimePool atomic.Int64
 
@@ -339,7 +337,7 @@
 	l.unlock()
 }
 
-// limiterEventType indicates the type of an event occuring on some P.
+// limiterEventType indicates the type of an event occurring on some P.
 //
 // These events represent the full set of events that the GC CPU limiter tracks
 // to execute its function.
@@ -471,9 +469,10 @@
 	// Account for the event.
 	switch typ {
 	case limiterEventIdleMarkWork:
-		fallthrough
+		gcCPULimiter.addIdleTime(duration)
 	case limiterEventIdle:
 		gcCPULimiter.addIdleTime(duration)
+		sched.idleTime.Add(duration)
 	case limiterEventMarkAssist:
 		fallthrough
 	case limiterEventScavengeAssist:
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 7463707..cfda706 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -218,8 +218,7 @@
 			userG := getg().m.curg
 			selfScan := gp == userG && readgstatus(userG) == _Grunning
 			if selfScan {
-				casgstatus(userG, _Grunning, _Gwaiting)
-				userG.waitreason = waitReasonGarbageCollectionScan
+				casGToWaiting(userG, _Grunning, waitReasonGarbageCollectionScan)
 			}
 
 			// TODO: suspendG blocks (and spins) until gp
@@ -387,7 +386,9 @@
 				// Mark everything that can be reached from
 				// the object (but *not* the object itself or
 				// we'll never collect it).
-				scanobject(p, gcw)
+				if !s.spanclass.noscan() {
+					scanobject(p, gcw)
+				}
 
 				// The special itself is a root.
 				scanblock(uintptr(unsafe.Pointer(&spf.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
@@ -440,7 +441,7 @@
 	// will just cause steals to fail until credit is accumulated
 	// again, so in the long run it doesn't really matter, but we
 	// do have to handle the negative credit case.
-	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
+	bgScanCredit := gcController.bgScanCredit.Load()
 	stolen := int64(0)
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
@@ -450,7 +451,7 @@
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
 		}
-		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
+		gcController.bgScanCredit.Add(-stolen)
 
 		scanWork -= stolen
 
@@ -558,8 +559,7 @@
 	}
 
 	// gcDrainN requires the caller to be preemptible.
-	casgstatus(gp, _Grunning, _Gwaiting)
-	gp.waitreason = waitReasonGCAssistMarking
+	casGToWaiting(gp, _Grunning, waitReasonGCAssistMarking)
 
 	// drain own cached work first in the hopes that it
 	// will be more cache friendly.
@@ -595,15 +595,15 @@
 	}
 	now := nanotime()
 	duration := now - startTime
-	_p_ := gp.m.p.ptr()
-	_p_.gcAssistTime += duration
+	pp := gp.m.p.ptr()
+	pp.gcAssistTime += duration
 	if trackLimiterEvent {
-		_p_.limiterEvent.stop(limiterEventMarkAssist, now)
+		pp.limiterEvent.stop(limiterEventMarkAssist, now)
 	}
-	if _p_.gcAssistTime > gcAssistTimeSlack {
-		gcController.assistTime.Add(_p_.gcAssistTime)
+	if pp.gcAssistTime > gcAssistTimeSlack {
+		gcController.assistTime.Add(pp.gcAssistTime)
 		gcCPULimiter.update(now)
-		_p_.gcAssistTime = 0
+		pp.gcAssistTime = 0
 	}
 }
 
@@ -639,7 +639,7 @@
 	// the queue, but can still back out. This avoids a
 	// race in case background marking has flushed more
 	// credit since we checked above.
-	if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+	if gcController.bgScanCredit.Load() > 0 {
 		work.assistQueue.q = oldList
 		if oldList.tail != 0 {
 			oldList.tail.ptr().schedlink.set(nil)
@@ -668,7 +668,7 @@
 		// small window here where an assist may add itself to
 		// the blocked queue and park. If that happens, we'll
 		// just get it on the next flush.
-		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		gcController.bgScanCredit.Add(scanWork)
 		return
 	}
 
@@ -708,7 +708,7 @@
 		// Convert from scan bytes back to work.
 		assistWorkPerByte := gcController.assistWorkPerByte.Load()
 		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
-		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		gcController.bgScanCredit.Add(scanWork)
 	}
 	unlock(&work.assistQueue.lock)
 }
@@ -943,10 +943,10 @@
 		}
 
 		// Scan arguments to this frame.
-		if frame.arglen != 0 {
+		if n := frame.argBytes(); n != 0 {
 			// TODO: We could pass the entry argument map
 			// to narrow this down further.
-			scanConservative(frame.argp, frame.arglen, nil, gcw, state)
+			scanConservative(frame.argp, n, nil, gcw, state)
 		}
 
 		if isAsyncPreempt || isDebugCall {
@@ -964,7 +964,7 @@
 		return
 	}
 
-	locals, args, objs := getStackMap(frame, &state.cache, false)
+	locals, args, objs := frame.getStackMap(&state.cache, false)
 
 	// Scan local variables if stack frame has been allocated.
 	if locals.n > 0 {
@@ -1061,7 +1061,7 @@
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
 		// Stop if we're preemptible or if someone wants to STW.
-		for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+		for !(gp.preempt && (preemptible || sched.gcwaiting.Load())) {
 			job := atomic.Xadd(&work.markrootNext, +1) - 1
 			if job >= work.markrootJobs {
 				break
@@ -1075,7 +1075,7 @@
 
 	// Drain heap marking jobs.
 	// Stop if we're preemptible or if someone wants to STW.
-	for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
+	for !(gp.preempt && (preemptible || sched.gcwaiting.Load())) {
 		// Try to keep work available on the global queue. We used to
 		// check if there were waiting workers, but it's better to
 		// just keep work available than to make workers wait. In the
@@ -1265,28 +1265,21 @@
 	// b is either the beginning of an object, in which case this
 	// is the size of the object to scan, or it points to an
 	// oblet, in which case we compute the size to scan below.
-	hbits := heapBitsForAddr(b)
 	s := spanOfUnchecked(b)
 	n := s.elemsize
 	if n == 0 {
 		throw("scanobject n == 0")
 	}
+	if s.spanclass.noscan() {
+		// Correctness-wise this is ok, but it's inefficient
+		// if noscan objects reach here.
+		throw("scanobject of a noscan object")
+	}
 
 	if n > maxObletBytes {
 		// Large object. Break into oblets for better
 		// parallelism and lower latency.
 		if b == s.base() {
-			// It's possible this is a noscan object (not
-			// from greyobject, but from other code
-			// paths), in which case we must *not* enqueue
-			// oblets since their bitmaps will be
-			// uninitialized.
-			if s.spanclass.noscan() {
-				// Bypass the whole scan.
-				gcw.bytesMarked += uint64(n)
-				return
-			}
-
 			// Enqueue the other oblets to scan later.
 			// Some oblets may be in b's scalar tail, but
 			// these will be marked as "no more pointers",
@@ -1308,20 +1301,24 @@
 		}
 	}
 
-	var i uintptr
-	for i = 0; i < n; i, hbits = i+goarch.PtrSize, hbits.next() {
-		// Load bits once. See CL 22712 and issue 16973 for discussion.
-		bits := hbits.bits()
-		if bits&bitScan == 0 {
-			break // no more pointers in this object
+	hbits := heapBitsForAddr(b, n)
+	var scanSize uintptr
+	for {
+		var addr uintptr
+		if hbits, addr = hbits.nextFast(); addr == 0 {
+			if hbits, addr = hbits.next(); addr == 0 {
+				break
+			}
 		}
-		if bits&bitPointer == 0 {
-			continue // not a pointer
-		}
+
+		// Keep track of farthest pointer we found, so we can
+		// update heapScanWork. TODO: is there a better metric,
+		// now that we can skip scalar portions pretty efficiently?
+		scanSize = addr - b + goarch.PtrSize
 
 		// Work here is duplicated in scanblock and above.
 		// If you make changes here, make changes there too.
-		obj := *(*uintptr)(unsafe.Pointer(b + i))
+		obj := *(*uintptr)(unsafe.Pointer(addr))
 
 		// At this point we have extracted the next potential pointer.
 		// Quickly filter out nil and pointers back to the current object.
@@ -1335,13 +1332,13 @@
 			// heap. In this case, we know the object was
 			// just allocated and hence will be marked by
 			// allocation itself.
-			if obj, span, objIndex := findObject(obj, b, i); obj != 0 {
-				greyobject(obj, b, i, span, gcw, objIndex)
+			if obj, span, objIndex := findObject(obj, b, addr-b); obj != 0 {
+				greyobject(obj, b, addr-b, span, gcw, objIndex)
 			}
 		}
 	}
 	gcw.bytesMarked += uint64(n)
-	gcw.heapScanWork += int64(i)
+	gcw.heapScanWork += int64(scanSize)
 }
 
 // scanConservative scans block [b, b+n) conservatively, treating any
@@ -1564,7 +1561,7 @@
 //
 //go:nowritebarrier
 //go:nosplit
-func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
+func gcmarknewobject(span *mspan, obj, size uintptr) {
 	if useCheckmark { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 2d9fd27..9d9840e 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -8,7 +8,7 @@
 	"internal/cpu"
 	"internal/goexperiment"
 	"runtime/internal/atomic"
-	"unsafe"
+	_ "unsafe" // for go:linkname
 )
 
 // go119MemoryLimitSupport is a feature flag for a number of changes
@@ -74,13 +74,6 @@
 	memoryLimitHeapGoalHeadroom = 1 << 20
 )
 
-func init() {
-	if offset := unsafe.Offsetof(gcController.heapLive); offset%8 != 0 {
-		println(offset)
-		throw("gcController.heapLive not aligned to 8 bytes")
-	}
-}
-
 // gcController implements the GC pacing controller that determines
 // when to trigger concurrent garbage collection and how much marking
 // work to do in mutator assists and background marking.
@@ -99,8 +92,6 @@
 	// Initialized from GOGC. GOGC=off means no GC.
 	gcPercent atomic.Int32
 
-	_ uint32 // padding so following 64-bit values are 8-byte aligned
-
 	// memoryLimit is the soft memory limit in bytes.
 	//
 	// Initialized from GOMEMLIMIT. GOMEMLIMIT=off is equivalent to MaxInt64
@@ -145,14 +136,10 @@
 	// Updated at the end of each GC cycle, in endCycle.
 	consMark float64
 
-	// consMarkController holds the state for the mark-cons ratio
-	// estimation over time.
-	//
-	// Its purpose is to smooth out noisiness in the computation of
-	// consMark; see consMark for details.
-	consMarkController piController
-
-	_ uint32 // Padding for atomics on 32-bit platforms.
+	// lastConsMark is the computed cons/mark value for the previous GC
+	// cycle. Note that this is *not* the last value of cons/mark, but the
+	// actual computed value. See endCycle for details.
+	lastConsMark float64
 
 	// gcPercentHeapGoal is the goal heapLive for when next GC ends derived
 	// from gcPercent.
@@ -193,32 +180,27 @@
 	// hence goes up as we allocate and down as we sweep) while heapLive
 	// excludes these objects (and hence only goes up between GCs).
 	//
-	// This is updated atomically without locking. To reduce
-	// contention, this is updated only when obtaining a span from
-	// an mcentral and at this point it counts all of the
-	// unallocated slots in that span (which will be allocated
-	// before that mcache obtains another span from that
-	// mcentral). Hence, it slightly overestimates the "true" live
-	// heap size. It's better to overestimate than to
-	// underestimate because 1) this triggers the GC earlier than
-	// necessary rather than potentially too late and 2) this
-	// leads to a conservative GC rate rather than a GC rate that
-	// is potentially too low.
-	//
-	// Reads should likewise be atomic (or during STW).
+	// To reduce contention, this is updated only when obtaining a span
+	// from an mcentral and at this point it counts all of the unallocated
+	// slots in that span (which will be allocated before that mcache
+	// obtains another span from that mcentral). Hence, it slightly
+	// overestimates the "true" live heap size. It's better to overestimate
+	// than to underestimate because 1) this triggers the GC earlier than
+	// necessary rather than potentially too late and 2) this leads to a
+	// conservative GC rate rather than a GC rate that is potentially too
+	// low.
 	//
 	// Whenever this is updated, call traceHeapAlloc() and
 	// this gcControllerState's revise() method.
-	heapLive uint64
+	heapLive atomic.Uint64
 
-	// heapScan is the number of bytes of "scannable" heap. This
-	// is the live heap (as counted by heapLive), but omitting
-	// no-scan objects and no-scan tails of objects.
+	// heapScan is the number of bytes of "scannable" heap. This is the
+	// live heap (as counted by heapLive), but omitting no-scan objects and
+	// no-scan tails of objects.
 	//
-	// This value is fixed at the start of a GC cycle, so during a
-	// GC cycle it is safe to read without atomics, and it represents
-	// the maximum scannable heap.
-	heapScan uint64
+	// This value is fixed at the start of a GC cycle. It represents the
+	// maximum scannable heap.
+	heapScan atomic.Uint64
 
 	// lastHeapScan is the number of bytes of heap that were scanned
 	// last GC cycle. It is the same as heapMarked, but only
@@ -229,7 +211,7 @@
 
 	// lastStackScan is the number of bytes of stack that were scanned
 	// last GC cycle.
-	lastStackScan uint64
+	lastStackScan atomic.Uint64
 
 	// maxStackScan is the amount of allocated goroutine stack space in
 	// use by goroutines.
@@ -239,15 +221,11 @@
 	// goroutine stack space is much harder to measure cheaply. By using
 	// allocated space, we make an overestimate; this is OK, it's better
 	// to conservatively overcount than undercount.
-	//
-	// Read and updated atomically.
-	maxStackScan uint64
+	maxStackScan atomic.Uint64
 
 	// globalsScan is the total amount of global variable space
 	// that is scannable.
-	//
-	// Read and updated atomically.
-	globalsScan uint64
+	globalsScan atomic.Uint64
 
 	// heapMarked is the number of bytes marked by the previous
 	// GC. After mark termination, heapLive == heapMarked, but
@@ -273,12 +251,11 @@
 	stackScanWork   atomic.Int64
 	globalsScanWork atomic.Int64
 
-	// bgScanCredit is the scan work credit accumulated by the
-	// concurrent background scan. This credit is accumulated by
-	// the background scan and stolen by mutator assists. This is
-	// updated atomically. Updates occur in bounded batches, since
-	// it is both written and read throughout the cycle.
-	bgScanCredit int64
+	// bgScanCredit is the scan work credit accumulated by the concurrent
+	// background scan. This credit is accumulated by the background scan
+	// and stolen by mutator assists.  Updates occur in bounded batches,
+	// since it is both written and read throughout the cycle.
+	bgScanCredit atomic.Int64
 
 	// assistTime is the nanoseconds spent in mutator assists
 	// during this cycle. This is updated atomically, and must also
@@ -287,31 +264,29 @@
 	// written and read throughout the cycle.
 	assistTime atomic.Int64
 
-	// dedicatedMarkTime is the nanoseconds spent in dedicated
-	// mark workers during this cycle. This is updated atomically
-	// at the end of the concurrent mark phase.
-	dedicatedMarkTime int64
+	// dedicatedMarkTime is the nanoseconds spent in dedicated mark workers
+	// during this cycle. This is updated at the end of the concurrent mark
+	// phase.
+	dedicatedMarkTime atomic.Int64
 
-	// fractionalMarkTime is the nanoseconds spent in the
-	// fractional mark worker during this cycle. This is updated
-	// atomically throughout the cycle and will be up-to-date if
-	// the fractional mark worker is not currently running.
-	fractionalMarkTime int64
+	// fractionalMarkTime is the nanoseconds spent in the fractional mark
+	// worker during this cycle. This is updated throughout the cycle and
+	// will be up-to-date if the fractional mark worker is not currently
+	// running.
+	fractionalMarkTime atomic.Int64
 
-	// idleMarkTime is the nanoseconds spent in idle marking
-	// during this cycle. This is updated atomically throughout
-	// the cycle.
-	idleMarkTime int64
+	// idleMarkTime is the nanoseconds spent in idle marking during this
+	// cycle. This is updated throughout the cycle.
+	idleMarkTime atomic.Int64
 
 	// markStartTime is the absolute start time in nanoseconds
 	// that assists and background mark workers started.
 	markStartTime int64
 
-	// dedicatedMarkWorkersNeeded is the number of dedicated mark
-	// workers that need to be started. This is computed at the
-	// beginning of each cycle and decremented atomically as
-	// dedicated mark workers get started.
-	dedicatedMarkWorkersNeeded int64
+	// dedicatedMarkWorkersNeeded is the number of dedicated mark workers
+	// that need to be started. This is computed at the beginning of each
+	// cycle and decremented as dedicated mark workers get started.
+	dedicatedMarkWorkersNeeded atomic.Int64
 
 	// idleMarkWorkers is two packed int32 values in a single uint64.
 	// These two values are always updated simultaneously.
@@ -395,28 +370,6 @@
 func (c *gcControllerState) init(gcPercent int32, memoryLimit int64) {
 	c.heapMinimum = defaultHeapMinimum
 	c.triggered = ^uint64(0)
-
-	c.consMarkController = piController{
-		// Tuned first via the Ziegler-Nichols process in simulation,
-		// then the integral time was manually tuned against real-world
-		// applications to deal with noisiness in the measured cons/mark
-		// ratio.
-		kp: 0.9,
-		ti: 4.0,
-
-		// Set a high reset time in GC cycles.
-		// This is inversely proportional to the rate at which we
-		// accumulate error from clipping. By making this very high
-		// we make the accumulation slow. In general, clipping is
-		// OK in our situation, hence the choice.
-		//
-		// Tune this if we get unintended effects from clipping for
-		// a long time.
-		tt:  1000,
-		min: -1000,
-		max: 1000,
-	}
-
 	c.setGCPercent(gcPercent)
 	c.setMemoryLimit(memoryLimit)
 	c.commit(true) // No sweep phase in the first GC cycle.
@@ -433,32 +386,13 @@
 	c.heapScanWork.Store(0)
 	c.stackScanWork.Store(0)
 	c.globalsScanWork.Store(0)
-	c.bgScanCredit = 0
+	c.bgScanCredit.Store(0)
 	c.assistTime.Store(0)
-	c.dedicatedMarkTime = 0
-	c.fractionalMarkTime = 0
-	c.idleMarkTime = 0
+	c.dedicatedMarkTime.Store(0)
+	c.fractionalMarkTime.Store(0)
+	c.idleMarkTime.Store(0)
 	c.markStartTime = markStartTime
-
-	// TODO(mknyszek): This is supposed to be the actual trigger point for the heap, but
-	// causes regressions in memory use. The cause is that the PI controller used to smooth
-	// the cons/mark ratio measurements tends to flail when using the less accurate precomputed
-	// trigger for the cons/mark calculation, and this results in the controller being more
-	// conservative about steady-states it tries to find in the future.
-	//
-	// This conservatism is transient, but these transient states tend to matter for short-lived
-	// programs, especially because the PI controller is overdamped, partially because it is
-	// configured with a relatively large time constant.
-	//
-	// Ultimately, I think this is just two mistakes piled on one another: the choice of a swingy
-	// smoothing function that recalls a fairly long history (due to its overdamped time constant)
-	// coupled with an inaccurate cons/mark calculation. It just so happens this works better
-	// today, and it makes it harder to change things in the future.
-	//
-	// This is described in #53738. Fix this for #53892 by changing back to the actual trigger
-	// point and simplifying the smoothing function.
-	heapTrigger, heapGoal := c.trigger()
-	c.triggered = heapTrigger
+	c.triggered = c.heapLive.Load()
 
 	// Compute the background mark utilization goal. In general,
 	// this may not come out exactly. We round the number of
@@ -466,26 +400,26 @@
 	// 25%. For small GOMAXPROCS, this would introduce too much
 	// error, so we add fractional workers in that case.
 	totalUtilizationGoal := float64(procs) * gcBackgroundUtilization
-	c.dedicatedMarkWorkersNeeded = int64(totalUtilizationGoal + 0.5)
-	utilError := float64(c.dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
+	dedicatedMarkWorkersNeeded := int64(totalUtilizationGoal + 0.5)
+	utilError := float64(dedicatedMarkWorkersNeeded)/totalUtilizationGoal - 1
 	const maxUtilError = 0.3
 	if utilError < -maxUtilError || utilError > maxUtilError {
 		// Rounding put us more than 30% off our goal. With
 		// gcBackgroundUtilization of 25%, this happens for
 		// GOMAXPROCS<=3 or GOMAXPROCS=6. Enable fractional
 		// workers to compensate.
-		if float64(c.dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
+		if float64(dedicatedMarkWorkersNeeded) > totalUtilizationGoal {
 			// Too many dedicated workers.
-			c.dedicatedMarkWorkersNeeded--
+			dedicatedMarkWorkersNeeded--
 		}
-		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(c.dedicatedMarkWorkersNeeded)) / float64(procs)
+		c.fractionalUtilizationGoal = (totalUtilizationGoal - float64(dedicatedMarkWorkersNeeded)) / float64(procs)
 	} else {
 		c.fractionalUtilizationGoal = 0
 	}
 
 	// In STW mode, we just want dedicated workers.
 	if debug.gcstoptheworld > 0 {
-		c.dedicatedMarkWorkersNeeded = int64(procs)
+		dedicatedMarkWorkersNeeded = int64(procs)
 		c.fractionalUtilizationGoal = 0
 	}
 
@@ -500,7 +434,7 @@
 		// required. However, we need at least one dedicated mark worker or
 		// idle GC worker to ensure GC progress in some scenarios (see comment
 		// on maxIdleMarkWorkers).
-		if c.dedicatedMarkWorkersNeeded > 0 {
+		if dedicatedMarkWorkersNeeded > 0 {
 			c.setMaxIdleMarkWorkers(0)
 		} else {
 			// TODO(mknyszek): The fundamental reason why we need this is because
@@ -510,22 +444,24 @@
 			c.setMaxIdleMarkWorkers(1)
 		}
 	} else {
-		// N.B. gomaxprocs and dedicatedMarkWorkersNeeded is guaranteed not to
+		// N.B. gomaxprocs and dedicatedMarkWorkersNeeded are guaranteed not to
 		// change during a GC cycle.
-		c.setMaxIdleMarkWorkers(int32(procs) - int32(c.dedicatedMarkWorkersNeeded))
+		c.setMaxIdleMarkWorkers(int32(procs) - int32(dedicatedMarkWorkersNeeded))
 	}
 
 	// Compute initial values for controls that are updated
 	// throughout the cycle.
+	c.dedicatedMarkWorkersNeeded.Store(dedicatedMarkWorkersNeeded)
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
+		heapGoal := c.heapGoal()
 		assistRatio := c.assistWorkPerByte.Load()
 		print("pacer: assist ratio=", assistRatio,
-			" (scan ", gcController.heapScan>>20, " MB in ",
+			" (scan ", gcController.heapScan.Load()>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			heapGoal>>20, " MB)",
-			" workers=", c.dedicatedMarkWorkersNeeded,
+			" workers=", dedicatedMarkWorkersNeeded,
 			"+", c.fractionalUtilizationGoal, "\n")
 	}
 }
@@ -559,8 +495,8 @@
 		// act like GOGC is huge for the below calculations.
 		gcPercent = 100000
 	}
-	live := atomic.Load64(&c.heapLive)
-	scan := atomic.Load64(&c.heapScan)
+	live := c.heapLive.Load()
+	scan := c.heapScan.Load()
 	work := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
 
 	// Assume we're under the soft goal. Pace GC to complete at
@@ -569,14 +505,14 @@
 
 	// The expected scan work is computed as the amount of bytes scanned last
 	// GC cycle (both heap and stack), plus our estimate of globals work for this cycle.
-	scanWorkExpected := int64(c.lastHeapScan + c.lastStackScan + c.globalsScan)
+	scanWorkExpected := int64(c.lastHeapScan + c.lastStackScan.Load() + c.globalsScan.Load())
 
 	// maxScanWork is a worst-case estimate of the amount of scan work that
 	// needs to be performed in this GC cycle. Specifically, it represents
 	// the case where *all* scannable memory turns out to be live, and
 	// *all* allocated stack space is scannable.
-	maxStackScan := atomic.Load64(&c.maxStackScan)
-	maxScanWork := int64(scan + maxStackScan + c.globalsScan)
+	maxStackScan := c.maxStackScan.Load()
+	maxScanWork := int64(scan + maxStackScan + c.globalsScan.Load())
 	if work > scanWorkExpected {
 		// We've already done more scan work than expected. Because our expectation
 		// is based on a steady-state scannable heap size, we assume this means our
@@ -675,7 +611,7 @@
 		utilization += float64(c.assistTime.Load()) / float64(assistDuration*int64(procs))
 	}
 
-	if c.heapLive <= c.triggered {
+	if c.heapLive.Load() <= c.triggered {
 		// Shouldn't happen, but let's be very safe about this in case the
 		// GC is somehow extremely short.
 		//
@@ -688,7 +624,7 @@
 	}
 	idleUtilization := 0.0
 	if assistDuration > 0 {
-		idleUtilization = float64(c.idleMarkTime) / float64(assistDuration*int64(procs))
+		idleUtilization = float64(c.idleMarkTime.Load()) / float64(assistDuration*int64(procs))
 	}
 	// Determine the cons/mark ratio.
 	//
@@ -706,7 +642,7 @@
 	//
 	//    assistDuration * procs * (utilization + idleUtilization)
 	//
-	// In this case, we *include* idle utilization, because that is additional CPU time that the
+	// In this case, we *include* idle utilization, because that is additional CPU time that
 	// the GC had available to it.
 	//
 	// In effect, idle GC time is sort of double-counted here, but it's very weird compared
@@ -719,44 +655,23 @@
 	//
 	// Note that because we only care about the ratio, assistDuration and procs cancel out.
 	scanWork := c.heapScanWork.Load() + c.stackScanWork.Load() + c.globalsScanWork.Load()
-	currentConsMark := (float64(c.heapLive-c.triggered) * (utilization + idleUtilization)) /
+	currentConsMark := (float64(c.heapLive.Load()-c.triggered) * (utilization + idleUtilization)) /
 		(float64(scanWork) * (1 - utilization))
 
-	// Update cons/mark controller. The time period for this is 1 GC cycle.
-	//
-	// This use of a PI controller might seem strange. So, here's an explanation:
-	//
-	// currentConsMark represents the consMark we *should've* had to be perfectly
-	// on-target for this cycle. Given that we assume the next GC will be like this
-	// one in the steady-state, it stands to reason that we should just pick that
-	// as our next consMark. In practice, however, currentConsMark is too noisy:
-	// we're going to be wildly off-target in each GC cycle if we do that.
-	//
-	// What we do instead is make a long-term assumption: there is some steady-state
-	// consMark value, but it's obscured by noise. By constantly shooting for this
-	// noisy-but-perfect consMark value, the controller will bounce around a bit,
-	// but its average behavior, in aggregate, should be less noisy and closer to
-	// the true long-term consMark value, provided its tuned to be slightly overdamped.
-	var ok bool
+	// Update our cons/mark estimate. This is the raw value above, but averaged over 2 GC cycles
+	// because it tends to be jittery, even in the steady-state. The smoothing helps the GC to
+	// maintain much more stable cycle-by-cycle behavior.
 	oldConsMark := c.consMark
-	c.consMark, ok = c.consMarkController.next(c.consMark, currentConsMark, 1.0)
-	if !ok {
-		// The error spiraled out of control. This is incredibly unlikely seeing
-		// as this controller is essentially just a smoothing function, but it might
-		// mean that something went very wrong with how currentConsMark was calculated.
-		// Just reset consMark and keep going.
-		c.consMark = 0
-	}
+	c.consMark = (currentConsMark + c.lastConsMark) / 2
+	c.lastConsMark = currentConsMark
 
 	if debug.gcpacertrace > 0 {
 		printlock()
 		goal := gcGoalUtilization * 100
 		print("pacer: ", int(utilization*100), "% CPU (", int(goal), " exp.) for ")
-		print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.lastStackScan+c.globalsScan, " B exp.) ")
-		print("in ", c.triggered, " B -> ", c.heapLive, " B (∆goal ", int64(c.heapLive)-int64(c.lastHeapGoal), ", cons/mark ", oldConsMark, ")")
-		if !ok {
-			print("[controller reset]")
-		}
+		print(c.heapScanWork.Load(), "+", c.stackScanWork.Load(), "+", c.globalsScanWork.Load(), " B work (", c.lastHeapScan+c.lastStackScan.Load()+c.globalsScan.Load(), " B exp.) ")
+		live := c.heapLive.Load()
+		print("in ", c.triggered, " B -> ", live, " B (∆goal ", int64(live)-int64(c.lastHeapGoal), ", cons/mark ", oldConsMark, ")")
 		println()
 		printunlock()
 	}
@@ -771,14 +686,14 @@
 	// If there are idle Ps, wake one so it will run an idle worker.
 	// NOTE: This is suspected of causing deadlocks. See golang.org/issue/19112.
 	//
-	//	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+	//	if sched.npidle.Load() != 0 && sched.nmspinning.Load() == 0 {
 	//		wakep()
 	//		return
 	//	}
 
 	// There are no idle Ps. If we need more dedicated workers,
 	// try to preempt a running P so it will switch to a worker.
-	if c.dedicatedMarkWorkersNeeded <= 0 {
+	if c.dedicatedMarkWorkersNeeded.Load() <= 0 {
 		return
 	}
 	// Pick a random other P to preempt.
@@ -805,9 +720,9 @@
 	}
 }
 
-// findRunnableGCWorker returns a background mark worker for _p_ if it
+// findRunnableGCWorker returns a background mark worker for pp if it
 // should be run. This must only be called when gcBlackenEnabled != 0.
-func (c *gcControllerState) findRunnableGCWorker(_p_ *p, now int64) (*g, int64) {
+func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
@@ -823,7 +738,7 @@
 		gcCPULimiter.update(now)
 	}
 
-	if !gcMarkWorkAvailable(_p_) {
+	if !gcMarkWorkAvailable(pp) {
 		// No work to be done right now. This can happen at
 		// the end of the mark phase when there are still
 		// assists tapering off. Don't bother running a worker
@@ -848,14 +763,14 @@
 		return nil, now
 	}
 
-	decIfPositive := func(ptr *int64) bool {
+	decIfPositive := func(val *atomic.Int64) bool {
 		for {
-			v := atomic.Loadint64(ptr)
+			v := val.Load()
 			if v <= 0 {
 				return false
 			}
 
-			if atomic.Casint64(ptr, v, v-1) {
+			if val.CompareAndSwap(v, v-1) {
 				return true
 			}
 		}
@@ -864,7 +779,7 @@
 	if decIfPositive(&c.dedicatedMarkWorkersNeeded) {
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
-		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
+		pp.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
 	} else if c.fractionalUtilizationGoal == 0 {
 		// No need for fractional workers.
 		gcBgMarkWorkerPool.push(&node.node)
@@ -875,13 +790,13 @@
 		//
 		// This should be kept in sync with pollFractionalWorkerExit.
 		delta := now - c.markStartTime
-		if delta > 0 && float64(_p_.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
+		if delta > 0 && float64(pp.gcFractionalMarkTime)/float64(delta) > c.fractionalUtilizationGoal {
 			// Nope. No need to run a fractional worker.
 			gcBgMarkWorkerPool.push(&node.node)
 			return nil, now
 		}
 		// Run a fractional worker.
-		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
+		pp.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
 	// Run the background mark worker.
@@ -900,15 +815,15 @@
 // The world must be stopped.
 func (c *gcControllerState) resetLive(bytesMarked uint64) {
 	c.heapMarked = bytesMarked
-	c.heapLive = bytesMarked
-	c.heapScan = uint64(c.heapScanWork.Load())
+	c.heapLive.Store(bytesMarked)
+	c.heapScan.Store(uint64(c.heapScanWork.Load()))
 	c.lastHeapScan = uint64(c.heapScanWork.Load())
-	c.lastStackScan = uint64(c.stackScanWork.Load())
+	c.lastStackScan.Store(uint64(c.stackScanWork.Load()))
 	c.triggered = ^uint64(0) // Reset triggered.
 
 	// heapLive was updated, so emit a trace event.
 	if trace.enabled {
-		traceHeapAlloc()
+		traceHeapAlloc(bytesMarked)
 	}
 }
 
@@ -921,12 +836,12 @@
 func (c *gcControllerState) markWorkerStop(mode gcMarkWorkerMode, duration int64) {
 	switch mode {
 	case gcMarkWorkerDedicatedMode:
-		atomic.Xaddint64(&c.dedicatedMarkTime, duration)
-		atomic.Xaddint64(&c.dedicatedMarkWorkersNeeded, 1)
+		c.dedicatedMarkTime.Add(duration)
+		c.dedicatedMarkWorkersNeeded.Add(1)
 	case gcMarkWorkerFractionalMode:
-		atomic.Xaddint64(&c.fractionalMarkTime, duration)
+		c.fractionalMarkTime.Add(duration)
 	case gcMarkWorkerIdleMode:
-		atomic.Xaddint64(&c.idleMarkTime, duration)
+		c.idleMarkTime.Add(duration)
 		c.removeIdleMarkWorker()
 	default:
 		throw("markWorkerStop: unknown mark worker mode")
@@ -935,17 +850,17 @@
 
 func (c *gcControllerState) update(dHeapLive, dHeapScan int64) {
 	if dHeapLive != 0 {
-		atomic.Xadd64(&gcController.heapLive, dHeapLive)
+		live := gcController.heapLive.Add(dHeapLive)
 		if trace.enabled {
 			// gcController.heapLive changed.
-			traceHeapAlloc()
+			traceHeapAlloc(live)
 		}
 	}
 	if gcBlackenEnabled == 0 {
 		// Update heapScan when we're not in a current GC. It is fixed
 		// at the beginning of a cycle.
 		if dHeapScan != 0 {
-			atomic.Xadd64(&gcController.heapScan, dHeapScan)
+			gcController.heapScan.Add(dHeapScan)
 		}
 	} else {
 		// gcController.heapLive changed.
@@ -955,18 +870,18 @@
 
 func (c *gcControllerState) addScannableStack(pp *p, amount int64) {
 	if pp == nil {
-		atomic.Xadd64(&c.maxStackScan, amount)
+		c.maxStackScan.Add(amount)
 		return
 	}
 	pp.maxStackScanDelta += amount
 	if pp.maxStackScanDelta >= maxStackScanSlack || pp.maxStackScanDelta <= -maxStackScanSlack {
-		atomic.Xadd64(&c.maxStackScan, pp.maxStackScanDelta)
+		c.maxStackScan.Add(pp.maxStackScanDelta)
 		pp.maxStackScanDelta = 0
 	}
 }
 
 func (c *gcControllerState) addGlobals(amount int64) {
-	atomic.Xadd64(&c.globalsScan, amount)
+	c.globalsScan.Add(amount)
 }
 
 // heapGoal returns the current heap goal.
@@ -1260,7 +1175,7 @@
 		// Concurrent sweep happens in the heap growth
 		// from gcController.heapLive to trigger. Make sure we
 		// give the sweeper some runway if it doesn't have enough.
-		c.sweepDistMinTrigger.Store(atomic.Load64(&c.heapLive) + sweepMinHeapDistance)
+		c.sweepDistMinTrigger.Store(c.heapLive.Load() + sweepMinHeapDistance)
 	}
 
 	// Compute the next GC goal, which is when the allocated heap
@@ -1268,7 +1183,7 @@
 	// plus additional runway for non-heap sources of GC work.
 	gcPercentHeapGoal := ^uint64(0)
 	if gcPercent := c.gcPercent.Load(); gcPercent >= 0 {
-		gcPercentHeapGoal = c.heapMarked + (c.heapMarked+atomic.Load64(&c.lastStackScan)+atomic.Load64(&c.globalsScan))*uint64(gcPercent)/100
+		gcPercentHeapGoal = c.heapMarked + (c.heapMarked+c.lastStackScan.Load()+c.globalsScan.Load())*uint64(gcPercent)/100
 	}
 	// Apply the minimum heap size here. It's defined in terms of gcPercent
 	// and is only updated by functions that call commit.
@@ -1300,7 +1215,7 @@
 	// Furthermore, by setting the runway so that CPU resources are divided
 	// this way, assuming that the cons/mark ratio is correct, we make that
 	// division a reality.
-	c.runway.Store(uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.lastStackScan+c.globalsScan)))
+	c.runway.Store(uint64((c.consMark * (1 - gcGoalUtilization) / (gcGoalUtilization)) * float64(c.lastHeapScan+c.lastStackScan.Load()+c.globalsScan.Load())))
 }
 
 // setGCPercent updates gcPercent. commit must be called after.
@@ -1335,7 +1250,7 @@
 	// If we just disabled GC, wait for any concurrent GC mark to
 	// finish so we always return with no GC running.
 	if in < 0 {
-		gcWaitOnMark(atomic.Load(&work.cycles))
+		gcWaitOnMark(work.cycles.Load())
 	}
 
 	return out
@@ -1400,74 +1315,6 @@
 	return n
 }
 
-type piController struct {
-	kp float64 // Proportional constant.
-	ti float64 // Integral time constant.
-	tt float64 // Reset time.
-
-	min, max float64 // Output boundaries.
-
-	// PI controller state.
-
-	errIntegral float64 // Integral of the error from t=0 to now.
-
-	// Error flags.
-	errOverflow   bool // Set if errIntegral ever overflowed.
-	inputOverflow bool // Set if an operation with the input overflowed.
-}
-
-// next provides a new sample to the controller.
-//
-// input is the sample, setpoint is the desired point, and period is how much
-// time (in whatever unit makes the most sense) has passed since the last sample.
-//
-// Returns a new value for the variable it's controlling, and whether the operation
-// completed successfully. One reason this might fail is if error has been growing
-// in an unbounded manner, to the point of overflow.
-//
-// In the specific case of an error overflow occurs, the errOverflow field will be
-// set and the rest of the controller's internal state will be fully reset.
-func (c *piController) next(input, setpoint, period float64) (float64, bool) {
-	// Compute the raw output value.
-	prop := c.kp * (setpoint - input)
-	rawOutput := prop + c.errIntegral
-
-	// Clamp rawOutput into output.
-	output := rawOutput
-	if isInf(output) || isNaN(output) {
-		// The input had a large enough magnitude that either it was already
-		// overflowed, or some operation with it overflowed.
-		// Set a flag and reset. That's the safest thing to do.
-		c.reset()
-		c.inputOverflow = true
-		return c.min, false
-	}
-	if output < c.min {
-		output = c.min
-	} else if output > c.max {
-		output = c.max
-	}
-
-	// Update the controller's state.
-	if c.ti != 0 && c.tt != 0 {
-		c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
-		if isInf(c.errIntegral) || isNaN(c.errIntegral) {
-			// So much error has accumulated that we managed to overflow.
-			// The assumptions around the controller have likely broken down.
-			// Set a flag and reset. That's the safest thing to do.
-			c.reset()
-			c.errOverflow = true
-			return c.min, false
-		}
-	}
-	return output, true
-}
-
-// reset resets the controller state, except for controller error flags.
-func (c *piController) reset() {
-	c.errIntegral = 0
-}
-
 // addIdleMarkWorker attempts to add a new idle mark worker.
 //
 // If this returns true, the caller must become an idle mark worker unless
diff --git a/src/runtime/mgcpacer_test.go b/src/runtime/mgcpacer_test.go
index 12d885d..e373e32 100644
--- a/src/runtime/mgcpacer_test.go
+++ b/src/runtime/mgcpacer_test.go
@@ -1019,51 +1019,6 @@
 	}
 }
 
-func FuzzPIController(f *testing.F) {
-	isNormal := func(x float64) bool {
-		return !math.IsInf(x, 0) && !math.IsNaN(x)
-	}
-	isPositive := func(x float64) bool {
-		return isNormal(x) && x > 0
-	}
-	// Seed with constants from controllers in the runtime.
-	// It's not critical that we keep these in sync, they're just
-	// reasonable seed inputs.
-	f.Add(0.3375, 3.2e6, 1e9, 0.001, 1000.0, 0.01)
-	f.Add(0.9, 4.0, 1000.0, -1000.0, 1000.0, 0.84)
-	f.Fuzz(func(t *testing.T, kp, ti, tt, min, max, setPoint float64) {
-		// Ignore uninteresting invalid parameters. These parameters
-		// are constant, so in practice surprising values will be documented
-		// or will be other otherwise immediately visible.
-		//
-		// We just want to make sure that given a non-Inf, non-NaN input,
-		// we always get a non-Inf, non-NaN output.
-		if !isPositive(kp) || !isPositive(ti) || !isPositive(tt) {
-			return
-		}
-		if !isNormal(min) || !isNormal(max) || min > max {
-			return
-		}
-		// Use a random source, but make it deterministic.
-		rs := rand.New(rand.NewSource(800))
-		randFloat64 := func() float64 {
-			return math.Float64frombits(rs.Uint64())
-		}
-		p := NewPIController(kp, ti, tt, min, max)
-		state := float64(0)
-		for i := 0; i < 100; i++ {
-			input := randFloat64()
-			// Ignore the "ok" parameter. We're just trying to break it.
-			// state is intentionally completely uncorrelated with the input.
-			var ok bool
-			state, ok = p.Next(input, setPoint, 1.0)
-			if !isNormal(state) {
-				t.Fatalf("got NaN or Inf result from controller: %f %v", state, ok)
-			}
-		}
-	})
-}
-
 func TestIdleMarkWorkerCount(t *testing.T) {
 	const workers = 10
 	c := NewGCController(100, math.MaxInt64)
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index bf38f87..e59340e 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -221,6 +221,16 @@
 	// gcController.memoryLimit by choosing to target the memory limit or
 	// some lower target to keep the scavenger working.
 	memoryLimitGoal atomic.Uint64
+
+	// assistTime is the time spent by the allocator scavenging in the last GC cycle.
+	//
+	// This is reset once a GC cycle ends.
+	assistTime atomic.Int64
+
+	// backgroundTime is the time spent by the background scavenger in the last GC cycle.
+	//
+	// This is reset once a GC cycle ends.
+	backgroundTime atomic.Int64
 }
 
 const (
@@ -361,6 +371,7 @@
 			if start >= end {
 				return r, 0
 			}
+			scavenge.backgroundTime.Add(end - start)
 			return r, end - start
 		}
 	}
@@ -718,7 +729,7 @@
 	if p.summary[len(p.summary)-1][ci].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := p.chunkOf(ci).findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
+		base, npages := p.chunkOf(ci).findScavengeCandidate(searchIdx, minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
@@ -736,6 +747,8 @@
 			unlock(p.mheapLock)
 
 			if !p.test {
+				pageTraceScav(getg().m.p.ptr(), 0, addr, uintptr(npages))
+
 				// Only perform the actual scavenging if we're not in a test.
 				// It's dangerous to do so otherwise.
 				sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
@@ -1103,3 +1116,71 @@
 func (s *scavengeIndex) clear(ci chunkIdx) {
 	s.chunks[ci/8].And(^uint8(1 << (ci % 8)))
 }
+
+type piController struct {
+	kp float64 // Proportional constant.
+	ti float64 // Integral time constant.
+	tt float64 // Reset time.
+
+	min, max float64 // Output boundaries.
+
+	// PI controller state.
+
+	errIntegral float64 // Integral of the error from t=0 to now.
+
+	// Error flags.
+	errOverflow   bool // Set if errIntegral ever overflowed.
+	inputOverflow bool // Set if an operation with the input overflowed.
+}
+
+// next provides a new sample to the controller.
+//
+// input is the sample, setpoint is the desired point, and period is how much
+// time (in whatever unit makes the most sense) has passed since the last sample.
+//
+// Returns a new value for the variable it's controlling, and whether the operation
+// completed successfully. One reason this might fail is if error has been growing
+// in an unbounded manner, to the point of overflow.
+//
+// In the specific case of an error overflow occurs, the errOverflow field will be
+// set and the rest of the controller's internal state will be fully reset.
+func (c *piController) next(input, setpoint, period float64) (float64, bool) {
+	// Compute the raw output value.
+	prop := c.kp * (setpoint - input)
+	rawOutput := prop + c.errIntegral
+
+	// Clamp rawOutput into output.
+	output := rawOutput
+	if isInf(output) || isNaN(output) {
+		// The input had a large enough magnitude that either it was already
+		// overflowed, or some operation with it overflowed.
+		// Set a flag and reset. That's the safest thing to do.
+		c.reset()
+		c.inputOverflow = true
+		return c.min, false
+	}
+	if output < c.min {
+		output = c.min
+	} else if output > c.max {
+		output = c.max
+	}
+
+	// Update the controller's state.
+	if c.ti != 0 && c.tt != 0 {
+		c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
+		if isInf(c.errIntegral) || isNaN(c.errIntegral) {
+			// So much error has accumulated that we managed to overflow.
+			// The assumptions around the controller have likely broken down.
+			// Set a flag and reset. That's the safest thing to do.
+			c.reset()
+			c.errOverflow = true
+			return c.min, false
+		}
+	}
+	return output, true
+}
+
+// reset resets the controller state, except for controller error flags.
+func (c *piController) reset() {
+	c.errIntegral = 0
+}
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 620392f..c436ff0 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -7,6 +7,7 @@
 import (
 	"fmt"
 	"internal/goos"
+	"math"
 	"math/rand"
 	. "runtime"
 	"runtime/internal/atomic"
@@ -707,3 +708,48 @@
 		find(0, 0)
 	})
 }
+
+func FuzzPIController(f *testing.F) {
+	isNormal := func(x float64) bool {
+		return !math.IsInf(x, 0) && !math.IsNaN(x)
+	}
+	isPositive := func(x float64) bool {
+		return isNormal(x) && x > 0
+	}
+	// Seed with constants from controllers in the runtime.
+	// It's not critical that we keep these in sync, they're just
+	// reasonable seed inputs.
+	f.Add(0.3375, 3.2e6, 1e9, 0.001, 1000.0, 0.01)
+	f.Add(0.9, 4.0, 1000.0, -1000.0, 1000.0, 0.84)
+	f.Fuzz(func(t *testing.T, kp, ti, tt, min, max, setPoint float64) {
+		// Ignore uninteresting invalid parameters. These parameters
+		// are constant, so in practice surprising values will be documented
+		// or will be other otherwise immediately visible.
+		//
+		// We just want to make sure that given a non-Inf, non-NaN input,
+		// we always get a non-Inf, non-NaN output.
+		if !isPositive(kp) || !isPositive(ti) || !isPositive(tt) {
+			return
+		}
+		if !isNormal(min) || !isNormal(max) || min > max {
+			return
+		}
+		// Use a random source, but make it deterministic.
+		rs := rand.New(rand.NewSource(800))
+		randFloat64 := func() float64 {
+			return math.Float64frombits(rs.Uint64())
+		}
+		p := NewPIController(kp, ti, tt, min, max)
+		state := float64(0)
+		for i := 0; i < 100; i++ {
+			input := randFloat64()
+			// Ignore the "ok" parameter. We're just trying to break it.
+			// state is intentionally completely uncorrelated with the input.
+			var ok bool
+			state, ok = p.Next(input, setPoint, 1.0)
+			if !isNormal(state) {
+				t.Fatalf("got NaN or Inf result from controller: %f %v", state, ok)
+			}
+		}
+	})
+}
diff --git a/src/runtime/mgcstack.go b/src/runtime/mgcstack.go
index 472c61a..6b55220 100644
--- a/src/runtime/mgcstack.go
+++ b/src/runtime/mgcstack.go
@@ -96,6 +96,7 @@
 
 import (
 	"internal/goarch"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -103,17 +104,15 @@
 
 // Buffer for pointers found during stack tracing.
 // Must be smaller than or equal to workbuf.
-//
-//go:notinheap
 type stackWorkBuf struct {
+	_ sys.NotInHeap
 	stackWorkBufHdr
 	obj [(_WorkbufSize - unsafe.Sizeof(stackWorkBufHdr{})) / goarch.PtrSize]uintptr
 }
 
 // Header declaration must come after the buf declaration above, because of issue #14620.
-//
-//go:notinheap
 type stackWorkBufHdr struct {
+	_ sys.NotInHeap
 	workbufhdr
 	next *stackWorkBuf // linked list of workbufs
 	// Note: we could theoretically repurpose lfnode.next as this next pointer.
@@ -123,15 +122,14 @@
 
 // Buffer for stack objects found on a goroutine stack.
 // Must be smaller than or equal to workbuf.
-//
-//go:notinheap
 type stackObjectBuf struct {
+	_ sys.NotInHeap
 	stackObjectBufHdr
 	obj [(_WorkbufSize - unsafe.Sizeof(stackObjectBufHdr{})) / unsafe.Sizeof(stackObject{})]stackObject
 }
 
-//go:notinheap
 type stackObjectBufHdr struct {
+	_ sys.NotInHeap
 	workbufhdr
 	next *stackObjectBuf
 }
@@ -147,9 +145,8 @@
 
 // A stackObject represents a variable on the stack that has had
 // its address taken.
-//
-//go:notinheap
 type stackObject struct {
+	_     sys.NotInHeap
 	off   uint32             // offset above stack.lo
 	size  uint32             // size of object
 	r     *stackObjectRecord // info of the object (for ptr/nonptr bits). nil if object has been scanned.
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index de57f18..6ccf090 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -33,10 +33,9 @@
 
 // State of background sweep.
 type sweepdata struct {
-	lock    mutex
-	g       *g
-	parked  bool
-	started bool
+	lock   mutex
+	g      *g
+	parked bool
 
 	nbgsweep    uint32
 	npausesweep uint32
@@ -177,7 +176,8 @@
 				return
 			}
 			if debug.gcpacertrace > 0 {
-				print("pacer: sweep done at heap size ", gcController.heapLive>>20, "MB; allocated ", (gcController.heapLive-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept.Load(), " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+				live := gcController.heapLive.Load()
+				print("pacer: sweep done at heap size ", live>>20, "MB; allocated ", (live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept.Load(), " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
 			}
 			return
 		}
@@ -278,12 +278,34 @@
 	goparkunlock(&sweep.lock, waitReasonGCSweepWait, traceEvGoBlock, 1)
 
 	for {
+		// bgsweep attempts to be a "low priority" goroutine by intentionally
+		// yielding time. It's OK if it doesn't run, because goroutines allocating
+		// memory will sweep and ensure that all spans are swept before the next
+		// GC cycle. We really only want to run when we're idle.
+		//
+		// However, calling Gosched after each span swept produces a tremendous
+		// amount of tracing events, sometimes up to 50% of events in a trace. It's
+		// also inefficient to call into the scheduler so much because sweeping a
+		// single span is in general a very fast operation, taking as little as 30 ns
+		// on modern hardware. (See #54767.)
+		//
+		// As a result, bgsweep sweeps in batches, and only calls into the scheduler
+		// at the end of every batch. Furthermore, it only yields its time if there
+		// isn't spare idle time available on other cores. If there's available idle
+		// time, helping to sweep can reduce allocation latencies by getting ahead of
+		// the proportional sweeper and having spans ready to go for allocation.
+		const sweepBatchSize = 10
+		nSwept := 0
 		for sweepone() != ^uintptr(0) {
 			sweep.nbgsweep++
-			Gosched()
+			nSwept++
+			if nSwept%sweepBatchSize == 0 {
+				goschedIfBusy()
+			}
 		}
 		for freeSomeWbufs(true) {
-			Gosched()
+			// N.B. freeSomeWbufs is already batched internally.
+			goschedIfBusy()
 		}
 		lock(&sweep.lock)
 		if !isSweepDone() {
@@ -431,8 +453,8 @@
 	// Caller must disable preemption.
 	// Otherwise when this function returns the span can become unswept again
 	// (if GC is triggered on another goroutine).
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+	gp := getg()
+	if gp.m.locks == 0 && gp.m.mallocing == 0 && gp != gp.m.g0 {
 		throw("mspan.ensureSwept: m is not locked")
 	}
 
@@ -470,8 +492,8 @@
 func (sl *sweepLocked) sweep(preserve bool) bool {
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
-	_g_ := getg()
-	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+	gp := getg()
+	if gp.m.locks == 0 && gp.m.mallocing == 0 && gp != gp.m.g0 {
 		throw("mspan.sweep: m is not locked")
 	}
 
@@ -579,13 +601,14 @@
 				if debug.clobberfree != 0 {
 					clobberfree(unsafe.Pointer(x), size)
 				}
-				if raceenabled {
+				// User arenas are handled on explicit free.
+				if raceenabled && !s.isUserArenaChunk {
 					racefree(unsafe.Pointer(x), size)
 				}
-				if msanenabled {
+				if msanenabled && !s.isUserArenaChunk {
 					msanfree(unsafe.Pointer(x), size)
 				}
-				if asanenabled {
+				if asanenabled && !s.isUserArenaChunk {
 					asanpoison(unsafe.Pointer(x), size)
 				}
 			}
@@ -625,6 +648,7 @@
 
 	s.allocCount = nalloc
 	s.freeindex = 0 // reset allocation index to start of span.
+	s.freeIndexForScan = 0
 	if trace.enabled {
 		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
 	}
@@ -659,6 +683,41 @@
 	// to go so release the span.
 	atomic.Store(&s.sweepgen, sweepgen)
 
+	if s.isUserArenaChunk {
+		if preserve {
+			// This is a case that should never be handled by a sweeper that
+			// preserves the span for reuse.
+			throw("sweep: tried to preserve a user arena span")
+		}
+		if nalloc > 0 {
+			// There still exist pointers into the span or the span hasn't been
+			// freed yet. It's not ready to be reused. Put it back on the
+			// full swept list for the next cycle.
+			mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+			return false
+		}
+
+		// It's only at this point that the sweeper doesn't actually need to look
+		// at this arena anymore, so subtract from pagesInUse now.
+		mheap_.pagesInUse.Add(-s.npages)
+		s.state.set(mSpanDead)
+
+		// The arena is ready to be recycled. Remove it from the quarantine list
+		// and place it on the ready list. Don't add it back to any sweep lists.
+		systemstack(func() {
+			// It's the arena code's responsibility to get the chunk on the quarantine
+			// list by the time all references to the chunk are gone.
+			if s.list != &mheap_.userArena.quarantineList {
+				throw("user arena span is on the wrong list")
+			}
+			lock(&mheap_.lock)
+			mheap_.userArena.quarantineList.remove(s)
+			mheap_.userArena.readyList.insert(s)
+			unlock(&mheap_.lock)
+		})
+		return false
+	}
+
 	if spc.sizeclass() != 0 {
 		// Handle spans for small objects.
 		if nfreed > 0 {
@@ -814,11 +873,30 @@
 		traceGCSweepStart()
 	}
 
+	// Fix debt if necessary.
 retry:
 	sweptBasis := mheap_.pagesSweptBasis.Load()
-
-	// Fix debt if necessary.
-	newHeapLive := uintptr(atomic.Load64(&gcController.heapLive)-mheap_.sweepHeapLiveBasis) + spanBytes
+	live := gcController.heapLive.Load()
+	liveBasis := mheap_.sweepHeapLiveBasis
+	newHeapLive := spanBytes
+	if liveBasis < live {
+		// Only do this subtraction when we don't overflow. Otherwise, pagesTarget
+		// might be computed as something really huge, causing us to get stuck
+		// sweeping here until the next mark phase.
+		//
+		// Overflow can happen here if gcPaceSweeper is called concurrently with
+		// sweeping (i.e. not during a STW, like it usually is) because this code
+		// is intentionally racy. A concurrent call to gcPaceSweeper can happen
+		// if a GC tuning parameter is modified and we read an older value of
+		// heapLive than what was used to set the basis.
+		//
+		// This state should be transient, so it's fine to just let newHeapLive
+		// be a relatively small number. We'll probably just skip this attempt to
+		// sweep.
+		//
+		// See issue #57523.
+		newHeapLive += uintptr(live - liveBasis)
+	}
 	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
 	for pagesTarget > int64(mheap_.pagesSwept.Load()-sweptBasis) {
 		if sweepone() == ^uintptr(0) {
@@ -862,7 +940,7 @@
 		// trigger. Compute the ratio of in-use pages to sweep
 		// per byte allocated, accounting for the fact that
 		// some might already be swept.
-		heapLiveBasis := atomic.Load64(&gcController.heapLive)
+		heapLiveBasis := gcController.heapLive.Load()
 		heapDistance := int64(trigger) - int64(heapLiveBasis)
 		// Add a little margin so rounding errors and
 		// concurrent sweep are less likely to leave pages
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 424de2f..7ab8975 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -7,6 +7,7 @@
 import (
 	"internal/goarch"
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -320,8 +321,8 @@
 	nobj int
 }
 
-//go:notinheap
 type workbuf struct {
+	_ sys.NotInHeap
 	workbufhdr
 	// account for the above fields
 	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / goarch.PtrSize]uintptr
@@ -420,7 +421,7 @@
 }
 
 // trygetfull tries to get a full or partially empty workbuffer.
-// If one is not immediately available return nil
+// If one is not immediately available return nil.
 //
 //go:nowritebarrier
 func trygetfull() *workbuf {
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index b19a2ff..1401e92 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -12,6 +12,7 @@
 	"internal/cpu"
 	"internal/goarch"
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -57,15 +58,13 @@
 //
 // mheap must not be heap-allocated because it contains mSpanLists,
 // which must not be heap-allocated.
-//
-//go:notinheap
 type mheap struct {
+	_ sys.NotInHeap
+
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
 	lock mutex
 
-	_ uint32 // 8-byte align pages so its alignment is consistent with tests.
-
 	pages pageAlloc // page allocation data structure
 
 	sweepgen uint32 // sweep generation, see comment in mspan; written during STW
@@ -83,8 +82,6 @@
 	// access (since that may free the backing store).
 	allspans []*mspan // all spans out there
 
-	// _ uint32 // align uint64 fields on 32-bit for atomics
-
 	// Proportional sweep
 	//
 	// These parameters represent a linear function from gcController.heapLive
@@ -103,13 +100,11 @@
 	// accounting for current progress. If we could only adjust
 	// the slope, it would create a discontinuity in debt if any
 	// progress has already been made.
-	pagesInUse         atomic.Uint64 // pages of spans in stats mSpanInUse
-	pagesSwept         atomic.Uint64 // pages swept this cycle
-	pagesSweptBasis    atomic.Uint64 // pagesSwept to use as the origin of the sweep ratio
-	sweepHeapLiveBasis uint64        // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
-	sweepPagesPerByte  float64       // proportional sweep ratio; written with lock, read without
-	// TODO(austin): pagesInUse should be a uintptr, but the 386
-	// compiler can't 8-byte align fields.
+	pagesInUse         atomic.Uintptr // pages of spans in stats mSpanInUse
+	pagesSwept         atomic.Uint64  // pages swept this cycle
+	pagesSweptBasis    atomic.Uint64  // pagesSwept to use as the origin of the sweep ratio
+	sweepHeapLiveBasis uint64         // value of gcController.heapLive to use as the origin of sweep ratio; written with lock, read without
+	sweepPagesPerByte  float64        // proportional sweep ratio; written with lock, read without
 
 	// Page reclaimer state
 
@@ -190,8 +185,6 @@
 		base, end uintptr
 	}
 
-	_ uint32 // ensure 64-bit alignment of central
-
 	// central free lists for small size classes.
 	// the padding makes sure that the mcentrals are
 	// spaced CacheLinePadSize bytes apart, so that each mcentral.lock
@@ -199,7 +192,7 @@
 	// central is indexed by spanClass.
 	central [numSpanClasses]struct {
 		mcentral mcentral
-		pad      [cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize]byte
+		pad      [(cpu.CacheLinePadSize - unsafe.Sizeof(mcentral{})%cpu.CacheLinePadSize) % cpu.CacheLinePadSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
@@ -210,6 +203,25 @@
 	speciallock           mutex    // lock for special record allocators.
 	arenaHintAlloc        fixalloc // allocator for arenaHints
 
+	// User arena state.
+	//
+	// Protected by mheap_.lock.
+	userArena struct {
+		// arenaHints is a list of addresses at which to attempt to
+		// add more heap arenas for user arena chunks. This is initially
+		// populated with a set of general hint addresses, and grown with
+		// the bounds of actual heap arena ranges.
+		arenaHints *arenaHint
+
+		// quarantineList is a list of user arena spans that have been set to fault, but
+		// are waiting for all pointers into them to go away. Sweeping handles
+		// identifying when this is true, and moves the span to the ready list.
+		quarantineList mSpanList
+
+		// readyList is a list of empty user arena spans that are ready for reuse.
+		readyList mSpanList
+	}
+
 	unused *specialfinalizer // never set, just here to force the specialfinalizer type into DWARF
 }
 
@@ -217,13 +229,26 @@
 
 // A heapArena stores metadata for a heap arena. heapArenas are stored
 // outside of the Go heap and accessed via the mheap_.arenas index.
-//
-//go:notinheap
 type heapArena struct {
+	_ sys.NotInHeap
+
 	// bitmap stores the pointer/scalar bitmap for the words in
-	// this arena. See mbitmap.go for a description. Use the
-	// heapBits type to access this.
-	bitmap [heapArenaBitmapBytes]byte
+	// this arena. See mbitmap.go for a description.
+	// This array uses 1 bit per word of heap, or 1.6% of the heap size (for 64-bit).
+	bitmap [heapArenaBitmapWords]uintptr
+
+	// If the ith bit of noMorePtrs is true, then there are no more
+	// pointers for the object containing the word described by the
+	// high bit of bitmap[i].
+	// In that case, bitmap[i+1], ... must be zero until the start
+	// of the next object.
+	// We never operate on these entries using bit-parallel techniques,
+	// so it is ok if they are small. Also, they can't be bigger than
+	// uint16 because at that size a single noMorePtrs entry
+	// represents 8K of memory, the minimum size of a span. Any larger
+	// and we'd have to worry about concurrent updates.
+	// This array uses 1 bit per word of bitmap, or .024% of the heap size (for 64-bit).
+	noMorePtrs [heapArenaBitmapWords / 8]uint8
 
 	// spans maps from virtual address page ID within this arena to *mspan.
 	// For allocated spans, their pages map to the span itself.
@@ -290,9 +315,8 @@
 
 // arenaHint is a hint for where to grow the heap arenas. See
 // mheap_.arenaHints.
-//
-//go:notinheap
 type arenaHint struct {
+	_    sys.NotInHeap
 	addr uintptr
 	down bool
 	next *arenaHint
@@ -347,34 +371,39 @@
 	"mSpanDead",
 	"mSpanInUse",
 	"mSpanManual",
-	"mSpanFree",
 }
 
-// mSpanStateBox holds an mSpanState and provides atomic operations on
-// it. This is a separate type to disallow accidental comparison or
-// assignment with mSpanState.
+// mSpanStateBox holds an atomic.Uint8 to provide atomic operations on
+// an mSpanState. This is a separate type to disallow accidental comparison
+// or assignment with mSpanState.
 type mSpanStateBox struct {
-	s mSpanState
+	s atomic.Uint8
 }
 
+// It is nosplit to match get, below.
+
+//go:nosplit
 func (b *mSpanStateBox) set(s mSpanState) {
-	atomic.Store8((*uint8)(&b.s), uint8(s))
+	b.s.Store(uint8(s))
 }
 
+// It is nosplit because it's called indirectly by typedmemclr,
+// which must not be preempted.
+
+//go:nosplit
 func (b *mSpanStateBox) get() mSpanState {
-	return mSpanState(atomic.Load8((*uint8)(&b.s)))
+	return mSpanState(b.s.Load())
 }
 
 // mSpanList heads a linked list of spans.
-//
-//go:notinheap
 type mSpanList struct {
+	_     sys.NotInHeap
 	first *mspan // first span in list, or nil if none
 	last  *mspan // last span in list, or nil if none
 }
 
-//go:notinheap
 type mspan struct {
+	_    sys.NotInHeap
 	next *mspan     // next span in list, or nil if none
 	prev *mspan     // previous span in list, or nil if none
 	list *mSpanList // For debugging. TODO: Remove.
@@ -451,11 +480,21 @@
 	spanclass             spanClass     // size class and noscan (uint8)
 	state                 mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
 	needzero              uint8         // needs to be zeroed before allocation
+	isUserArenaChunk      bool          // whether or not this span represents a user arena
 	allocCountBeforeCache uint16        // a copy of allocCount that is stored just before this span is cached
 	elemsize              uintptr       // computed from sizeclass or from npages
 	limit                 uintptr       // end of data in span
 	speciallock           mutex         // guards specials list
 	specials              *special      // linked list of special records sorted by offset.
+	userArenaChunkFree    addrRange     // interval for managing chunk allocation
+
+	// freeIndexForScan is like freeindex, except that freeindex is
+	// used by the allocator whereas freeIndexForScan is used by the
+	// GC scanner. They are two fields so that the GC sees the object
+	// is allocated only when the object and the heap bits are
+	// initialized (see also the assignment of freeIndexForScan in
+	// mallocgc, and issue 54596).
+	freeIndexForScan uintptr
 }
 
 func (s *mspan) base() uintptr {
@@ -565,6 +604,12 @@
 
 type arenaIdx uint
 
+// l1 returns the "l1" portion of an arenaIdx.
+//
+// Marked nosplit because it's called by spanOf and other nosplit
+// functions.
+//
+//go:nosplit
 func (i arenaIdx) l1() uint {
 	if arenaL1Bits == 0 {
 		// Let the compiler optimize this away if there's no
@@ -575,6 +620,12 @@
 	}
 }
 
+// l2 returns the "l2" portion of an arenaIdx.
+//
+// Marked nosplit because it's called by spanOf and other nosplit funcs.
+// functions.
+//
+//go:nosplit
 func (i arenaIdx) l2() uint {
 	if arenaL1Bits == 0 {
 		return uint(i)
@@ -1183,6 +1234,7 @@
 		base = alignUp(base, physPageSize)
 		scav = h.pages.allocRange(base, npages)
 	}
+
 	if base == 0 {
 		// Try to acquire a base address.
 		base, scav = h.pages.alloc(npages)
@@ -1207,56 +1259,6 @@
 	unlock(&h.lock)
 
 HaveSpan:
-	// At this point, both s != nil and base != 0, and the heap
-	// lock is no longer held. Initialize the span.
-	s.init(base, npages)
-	if h.allocNeedsZero(base, npages) {
-		s.needzero = 1
-	}
-	nbytes := npages * pageSize
-	if typ.manual() {
-		s.manualFreeList = 0
-		s.nelems = 0
-		s.limit = s.base() + s.npages*pageSize
-		s.state.set(mSpanManual)
-	} else {
-		// We must set span properties before the span is published anywhere
-		// since we're not holding the heap lock.
-		s.spanclass = spanclass
-		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
-			s.elemsize = nbytes
-			s.nelems = 1
-			s.divMul = 0
-		} else {
-			s.elemsize = uintptr(class_to_size[sizeclass])
-			s.nelems = nbytes / s.elemsize
-			s.divMul = class_to_divmagic[sizeclass]
-		}
-
-		// Initialize mark and allocation structures.
-		s.freeindex = 0
-		s.allocCache = ^uint64(0) // all 1s indicating all free.
-		s.gcmarkBits = newMarkBits(s.nelems)
-		s.allocBits = newAllocBits(s.nelems)
-
-		// It's safe to access h.sweepgen without the heap lock because it's
-		// only ever updated with the world stopped and we run on the
-		// systemstack which blocks a STW transition.
-		atomic.Store(&s.sweepgen, h.sweepgen)
-
-		// Now that the span is filled in, set its state. This
-		// is a publication barrier for the other fields in
-		// the span. While valid pointers into this span
-		// should never be visible until the span is returned,
-		// if the garbage collector finds an invalid pointer,
-		// access to the span may race with initialization of
-		// the span. We resolve this race by atomically
-		// setting the state after the span is fully
-		// initialized, and atomically checking the state in
-		// any situation where a pointer is suspect.
-		s.state.set(mSpanInUse)
-	}
-
 	// Decide if we need to scavenge in response to what we just allocated.
 	// Specifically, we track the maximum amount of memory to scavenge of all
 	// the alternatives below, assuming that the maximum satisfies *all*
@@ -1304,6 +1306,7 @@
 	// There are a few very limited cirumstances where we won't have a P here.
 	// It's OK to simply skip scavenging in these cases. Something else will notice
 	// and pick up the tab.
+	var now int64
 	if pp != nil && bytesToScavenge > 0 {
 		// Measure how long we spent scavenging and add that measurement to the assist
 		// time so we can track it for the GC CPU limiter.
@@ -1319,14 +1322,18 @@
 		})
 
 		// Finish up accounting.
-		now := nanotime()
+		now = nanotime()
 		if track {
 			pp.limiterEvent.stop(limiterEventScavengeAssist, now)
 		}
-		h.pages.scav.assistTime.Add(now - start)
+		scavenge.assistTime.Add(now - start)
 	}
 
+	// Initialize the span.
+	h.initSpan(s, typ, spanclass, base, npages)
+
 	// Commit and account for any scavenged memory that the span now owns.
+	nbytes := npages * pageSize
 	if scav != 0 {
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
@@ -1354,6 +1361,64 @@
 	}
 	memstats.heapStats.release()
 
+	pageTraceAlloc(pp, now, base, npages)
+	return s
+}
+
+// initSpan initializes a blank span s which will represent the range
+// [base, base+npages*pageSize). typ is the type of span being allocated.
+func (h *mheap) initSpan(s *mspan, typ spanAllocType, spanclass spanClass, base, npages uintptr) {
+	// At this point, both s != nil and base != 0, and the heap
+	// lock is no longer held. Initialize the span.
+	s.init(base, npages)
+	if h.allocNeedsZero(base, npages) {
+		s.needzero = 1
+	}
+	nbytes := npages * pageSize
+	if typ.manual() {
+		s.manualFreeList = 0
+		s.nelems = 0
+		s.limit = s.base() + s.npages*pageSize
+		s.state.set(mSpanManual)
+	} else {
+		// We must set span properties before the span is published anywhere
+		// since we're not holding the heap lock.
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
+			s.elemsize = nbytes
+			s.nelems = 1
+			s.divMul = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			s.nelems = nbytes / s.elemsize
+			s.divMul = class_to_divmagic[sizeclass]
+		}
+
+		// Initialize mark and allocation structures.
+		s.freeindex = 0
+		s.freeIndexForScan = 0
+		s.allocCache = ^uint64(0) // all 1s indicating all free.
+		s.gcmarkBits = newMarkBits(s.nelems)
+		s.allocBits = newAllocBits(s.nelems)
+
+		// It's safe to access h.sweepgen without the heap lock because it's
+		// only ever updated with the world stopped and we run on the
+		// systemstack which blocks a STW transition.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+
+		// Now that the span is filled in, set its state. This
+		// is a publication barrier for the other fields in
+		// the span. While valid pointers into this span
+		// should never be visible until the span is returned,
+		// if the garbage collector finds an invalid pointer,
+		// access to the span may race with initialization of
+		// the span. We resolve this race by atomically
+		// setting the state after the span is fully
+		// initialized, and atomically checking the state in
+		// any situation where a pointer is suspect.
+		s.state.set(mSpanInUse)
+	}
+
 	// Publish the span in various locations.
 
 	// This is safe to call without the lock held because the slots
@@ -1373,14 +1438,12 @@
 		atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
 
 		// Update related page sweeper stats.
-		h.pagesInUse.Add(int64(npages))
+		h.pagesInUse.Add(npages)
 	}
 
 	// Make sure the newly allocated span will be observed
 	// by the GC before pointers into the span are published.
 	publicationBarrier()
-
-	return s
 }
 
 // Try to add at least npage pages of memory to the heap,
@@ -1406,7 +1469,7 @@
 		// Not enough room in the current arena. Allocate more
 		// arena space. This may not be contiguous with the
 		// current arena, so we have to request the full ask.
-		av, asize := h.sysAlloc(ask)
+		av, asize := h.sysAlloc(ask, &h.arenaHints, true)
 		if av == nil {
 			inUse := gcController.heapFree.load() + gcController.heapReleased.load() + gcController.heapInUse.load()
 			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", inUse, " in use)\n")
@@ -1474,6 +1537,8 @@
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
+		pageTraceFree(getg().m.p.ptr(), 0, s.base(), s.npages)
+
 		lock(&h.lock)
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
@@ -1504,6 +1569,8 @@
 //
 //go:systemstack
 func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
+	pageTraceFree(getg().m.p.ptr(), 0, s.base(), s.npages)
+
 	s.needzero = 1
 	lock(&h.lock)
 	h.freeSpanLocked(s, typ)
@@ -1519,11 +1586,14 @@
 			throw("mheap.freeSpanLocked - invalid stack free")
 		}
 	case mSpanInUse:
+		if s.isUserArenaChunk {
+			throw("mheap.freeSpanLocked - invalid free of user arena chunk")
+		}
 		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
 			print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
 			throw("mheap.freeSpanLocked - invalid free")
 		}
-		h.pagesInUse.Add(-int64(s.npages))
+		h.pagesInUse.Add(-s.npages)
 
 		// Clear in-use bit in arena page bitmap.
 		arena, pageIdx, pageMask := pageIndexOf(s.base())
@@ -1602,6 +1672,7 @@
 	span.specials = nil
 	span.needzero = 0
 	span.freeindex = 0
+	span.freeIndexForScan = 0
 	span.allocBits = nil
 	span.gcmarkBits = nil
 	span.state.set(mSpanDead)
@@ -1715,8 +1786,8 @@
 	// if that happens.
 )
 
-//go:notinheap
 type special struct {
+	_      sys.NotInHeap
 	next   *special // linked list in span
 	offset uint16   // span offset of object
 	kind   byte     // kind of special
@@ -1836,9 +1907,8 @@
 //
 // specialfinalizer is allocated from non-GC'd memory, so any heap
 // pointers must be specially handled.
-//
-//go:notinheap
 type specialfinalizer struct {
+	_       sys.NotInHeap
 	special special
 	fn      *funcval // May be a heap pointer.
 	nret    uintptr
@@ -1862,12 +1932,14 @@
 		// situation where it's possible that markrootSpans
 		// has already run but mark termination hasn't yet.
 		if gcphase != _GCoff {
-			base, _, _ := findObject(uintptr(p), 0, 0)
+			base, span, _ := findObject(uintptr(p), 0, 0)
 			mp := acquirem()
 			gcw := &mp.p.ptr().gcw
 			// Mark everything reachable from the object
 			// so it's retained for the finalizer.
-			scanobject(base, gcw)
+			if !span.spanclass.noscan() {
+				scanobject(base, gcw)
+			}
 			// Mark the finalizer itself, since the
 			// special isn't part of the GC'd heap.
 			scanblock(uintptr(unsafe.Pointer(&s.fn)), goarch.PtrSize, &oneptrmask[0], gcw, nil)
@@ -1895,9 +1967,8 @@
 }
 
 // The described object is being heap profiled.
-//
-//go:notinheap
 type specialprofile struct {
+	_       sys.NotInHeap
 	special special
 	b       *bucket
 }
@@ -1976,14 +2047,15 @@
 	}
 }
 
-// gcBits is an alloc/mark bitmap. This is always used as *gcBits.
-//
-//go:notinheap
-type gcBits uint8
+// gcBits is an alloc/mark bitmap. This is always used as gcBits.x.
+type gcBits struct {
+	_ sys.NotInHeap
+	x uint8
+}
 
 // bytep returns a pointer to the n'th byte of b.
 func (b *gcBits) bytep(n uintptr) *uint8 {
-	return addb((*uint8)(b), n)
+	return addb(&b.x, n)
 }
 
 // bitp returns a pointer to the byte containing bit n and a mask for
@@ -2000,8 +2072,8 @@
 	next uintptr // *gcBits triggers recursive type bug. (issue 14620)
 }
 
-//go:notinheap
 type gcBitsArena struct {
+	_ sys.NotInHeap
 	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
 	free uintptr // free is the index into bits of the next free byte; read/write atomically
 	next *gcBitsArena
diff --git a/src/runtime/mklockrank.go b/src/runtime/mklockrank.go
new file mode 100644
index 0000000..bc15e57
--- /dev/null
+++ b/src/runtime/mklockrank.go
@@ -0,0 +1,366 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build ignore
+
+// mklockrank records the static rank graph of the locks in the
+// runtime and generates the rank checking structures in lockrank.go.
+package main
+
+import (
+	"bytes"
+	"flag"
+	"fmt"
+	"go/format"
+	"internal/dag"
+	"io"
+	"log"
+	"os"
+	"strings"
+)
+
+// ranks describes the lock rank graph. See "go doc internal/dag" for
+// the syntax.
+//
+// "a < b" means a must be acquired before b if both are held
+// (or, if b is held, a cannot be acquired).
+//
+// "NONE < a" means no locks may be held when a is acquired.
+//
+// If a lock is not given a rank, then it is assumed to be a leaf
+// lock, which means no other lock can be acquired while it is held.
+// Therefore, leaf locks do not need to be given an explicit rank.
+//
+// Ranks in all caps are pseudo-nodes that help define order, but do
+// not actually define a rank.
+//
+// TODO: It's often hard to correlate rank names to locks. Change
+// these to be more consistent with the locks they label.
+const ranks = `
+# Sysmon
+NONE
+< sysmon
+< scavenge, forcegc;
+
+# Defer
+NONE < defer;
+
+# GC
+NONE <
+  sweepWaiters,
+  assistQueue,
+  sweep;
+
+# Scheduler, timers, netpoll
+NONE < pollDesc, cpuprof;
+assistQueue,
+  cpuprof,
+  forcegc,
+  pollDesc, # pollDesc can interact with timers, which can lock sched.
+  scavenge,
+  sweep,
+  sweepWaiters
+< sched;
+sched < allg, allp;
+allp < timers;
+timers < netpollInit;
+
+# Channels
+scavenge, sweep < hchan;
+NONE < notifyList;
+hchan, notifyList < sudog;
+
+# RWMutex
+NONE < rwmutexW;
+rwmutexW, sysmon < rwmutexR;
+
+# Semaphores
+NONE < root;
+
+# Itabs
+NONE
+< itab
+< reflectOffs;
+
+# User arena state
+NONE < userArenaState;
+
+# Tracing without a P uses a global trace buffer.
+scavenge
+# Above TRACEGLOBAL can emit a trace event without a P.
+< TRACEGLOBAL
+# Below TRACEGLOBAL manages the global tracing buffer.
+# Note that traceBuf eventually chains to MALLOC, but we never get that far
+# in the situation where there's no P.
+< traceBuf;
+# Starting/stopping tracing traces strings.
+traceBuf < traceStrings;
+
+# Malloc
+allg,
+  hchan,
+  notifyList,
+  reflectOffs,
+  timers,
+  traceStrings,
+  userArenaState
+# Above MALLOC are things that can allocate memory.
+< MALLOC
+# Below MALLOC is the malloc implementation.
+< fin,
+  gcBitsArenas,
+  mheapSpecial,
+  mspanSpecial,
+  spanSetSpine,
+  MPROF;
+
+# Memory profiling
+MPROF < profInsert, profBlock, profMemActive;
+profMemActive < profMemFuture;
+
+# Stack allocation and copying
+gcBitsArenas,
+  netpollInit,
+  profBlock,
+  profInsert,
+  profMemFuture,
+  spanSetSpine,
+  fin,
+  root
+# Anything that can grow the stack can acquire STACKGROW.
+# (Most higher layers imply STACKGROW, like MALLOC.)
+< STACKGROW
+# Below STACKGROW is the stack allocator/copying implementation.
+< gscan;
+gscan, rwmutexR < stackpool;
+gscan < stackLarge;
+# Generally, hchan must be acquired before gscan. But in one case,
+# where we suspend a G and then shrink its stack, syncadjustsudogs
+# can acquire hchan locks while holding gscan. To allow this case,
+# we use hchanLeaf instead of hchan.
+gscan < hchanLeaf;
+
+# Write barrier
+defer,
+  gscan,
+  mspanSpecial,
+  sudog
+# Anything that can have write barriers can acquire WB.
+# Above WB, we can have write barriers.
+< WB
+# Below WB is the write barrier implementation.
+< wbufSpans;
+
+# Span allocator
+stackLarge,
+  stackpool,
+  wbufSpans
+# Above mheap is anything that can call the span allocator.
+< mheap;
+# Below mheap is the span allocator implementation.
+mheap, mheapSpecial < globalAlloc;
+
+# Execution tracer events (with a P)
+hchan,
+  mheap,
+  root,
+  sched,
+  traceStrings,
+  notifyList,
+  fin
+# Above TRACE is anything that can create a trace event
+< TRACE
+< trace
+< traceStackTab;
+
+# panic is handled specially. It is implicitly below all other locks.
+NONE < panic;
+# deadlock is not acquired while holding panic, but it also needs to be
+# below all other locks.
+panic < deadlock;
+`
+
+// cyclicRanks lists lock ranks that allow multiple locks of the same
+// rank to be acquired simultaneously. The runtime enforces ordering
+// within these ranks using a separate mechanism.
+var cyclicRanks = map[string]bool{
+	// Multiple timers are locked simultaneously in destroy().
+	"timers": true,
+	// Multiple hchans are acquired in hchan.sortkey() order in
+	// select.
+	"hchan": true,
+	// Multiple hchanLeafs are acquired in hchan.sortkey() order in
+	// syncadjustsudogs().
+	"hchanLeaf": true,
+	// The point of the deadlock lock is to deadlock.
+	"deadlock": true,
+}
+
+func main() {
+	flagO := flag.String("o", "", "write to `file` instead of stdout")
+	flagDot := flag.Bool("dot", false, "emit graphviz output instead of Go")
+	flag.Parse()
+	if flag.NArg() != 0 {
+		fmt.Fprintf(os.Stderr, "too many arguments")
+		os.Exit(2)
+	}
+
+	g, err := dag.Parse(ranks)
+	if err != nil {
+		log.Fatal(err)
+	}
+
+	var out []byte
+	if *flagDot {
+		var b bytes.Buffer
+		g.TransitiveReduction()
+		// Add cyclic edges for visualization.
+		for k := range cyclicRanks {
+			g.AddEdge(k, k)
+		}
+		// Reverse the graph. It's much easier to read this as
+		// a "<" partial order than a ">" partial order. This
+		// ways, locks are acquired from the top going down
+		// and time moves forward over the edges instead of
+		// backward.
+		g.Transpose()
+		generateDot(&b, g)
+		out = b.Bytes()
+	} else {
+		var b bytes.Buffer
+		generateGo(&b, g)
+		out, err = format.Source(b.Bytes())
+		if err != nil {
+			log.Fatal(err)
+		}
+	}
+
+	if *flagO != "" {
+		err = os.WriteFile(*flagO, out, 0666)
+	} else {
+		_, err = os.Stdout.Write(out)
+	}
+	if err != nil {
+		log.Fatal(err)
+	}
+}
+
+func generateGo(w io.Writer, g *dag.Graph) {
+	fmt.Fprintf(w, `// Code generated by mklockrank.go; DO NOT EDIT.
+
+package runtime
+
+type lockRank int
+
+`)
+
+	// Create numeric ranks.
+	topo := g.Topo()
+	for i, j := 0, len(topo)-1; i < j; i, j = i+1, j-1 {
+		topo[i], topo[j] = topo[j], topo[i]
+	}
+	fmt.Fprintf(w, `
+// Constants representing the ranks of all non-leaf runtime locks, in rank order.
+// Locks with lower rank must be taken before locks with higher rank,
+// in addition to satisfying the partial order in lockPartialOrder.
+// A few ranks allow self-cycles, which are specified in lockPartialOrder.
+const (
+	lockRankUnknown lockRank = iota
+
+`)
+	for _, rank := range topo {
+		if isPseudo(rank) {
+			fmt.Fprintf(w, "\t// %s\n", rank)
+		} else {
+			fmt.Fprintf(w, "\t%s\n", cname(rank))
+		}
+	}
+	fmt.Fprintf(w, `)
+
+// lockRankLeafRank is the rank of lock that does not have a declared rank,
+// and hence is a leaf lock.
+const lockRankLeafRank lockRank = 1000
+`)
+
+	// Create string table.
+	fmt.Fprintf(w, `
+// lockNames gives the names associated with each of the above ranks.
+var lockNames = []string{
+`)
+	for _, rank := range topo {
+		if !isPseudo(rank) {
+			fmt.Fprintf(w, "\t%s: %q,\n", cname(rank), rank)
+		}
+	}
+	fmt.Fprintf(w, `}
+
+func (rank lockRank) String() string {
+	if rank == 0 {
+		return "UNKNOWN"
+	}
+	if rank == lockRankLeafRank {
+		return "LEAF"
+	}
+	if rank < 0 || int(rank) >= len(lockNames) {
+		return "BAD RANK"
+	}
+	return lockNames[rank]
+}
+`)
+
+	// Create partial order structure.
+	fmt.Fprintf(w, `
+// lockPartialOrder is the transitive closure of the lock rank graph.
+// An entry for rank X lists all of the ranks that can already be held
+// when rank X is acquired.
+//
+// Lock ranks that allow self-cycles list themselves.
+var lockPartialOrder [][]lockRank = [][]lockRank{
+`)
+	for _, rank := range topo {
+		if isPseudo(rank) {
+			continue
+		}
+		list := []string{}
+		for _, before := range g.Edges(rank) {
+			if !isPseudo(before) {
+				list = append(list, cname(before))
+			}
+		}
+		if cyclicRanks[rank] {
+			list = append(list, cname(rank))
+		}
+
+		fmt.Fprintf(w, "\t%s: {%s},\n", cname(rank), strings.Join(list, ", "))
+	}
+	fmt.Fprintf(w, "}\n")
+}
+
+// cname returns the Go const name for the given lock rank label.
+func cname(label string) string {
+	return "lockRank" + strings.ToUpper(label[:1]) + label[1:]
+}
+
+func isPseudo(label string) bool {
+	return strings.ToUpper(label) == label
+}
+
+// generateDot emits a Graphviz dot representation of g to w.
+func generateDot(w io.Writer, g *dag.Graph) {
+	fmt.Fprintf(w, "digraph g {\n")
+
+	// Define all nodes.
+	for _, node := range g.Nodes {
+		fmt.Fprintf(w, "%q;\n", node)
+	}
+
+	// Create edges.
+	for _, node := range g.Nodes {
+		for _, to := range g.Edges(node) {
+			fmt.Fprintf(w, "%q -> %q;\n", node, to)
+		}
+	}
+
+	fmt.Fprintf(w, "}\n")
+}
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index 28befcb..61d2d02 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -126,6 +126,9 @@
 		fmt.Fprintf(out, "//go:build %s || %sle\n\n", base, base)
 	}
 	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+	if arch == "amd64" {
+		fmt.Fprintf(out, "#include \"asm_amd64.h\"\n")
+	}
 	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
 	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
 }
@@ -267,8 +270,10 @@
 	// Clear the upper bits to get to a clean state. See issue #37174.
 	// It is safe here as Go code don't use the upper bits of Y registers.
 	p("#ifdef GOOS_darwin")
+	p("#ifndef hasAVX")
 	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0")
 	p("JE 2(PC)")
+	p("#endif")
 	p("VZEROUPPER")
 	p("#endif")
 
diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 3280a62..f0183f6 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !aix && !darwin && !js && (!linux || !amd64) && (!linux || !arm64) && !openbsd && !plan9 && !solaris && !windows
+//go:build !aix && !darwin && !js && (!linux || !amd64) && (!linux || !arm64) && (!freebsd || !amd64) && !openbsd && !plan9 && !solaris && !windows
 
 package runtime
 
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 5de25cf..35b2a01 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -48,7 +48,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -107,7 +106,7 @@
 	return chunkIdx((p - arenaBaseOffset) / pallocChunkBytes)
 }
 
-// chunkIndex returns the base address of the palloc chunk at index ci.
+// chunkBase returns the base address of the palloc chunk at index ci.
 func chunkBase(ci chunkIdx) uintptr {
 	return uintptr(ci)*pallocChunkBytes + arenaBaseOffset
 }
@@ -267,25 +266,16 @@
 	// All access is protected by the mheapLock.
 	inUse addrRanges
 
-	_ uint32 // Align scav so it's easier to reason about alignment within scav.
-
 	// scav stores the scavenger state.
 	scav struct {
 		// index is an efficient index of chunks that have pages available to
 		// scavenge.
 		index scavengeIndex
 
-		// released is the amount of memory released this generation.
+		// released is the amount of memory released this scavenge cycle.
 		//
 		// Updated atomically.
 		released uintptr
-
-		_ uint32 // Align assistTime for atomics on 32-bit platforms.
-
-		// scavengeAssistTime is the time spent scavenging in the last GC cycle.
-		//
-		// This is reset once a GC cycle ends.
-		assistTime atomic.Int64
 	}
 
 	// mheap_.lock. This level of indirection makes it possible
@@ -395,14 +385,13 @@
 	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
 		if p.chunks[c.l1()] == nil {
 			// Create the necessary l2 entry.
-			//
-			// Store it atomically to avoid races with readers which
-			// don't acquire the heap lock.
 			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
 			if r == nil {
 				throw("pageAlloc: out of memory")
 			}
-			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
+			// Store the new chunk block but avoid a write barrier.
+			// grow is used in call chains that disallow write barriers.
+			*(*uintptr)(unsafe.Pointer(&p.chunks[c.l1()])) = uintptr(r)
 		}
 		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
 	}
@@ -678,7 +667,7 @@
 
 		// Determine j0, the first index we should start iterating from.
 		// The searchAddr may help us eliminate iterations if we followed the
-		// searchAddr on the previous level or we're on the root leve, in which
+		// searchAddr on the previous level or we're on the root level, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
 		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 5bad4f7..5bc9c84 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -21,8 +21,7 @@
 	scav  uint64  // 64-bit bitmap representing scavenged pages (1 means scavenged)
 }
 
-// empty returns true if the pageCache has any free pages, and false
-// otherwise.
+// empty reports whether the page cache has no free pages.
 func (c *pageCache) empty() bool {
 	return c.cache == 0
 }
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 99a67b9..24f8889 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -10,6 +10,7 @@
 import (
 	"internal/abi"
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -57,9 +58,8 @@
 // creation, including its next and allnext links.
 //
 // No heap pointers.
-//
-//go:notinheap
 type bucket struct {
+	_       sys.NotInHeap
 	next    *bucket
 	allnext *bucket
 	typ     bucketType // memBucket or blockBucket (includes mutexProfile)
@@ -510,10 +510,18 @@
 	bp := b.bp()
 
 	lock(&profBlockLock)
+	// We want to up-scale the count and cycles according to the
+	// probability that the event was sampled. For block profile events,
+	// the sample probability is 1 if cycles >= rate, and cycles / rate
+	// otherwise. For mutex profile events, the sample probability is 1 / rate.
+	// We scale the events by 1 / (probability the event was sampled).
 	if which == blockProfile && cycles < rate {
 		// Remove sampling bias, see discussion on http://golang.org/cl/299991.
 		bp.count += float64(rate) / float64(cycles)
 		bp.cycles += rate
+	} else if which == mutexProfile {
+		bp.count += float64(rate)
+		bp.cycles += rate * cycles
 	} else {
 		bp.count++
 		bp.cycles += cycles
@@ -584,17 +592,7 @@
 // memory profiling rate should do so just once, as early as
 // possible in the execution of the program (for example,
 // at the beginning of main).
-var MemProfileRate int = defaultMemProfileRate(512 * 1024)
-
-// defaultMemProfileRate returns 0 if disableMemoryProfiling is set.
-// It exists primarily for the godoc rendering of MemProfileRate
-// above.
-func defaultMemProfileRate(v int) int {
-	if disableMemoryProfiling {
-		return 0
-	}
-	return v
-}
+var MemProfileRate int = 512 * 1024
 
 // disableMemoryProfiling is set by the linker if runtime.MemProfile
 // is not used and the link type guarantees nobody else could use it
@@ -917,7 +915,7 @@
 	// doesn't change during the collection. So, check the finalizer goroutine
 	// in particular.
 	n = int(gcount())
-	if fingRunning {
+	if fingStatus.Load()&fingRunningFinalizer != 0 {
 		n++
 	}
 
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 9cf83cc..4388d26 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -70,6 +70,30 @@
 	return a
 }
 
+// takeFromFront takes len bytes from the front of the address range, aligning
+// the base to align first. On success, returns the aligned start of the region
+// taken and true.
+func (a *addrRange) takeFromFront(len uintptr, align uint8) (uintptr, bool) {
+	base := alignUp(a.base.addr(), uintptr(align)) + len
+	if base > a.limit.addr() {
+		return 0, false
+	}
+	a.base = offAddr{base}
+	return base - len, true
+}
+
+// takeFromBack takes len bytes from the end of the address range, aligning
+// the limit to align after subtracting len. On success, returns the aligned
+// start of the region taken and true.
+func (a *addrRange) takeFromBack(len uintptr, align uint8) (uintptr, bool) {
+	limit := alignDown(a.limit.addr()-len, uintptr(align))
+	if a.base.addr() > limit {
+		return 0, false
+	}
+	a.limit = offAddr{limit}
+	return limit, true
+}
+
 // removeGreaterEqual removes all addresses in a greater than or equal
 // to addr and returns the new range.
 func (a addrRange) removeGreaterEqual(addr uintptr) addrRange {
diff --git a/src/runtime/msan.go b/src/runtime/msan.go
index c485216..5e2aae1 100644
--- a/src/runtime/msan.go
+++ b/src/runtime/msan.go
@@ -31,8 +31,8 @@
 //
 //go:nosplit
 func msanread(addr unsafe.Pointer, sz uintptr) {
-	g := getg()
-	if g == nil || g.m == nil || g == g.m.g0 || g == g.m.gsignal {
+	gp := getg()
+	if gp == nil || gp.m == nil || gp == gp.m.g0 || gp == gp.m.gsignal {
 		return
 	}
 	domsanread(addr, sz)
diff --git a/src/runtime/msan/msan.go b/src/runtime/msan/msan.go
index f1bf4e1..4e41f85 100644
--- a/src/runtime/msan/msan.go
+++ b/src/runtime/msan/msan.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build msan && linux && (amd64 || arm64)
+//go:build msan && ((linux && (amd64 || arm64)) || (freebsd && amd64))
 
 package msan
 
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
index 4158495..abbd450 100644
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -33,9 +33,9 @@
 	// anyway. (In principle, we could do this during STW.)
 
 	spineLock mutex
-	spine     unsafe.Pointer // *[N]*spanSetBlock, accessed atomically
-	spineLen  uintptr        // Spine array length, accessed atomically
-	spineCap  uintptr        // Spine array cap, accessed under lock
+	spine     atomicSpanSetSpinePointer // *[N]atomic.Pointer[spanSetBlock]
+	spineLen  atomic.Uintptr            // Spine array length
+	spineCap  uintptr                   // Spine array cap, accessed under spineLock
 
 	// index is the head and tail of the spanSet in a single field.
 	// The head and the tail both represent an index into the logical
@@ -48,7 +48,7 @@
 	// span in the heap were stored in this set, and each span were
 	// the minimum size (1 runtime page, 8 KiB), then roughly the
 	// smallest heap which would be unrepresentable is 32 TiB in size.
-	index headTailIndex
+	index atomicHeadTailIndex
 }
 
 const (
@@ -63,10 +63,10 @@
 	// popped is the number of pop operations that have occurred on
 	// this block. This number is used to help determine when a block
 	// may be safely recycled.
-	popped uint32
+	popped atomic.Uint32
 
 	// spans is the set of spans in this block.
-	spans [spanSetBlockEntries]*mspan
+	spans [spanSetBlockEntries]atomicMSpanPointer
 }
 
 // push adds span s to buffer b. push is safe to call concurrently
@@ -77,25 +77,24 @@
 	top, bottom := cursor/spanSetBlockEntries, cursor%spanSetBlockEntries
 
 	// Do we need to add a block?
-	spineLen := atomic.Loaduintptr(&b.spineLen)
+	spineLen := b.spineLen.Load()
 	var block *spanSetBlock
 retry:
 	if top < spineLen {
-		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-		blockp := add(spine, goarch.PtrSize*top)
-		block = (*spanSetBlock)(atomic.Loadp(blockp))
+		block = b.spine.Load().lookup(top).Load()
 	} else {
 		// Add a new block to the spine, potentially growing
 		// the spine.
 		lock(&b.spineLock)
 		// spineLen cannot change until we release the lock,
 		// but may have changed while we were waiting.
-		spineLen = atomic.Loaduintptr(&b.spineLen)
+		spineLen = b.spineLen.Load()
 		if top < spineLen {
 			unlock(&b.spineLock)
 			goto retry
 		}
 
+		spine := b.spine.Load()
 		if spineLen == b.spineCap {
 			// Grow the spine.
 			newCap := b.spineCap * 2
@@ -106,10 +105,12 @@
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
-				memmove(newSpine, b.spine, b.spineCap*goarch.PtrSize)
+				memmove(newSpine, spine.p, b.spineCap*goarch.PtrSize)
 			}
+			spine = spanSetSpinePointer{newSpine}
+
 			// Spine is allocated off-heap, so no write barrier.
-			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spine.StoreNoWB(spine)
 			b.spineCap = newCap
 			// We can't immediately free the old spine
 			// since a concurrent push with a lower index
@@ -124,16 +125,15 @@
 		block = spanSetBlockPool.alloc()
 
 		// Add it to the spine.
-		blockp := add(b.spine, goarch.PtrSize*top)
 		// Blocks are allocated off-heap, so no write barrier.
-		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
-		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		spine.lookup(top).StoreNoWB(block)
+		b.spineLen.Store(spineLen + 1)
 		unlock(&b.spineLock)
 	}
 
 	// We have a block. Insert the span atomically, since there may be
 	// concurrent readers via the block API.
-	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
+	block.spans[bottom].StoreNoWB(s)
 }
 
 // pop removes and returns a span from buffer b, or nil if b is empty.
@@ -150,7 +150,7 @@
 		}
 		// Check if the head position we want to claim is actually
 		// backed by a block.
-		spineLen := atomic.Loaduintptr(&b.spineLen)
+		spineLen := b.spineLen.Load()
 		if spineLen <= uintptr(head)/spanSetBlockEntries {
 			// We're racing with a spine growth and the allocation of
 			// a new block (and maybe a new spine!), and trying to grab
@@ -180,24 +180,23 @@
 	// We may be reading a stale spine pointer, but because the length
 	// grows monotonically and we've already verified it, we'll definitely
 	// be reading from a valid block.
-	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
-	blockp := add(spine, goarch.PtrSize*uintptr(top))
+	blockp := b.spine.Load().lookup(uintptr(top))
 
 	// Given that the spine length is correct, we know we will never
 	// see a nil block here, since the length is always updated after
 	// the block is set.
-	block := (*spanSetBlock)(atomic.Loadp(blockp))
-	s := (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	block := blockp.Load()
+	s := block.spans[bottom].Load()
 	for s == nil {
 		// We raced with the span actually being set, but given that we
 		// know a block for this span exists, the race window here is
 		// extremely small. Try again.
-		s = (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+		s = block.spans[bottom].Load()
 	}
 	// Clear the pointer. This isn't strictly necessary, but defensively
 	// avoids accidentally re-using blocks which could lead to memory
 	// corruption. This way, we'll get a nil pointer access instead.
-	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), nil)
+	block.spans[bottom].StoreNoWB(nil)
 
 	// Increase the popped count. If we are the last possible popper
 	// in the block (note that bottom need not equal spanSetBlockEntries-1
@@ -211,9 +210,9 @@
 	// pushers (there can't be any). Note that we may not be the popper
 	// which claimed the last slot in the block, we're just the last one
 	// to finish popping.
-	if atomic.Xadd(&block.popped, 1) == spanSetBlockEntries {
+	if block.popped.Add(1) == spanSetBlockEntries {
 		// Clear the block's pointer.
-		atomic.StorepNoWB(blockp, nil)
+		blockp.StoreNoWB(nil)
 
 		// Return the block to the block pool.
 		spanSetBlockPool.free(block)
@@ -235,23 +234,23 @@
 		throw("attempt to clear non-empty span set")
 	}
 	top := head / spanSetBlockEntries
-	if uintptr(top) < b.spineLen {
+	if uintptr(top) < b.spineLen.Load() {
 		// If the head catches up to the tail and the set is empty,
 		// we may not clean up the block containing the head and tail
 		// since it may be pushed into again. In order to avoid leaking
 		// memory since we're going to reset the head and tail, clean
 		// up such a block now, if it exists.
-		blockp := (**spanSetBlock)(add(b.spine, goarch.PtrSize*uintptr(top)))
-		block := *blockp
+		blockp := b.spine.Load().lookup(uintptr(top))
+		block := blockp.Load()
 		if block != nil {
-			// Sanity check the popped value.
-			if block.popped == 0 {
+			// Check the popped value.
+			if block.popped.Load() == 0 {
 				// popped should never be zero because that means we have
 				// pushed at least one value but not yet popped if this
 				// block pointer is not nil.
 				throw("span set block with unpopped elements found in reset")
 			}
-			if block.popped == spanSetBlockEntries {
+			if block.popped.Load() == spanSetBlockEntries {
 				// popped should also never be equal to spanSetBlockEntries
 				// because the last popper should have made the block pointer
 				// in this slot nil.
@@ -259,14 +258,45 @@
 			}
 
 			// Clear the pointer to the block.
-			atomic.StorepNoWB(unsafe.Pointer(blockp), nil)
+			blockp.StoreNoWB(nil)
 
 			// Return the block to the block pool.
 			spanSetBlockPool.free(block)
 		}
 	}
 	b.index.reset()
-	atomic.Storeuintptr(&b.spineLen, 0)
+	b.spineLen.Store(0)
+}
+
+// atomicSpanSetSpinePointer is an atomically-accessed spanSetSpinePointer.
+//
+// It has the same semantics as atomic.UnsafePointer.
+type atomicSpanSetSpinePointer struct {
+	a atomic.UnsafePointer
+}
+
+// Loads the spanSetSpinePointer and returns it.
+//
+// It has the same semantics as atomic.UnsafePointer.
+func (s *atomicSpanSetSpinePointer) Load() spanSetSpinePointer {
+	return spanSetSpinePointer{s.a.Load()}
+}
+
+// Stores the spanSetSpinePointer.
+//
+// It has the same semantics as atomic.UnsafePointer.
+func (s *atomicSpanSetSpinePointer) StoreNoWB(p spanSetSpinePointer) {
+	s.a.StoreNoWB(p.p)
+}
+
+// spanSetSpinePointer represents a pointer to a contiguous block of atomic.Pointer[spanSetBlock].
+type spanSetSpinePointer struct {
+	p unsafe.Pointer
+}
+
+// lookup returns &s[idx].
+func (s spanSetSpinePointer) lookup(idx uintptr) *atomic.Pointer[spanSetBlock] {
+	return (*atomic.Pointer[spanSetBlock])(add(unsafe.Pointer(s.p), goarch.PtrSize*idx))
 }
 
 // spanSetBlockPool is a global pool of spanSetBlocks.
@@ -288,7 +318,7 @@
 
 // free returns a spanSetBlock back to the pool.
 func (p *spanSetBlockAlloc) free(block *spanSetBlock) {
-	atomic.Store(&block.popped, 0)
+	block.popped.Store(0)
 	p.stack.push(&block.lfnode)
 }
 
@@ -317,29 +347,34 @@
 	return h.head(), h.tail()
 }
 
+// atomicHeadTailIndex is an atomically-accessed headTailIndex.
+type atomicHeadTailIndex struct {
+	u atomic.Uint64
+}
+
 // load atomically reads a headTailIndex value.
-func (h *headTailIndex) load() headTailIndex {
-	return headTailIndex(atomic.Load64((*uint64)(h)))
+func (h *atomicHeadTailIndex) load() headTailIndex {
+	return headTailIndex(h.u.Load())
 }
 
 // cas atomically compares-and-swaps a headTailIndex value.
-func (h *headTailIndex) cas(old, new headTailIndex) bool {
-	return atomic.Cas64((*uint64)(h), uint64(old), uint64(new))
+func (h *atomicHeadTailIndex) cas(old, new headTailIndex) bool {
+	return h.u.CompareAndSwap(uint64(old), uint64(new))
 }
 
 // incHead atomically increments the head of a headTailIndex.
-func (h *headTailIndex) incHead() headTailIndex {
-	return headTailIndex(atomic.Xadd64((*uint64)(h), (1 << 32)))
+func (h *atomicHeadTailIndex) incHead() headTailIndex {
+	return headTailIndex(h.u.Add(1 << 32))
 }
 
 // decHead atomically decrements the head of a headTailIndex.
-func (h *headTailIndex) decHead() headTailIndex {
-	return headTailIndex(atomic.Xadd64((*uint64)(h), -(1 << 32)))
+func (h *atomicHeadTailIndex) decHead() headTailIndex {
+	return headTailIndex(h.u.Add(-(1 << 32)))
 }
 
 // incTail atomically increments the tail of a headTailIndex.
-func (h *headTailIndex) incTail() headTailIndex {
-	ht := headTailIndex(atomic.Xadd64((*uint64)(h), +1))
+func (h *atomicHeadTailIndex) incTail() headTailIndex {
+	ht := headTailIndex(h.u.Add(1))
 	// Check for overflow.
 	if ht.tail() == 0 {
 		print("runtime: head = ", ht.head(), ", tail = ", ht.tail(), "\n")
@@ -349,6 +384,21 @@
 }
 
 // reset clears the headTailIndex to (0, 0).
-func (h *headTailIndex) reset() {
-	atomic.Store64((*uint64)(h), 0)
+func (h *atomicHeadTailIndex) reset() {
+	h.u.Store(0)
+}
+
+// atomicMSpanPointer is an atomic.Pointer[mspan]. Can't use generics because it's NotInHeap.
+type atomicMSpanPointer struct {
+	p atomic.UnsafePointer
+}
+
+// Load returns the *mspan.
+func (p *atomicMSpanPointer) Load() *mspan {
+	return (*mspan)(p.p.Load())
+}
+
+// Store stores an *mspan.
+func (p *atomicMSpanPointer) StoreNoWB(s *mspan) {
+	p.p.StoreNoWB(unsafe.Pointer(s))
 }
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 0029ea9..3a5273f 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -45,8 +45,6 @@
 
 	enablegc bool
 
-	_ uint32 // ensure gcPauseDist is aligned.
-
 	// gcPauseDist represents the distribution of all GC-related
 	// application pauses in the runtime.
 	//
@@ -334,10 +332,6 @@
 		println(offset)
 		throw("memstats.heapStats not aligned to 8 bytes")
 	}
-	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
-		println(offset)
-		throw("memstats.gcPauseDist not aligned to 8 bytes")
-	}
 	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
 	// [3]heapStatsDelta) to be 8-byte aligned.
 	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
@@ -733,8 +727,7 @@
 
 	// gen represents the current index into which writers
 	// are writing, and can take on the value of 0, 1, or 2.
-	// This value is updated atomically.
-	gen uint32
+	gen atomic.Uint32
 
 	// noPLock is intended to provide mutual exclusion for updating
 	// stats when no P is available. It does not block other writers
@@ -763,7 +756,7 @@
 //go:nosplit
 func (m *consistentHeapStats) acquire() *heapStatsDelta {
 	if pp := getg().m.p.ptr(); pp != nil {
-		seq := atomic.Xadd(&pp.statsSeq, 1)
+		seq := pp.statsSeq.Add(1)
 		if seq%2 == 0 {
 			// Should have been incremented to odd.
 			print("runtime: seq=", seq, "\n")
@@ -772,7 +765,7 @@
 	} else {
 		lock(&m.noPLock)
 	}
-	gen := atomic.Load(&m.gen) % 3
+	gen := m.gen.Load() % 3
 	return &m.stats[gen]
 }
 
@@ -792,7 +785,7 @@
 //go:nosplit
 func (m *consistentHeapStats) release() {
 	if pp := getg().m.p.ptr(); pp != nil {
-		seq := atomic.Xadd(&pp.statsSeq, 1)
+		seq := pp.statsSeq.Add(1)
 		if seq%2 != 0 {
 			// Should have been incremented to even.
 			print("runtime: seq=", seq, "\n")
@@ -843,7 +836,7 @@
 	// Get the current generation. We can be confident that this
 	// will not change since read is serialized and is the only
 	// one that modifies currGen.
-	currGen := atomic.Load(&m.gen)
+	currGen := m.gen.Load()
 	prevGen := currGen - 1
 	if currGen == 0 {
 		prevGen = 2
@@ -858,7 +851,7 @@
 	//
 	// This exchange is safe to do because we won't race
 	// with anyone else trying to update this value.
-	atomic.Xchg(&m.gen, (currGen+1)%3)
+	m.gen.Swap((currGen + 1) % 3)
 
 	// Allow P-less writers to continue. They'll be writing to the
 	// next generation now.
@@ -866,7 +859,7 @@
 
 	for _, p := range allp {
 		// Spin until there are no more writers.
-		for atomic.Load(&p.statsSeq)%2 != 0 {
+		for p.statsSeq.Load()%2 != 0 {
 		}
 	}
 
@@ -886,3 +879,25 @@
 
 	releasem(mp)
 }
+
+type cpuStats struct {
+	// All fields are CPU time in nanoseconds computed by comparing
+	// calls of nanotime. This means they're all overestimates, because
+	// they don't accurately compute on-CPU time (so some of the time
+	// could be spent scheduled away by the OS).
+
+	gcAssistTime    int64 // GC assists
+	gcDedicatedTime int64 // GC dedicated mark workers + pauses
+	gcIdleTime      int64 // GC idle mark workers
+	gcPauseTime     int64 // GC pauses (all GOMAXPROCS, even if just 1 is running)
+	gcTotalTime     int64
+
+	scavengeAssistTime int64 // background scavenger
+	scavengeBgTime     int64 // scavenge assists
+	scavengeTotalTime  int64
+
+	idleTime int64 // Time Ps spent in _Pidle.
+	userTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
+
+	totalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
+}
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index 39ce0b4..3b7cbf8 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -212,22 +212,22 @@
 //
 //go:nowritebarrierrec
 //go:systemstack
-func wbBufFlush1(_p_ *p) {
+func wbBufFlush1(pp *p) {
 	// Get the buffered pointers.
-	start := uintptr(unsafe.Pointer(&_p_.wbBuf.buf[0]))
-	n := (_p_.wbBuf.next - start) / unsafe.Sizeof(_p_.wbBuf.buf[0])
-	ptrs := _p_.wbBuf.buf[:n]
+	start := uintptr(unsafe.Pointer(&pp.wbBuf.buf[0]))
+	n := (pp.wbBuf.next - start) / unsafe.Sizeof(pp.wbBuf.buf[0])
+	ptrs := pp.wbBuf.buf[:n]
 
 	// Poison the buffer to make extra sure nothing is enqueued
 	// while we're processing the buffer.
-	_p_.wbBuf.next = 0
+	pp.wbBuf.next = 0
 
 	if useCheckmark {
 		// Slow path for checkmark mode.
 		for _, ptr := range ptrs {
 			shade(ptr)
 		}
-		_p_.wbBuf.reset()
+		pp.wbBuf.reset()
 		return
 	}
 
@@ -245,7 +245,7 @@
 	// could track whether any un-shaded goroutine has used the
 	// buffer, or just track globally whether there are any
 	// un-shaded stacks and flush after each stack scan.
-	gcw := &_p_.gcw
+	gcw := &pp.gcw
 	pos := 0
 	for _, ptr := range ptrs {
 		if ptr < minLegalPointer {
@@ -286,5 +286,5 @@
 	// Enqueue the greyed objects.
 	gcw.putBatch(ptrs[:pos])
 
-	_p_.wbBuf.reset()
+	pp.wbBuf.reset()
 }
diff --git a/src/runtime/nbpipe_fcntl_libc_test.go b/src/runtime/nbpipe_fcntl_libc_test.go
index a9c8987..170245d 100644
--- a/src/runtime/nbpipe_fcntl_libc_test.go
+++ b/src/runtime/nbpipe_fcntl_libc_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build aix || darwin || solaris
+//go:build aix || darwin || (openbsd && !mips64) || solaris
 
 package runtime_test
 
diff --git a/src/runtime/nbpipe_fcntl_unix_test.go b/src/runtime/nbpipe_fcntl_unix_test.go
index 97607fa..b7252ea 100644
--- a/src/runtime/nbpipe_fcntl_unix_test.go
+++ b/src/runtime/nbpipe_fcntl_unix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build dragonfly || freebsd || linux || netbsd || openbsd
+//go:build dragonfly || freebsd || linux || netbsd || (openbsd && mips64)
 
 package runtime_test
 
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index ac6bc89..5ac1f37 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -8,6 +8,7 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -49,16 +50,17 @@
 // goroutines respectively. The semaphore can be in the following states:
 //
 //	pdReady - io readiness notification is pending;
-//	          a goroutine consumes the notification by changing the state to nil.
+//	          a goroutine consumes the notification by changing the state to pdNil.
 //	pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
 //	         the goroutine commits to park by changing the state to G pointer,
 //	         or, alternatively, concurrent io notification changes the state to pdReady,
-//	         or, alternatively, concurrent timeout/close changes the state to nil.
+//	         or, alternatively, concurrent timeout/close changes the state to pdNil.
 //	G pointer - the goroutine is blocked on the semaphore;
-//	            io notification or timeout/close changes the state to pdReady or nil respectively
+//	            io notification or timeout/close changes the state to pdReady or pdNil respectively
 //	            and unparks the goroutine.
-//	nil - none of the above.
+//	pdNil - none of the above.
 const (
+	pdNil   uintptr = 0
 	pdReady uintptr = 1
 	pdWait  uintptr = 2
 )
@@ -68,9 +70,8 @@
 // Network poller descriptor.
 //
 // No heap pointers.
-//
-//go:notinheap
 type pollDesc struct {
+	_    sys.NotInHeap
 	link *pollDesc // in pollcache, protected by pollcache.lock
 	fd   uintptr   // constant for pollDesc usage lifetime
 
@@ -93,8 +94,8 @@
 
 	// rg, wg are accessed atomically and hold g pointers.
 	// (Using atomic.Uintptr here is similar to using guintptr elsewhere.)
-	rg atomic.Uintptr // pdReady, pdWait, G waiting for read or nil
-	wg atomic.Uintptr // pdReady, pdWait, G waiting for write or nil
+	rg atomic.Uintptr // pdReady, pdWait, G waiting for read or pdNil
+	wg atomic.Uintptr // pdReady, pdWait, G waiting for write or pdNil
 
 	lock    mutex // protects the following fields
 	closing bool
@@ -177,10 +178,10 @@
 
 var (
 	netpollInitLock mutex
-	netpollInited   uint32
+	netpollInited   atomic.Uint32
 
 	pollcache      pollCache
-	netpollWaiters uint32
+	netpollWaiters atomic.Uint32
 )
 
 //go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
@@ -189,19 +190,19 @@
 }
 
 func netpollGenericInit() {
-	if atomic.Load(&netpollInited) == 0 {
+	if netpollInited.Load() == 0 {
 		lockInit(&netpollInitLock, lockRankNetpollInit)
 		lock(&netpollInitLock)
-		if netpollInited == 0 {
+		if netpollInited.Load() == 0 {
 			netpollinit()
-			atomic.Store(&netpollInited, 1)
+			netpollInited.Store(1)
 		}
 		unlock(&netpollInitLock)
 	}
 }
 
 func netpollinited() bool {
-	return atomic.Load(&netpollInited) != 0
+	return netpollInited.Load() != 0
 }
 
 //go:linkname poll_runtime_isPollServerDescriptor internal/poll.runtime_isPollServerDescriptor
@@ -217,21 +218,21 @@
 	pd := pollcache.alloc()
 	lock(&pd.lock)
 	wg := pd.wg.Load()
-	if wg != 0 && wg != pdReady {
+	if wg != pdNil && wg != pdReady {
 		throw("runtime: blocked write on free polldesc")
 	}
 	rg := pd.rg.Load()
-	if rg != 0 && rg != pdReady {
+	if rg != pdNil && rg != pdReady {
 		throw("runtime: blocked read on free polldesc")
 	}
 	pd.fd = fd
 	pd.closing = false
 	pd.setEventErr(false)
 	pd.rseq++
-	pd.rg.Store(0)
+	pd.rg.Store(pdNil)
 	pd.rd = 0
 	pd.wseq++
-	pd.wg.Store(0)
+	pd.wg.Store(pdNil)
 	pd.wd = 0
 	pd.self = pd
 	pd.publishInfo()
@@ -251,11 +252,11 @@
 		throw("runtime: close polldesc w/o unblock")
 	}
 	wg := pd.wg.Load()
-	if wg != 0 && wg != pdReady {
+	if wg != pdNil && wg != pdReady {
 		throw("runtime: blocked write on closing polldesc")
 	}
 	rg := pd.rg.Load()
-	if rg != 0 && rg != pdReady {
+	if rg != pdNil && rg != pdReady {
 		throw("runtime: blocked read on closing polldesc")
 	}
 	netpollclose(pd.fd)
@@ -280,9 +281,9 @@
 		return errcode
 	}
 	if mode == 'r' {
-		pd.rg.Store(0)
+		pd.rg.Store(pdNil)
 	} else if mode == 'w' {
-		pd.wg.Store(0)
+		pd.wg.Store(pdNil)
 	}
 	return pollNoError
 }
@@ -482,17 +483,17 @@
 		// Bump the count of goroutines waiting for the poller.
 		// The scheduler uses this to decide whether to block
 		// waiting for the poller if there is nothing else to do.
-		atomic.Xadd(&netpollWaiters, 1)
+		netpollWaiters.Add(1)
 	}
 	return r
 }
 
 func netpollgoready(gp *g, traceskip int) {
-	atomic.Xadd(&netpollWaiters, -1)
+	netpollWaiters.Add(-1)
 	goready(gp, traceskip+1)
 }
 
-// returns true if IO is ready, or false if timedout or closed
+// returns true if IO is ready, or false if timed out or closed
 // waitio - wait only for completed IO, ignore errors
 // Concurrent calls to netpollblock in the same mode are forbidden, as pollDesc
 // can hold only a single waiting goroutine for each mode.
@@ -505,16 +506,16 @@
 	// set the gpp semaphore to pdWait
 	for {
 		// Consume notification if already ready.
-		if gpp.CompareAndSwap(pdReady, 0) {
+		if gpp.CompareAndSwap(pdReady, pdNil) {
 			return true
 		}
-		if gpp.CompareAndSwap(0, pdWait) {
+		if gpp.CompareAndSwap(pdNil, pdWait) {
 			break
 		}
 
 		// Double check that this isn't corrupt; otherwise we'd loop
 		// forever.
-		if v := gpp.Load(); v != pdReady && v != 0 {
+		if v := gpp.Load(); v != pdReady && v != pdNil {
 			throw("runtime: double wait")
 		}
 	}
@@ -526,7 +527,7 @@
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
 	}
 	// be careful to not lose concurrent pdReady notification
-	old := gpp.Swap(0)
+	old := gpp.Swap(pdNil)
 	if old > pdWait {
 		throw("runtime: corrupted polldesc")
 	}
@@ -544,7 +545,7 @@
 		if old == pdReady {
 			return nil
 		}
-		if old == 0 && !ioready {
+		if old == pdNil && !ioready {
 			// Only set pdReady for ioready. runtime_pollWait
 			// will check for timeout/cancel before waiting.
 			return nil
@@ -555,7 +556,7 @@
 		}
 		if gpp.CompareAndSwap(old, new) {
 			if old == pdWait {
-				old = 0
+				old = pdNil
 			}
 			return (*g)(unsafe.Pointer(old))
 		}
@@ -641,8 +642,8 @@
 // makeArg converts pd to an interface{}.
 // makeArg does not do any allocation. Normally, such
 // a conversion requires an allocation because pointers to
-// go:notinheap types (which pollDesc is) must be stored
-// in interfaces indirectly. See issue 42076.
+// types which embed runtime/internal/sys.NotInHeap (which pollDesc is)
+// must be stored in interfaces indirectly. See issue 42076.
 func (pd *pollDesc) makeArg() (i any) {
 	x := (*eface)(unsafe.Pointer(&i))
 	x._type = pdType
diff --git a/src/runtime/netpoll_aix.go b/src/runtime/netpoll_aix.go
index 22cc513..5184aad 100644
--- a/src/runtime/netpoll_aix.go
+++ b/src/runtime/netpoll_aix.go
@@ -45,7 +45,7 @@
 	wrwake         int32
 	pendingUpdates int32
 
-	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+	netpollWakeSig atomic.Uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -135,10 +135,13 @@
 
 // netpollBreak interrupts a poll.
 func netpollBreak() {
-	if atomic.Cas(&netpollWakeSig, 0, 1) {
-		b := [1]byte{0}
-		write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
+	// Failing to cas indicates there is an in-flight wakeup, so we're done here.
+	if !netpollWakeSig.CompareAndSwap(0, 1) {
+		return
 	}
+
+	b := [1]byte{0}
+	write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
 }
 
 // netpoll checks for ready network connections.
@@ -193,7 +196,7 @@
 			var b [1]byte
 			for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
 			}
-			atomic.Store(&netpollWakeSig, 0)
+			netpollWakeSig.Store(0)
 		}
 		// Still look at the other fds even if the mode may have
 		// changed, as netpollBreak might have been called.
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index b7d6199..7164a59 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go
@@ -8,49 +8,37 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/syscall"
 	"unsafe"
 )
 
-func epollcreate(size int32) int32
-func epollcreate1(flags int32) int32
-
-//go:noescape
-func epollctl(epfd, op, fd int32, ev *epollevent) int32
-
-//go:noescape
-func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
-func closeonexec(fd int32)
-
 var (
 	epfd int32 = -1 // epoll descriptor
 
 	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
 
-	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+	netpollWakeSig atomic.Uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
-	epfd = epollcreate1(_EPOLL_CLOEXEC)
-	if epfd < 0 {
-		epfd = epollcreate(1024)
-		if epfd < 0 {
-			println("runtime: epollcreate failed with", -epfd)
-			throw("runtime: netpollinit failed")
-		}
-		closeonexec(epfd)
-	}
-	r, w, errno := nonblockingPipe()
+	var errno uintptr
+	epfd, errno = syscall.EpollCreate1(syscall.EPOLL_CLOEXEC)
 	if errno != 0 {
-		println("runtime: pipe failed with", -errno)
+		println("runtime: epollcreate failed with", errno)
+		throw("runtime: netpollinit failed")
+	}
+	r, w, errpipe := nonblockingPipe()
+	if errpipe != 0 {
+		println("runtime: pipe failed with", -errpipe)
 		throw("runtime: pipe failed")
 	}
-	ev := epollevent{
-		events: _EPOLLIN,
+	ev := syscall.EpollEvent{
+		Events: syscall.EPOLLIN,
 	}
-	*(**uintptr)(unsafe.Pointer(&ev.data)) = &netpollBreakRd
-	errno = epollctl(epfd, _EPOLL_CTL_ADD, r, &ev)
+	*(**uintptr)(unsafe.Pointer(&ev.Data)) = &netpollBreakRd
+	errno = syscall.EpollCtl(epfd, syscall.EPOLL_CTL_ADD, r, &ev)
 	if errno != 0 {
-		println("runtime: epollctl failed with", -errno)
+		println("runtime: epollctl failed with", errno)
 		throw("runtime: epollctl failed")
 	}
 	netpollBreakRd = uintptr(r)
@@ -61,16 +49,16 @@
 	return fd == uintptr(epfd) || fd == netpollBreakRd || fd == netpollBreakWr
 }
 
-func netpollopen(fd uintptr, pd *pollDesc) int32 {
-	var ev epollevent
-	ev.events = _EPOLLIN | _EPOLLOUT | _EPOLLRDHUP | _EPOLLET
-	*(**pollDesc)(unsafe.Pointer(&ev.data)) = pd
-	return -epollctl(epfd, _EPOLL_CTL_ADD, int32(fd), &ev)
+func netpollopen(fd uintptr, pd *pollDesc) uintptr {
+	var ev syscall.EpollEvent
+	ev.Events = syscall.EPOLLIN | syscall.EPOLLOUT | syscall.EPOLLRDHUP | syscall.EPOLLET
+	*(**pollDesc)(unsafe.Pointer(&ev.Data)) = pd
+	return syscall.EpollCtl(epfd, syscall.EPOLL_CTL_ADD, int32(fd), &ev)
 }
 
-func netpollclose(fd uintptr) int32 {
-	var ev epollevent
-	return -epollctl(epfd, _EPOLL_CTL_DEL, int32(fd), &ev)
+func netpollclose(fd uintptr) uintptr {
+	var ev syscall.EpollEvent
+	return syscall.EpollCtl(epfd, syscall.EPOLL_CTL_DEL, int32(fd), &ev)
 }
 
 func netpollarm(pd *pollDesc, mode int) {
@@ -79,22 +67,25 @@
 
 // netpollBreak interrupts an epollwait.
 func netpollBreak() {
-	if atomic.Cas(&netpollWakeSig, 0, 1) {
-		for {
-			var b byte
-			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
-			if n == 1 {
-				break
-			}
-			if n == -_EINTR {
-				continue
-			}
-			if n == -_EAGAIN {
-				return
-			}
-			println("runtime: netpollBreak write failed with", -n)
-			throw("runtime: netpollBreak write failed")
+	// Failing to cas indicates there is an in-flight wakeup, so we're done here.
+	if !netpollWakeSig.CompareAndSwap(0, 1) {
+		return
+	}
+
+	for {
+		var b byte
+		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+		if n == 1 {
+			break
 		}
+		if n == -_EINTR {
+			continue
+		}
+		if n == -_EAGAIN {
+			return
+		}
+		println("runtime: netpollBreak write failed with", -n)
+		throw("runtime: netpollBreak write failed")
 	}
 }
 
@@ -121,12 +112,12 @@
 		// 1e9 ms == ~11.5 days.
 		waitms = 1e9
 	}
-	var events [128]epollevent
+	var events [128]syscall.EpollEvent
 retry:
-	n := epollwait(epfd, &events[0], int32(len(events)), waitms)
-	if n < 0 {
-		if n != -_EINTR {
-			println("runtime: epollwait on fd", epfd, "failed with", -n)
+	n, errno := syscall.EpollWait(epfd, events[:], int32(len(events)), waitms)
+	if errno != 0 {
+		if errno != _EINTR {
+			println("runtime: epollwait on fd", epfd, "failed with", errno)
 			throw("runtime: netpoll failed")
 		}
 		// If a timed sleep was interrupted, just return to
@@ -138,14 +129,14 @@
 	}
 	var toRun gList
 	for i := int32(0); i < n; i++ {
-		ev := &events[i]
-		if ev.events == 0 {
+		ev := events[i]
+		if ev.Events == 0 {
 			continue
 		}
 
-		if *(**uintptr)(unsafe.Pointer(&ev.data)) == &netpollBreakRd {
-			if ev.events != _EPOLLIN {
-				println("runtime: netpoll: break fd ready for", ev.events)
+		if *(**uintptr)(unsafe.Pointer(&ev.Data)) == &netpollBreakRd {
+			if ev.Events != syscall.EPOLLIN {
+				println("runtime: netpoll: break fd ready for", ev.Events)
 				throw("runtime: netpoll: break fd ready for something unexpected")
 			}
 			if delay != 0 {
@@ -154,21 +145,21 @@
 				// if blocking.
 				var tmp [16]byte
 				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
-				atomic.Store(&netpollWakeSig, 0)
+				netpollWakeSig.Store(0)
 			}
 			continue
 		}
 
 		var mode int32
-		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
+		if ev.Events&(syscall.EPOLLIN|syscall.EPOLLRDHUP|syscall.EPOLLHUP|syscall.EPOLLERR) != 0 {
 			mode += 'r'
 		}
-		if ev.events&(_EPOLLOUT|_EPOLLHUP|_EPOLLERR) != 0 {
+		if ev.Events&(syscall.EPOLLOUT|syscall.EPOLLHUP|syscall.EPOLLERR) != 0 {
 			mode += 'w'
 		}
 		if mode != 0 {
-			pd := *(**pollDesc)(unsafe.Pointer(&ev.data))
-			pd.setEventErr(ev.events == _EPOLLERR)
+			pd := *(**pollDesc)(unsafe.Pointer(&ev.Data))
+			pd.setEventErr(ev.Events == syscall.EPOLLERR)
 			netpollready(&toRun, pd, mode)
 		}
 	}
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 1694753..5ae77b5 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -18,7 +18,7 @@
 
 	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
 
-	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+	netpollWakeSig atomic.Uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -83,19 +83,22 @@
 
 // netpollBreak interrupts a kevent.
 func netpollBreak() {
-	if atomic.Cas(&netpollWakeSig, 0, 1) {
-		for {
-			var b byte
-			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
-			if n == 1 || n == -_EAGAIN {
-				break
-			}
-			if n == -_EINTR {
-				continue
-			}
-			println("runtime: netpollBreak write failed with", -n)
-			throw("runtime: netpollBreak write failed")
+	// Failing to cas indicates there is an in-flight wakeup, so we're done here.
+	if !netpollWakeSig.CompareAndSwap(0, 1) {
+		return
+	}
+
+	for {
+		var b byte
+		n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+		if n == 1 || n == -_EAGAIN {
+			break
 		}
+		if n == -_EINTR {
+			continue
+		}
+		println("runtime: netpollBreak write failed with", -n)
+		throw("runtime: netpollBreak write failed")
 	}
 }
 
@@ -152,7 +155,7 @@
 				// if blocking.
 				var tmp [16]byte
 				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
-				atomic.Store(&netpollWakeSig, 0)
+				netpollWakeSig.Store(0)
 			}
 			continue
 		}
diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index 6e545b3..d835cd9 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go
@@ -88,7 +88,7 @@
 	libc_port_dissociate,
 	libc_port_getn,
 	libc_port_alert libcFunc
-	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+	netpollWakeSig atomic.Uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func errno() int32 {
@@ -191,17 +191,20 @@
 
 // netpollBreak interrupts a port_getn wait.
 func netpollBreak() {
-	if atomic.Cas(&netpollWakeSig, 0, 1) {
-		// Use port_alert to put portfd into alert mode.
-		// This will wake up all threads sleeping in port_getn on portfd,
-		// and cause their calls to port_getn to return immediately.
-		// Further, until portfd is taken out of alert mode,
-		// all calls to port_getn will return immediately.
-		if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
-			if e := errno(); e != _EBUSY {
-				println("runtime: port_alert failed with", e)
-				throw("runtime: netpoll: port_alert failed")
-			}
+	// Failing to cas indicates there is an in-flight wakeup, so we're done here.
+	if !netpollWakeSig.CompareAndSwap(0, 1) {
+		return
+	}
+
+	// Use port_alert to put portfd into alert mode.
+	// This will wake up all threads sleeping in port_getn on portfd,
+	// and cause their calls to port_getn to return immediately.
+	// Further, until portfd is taken out of alert mode,
+	// all calls to port_getn will return immediately.
+	if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
+		if e := errno(); e != _EBUSY {
+			println("runtime: port_alert failed with", e)
+			throw("runtime: netpoll: port_alert failed")
 		}
 	}
 }
@@ -274,7 +277,7 @@
 					println("runtime: port_alert failed with", e)
 					throw("runtime: netpoll: port_alert failed")
 				}
-				atomic.Store(&netpollWakeSig, 0)
+				netpollWakeSig.Store(0)
 			}
 			continue
 		}
diff --git a/src/runtime/netpoll_stub.go b/src/runtime/netpoll_stub.go
index d0a63bc..14cf0c3 100644
--- a/src/runtime/netpoll_stub.go
+++ b/src/runtime/netpoll_stub.go
@@ -8,8 +8,8 @@
 
 import "runtime/internal/atomic"
 
-var netpollInited uint32
-var netpollWaiters uint32
+var netpollInited atomic.Uint32
+var netpollWaiters atomic.Uint32
 
 var netpollStubLock mutex
 var netpollNote note
@@ -19,7 +19,7 @@
 var netpollBroken bool
 
 func netpollGenericInit() {
-	atomic.Store(&netpollInited, 1)
+	netpollInited.Store(1)
 }
 
 func netpollBreak() {
@@ -57,5 +57,5 @@
 }
 
 func netpollinited() bool {
-	return atomic.Load(&netpollInited) != 0
+	return netpollInited.Load() != 0
 }
diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 4c1cd26..796bf1d 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go
@@ -35,7 +35,7 @@
 var (
 	iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
 
-	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+	netpollWakeSig atomic.Uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -67,11 +67,14 @@
 }
 
 func netpollBreak() {
-	if atomic.Cas(&netpollWakeSig, 0, 1) {
-		if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
-			println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
-			throw("runtime: netpoll: PostQueuedCompletionStatus failed")
-		}
+	// Failing to cas indicates there is an in-flight wakeup, so we're done here.
+	if !netpollWakeSig.CompareAndSwap(0, 1) {
+		return
+	}
+
+	if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
+		println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
+		throw("runtime: netpoll: PostQueuedCompletionStatus failed")
 	}
 }
 
@@ -133,7 +136,7 @@
 			}
 			handlecompletion(&toRun, op, errno, qty)
 		} else {
-			atomic.Store(&netpollWakeSig, 0)
+			netpollWakeSig.Store(0)
 			if delay == 0 {
 				// Forward the notification to the
 				// blocked poller.
diff --git a/src/runtime/os2_aix.go b/src/runtime/os2_aix.go
index 9ad1caa..2efc565 100644
--- a/src/runtime/os2_aix.go
+++ b/src/runtime/os2_aix.go
@@ -388,11 +388,11 @@
 
 //go:nosplit
 func exit(code int32) {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		syscall1(&libc_exit, uintptr(code))
 		return
 	}
@@ -403,11 +403,11 @@
 
 //go:nosplit
 func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		r, errno := syscall3(&libc_write, uintptr(fd), uintptr(p), uintptr(n))
 		if int32(r) < 0 {
 			return -int32(errno)
@@ -493,11 +493,11 @@
 
 //go:nosplit
 func sigaction(sig uintptr, new, old *sigactiont) {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// runtime.libpreinit.
-	if _g_ != nil {
+	if gp != nil {
 		r, err := syscall3(&libc_sigaction, sig, uintptr(unsafe.Pointer(new)), uintptr(unsafe.Pointer(old)))
 		if int32(r) == -1 {
 			println("Sigaction failed for sig: ", sig, " with error:", hex(err))
@@ -645,11 +645,11 @@
 
 //go:nosplit
 func pthread_attr_init(attr *pthread_attr) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		r, _ := syscall1(&libpthread_attr_init, uintptr(unsafe.Pointer(attr)))
 		return int32(r)
 	}
@@ -661,11 +661,11 @@
 
 //go:nosplit
 func pthread_attr_setdetachstate(attr *pthread_attr, state int32) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		r, _ := syscall2(&libpthread_attr_setdetachstate, uintptr(unsafe.Pointer(attr)), uintptr(state))
 		return int32(r)
 	}
@@ -689,11 +689,11 @@
 
 //go:nosplit
 func pthread_attr_setstacksize(attr *pthread_attr, size uint64) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		r, _ := syscall2(&libpthread_attr_setstacksize, uintptr(unsafe.Pointer(attr)), uintptr(size))
 		return int32(r)
 	}
@@ -705,11 +705,11 @@
 
 //go:nosplit
 func pthread_create(tid *pthread, attr *pthread_attr, fn *funcDescriptor, arg unsafe.Pointer) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
-	if _g_ != nil {
+	if gp != nil {
 		r, _ := syscall4(&libpthread_create, uintptr(unsafe.Pointer(tid)), uintptr(unsafe.Pointer(attr)), uintptr(unsafe.Pointer(fn)), uintptr(arg))
 		return int32(r)
 	}
@@ -723,11 +723,11 @@
 
 //go:nosplit
 func sigprocmask(how int32, new, old *sigset) {
-	_g_ := getg()
+	gp := getg()
 
 	// Check the validity of m because it might be called during a cgo
 	// callback early enough where m isn't available yet.
-	if _g_ != nil && _g_.m != nil {
+	if gp != nil && gp.m != nil {
 		r, err := syscall3(&libpthread_sigthreadmask, uintptr(how), uintptr(unsafe.Pointer(new)), uintptr(unsafe.Pointer(old)))
 		if int32(r) != 0 {
 			println("syscall sigthreadmask failed: ", hex(err))
diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index e901b3e..8c9cbe2 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go
@@ -14,7 +14,9 @@
 //
 //go:nowritebarrierrec
 func sighandler(_ureg *ureg, note *byte, gp *g) int {
-	_g_ := getg()
+	gsignal := getg()
+	mp := gsignal.m
+
 	var t sigTabT
 	var docrash bool
 	var sig int
@@ -61,7 +63,7 @@
 	if flags&_SigPanic != 0 {
 		// Copy the error string from sigtramp's stack into m->notesig so
 		// we can reliably access it from the panic routines.
-		memmove(unsafe.Pointer(_g_.m.notesig), unsafe.Pointer(note), uintptr(len(notestr)+1))
+		memmove(unsafe.Pointer(mp.notesig), unsafe.Pointer(note), uintptr(len(notestr)+1))
 		gp.sig = uint32(sig)
 		gp.sigpc = c.pc()
 
@@ -120,8 +122,8 @@
 		return _NCONT
 	}
 Throw:
-	_g_.m.throwing = throwTypeRuntime
-	_g_.m.caughtsig.set(gp)
+	mp.throwing = throwTypeRuntime
+	mp.caughtsig.set(gp)
 	startpanic_m()
 	print(notestr, "\n")
 	print("PC=", hex(c.pc()), "\n")
diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index 8c85b71..ffac4b6 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go
@@ -7,6 +7,7 @@
 import (
 	"internal/abi"
 	"internal/goarch"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -171,18 +172,20 @@
 	// Disable signals during create, so that the new thread starts
 	// with signals disabled. It will enable them in minit.
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	ret = pthread_create(&tid, &attr, abi.FuncPCABI0(tstart_sysvicall), unsafe.Pointer(mp))
+	ret = retryOnEAGAIN(func() int32 {
+		return pthread_create(&tid, &attr, abi.FuncPCABI0(tstart_sysvicall), unsafe.Pointer(mp))
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret != 0 {
 		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", ret, ")\n")
-		if ret == -_EAGAIN {
+		if ret == _EAGAIN {
 			println("runtime: may need to increase max user processes (ulimit -u)")
 		}
 		throw("newosproc")
 	}
 }
 
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
 	// We should never reach exitThread on Solaris because we let
 	// libc clean up threads.
 	throw("exitThread")
@@ -267,7 +270,7 @@
 	return *((*uintptr)(unsafe.Pointer(&sa._funcptr)))
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
@@ -308,18 +311,17 @@
 	}
 
 	var sem *semt
-	_g_ := getg()
 
 	// Call libc's malloc rather than malloc. This will
 	// allocate space on the C heap. We can't call malloc
 	// here because it could cause a deadlock.
-	_g_.m.libcall.fn = uintptr(unsafe.Pointer(&libc_malloc))
-	_g_.m.libcall.n = 1
-	_g_.m.scratch = mscratch{}
-	_g_.m.scratch.v[0] = unsafe.Sizeof(*sem)
-	_g_.m.libcall.args = uintptr(unsafe.Pointer(&_g_.m.scratch))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6x), unsafe.Pointer(&_g_.m.libcall))
-	sem = (*semt)(unsafe.Pointer(_g_.m.libcall.r1))
+	mp.libcall.fn = uintptr(unsafe.Pointer(&libc_malloc))
+	mp.libcall.n = 1
+	mp.scratch = mscratch{}
+	mp.scratch.v[0] = unsafe.Sizeof(*sem)
+	mp.libcall.args = uintptr(unsafe.Pointer(&mp.scratch))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6x), unsafe.Pointer(&mp.libcall))
+	sem = (*semt)(unsafe.Pointer(mp.libcall.r1))
 	if sem_init(sem, 0, 0) != 0 {
 		throw("sem_init")
 	}
diff --git a/src/runtime/os_aix.go b/src/runtime/os_aix.go
index 15e4929..e07c7f1 100644
--- a/src/runtime/os_aix.go
+++ b/src/runtime/os_aix.go
@@ -8,6 +8,7 @@
 
 import (
 	"internal/abi"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -110,17 +111,17 @@
 	)
 
 	if pthread_attr_init(&attr) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
 	if pthread_attr_setstacksize(&attr, threadStackSize) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
@@ -139,14 +140,12 @@
 	}
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
 }
 
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
 // Called to do synchronous initialization of Go code built with
 // -buildmode=c-archive or -buildmode=c-shared.
 // None of the Go runtime is initialized.
@@ -164,7 +163,7 @@
 }
 
 // errno address must be retrieved by calling _Errno libc function.
-// This will return a pointer to errno
+// This will return a pointer to errno.
 func miniterrno() {
 	mp := getg().m
 	r, _ := syscall0(&libc__Errno)
@@ -212,16 +211,9 @@
 	// Disable signals during create, so that the new thread starts
 	// with signals disabled. It will enable them in minit.
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	var ret int32
-	for tries := 0; tries < 20; tries++ {
-		// pthread_create can fail with EAGAIN for no reasons
-		// but it will be ok if it retries.
-		ret = pthread_create(&tid, &attr, &tstart, unsafe.Pointer(mp))
-		if ret != _EAGAIN {
-			break
-		}
-		usleep(uint32(tries+1) * 1000) // Milliseconds.
-	}
+	ret := retryOnEAGAIN(func() int32 {
+		return pthread_create(&tid, &attr, &tstart, unsafe.Pointer(mp))
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret != 0 {
 		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", ret, ")\n")
@@ -233,7 +225,7 @@
 
 }
 
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
 	// We should never reach exitThread on AIX because we let
 	// libc clean up threads.
 	throw("exitThread")
@@ -296,7 +288,7 @@
 	return sa.sa_handler
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 8562d7d..c4f3bb6 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -136,6 +136,8 @@
 
 	ncpu = getncpu()
 	physPageSize = getPageSize()
+
+	osinit_hack()
 }
 
 func sysctlbynameInt32(name []byte) (int32, int32) {
@@ -208,21 +210,21 @@
 	var err int32
 	err = pthread_attr_init(&attr)
 	if err != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
 	// Find out OS stack size for our own stack guard.
 	var stacksize uintptr
 	if pthread_attr_getstacksize(&attr, &stacksize) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
@@ -230,10 +232,12 @@
 	// setup and then calls mstart.
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	err = pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
+	err = retryOnEAGAIN(func() int32 {
+		return pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if err != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 }
@@ -253,7 +257,7 @@
 	var err int32
 	err = pthread_attr_init(&attr)
 	if err != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
@@ -263,7 +267,7 @@
 	// we use the OS default stack size instead of the suggestion.
 	// Find out that stack size for our own stack guard.
 	if pthread_attr_getstacksize(&attr, &stacksize) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 	g0.stack.hi = stacksize // for mstart
@@ -271,7 +275,7 @@
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
@@ -282,14 +286,11 @@
 	err = pthread_create(&attr, fn, nil)
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if err != 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 }
 
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
 // Called to do synchronous initialization of Go code built with
 // -buildmode=c-archive or -buildmode=c-shared.
 // None of the Go runtime is initialized.
@@ -412,7 +413,7 @@
 	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
index 8347814..e467578 100644
--- a/src/runtime/os_dragonfly.go
+++ b/src/runtime/os_dragonfly.go
@@ -162,7 +162,10 @@
 	}
 
 	// TODO: Check for error.
-	lwp_create(&params)
+	retryOnEAGAIN(func() int32 {
+		lwp_create(&params)
+		return 0
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 }
 
@@ -248,7 +251,7 @@
 	return sa.sa_sigaction
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index 23efd1a..f53cb11 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go
@@ -213,10 +213,14 @@
 
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	ret := thr_new(&param, int32(unsafe.Sizeof(param)))
+	ret := retryOnEAGAIN(func() int32 {
+		errno := thr_new(&param, int32(unsafe.Sizeof(param)))
+		// thr_new returns negative errno
+		return -errno
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+	if ret != 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", ret, ")\n")
 		throw("newosproc")
 	}
 }
@@ -227,7 +231,7 @@
 func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
 	stack := sysAlloc(stacksize, &memstats.stacks_sys)
 	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		writeErrStr(failallocatestack)
 		exit(1)
 	}
 	// This code "knows" it's being called once from the library
@@ -252,14 +256,11 @@
 	ret := thr_new(&param, int32(unsafe.Sizeof(param)))
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 }
 
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
 // Called to do synchronous initialization of Go code built with
 // -buildmode=c-archive or -buildmode=c-shared.
 // None of the Go runtime is initialized.
@@ -362,7 +363,7 @@
 	return sa.sa_handler
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_freebsd_riscv64.go b/src/runtime/os_freebsd_riscv64.go
new file mode 100644
index 0000000..0f2ed50
--- /dev/null
+++ b/src/runtime/os_freebsd_riscv64.go
@@ -0,0 +1,7 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func osArchInit() {}
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index 34cc027..7481fb9 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -7,6 +7,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -35,7 +36,7 @@
 	usleep(usec)
 }
 
-func exitThread(wait *uint32)
+func exitThread(wait *atomic.Uint32)
 
 type mOS struct{}
 
@@ -49,13 +50,13 @@
 const _SIGSEGV = 0xb
 
 func sigpanic() {
-	g := getg()
-	if !canpanic(g) {
+	gp := getg()
+	if !canpanic() {
 		throw("unexpected signal during runtime execution")
 	}
 
 	// js only invokes the exception handler for memory faults.
-	g.sig = _SIGSEGV
+	gp.sig = _SIGSEGV
 	panicmem()
 }
 
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index 25aea65..3ad1e3b 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -21,12 +21,12 @@
 	// profileTimer holds the ID of the POSIX interval timer for profiling CPU
 	// usage on this thread.
 	//
-	// It is valid when the profileTimerValid field is non-zero. A thread
+	// It is valid when the profileTimerValid field is true. A thread
 	// creates and manages its own timer, and these fields are read and written
 	// only by this thread. But because some of the reads on profileTimerValid
-	// are in signal handling code, access to that field uses atomic operations.
+	// are in signal handling code, this field should be atomic type.
 	profileTimer      int32
-	profileTimerValid uint32
+	profileTimerValid atomic.Bool
 
 	// needPerThreadSyscall indicates that a per-thread syscall is required
 	// for doAllThreadsSyscall.
@@ -176,12 +176,20 @@
 	// with signals disabled. It will enable them in minit.
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(abi.FuncPCABI0(mstart)))
+	ret := retryOnEAGAIN(func() int32 {
+		r := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(abi.FuncPCABI0(mstart)))
+		// clone returns positive TID, negative errno.
+		// We don't care about the TID.
+		if r >= 0 {
+			return 0
+		}
+		return -r
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
-		if ret == -_EAGAIN {
+	if ret != 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", ret, ")\n")
+		if ret == _EAGAIN {
 			println("runtime: may need to increase max user processes (ulimit -u)")
 		}
 		throw("newosproc")
@@ -194,19 +202,16 @@
 func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
 	stack := sysAlloc(stacksize, &memstats.stacks_sys)
 	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		writeErrStr(failallocatestack)
 		exit(1)
 	}
 	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
 	if ret < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 }
 
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
 const (
 	_AT_NULL   = 0  // End of vector
 	_AT_PAGESZ = 6  // System physical page size
@@ -504,7 +509,7 @@
 	return sa.sa_handler
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
@@ -593,7 +598,7 @@
 
 	// Having an M means the thread interacts with the Go scheduler, and we can
 	// check whether there's an active per-thread timer for this thread.
-	if atomic.Load(&mp.profileTimerValid) != 0 {
+	if mp.profileTimerValid.Load() {
 		// If this M has its own per-thread CPU profiling interval timer, we
 		// should track the SIGPROF signals that come from that timer (for
 		// accurate reporting of its CPU usage; see issue 35057) and ignore any
@@ -619,9 +624,9 @@
 	}
 
 	// destroy any active timer
-	if atomic.Load(&mp.profileTimerValid) != 0 {
+	if mp.profileTimerValid.Load() {
 		timerid := mp.profileTimer
-		atomic.Store(&mp.profileTimerValid, 0)
+		mp.profileTimerValid.Store(false)
 		mp.profileTimer = 0
 
 		ret := timer_delete(timerid)
@@ -681,7 +686,7 @@
 	}
 
 	mp.profileTimer = timerid
-	atomic.Store(&mp.profileTimerValid, 1)
+	mp.profileTimerValid.Store(true)
 }
 
 // perThreadSyscallArgs contains the system call number, arguments, and
@@ -880,9 +885,23 @@
 	}
 	if errno != 0 || r1 != args.r1 || r2 != args.r2 {
 		print("trap:", args.trap, ", a123456=[", args.a1, ",", args.a2, ",", args.a3, ",", args.a4, ",", args.a5, ",", args.a6, "]\n")
-		print("results: got {r1=", r1, ",r2=", r2, ",errno=", errno, "}, want {r1=", args.r1, ",r2=", args.r2, ",errno=0\n")
+		print("results: got {r1=", r1, ",r2=", r2, ",errno=", errno, "}, want {r1=", args.r1, ",r2=", args.r2, ",errno=0}\n")
 		fatal("AllThreadsSyscall6 results differ between threads; runtime corrupted")
 	}
 
 	gp.m.needPerThreadSyscall.Store(0)
 }
+
+const (
+	_SI_USER  = 0
+	_SI_TKILL = -6
+)
+
+// sigFromUser reports whether the signal was sent because of a call
+// to kill or tgkill.
+//
+//go:nosplit
+func (c *sigctxt) sigFromUser() bool {
+	code := int32(c.sigcode())
+	return code == _SI_USER || code == _SI_TKILL
+}
diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index b590da7..bd3ab44 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go
@@ -11,6 +11,8 @@
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
 )
 
+func vdsoCall()
+
 func checkgoarm() {
 	// On Android, /proc/self/auxv might be unreadable and hwcap won't
 	// reflect the CPU capabilities. Assume that every Android arm device
diff --git a/src/runtime/os_linux_be64.go b/src/runtime/os_linux_be64.go
index 537515f..d8d4ac2 100644
--- a/src/runtime/os_linux_be64.go
+++ b/src/runtime/os_linux_be64.go
@@ -11,7 +11,6 @@
 const (
 	_SS_DISABLE  = 2
 	_NSIG        = 65
-	_SI_USER     = 0
 	_SIG_BLOCK   = 0
 	_SIG_UNBLOCK = 1
 	_SIG_SETMASK = 2
diff --git a/src/runtime/os_linux_generic.go b/src/runtime/os_linux_generic.go
index bed9e66..15fafc1 100644
--- a/src/runtime/os_linux_generic.go
+++ b/src/runtime/os_linux_generic.go
@@ -9,7 +9,6 @@
 const (
 	_SS_DISABLE  = 2
 	_NSIG        = 65
-	_SI_USER     = 0
 	_SIG_BLOCK   = 0
 	_SIG_UNBLOCK = 1
 	_SIG_SETMASK = 2
diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
index 188db01..11d35bc 100644
--- a/src/runtime/os_linux_mips64x.go
+++ b/src/runtime/os_linux_mips64x.go
@@ -27,7 +27,6 @@
 const (
 	_SS_DISABLE  = 2
 	_NSIG        = 129
-	_SI_USER     = 0
 	_SIG_BLOCK   = 1
 	_SIG_UNBLOCK = 2
 	_SIG_SETMASK = 3
diff --git a/src/runtime/os_linux_mipsx.go b/src/runtime/os_linux_mipsx.go
index 73016f8..cdf83ff 100644
--- a/src/runtime/os_linux_mipsx.go
+++ b/src/runtime/os_linux_mipsx.go
@@ -21,7 +21,6 @@
 const (
 	_SS_DISABLE  = 2
 	_NSIG        = 128 + 1
-	_SI_USER     = 0
 	_SIG_BLOCK   = 1
 	_SIG_UNBLOCK = 2
 	_SIG_SETMASK = 3
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index 3cbace3..ce59618 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -152,16 +152,16 @@
 
 //go:nosplit
 func semasleep(ns int64) int32 {
-	_g_ := getg()
+	gp := getg()
 	var deadline int64
 	if ns >= 0 {
 		deadline = nanotime() + ns
 	}
 
 	for {
-		v := atomic.Load(&_g_.m.waitsemacount)
+		v := atomic.Load(&gp.m.waitsemacount)
 		if v > 0 {
-			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
+			if atomic.Cas(&gp.m.waitsemacount, v, v-1) {
 				return 0 // semaphore acquired
 			}
 			continue
@@ -178,7 +178,7 @@
 			ts.setNsec(wait)
 			tsp = &ts
 		}
-		ret := lwp_park(_CLOCK_MONOTONIC, _TIMER_RELTIME, tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
+		ret := lwp_park(_CLOCK_MONOTONIC, _TIMER_RELTIME, tsp, 0, unsafe.Pointer(&gp.m.waitsemacount), nil)
 		if ret == _ETIMEDOUT {
 			return -1
 		}
@@ -227,11 +227,15 @@
 
 	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, abi.FuncPCABI0(netbsdMstart))
 
-	ret := lwp_create(unsafe.Pointer(&uc), _LWP_DETACHED, unsafe.Pointer(&mp.procid))
+	ret := retryOnEAGAIN(func() int32 {
+		errno := lwp_create(unsafe.Pointer(&uc), _LWP_DETACHED, unsafe.Pointer(&mp.procid))
+		// lwp_create returns negative errno
+		return -errno
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
-		if ret == -_EAGAIN {
+	if ret != 0 {
+		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", ret, ")\n")
+		if ret == _EAGAIN {
 			println("runtime: may need to increase max user processes (ulimit -p)")
 		}
 		throw("runtime.newosproc")
@@ -289,8 +293,8 @@
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	_g_ := getg()
-	_g_.m.procid = uint64(lwp_self())
+	gp := getg()
+	gp.m.procid = uint64(lwp_self())
 
 	// On NetBSD a thread created by pthread_create inherits the
 	// signal stack of the creating thread. We always create a
@@ -299,8 +303,8 @@
 	// created in C that calls sigaltstack and then calls a Go
 	// function, because we will lose track of the C code's
 	// sigaltstack, but it's the best we can do.
-	signalstack(&_g_.m.gsignal.stack)
-	_g_.m.newSigstack = true
+	signalstack(&gp.m.gsignal.stack)
+	gp.m.newSigstack = true
 
 	minitSignalMask()
 }
@@ -352,7 +356,7 @@
 	return sa.sa_sigaction
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
index 2383dc8..500286a 100644
--- a/src/runtime/os_openbsd.go
+++ b/src/runtime/os_openbsd.go
@@ -51,6 +51,21 @@
 	return out, true
 }
 
+func sysctlUint64(mib []uint32) (uint64, bool) {
+	var out uint64
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], uint32(len(mib)), (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret < 0 {
+		return 0, false
+	}
+	return out, true
+}
+
+//go:linkname internal_cpu_sysctlUint64 internal/cpu.sysctlUint64
+func internal_cpu_sysctlUint64(mib []uint32) (uint64, bool) {
+	return sysctlUint64(mib)
+}
+
 func getncpu() int32 {
 	// Try hw.ncpuonline first because hw.ncpu would report a number twice as
 	// high as the actual CPUs running on OpenBSD 6.4 with hyperthreading
@@ -84,7 +99,7 @@
 
 //go:nosplit
 func semasleep(ns int64) int32 {
-	_g_ := getg()
+	gp := getg()
 
 	// Compute sleep deadline.
 	var tsp *timespec
@@ -95,9 +110,9 @@
 	}
 
 	for {
-		v := atomic.Load(&_g_.m.waitsemacount)
+		v := atomic.Load(&gp.m.waitsemacount)
 		if v > 0 {
-			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
+			if atomic.Cas(&gp.m.waitsemacount, v, v-1) {
 				return 0 // semaphore acquired
 			}
 			continue
@@ -110,7 +125,7 @@
 		// be examined [...] immediately before blocking. If that int
 		// is non-zero then __thrsleep() will immediately return EINTR
 		// without blocking."
-		ret := thrsleep(uintptr(unsafe.Pointer(&_g_.m.waitsemacount)), _CLOCK_MONOTONIC, tsp, 0, &_g_.m.waitsemacount)
+		ret := thrsleep(uintptr(unsafe.Pointer(&gp.m.waitsemacount)), _CLOCK_MONOTONIC, tsp, 0, &gp.m.waitsemacount)
 		if ret == _EWOULDBLOCK {
 			return -1
 		}
@@ -214,7 +229,7 @@
 	return sa.sa_sigaction
 }
 
-// setSignaltstackSP sets the ss_sp field of a stackt.
+// setSignalstackSP sets the ss_sp field of a stackt.
 //
 //go:nosplit
 func setSignalstackSP(s *stackt, sp uintptr) {
diff --git a/src/runtime/os_openbsd_libc.go b/src/runtime/os_openbsd_libc.go
index 4ad2a06..201f162 100644
--- a/src/runtime/os_openbsd_libc.go
+++ b/src/runtime/os_openbsd_libc.go
@@ -11,8 +11,6 @@
 	"unsafe"
 )
 
-var failThreadCreate = []byte("runtime: failed to create new OS thread\n")
-
 // mstart_stub provides glue code to call mstart from pthread_create.
 func mstart_stub()
 
@@ -27,21 +25,21 @@
 	// Initialize an attribute object.
 	var attr pthreadattr
 	if err := pthread_attr_init(&attr); err != 0 {
-		write(2, unsafe.Pointer(&failThreadCreate[0]), int32(len(failThreadCreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
 	// Find out OS stack size for our own stack guard.
 	var stacksize uintptr
 	if pthread_attr_getstacksize(&attr, &stacksize) != 0 {
-		write(2, unsafe.Pointer(&failThreadCreate[0]), int32(len(failThreadCreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
-		write(2, unsafe.Pointer(&failThreadCreate[0]), int32(len(failThreadCreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
@@ -49,10 +47,12 @@
 	// setup and then calls mstart.
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	err := pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
+	err := retryOnEAGAIN(func() int32 {
+		return pthread_create(&attr, abi.FuncPCABI0(mstart_stub), unsafe.Pointer(mp))
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if err != 0 {
-		write(2, unsafe.Pointer(&failThreadCreate[0]), int32(len(failThreadCreate)))
+		writeErrStr(failthreadcreate)
 		exit(1)
 	}
 
diff --git a/src/runtime/os_openbsd_syscall.go b/src/runtime/os_openbsd_syscall.go
index 9d67a7e..d784f76 100644
--- a/src/runtime/os_openbsd_syscall.go
+++ b/src/runtime/os_openbsd_syscall.go
@@ -34,12 +34,16 @@
 
 	var oset sigset
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	ret := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, abi.FuncPCABI0(mstart))
+	ret := retryOnEAGAIN(func() int32 {
+		errno := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, abi.FuncPCABI0(mstart))
+		// tfork returns negative errno
+		return -errno
+	})
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
-		if ret == -_EAGAIN {
+	if ret != 0 {
+		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", ret, ")\n")
+		if ret == _EAGAIN {
 			println("runtime: may need to increase max user processes (ulimit -p)")
 		}
 		throw("runtime.newosproc")
diff --git a/src/runtime/os_openbsd_syscall2.go b/src/runtime/os_openbsd_syscall2.go
index e4c9d2f..ebf478b 100644
--- a/src/runtime/os_openbsd_syscall2.go
+++ b/src/runtime/os_openbsd_syscall2.go
@@ -7,6 +7,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -37,7 +38,7 @@
 	usleep(usec)
 }
 
-// write calls the write system call.
+// write1 calls the write system call.
 // It returns a non-negative number of bytes written or a negative errno value.
 //
 //go:noescape
@@ -46,14 +47,14 @@
 //go:noescape
 func open(name *byte, mode, perm int32) int32
 
-// return value is only set on linux to be used in osinit()
+// return value is only set on linux to be used in osinit().
 func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
 
-// exitThread terminates the current thread, writing *wait = 0 when
+// exitThread terminates the current thread, writing *wait = freeMStack when
 // the stack is safe to reclaim.
 //
 //go:noescape
-func exitThread(wait *uint32)
+func exitThread(wait *atomic.Uint32)
 
 //go:noescape
 func obsdsigprocmask(how int32, new sigset) sigset
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index f0e7c6a..5e5a63d 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -75,13 +75,13 @@
 }
 
 func sigpanic() {
-	g := getg()
-	if !canpanic(g) {
+	gp := getg()
+	if !canpanic() {
 		throw("unexpected signal during runtime execution")
 	}
 
-	note := gostringnocopy((*byte)(unsafe.Pointer(g.m.notesig)))
-	switch g.sig {
+	note := gostringnocopy((*byte)(unsafe.Pointer(gp.m.notesig)))
+	switch gp.sig {
 	case _SIGRFAULT, _SIGWFAULT:
 		i := indexNoFloat(note, "addr=")
 		if i >= 0 {
@@ -92,17 +92,24 @@
 			panicmem()
 		}
 		addr := note[i:]
-		g.sigcode1 = uintptr(atolwhex(addr))
-		if g.sigcode1 < 0x1000 {
+		gp.sigcode1 = uintptr(atolwhex(addr))
+		if gp.sigcode1 < 0x1000 {
 			panicmem()
 		}
-		if g.paniconfault {
-			panicmemAddr(g.sigcode1)
+		if gp.paniconfault {
+			panicmemAddr(gp.sigcode1)
 		}
-		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		if inUserArenaChunk(gp.sigcode1) {
+			// We could check that the arena chunk is explicitly set to fault,
+			// but the fact that we faulted on accessing it is enough to prove
+			// that it is.
+			print("accessed data from freed user arena ", hex(gp.sigcode1), "\n")
+		} else {
+			print("unexpected fault address ", hex(gp.sigcode1), "\n")
+		}
 		throw("fault")
 	case _SIGTRAP:
-		if g.paniconfault {
+		if gp.paniconfault {
 			panicmem()
 		}
 		throw(note)
@@ -461,7 +468,7 @@
 	}
 }
 
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
 	// We should never reach exitThread on Plan 9 because we let
 	// the OS clean up threads.
 	throw("exitThread")
@@ -473,19 +480,19 @@
 
 //go:nosplit
 func semasleep(ns int64) int {
-	_g_ := getg()
+	gp := getg()
 	if ns >= 0 {
 		ms := timediv(ns, 1000000, nil)
 		if ms == 0 {
 			ms = 1
 		}
-		ret := plan9_tsemacquire(&_g_.m.waitsemacount, ms)
+		ret := plan9_tsemacquire(&gp.m.waitsemacount, ms)
 		if ret == 1 {
 			return 0 // success
 		}
 		return -1 // timeout or interrupted
 	}
-	for plan9_semacquire(&_g_.m.waitsemacount, 1) < 0 {
+	for plan9_semacquire(&gp.m.waitsemacount, 1) < 0 {
 		// interrupted; try again (c.f. lock_sema.go)
 	}
 	return 0 // success
diff --git a/src/runtime/os_unix_nonlinux.go b/src/runtime/os_unix_nonlinux.go
new file mode 100644
index 0000000..b98753b
--- /dev/null
+++ b/src/runtime/os_unix_nonlinux.go
@@ -0,0 +1,15 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix && !linux
+
+package runtime
+
+// sigFromUser reports whether the signal was sent because of a call
+// to kill.
+//
+//go:nosplit
+func (c *sigctxt) sigFromUser() bool {
+	return c.sigcode() == _SI_USER
+}
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 2f6ec75..44718f1 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -941,7 +941,7 @@
 	throw("bad newosproc0")
 }
 
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
 	// We should never reach exitThread on Windows because we let
 	// the OS clean up threads.
 	throw("exitThread")
@@ -1326,7 +1326,7 @@
 	if !atomic.Cas(&mp.preemptExtLock, 0, 1) {
 		// External code is running. Fail the preemption
 		// attempt.
-		atomic.Xadd(&mp.preemptGen, 1)
+		mp.preemptGen.Add(1)
 		return
 	}
 
@@ -1336,7 +1336,7 @@
 		// The M hasn't been minit'd yet (or was just unminit'd).
 		unlock(&mp.threadLock)
 		atomic.Store(&mp.preemptExtLock, 0)
-		atomic.Xadd(&mp.preemptGen, 1)
+		mp.preemptGen.Add(1)
 		return
 	}
 	var thread uintptr
@@ -1366,7 +1366,7 @@
 		atomic.Store(&mp.preemptExtLock, 0)
 		// The thread no longer exists. This shouldn't be
 		// possible, but just acknowledge the request.
-		atomic.Xadd(&mp.preemptGen, 1)
+		mp.preemptGen.Add(1)
 		return
 	}
 
@@ -1431,7 +1431,7 @@
 	atomic.Store(&mp.preemptExtLock, 0)
 
 	// Acknowledge the preemption.
-	atomic.Xadd(&mp.preemptGen, 1)
+	mp.preemptGen.Add(1)
 
 	stdcall1(_ResumeThread, thread)
 	stdcall1(_CloseHandle, thread)
diff --git a/src/runtime/pagetrace_off.go b/src/runtime/pagetrace_off.go
new file mode 100644
index 0000000..10b44d4
--- /dev/null
+++ b/src/runtime/pagetrace_off.go
@@ -0,0 +1,28 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !goexperiment.pagetrace
+
+package runtime
+
+//go:systemstack
+func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
+}
+
+//go:systemstack
+func pageTraceFree(pp *p, now int64, base, npages uintptr) {
+}
+
+//go:systemstack
+func pageTraceScav(pp *p, now int64, base, npages uintptr) {
+}
+
+type pageTraceBuf struct {
+}
+
+func initPageTrace(env string) {
+}
+
+func finishPageTrace() {
+}
diff --git a/src/runtime/pagetrace_on.go b/src/runtime/pagetrace_on.go
new file mode 100644
index 0000000..0e621cb
--- /dev/null
+++ b/src/runtime/pagetrace_on.go
@@ -0,0 +1,358 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build goexperiment.pagetrace
+
+// Page tracer.
+//
+// This file contains an implementation of page trace instrumentation for tracking
+// the way the Go runtime manages pages of memory. The trace may be enabled at program
+// startup with the GODEBUG option pagetrace.
+//
+// Each page trace event is either 8 or 16 bytes wide. The first
+// 8 bytes follow this format for non-sync events:
+//
+//     [16 timestamp delta][35 base address][10 npages][1 isLarge][2 pageTraceEventType]
+//
+// If the "large" bit is set then the event is 16 bytes wide with the second 8 byte word
+// containing the full npages value (the npages bitfield is 0).
+//
+// The base address's bottom pageShift bits are always zero hence why we can pack other
+// data in there. We ignore the top 16 bits, assuming a 48 bit address space for the
+// heap.
+//
+// The timestamp delta is computed from the difference between the current nanotime
+// timestamp and the last sync event's timestamp. The bottom pageTraceTimeLostBits of
+// this delta is removed and only the next pageTraceTimeDeltaBits are kept.
+//
+// A sync event is emitted at the beginning of each trace buffer and whenever the
+// timestamp delta would not fit in an event.
+//
+// Sync events have the following structure:
+//
+//    [61 timestamp or P ID][1 isPID][2 pageTraceSyncEvent]
+//
+// In essence, the "large" bit repurposed to indicate whether it's a timestamp or a P ID
+// (these are typically uint32). Note that we only have 61 bits for the 64-bit timestamp,
+// but like for the delta we drop the bottom pageTraceTimeLostBits here as well.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// pageTraceAlloc records a page trace allocation event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceAlloc(pp *p, now int64, base, npages uintptr) {
+	if pageTrace.enabled {
+		if now == 0 {
+			now = nanotime()
+		}
+		pageTraceEmit(pp, now, base, npages, pageTraceAllocEvent)
+	}
+}
+
+// pageTraceFree records a page trace free event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceFree(pp *p, now int64, base, npages uintptr) {
+	if pageTrace.enabled {
+		if now == 0 {
+			now = nanotime()
+		}
+		pageTraceEmit(pp, now, base, npages, pageTraceFreeEvent)
+	}
+}
+
+// pageTraceScav records a page trace scavenge event.
+// pp may be nil. Call only if debug.pagetracefd != 0.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceScav(pp *p, now int64, base, npages uintptr) {
+	if pageTrace.enabled {
+		if now == 0 {
+			now = nanotime()
+		}
+		pageTraceEmit(pp, now, base, npages, pageTraceScavEvent)
+	}
+}
+
+// pageTraceEventType is a page trace event type.
+type pageTraceEventType uint8
+
+const (
+	pageTraceSyncEvent  pageTraceEventType = iota // Timestamp emission.
+	pageTraceAllocEvent                           // Allocation of pages.
+	pageTraceFreeEvent                            // Freeing pages.
+	pageTraceScavEvent                            // Scavenging pages.
+)
+
+// pageTraceEmit emits a page trace event.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func pageTraceEmit(pp *p, now int64, base, npages uintptr, typ pageTraceEventType) {
+	// Get a buffer.
+	var tbp *pageTraceBuf
+	pid := int32(-1)
+	if pp == nil {
+		// We have no P, so take the global buffer.
+		lock(&pageTrace.lock)
+		tbp = &pageTrace.buf
+	} else {
+		tbp = &pp.pageTraceBuf
+		pid = pp.id
+	}
+
+	// Initialize the buffer if necessary.
+	tb := *tbp
+	if tb.buf == nil {
+		tb.buf = (*pageTraceEvents)(sysAlloc(pageTraceBufSize, &memstats.other_sys))
+		tb = tb.writePid(pid)
+	}
+
+	// Handle timestamp and emit a sync event if necessary.
+	if now < tb.timeBase {
+		now = tb.timeBase
+	}
+	if now-tb.timeBase >= pageTraceTimeMaxDelta {
+		tb.timeBase = now
+		tb = tb.writeSync(pid)
+	}
+
+	// Emit the event.
+	tb = tb.writeEvent(pid, now, base, npages, typ)
+
+	// Write back the buffer.
+	*tbp = tb
+	if pp == nil {
+		unlock(&pageTrace.lock)
+	}
+}
+
+const (
+	pageTraceBufSize = 32 << 10
+
+	// These constants describe the per-event timestamp delta encoding.
+	pageTraceTimeLostBits  = 7  // How many bits of precision we lose in the delta.
+	pageTraceTimeDeltaBits = 16 // Size of the delta in bits.
+	pageTraceTimeMaxDelta  = 1 << (pageTraceTimeLostBits + pageTraceTimeDeltaBits)
+)
+
+// pageTraceEvents is the low-level buffer containing the trace data.
+type pageTraceEvents struct {
+	_      sys.NotInHeap
+	events [pageTraceBufSize / 8]uint64
+}
+
+// pageTraceBuf is a wrapper around pageTraceEvents that knows how to write events
+// to the buffer. It tracks state necessary to do so.
+type pageTraceBuf struct {
+	buf      *pageTraceEvents
+	len      int   // How many events have been written so far.
+	timeBase int64 // The current timestamp base from which deltas are produced.
+	finished bool  // Whether this trace buf should no longer flush anything out.
+}
+
+// writePid writes a P ID event indicating which P we're running on.
+//
+// Assumes there's always space in the buffer since this is only called at the
+// beginning of a new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writePid(pid int32) pageTraceBuf {
+	e := uint64(int64(pid))<<3 | 0b100 | uint64(pageTraceSyncEvent)
+	tb.buf.events[tb.len] = e
+	tb.len++
+	return tb
+}
+
+// writeSync writes a sync event, which is just a timestamp. Handles flushing.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeSync(pid int32) pageTraceBuf {
+	if tb.len+1 > len(tb.buf.events) {
+		// N.B. flush will writeSync again.
+		return tb.flush(pid, tb.timeBase)
+	}
+	e := ((uint64(tb.timeBase) >> pageTraceTimeLostBits) << 3) | uint64(pageTraceSyncEvent)
+	tb.buf.events[tb.len] = e
+	tb.len++
+	return tb
+}
+
+// writeEvent handles writing all non-sync and non-pid events. Handles flushing if necessary.
+//
+// pid indicates the P we're currently running on. Necessary in case we need to flush.
+// now is the current nanotime timestamp.
+// base is the base address of whatever group of pages this event is happening to.
+// npages is the length of the group of pages this event is happening to.
+// typ is the event that's happening to these pages.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) writeEvent(pid int32, now int64, base, npages uintptr, typ pageTraceEventType) pageTraceBuf {
+	large := 0
+	np := npages
+	if npages >= 1024 {
+		large = 1
+		np = 0
+	}
+	if tb.len+1+large > len(tb.buf.events) {
+		tb = tb.flush(pid, now)
+	}
+	if base%pageSize != 0 {
+		throw("base address not page aligned")
+	}
+	e := uint64(base)
+	// The pageShift low-order bits are zero.
+	e |= uint64(typ)        // 2 bits
+	e |= uint64(large) << 2 // 1 bit
+	e |= uint64(np) << 3    // 10 bits
+	// Write the timestamp delta in the upper pageTraceTimeDeltaBits.
+	e |= uint64((now-tb.timeBase)>>pageTraceTimeLostBits) << (64 - pageTraceTimeDeltaBits)
+	tb.buf.events[tb.len] = e
+	if large != 0 {
+		// npages doesn't fit in 10 bits, so write an additional word with that data.
+		tb.buf.events[tb.len+1] = uint64(npages)
+	}
+	tb.len += 1 + large
+	return tb
+}
+
+// flush writes out the contents of the buffer to pageTrace.fd and resets the buffer.
+// It then writes out a P ID event and the first sync event for the new buffer.
+//
+// Must run on the system stack as a crude way to prevent preemption.
+//
+//go:systemstack
+func (tb pageTraceBuf) flush(pid int32, now int64) pageTraceBuf {
+	if !tb.finished {
+		lock(&pageTrace.fdLock)
+		writeFull(uintptr(pageTrace.fd), (*byte)(unsafe.Pointer(&tb.buf.events[0])), tb.len*8)
+		unlock(&pageTrace.fdLock)
+	}
+	tb.len = 0
+	tb.timeBase = now
+	return tb.writePid(pid).writeSync(pid)
+}
+
+var pageTrace struct {
+	// enabled indicates whether tracing is enabled. If true, fd >= 0.
+	//
+	// Safe to read without synchronization because it's only set once
+	// at program initialization.
+	enabled bool
+
+	// buf is the page trace buffer used if there is no P.
+	//
+	// lock protects buf.
+	lock mutex
+	buf  pageTraceBuf
+
+	// fdLock protects writing to fd.
+	//
+	// fd is the file to write the page trace to.
+	fdLock mutex
+	fd     int32
+}
+
+// initPageTrace initializes the page tracing infrastructure from GODEBUG.
+//
+// env must be the value of the GODEBUG environment variable.
+func initPageTrace(env string) {
+	var value string
+	for env != "" {
+		elt, rest := env, ""
+		for i := 0; i < len(env); i++ {
+			if env[i] == ',' {
+				elt, rest = env[:i], env[i+1:]
+				break
+			}
+		}
+		env = rest
+		if hasPrefix(elt, "pagetrace=") {
+			value = elt[len("pagetrace="):]
+			break
+		}
+	}
+	pageTrace.fd = -1
+	if canCreateFile && value != "" {
+		var tmp [4096]byte
+		if len(value) != 0 && len(value) < 4096 {
+			copy(tmp[:], value)
+			pageTrace.fd = create(&tmp[0], 0o664)
+		}
+	}
+	pageTrace.enabled = pageTrace.fd >= 0
+}
+
+// finishPageTrace flushes all P's trace buffers and disables page tracing.
+func finishPageTrace() {
+	if !pageTrace.enabled {
+		return
+	}
+	// Grab worldsema as we're about to execute a ragged barrier.
+	semacquire(&worldsema)
+	systemstack(func() {
+		// Disable tracing. This isn't strictly necessary and it's best-effort.
+		pageTrace.enabled = false
+
+		// Execute a ragged barrier, flushing each trace buffer.
+		forEachP(func(pp *p) {
+			if pp.pageTraceBuf.buf != nil {
+				pp.pageTraceBuf = pp.pageTraceBuf.flush(pp.id, nanotime())
+			}
+			pp.pageTraceBuf.finished = true
+		})
+
+		// Write the global have-no-P buffer.
+		lock(&pageTrace.lock)
+		if pageTrace.buf.buf != nil {
+			pageTrace.buf = pageTrace.buf.flush(-1, nanotime())
+		}
+		pageTrace.buf.finished = true
+		unlock(&pageTrace.lock)
+
+		// Safely close the file as nothing else should be allowed to write to the fd.
+		lock(&pageTrace.fdLock)
+		closefd(pageTrace.fd)
+		pageTrace.fd = -1
+		unlock(&pageTrace.fdLock)
+	})
+	semrelease(&worldsema)
+}
+
+// writeFull ensures that a complete write of bn bytes from b is made to fd.
+func writeFull(fd uintptr, b *byte, bn int) {
+	for bn > 0 {
+		n := write(fd, unsafe.Pointer(b), int32(bn))
+		if n == -_EINTR || n == -_EAGAIN {
+			continue
+		}
+		if n < 0 {
+			print("errno=", -n, "\n")
+			throw("writeBytes: bad write")
+		}
+		bn -= int(n)
+		b = addb(b, uintptr(n))
+	}
+}
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 121f202..26618db 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -197,9 +197,9 @@
 	panic(boundsError{x: int64(x), signed: false, y: y, code: boundsSlice3C})
 }
 
-// failures in the conversion (*[x]T)s, 0 <= x <= y, x == cap(s)
+// failures in the conversion ([x]T)(s) or (*[x]T)(s), 0 <= x <= y, y == len(s)
 func goPanicSliceConvert(x int, y int) {
-	panicCheck1(getcallerpc(), "slice length too short to convert to pointer to array")
+	panicCheck1(getcallerpc(), "slice length too short to convert to array or pointer to array")
 	panic(boundsError{x: int64(x), signed: true, y: y, code: boundsConvert})
 }
 
@@ -457,7 +457,7 @@
 			return
 		}
 		if d.openDefer {
-			done := runOpenDeferFrame(gp, d)
+			done := runOpenDeferFrame(d)
 			if !done {
 				throw("unfinished open-coded defers in deferreturn")
 			}
@@ -519,7 +519,7 @@
 		d.started = true
 		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 		if d.openDefer {
-			done := runOpenDeferFrame(gp, d)
+			done := runOpenDeferFrame(d)
 			if !done {
 				// We should always run all defers in the frame,
 				// since there is no panic associated with this
@@ -744,7 +744,7 @@
 // d. It normally processes all active defers in the frame, but stops immediately
 // if a defer does a successful recover. It returns true if there are no
 // remaining defers to run in the frame.
-func runOpenDeferFrame(gp *g, d *_defer) bool {
+func runOpenDeferFrame(d *_defer) bool {
 	done := true
 	fd := d.fd
 
@@ -837,7 +837,7 @@
 	p.link = gp._panic
 	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 
-	atomic.Xadd(&runningPanicDefers, 1)
+	runningPanicDefers.Add(1)
 
 	// By calculating getcallerpc/getcallersp here, we avoid scanning the
 	// gopanic frame (stack scanning is slow...)
@@ -881,7 +881,7 @@
 
 		done := true
 		if d.openDefer {
-			done = runOpenDeferFrame(gp, d)
+			done = runOpenDeferFrame(d)
 			if done && !d._panic.recovered {
 				addOneOpenDeferFrame(gp, 0, nil)
 			}
@@ -917,7 +917,7 @@
 				mcall(recovery)
 				throw("bypassed recovery failed") // mcall should not return
 			}
-			atomic.Xadd(&runningPanicDefers, -1)
+			runningPanicDefers.Add(-1)
 
 			// After a recover, remove any remaining non-started,
 			// open-coded defer entries, since the corresponding defers
@@ -1067,13 +1067,11 @@
 }
 
 // runningPanicDefers is non-zero while running deferred functions for panic.
-// runningPanicDefers is incremented and decremented atomically.
 // This is used to try hard to get a panic stack trace out when exiting.
-var runningPanicDefers uint32
+var runningPanicDefers atomic.Uint32
 
 // panicking is non-zero when crashing the program for an unrecovered panic.
-// panicking is incremented and decremented atomically.
-var panicking uint32
+var panicking atomic.Uint32
 
 // paniclk is held while printing the panic information and stack trace,
 // so that two concurrent panics don't overlap their output.
@@ -1155,7 +1153,7 @@
 			// startpanic_m set panicking, which will
 			// block main from exiting, so now OK to
 			// decrement runningPanicDefers.
-			atomic.Xadd(&runningPanicDefers, -1)
+			runningPanicDefers.Add(-1)
 
 			printpanics(msgs)
 		}
@@ -1190,7 +1188,7 @@
 //
 //go:nowritebarrierrec
 func startpanic_m() bool {
-	_g_ := getg()
+	gp := getg()
 	if mheap_.cachealloc.size == 0 { // very early
 		print("runtime: panic before malloc heap initialized\n")
 	}
@@ -1198,19 +1196,19 @@
 	// could happen in a signal handler, or in a throw, or inside
 	// malloc itself. We want to catch if an allocation ever does
 	// happen (even if we're not in one of these situations).
-	_g_.m.mallocing++
+	gp.m.mallocing++
 
 	// If we're dying because of a bad lock count, set it to a
 	// good lock count so we don't recursively panic below.
-	if _g_.m.locks < 0 {
-		_g_.m.locks = 1
+	if gp.m.locks < 0 {
+		gp.m.locks = 1
 	}
 
-	switch _g_.m.dying {
+	switch gp.m.dying {
 	case 0:
 		// Setting dying >0 has the side-effect of disabling this G's writebuf.
-		_g_.m.dying = 1
-		atomic.Xadd(&panicking, 1)
+		gp.m.dying = 1
+		panicking.Add(1)
 		lock(&paniclk)
 		if debug.schedtrace > 0 || debug.scheddetail > 0 {
 			schedtrace(true)
@@ -1220,13 +1218,13 @@
 	case 1:
 		// Something failed while panicking.
 		// Just print a stack trace and exit.
-		_g_.m.dying = 2
+		gp.m.dying = 2
 		print("panic during panic\n")
 		return false
 	case 2:
 		// This is a genuine bug in the runtime, we couldn't even
 		// print the stack trace successfully.
-		_g_.m.dying = 3
+		gp.m.dying = 3
 		print("stack trace unavailable\n")
 		exit(4)
 		fallthrough
@@ -1240,6 +1238,8 @@
 var didothers bool
 var deadlock mutex
 
+// gp is the crashing g running on this M, but may be a user G, while getg() is
+// always g0.
 func dopanic_m(gp *g, pc, sp uintptr) bool {
 	if gp.sig != 0 {
 		signame := signame(gp.sig)
@@ -1252,7 +1252,6 @@
 	}
 
 	level, all, docrash := gotraceback()
-	_g_ := getg()
 	if level > 0 {
 		if gp != gp.m.curg {
 			all = true
@@ -1261,7 +1260,7 @@
 			print("\n")
 			goroutineheader(gp)
 			traceback(pc, sp, 0, gp)
-		} else if level >= 2 || _g_.m.throwing >= throwTypeRuntime {
+		} else if level >= 2 || gp.m.throwing >= throwTypeRuntime {
 			print("\nruntime stack:\n")
 			traceback(pc, sp, 0, gp)
 		}
@@ -1272,7 +1271,7 @@
 	}
 	unlock(&paniclk)
 
-	if atomic.Xadd(&panicking, -1) != 0 {
+	if panicking.Add(-1) != 0 {
 		// Some other m is panicking too.
 		// Let it print what it needs to print.
 		// Wait forever without chewing up cpu.
@@ -1290,29 +1289,32 @@
 // panicking.
 //
 //go:nosplit
-func canpanic(gp *g) bool {
-	// Note that g is m->gsignal, different from gp.
-	// Note also that g->m can change at preemption, so m can go stale
-	// if this function ever makes a function call.
-	_g_ := getg()
-	mp := _g_.m
+func canpanic() bool {
+	gp := getg()
+	mp := acquirem()
 
 	// Is it okay for gp to panic instead of crashing the program?
 	// Yes, as long as it is running Go code, not runtime code,
 	// and not stuck in a system call.
-	if gp == nil || gp != mp.curg {
+	if gp != mp.curg {
+		releasem(mp)
 		return false
 	}
-	if mp.locks != 0 || mp.mallocing != 0 || mp.throwing != throwTypeNone || mp.preemptoff != "" || mp.dying != 0 {
+	// N.B. mp.locks != 1 instead of 0 to account for acquirem.
+	if mp.locks != 1 || mp.mallocing != 0 || mp.throwing != throwTypeNone || mp.preemptoff != "" || mp.dying != 0 {
+		releasem(mp)
 		return false
 	}
 	status := readgstatus(gp)
 	if status&^_Gscan != _Grunning || gp.syscallsp != 0 {
+		releasem(mp)
 		return false
 	}
 	if GOOS == "windows" && mp.libcallsp != 0 {
+		releasem(mp)
 		return false
 	}
+	releasem(mp)
 	return true
 }
 
diff --git a/src/runtime/pprof/label.go b/src/runtime/pprof/label.go
index 0c58a7a..d39e0ad 100644
--- a/src/runtime/pprof/label.go
+++ b/src/runtime/pprof/label.go
@@ -57,8 +57,8 @@
 // WithLabels returns a new context.Context with the given labels added.
 // A label overwrites a prior label with the same key.
 func WithLabels(ctx context.Context, labels LabelSet) context.Context {
-	childLabels := make(labelMap)
 	parentLabels := labelValue(ctx)
+	childLabels := make(labelMap, len(parentLabels))
 	// TODO(matloob): replace the map implementation with something
 	// more efficient so creating a child context WithLabels doesn't need
 	// to clone the map.
diff --git a/src/runtime/pprof/pe.go b/src/runtime/pprof/pe.go
new file mode 100644
index 0000000..4105458
--- /dev/null
+++ b/src/runtime/pprof/pe.go
@@ -0,0 +1,19 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import "os"
+
+// peBuildID returns a best effort unique ID for the named executable.
+//
+// It would be wasteful to calculate the hash of the whole file,
+// instead use the binary name and the last modified time for the buildid.
+func peBuildID(file string) string {
+	s, err := os.Stat(file)
+	if err != nil {
+		return file
+	}
+	return file + s.ModTime().String()
+}
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index f0b25c1..17a490e 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -74,7 +74,6 @@
 
 import (
 	"bufio"
-	"bytes"
 	"fmt"
 	"internal/abi"
 	"io"
@@ -372,8 +371,7 @@
 // as the pprof-proto format output. Translations from cycle count to time duration
 // are done because The proto expects count and time (nanoseconds) instead of count
 // and the number of cycles for block, contention profiles.
-// Possible 'scaler' functions are scaleBlockProfile and scaleMutexProfile.
-func printCountCycleProfile(w io.Writer, countName, cycleName string, scaler func(int64, float64) (int64, float64), records []runtime.BlockProfileRecord) error {
+func printCountCycleProfile(w io.Writer, countName, cycleName string, records []runtime.BlockProfileRecord) error {
 	// Output profile in protobuf form.
 	b := newProfileBuilder(w)
 	b.pbValueType(tagProfile_PeriodType, countName, "count")
@@ -386,9 +384,8 @@
 	values := []int64{0, 0}
 	var locs []uint64
 	for _, r := range records {
-		count, nanosec := scaler(r.Count, float64(r.Cycles)/cpuGHz)
-		values[0] = count
-		values[1] = int64(nanosec)
+		values[0] = r.Count
+		values[1] = int64(float64(r.Cycles) / cpuGHz)
 		// For count profiles, all stack addresses are
 		// return PCs, which is what appendLocsForStack expects.
 		locs = b.appendLocsForStack(locs[:0], r.Stack())
@@ -402,7 +399,7 @@
 // The profile will be in compressed proto format unless debug is nonzero.
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
 	// Build count of each stack.
-	var buf bytes.Buffer
+	var buf strings.Builder
 	key := func(stk []uintptr, lbls *labelMap) string {
 		buf.Reset()
 		fmt.Fprintf(&buf, "@")
@@ -593,10 +590,24 @@
 	// Technically the rate is MemProfileRate not 2*MemProfileRate,
 	// but early versions of the C++ heap profiler reported 2*MemProfileRate,
 	// so that's what pprof has come to expect.
+	rate := 2 * runtime.MemProfileRate
+
+	// pprof reads a profile with alloc == inuse as being a "2-column" profile
+	// (objects and bytes, not distinguishing alloc from inuse),
+	// but then such a profile can't be merged using pprof *.prof with
+	// other 4-column profiles where alloc != inuse.
+	// The easiest way to avoid this bug is to adjust allocBytes so it's never == inuseBytes.
+	// pprof doesn't use these header values anymore except for checking equality.
+	inUseBytes := total.InUseBytes()
+	allocBytes := total.AllocBytes
+	if inUseBytes == allocBytes {
+		allocBytes++
+	}
+
 	fmt.Fprintf(w, "heap profile: %d: %d [%d: %d] @ heap/%d\n",
-		total.InUseObjects(), total.InUseBytes(),
-		total.AllocObjects, total.AllocBytes,
-		2*runtime.MemProfileRate)
+		total.InUseObjects(), inUseBytes,
+		total.AllocObjects, allocBytes,
+		rate)
 
 	for i := range p {
 		r := &p[i]
@@ -842,24 +853,16 @@
 
 // writeBlock writes the current blocking profile to w.
 func writeBlock(w io.Writer, debug int) error {
-	return writeProfileInternal(w, debug, "contention", runtime.BlockProfile, scaleBlockProfile)
-}
-
-func scaleBlockProfile(cnt int64, ns float64) (int64, float64) {
-	// Do nothing.
-	// The current way of block profile sampling makes it
-	// hard to compute the unsampled number. The legacy block
-	// profile parse doesn't attempt to scale or unsample.
-	return cnt, ns
+	return writeProfileInternal(w, debug, "contention", runtime.BlockProfile)
 }
 
 // writeMutex writes the current mutex profile to w.
 func writeMutex(w io.Writer, debug int) error {
-	return writeProfileInternal(w, debug, "mutex", runtime.MutexProfile, scaleMutexProfile)
+	return writeProfileInternal(w, debug, "mutex", runtime.MutexProfile)
 }
 
-// writeProfileInternal writes the current blocking or mutex profile depending on the passed parameters
-func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile func([]runtime.BlockProfileRecord) (int, bool), scaleProfile func(int64, float64) (int64, float64)) error {
+// writeProfileInternal writes the current blocking or mutex profile depending on the passed parameters.
+func writeProfileInternal(w io.Writer, debug int, name string, runtimeProfile func([]runtime.BlockProfileRecord) (int, bool)) error {
 	var p []runtime.BlockProfileRecord
 	n, ok := runtimeProfile(nil)
 	for {
@@ -874,7 +877,7 @@
 	sort.Slice(p, func(i, j int) bool { return p[i].Cycles > p[j].Cycles })
 
 	if debug <= 0 {
-		return printCountCycleProfile(w, "contentions", "delay", scaleProfile, p)
+		return printCountCycleProfile(w, "contentions", "delay", p)
 	}
 
 	b := bufio.NewWriter(w)
@@ -904,9 +907,4 @@
 	return b.Flush()
 }
 
-func scaleMutexProfile(cnt int64, ns float64) (int64, float64) {
-	period := runtime.SetMutexProfileFraction(-1)
-	return cnt * int64(period), ns * float64(period)
-}
-
 func runtime_cyclesPerSecond() int64
diff --git a/src/runtime/pprof/pprof_norusage.go b/src/runtime/pprof/pprof_norusage.go
index 3d60525..8de3808 100644
--- a/src/runtime/pprof/pprof_norusage.go
+++ b/src/runtime/pprof/pprof_norusage.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !aix && !darwin && !dragonfly && !freebsd && !linux && !netbsd && !openbsd && !solaris
+//go:build !aix && !darwin && !dragonfly && !freebsd && !linux && !netbsd && !openbsd && !solaris && !windows
 
 package pprof
 
diff --git a/src/runtime/pprof/pprof_rusage.go b/src/runtime/pprof/pprof_rusage.go
index 984a32e..aa429fb 100644
--- a/src/runtime/pprof/pprof_rusage.go
+++ b/src/runtime/pprof/pprof_rusage.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
+//go:build unix
 
 package pprof
 
@@ -28,6 +28,8 @@
 	}
 
 	var rusage syscall.Rusage
-	syscall.Getrusage(syscall.RUSAGE_SELF, &rusage)
-	fmt.Fprintf(w, "# MaxRSS = %d\n", uintptr(rusage.Maxrss)*rssToBytes)
+	err := syscall.Getrusage(syscall.RUSAGE_SELF, &rusage)
+	if err == nil {
+		fmt.Fprintf(w, "# MaxRSS = %d\n", uintptr(rusage.Maxrss)*rssToBytes)
+	}
 }
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index aabc180..53688ad 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -12,6 +12,7 @@
 	"fmt"
 	"internal/abi"
 	"internal/profile"
+	"internal/syscall/unix"
 	"internal/testenv"
 	"io"
 	"math"
@@ -116,11 +117,8 @@
 
 	// Linux [5.9,5.16) has a kernel bug that can break CPU timers on newly
 	// created threads, breaking our CPU accounting.
-	major, minor, patch, err := linuxKernelVersion()
-	if err != nil {
-		t.Errorf("Error determining kernel version: %v", err)
-	}
-	t.Logf("Running on Linux %d.%d.%d", major, minor, patch)
+	major, minor := unix.KernelVersion()
+	t.Logf("Running on Linux %d.%d", major, minor)
 	defer func() {
 		if t.Failed() {
 			t.Logf("Failure of this test may indicate that your system suffers from a known Linux kernel bug fixed on newer kernels. See https://golang.org/issue/49065.")
@@ -530,7 +528,7 @@
 	ok = true
 
 	var samples uintptr
-	var buf bytes.Buffer
+	var buf strings.Builder
 	p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
 		fmt.Fprintf(&buf, "%d:", count)
 		fprintStack(&buf, stk)
@@ -609,7 +607,7 @@
 		var total uintptr
 		for i, name := range need {
 			total += have[i]
-			t.Logf("%s: %d\n", name, have[i])
+			t.Logf("found %d samples in expected function %s\n", have[i], name)
 		}
 		if total == 0 {
 			t.Logf("no samples in expected functions")
@@ -720,7 +718,7 @@
 			// The place we'd see it would be the inner most frame.
 			name := stk[0].Line[0].Function.Name
 			if name == "gogo" {
-				var buf bytes.Buffer
+				var buf strings.Builder
 				fprintStack(&buf, stk)
 				t.Fatalf("found profile entry for gogo:\n%s", buf.String())
 			}
@@ -729,6 +727,9 @@
 }
 
 func fprintStack(w io.Writer, stk []*profile.Location) {
+	if len(stk) == 0 {
+		fmt.Fprintf(w, " (stack empty)")
+	}
 	for _, loc := range stk {
 		fmt.Fprintf(w, " %#x", loc.Address)
 		fmt.Fprintf(w, " (")
@@ -924,7 +925,7 @@
 	}
 
 	t.Run("debug=1", func(t *testing.T) {
-		var w bytes.Buffer
+		var w strings.Builder
 		Lookup("block").WriteTo(&w, 1)
 		prof := w.String()
 
@@ -1091,7 +1092,7 @@
 	var mu sync.Mutex
 	mu.Lock()
 	go func() {
-		awaitBlockedGoroutine(t, "semacquire", "blockMutex")
+		awaitBlockedGoroutine(t, "sync.Mutex.Lock", "blockMutex")
 		mu.Unlock()
 	}()
 	// Note: Unlock releases mu before recording the mutex event,
@@ -1196,7 +1197,7 @@
 	blockMutex(t)
 
 	t.Run("debug=1", func(t *testing.T) {
-		var w bytes.Buffer
+		var w strings.Builder
 		Lookup("mutex").WriteTo(&w, 1)
 		prof := w.String()
 		t.Logf("received profile: %v", prof)
@@ -1248,6 +1249,50 @@
 	})
 }
 
+func TestMutexProfileRateAdjust(t *testing.T) {
+	old := runtime.SetMutexProfileFraction(1)
+	defer runtime.SetMutexProfileFraction(old)
+	if old != 0 {
+		t.Fatalf("need MutexProfileRate 0, got %d", old)
+	}
+
+	readProfile := func() (contentions int64, delay int64) {
+		var w bytes.Buffer
+		Lookup("mutex").WriteTo(&w, 0)
+		p, err := profile.Parse(&w)
+		if err != nil {
+			t.Fatalf("failed to parse profile: %v", err)
+		}
+		t.Logf("parsed proto: %s", p)
+		if err := p.CheckValid(); err != nil {
+			t.Fatalf("invalid profile: %v", err)
+		}
+
+		for _, s := range p.Sample {
+			for _, l := range s.Location {
+				for _, line := range l.Line {
+					if line.Function.Name == "runtime/pprof.blockMutex.func1" {
+						contentions += s.Value[0]
+						delay += s.Value[1]
+					}
+				}
+			}
+		}
+		return
+	}
+
+	blockMutex(t)
+	contentions, delay := readProfile()
+	if contentions == 0 || delay == 0 {
+		t.Fatal("did not see expected function in profile")
+	}
+	runtime.SetMutexProfileFraction(0)
+	newContentions, newDelay := readProfile()
+	if newContentions != contentions || newDelay != delay {
+		t.Fatalf("sample value changed: got [%d, %d], want [%d, %d]", newContentions, newDelay, contentions, delay)
+	}
+}
+
 func func1(c chan int) { <-c }
 func func2(c chan int) { <-c }
 func func3(c chan int) { <-c }
@@ -1319,13 +1364,13 @@
 		t.Errorf("protobuf profile is invalid: %v", err)
 	}
 	expectedLabels := map[int64]map[string]string{
-		50: map[string]string{},
-		44: map[string]string{"label": "value"},
-		40: map[string]string{},
-		36: map[string]string{"label": "value"},
-		10: map[string]string{},
-		9:  map[string]string{"label": "value"},
-		1:  map[string]string{},
+		50: {},
+		44: {"label": "value"},
+		40: {},
+		36: {"label": "value"},
+		10: {},
+		9:  {"label": "value"},
+		1:  {},
 	}
 	if !containsCountsLabels(p, expectedLabels) {
 		t.Errorf("expected count profile to contain goroutines with counts and labels %v, got %v",
@@ -1419,7 +1464,7 @@
 				go func() {
 					defer wg.Done()
 					for ctx.Err() == nil {
-						var w bytes.Buffer
+						var w strings.Builder
 						goroutineProf.WriteTo(&w, 1)
 						prof := w.String()
 						count := profilerCalls(prof)
@@ -1437,7 +1482,7 @@
 	// The finalizer goroutine should not show up in most profiles, since it's
 	// marked as a system goroutine when idle.
 	t.Run("finalizer not present", func(t *testing.T) {
-		var w bytes.Buffer
+		var w strings.Builder
 		goroutineProf.WriteTo(&w, 1)
 		prof := w.String()
 		if includesFinalizer(prof) {
@@ -1465,7 +1510,7 @@
 				runtime.GC()
 			}
 		}
-		var w bytes.Buffer
+		var w strings.Builder
 		goroutineProf.WriteTo(&w, 1)
 		prof := w.String()
 		if !includesFinalizer(prof) {
@@ -1679,7 +1724,7 @@
 	emptyCallStackTestRun++
 
 	t.Parallel()
-	var buf bytes.Buffer
+	var buf strings.Builder
 	p := NewProfile(name)
 
 	p.Add("foo", 47674)
@@ -1759,7 +1804,7 @@
 		go func() {
 			goroutineProf := Lookup("goroutine")
 			for ctx.Err() == nil {
-				var w bytes.Buffer
+				var w strings.Builder
 				goroutineProf.WriteTo(&w, 1)
 				prof := w.String()
 				if strings.Contains(prof, "loop-i") {
@@ -1825,14 +1870,14 @@
 		isLabeled := s.Label != nil && contains(s.Label["key"], "value")
 		var (
 			mayBeLabeled     bool
-			mustBeLabeled    bool
-			mustNotBeLabeled bool
+			mustBeLabeled    string
+			mustNotBeLabeled string
 		)
 		for _, loc := range s.Location {
 			for _, l := range loc.Line {
 				switch l.Function.Name {
 				case "runtime/pprof.labelHog", "runtime/pprof.parallelLabelHog", "runtime/pprof.parallelLabelHog.func1":
-					mustBeLabeled = true
+					mustBeLabeled = l.Function.Name
 				case "runtime/pprof.Do":
 					// Do sets the labels, so samples may
 					// or may not be labeled depending on
@@ -1844,7 +1889,7 @@
 					// (such as those identified by
 					// runtime.isSystemGoroutine). These
 					// should never be labeled.
-					mustNotBeLabeled = true
+					mustNotBeLabeled = l.Function.Name
 				case "gogo", "gosave_systemstack_switch", "racecall":
 					// These are context switch/race
 					// critical that we can't do a full
@@ -1866,25 +1911,28 @@
 				}
 			}
 		}
-		if mustNotBeLabeled {
-			// If this must not be labeled, then mayBeLabeled hints
-			// are not relevant.
+		errorStack := func(f string, args ...any) {
+			var buf strings.Builder
+			fprintStack(&buf, s.Location)
+			t.Errorf("%s: %s", fmt.Sprintf(f, args...), buf.String())
+		}
+		if mustBeLabeled != "" && mustNotBeLabeled != "" {
+			errorStack("sample contains both %s, which must be labeled, and %s, which must not be labeled", mustBeLabeled, mustNotBeLabeled)
+			continue
+		}
+		if mustBeLabeled != "" || mustNotBeLabeled != "" {
+			// We found a definitive frame, so mayBeLabeled hints are not relevant.
 			mayBeLabeled = false
 		}
-		if mustBeLabeled && !isLabeled {
-			var buf bytes.Buffer
-			fprintStack(&buf, s.Location)
-			t.Errorf("Sample labeled got false want true: %s", buf.String())
+		if mayBeLabeled {
+			// This sample may or may not be labeled, so there's nothing we can check.
+			continue
 		}
-		if mustNotBeLabeled && isLabeled {
-			var buf bytes.Buffer
-			fprintStack(&buf, s.Location)
-			t.Errorf("Sample labeled got true want false: %s", buf.String())
+		if mustBeLabeled != "" && !isLabeled {
+			errorStack("sample must be labeled because of %s, but is not", mustBeLabeled)
 		}
-		if isLabeled && !(mayBeLabeled || mustBeLabeled) {
-			var buf bytes.Buffer
-			fprintStack(&buf, s.Location)
-			t.Errorf("Sample labeled got true want false: %s", buf.String())
+		if mustNotBeLabeled != "" && isLabeled {
+			errorStack("sample must not be labeled because of %s, but is", mustNotBeLabeled)
 		}
 	}
 }
diff --git a/src/runtime/pprof/pprof_windows.go b/src/runtime/pprof/pprof_windows.go
new file mode 100644
index 0000000..23ef2f8
--- /dev/null
+++ b/src/runtime/pprof/pprof_windows.go
@@ -0,0 +1,22 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"fmt"
+	"internal/syscall/windows"
+	"io"
+	"syscall"
+	"unsafe"
+)
+
+func addMaxRSS(w io.Writer) {
+	var m windows.PROCESS_MEMORY_COUNTERS
+	p, _ := syscall.GetCurrentProcess()
+	err := windows.GetProcessMemoryInfo(p, &m, uint32(unsafe.Sizeof(m)))
+	if err == nil {
+		fmt.Fprintf(w, "# MaxRSS = %d\n", m.PeakWorkingSetSize)
+	}
+}
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
index 085027c..b68f30d 100644
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -10,7 +10,6 @@
 	"fmt"
 	"internal/abi"
 	"io"
-	"os"
 	"runtime"
 	"strconv"
 	"strings"
@@ -46,10 +45,11 @@
 
 type memMap struct {
 	// initialized as reading mapping
-	start         uintptr
-	end           uintptr
-	offset        uint64
-	file, buildID string
+	start   uintptr // Address at which the binary (or DLL) is loaded into memory.
+	end     uintptr // The limit of the address range occupied by this mapping.
+	offset  uint64  // Offset in the binary that corresponds to the first mapped address.
+	file    string  // The object this entry is loaded from.
+	buildID string  // A string that uniquely identifies a particular program version with high probability.
 
 	funcs symbolizeFlag
 	fake  bool // map entry was faked; /proc/self/maps wasn't available
@@ -230,7 +230,7 @@
 		frame.PC = addr - 1
 	}
 	ret := []runtime.Frame{frame}
-	for frame.Function != "runtime.goexit" && more == true {
+	for frame.Function != "runtime.goexit" && more {
 		frame, more = frames.Next()
 		ret = append(ret, frame)
 	}
@@ -395,6 +395,10 @@
 // location ID slice, locs. The addresses in the stack are return PCs or 1 + the PC of
 // an inline marker as the runtime traceback function returns.
 //
+// It may return an empty slice even if locs is non-empty, for example if locs consists
+// solely of runtime.goexit. We still count these empty stacks in profiles in order to
+// get the right cumulative sample count.
+//
 // It may emit to b.pb, so there must be no message encoding in progress.
 func (b *profileBuilder) appendLocsForStack(locs []uint64, stk []uintptr) (newLocs []uint64) {
 	b.deck.reset()
@@ -590,6 +594,7 @@
 	type newFunc struct {
 		id         uint64
 		name, file string
+		startLine  int64
 	}
 	newFuncs := make([]newFunc, 0, 8)
 
@@ -610,7 +615,12 @@
 		if funcID == 0 {
 			funcID = uint64(len(b.funcs)) + 1
 			b.funcs[frame.Function] = int(funcID)
-			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+			newFuncs = append(newFuncs, newFunc{
+				id:        funcID,
+				name:      frame.Function,
+				file:      frame.File,
+				startLine: int64(runtime_FrameStartLine(&frame)),
+			})
 		}
 		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
 	}
@@ -633,6 +643,7 @@
 		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
 		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
 		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+		b.pb.int64Opt(tagFunction_StartLine, fn.startLine)
 		b.pb.endMessage(tagProfile_Function, start)
 	}
 
@@ -640,20 +651,6 @@
 	return id
 }
 
-// readMapping reads /proc/self/maps and writes mappings to b.pb.
-// It saves the address ranges of the mappings in b.mem for use
-// when emitting locations.
-func (b *profileBuilder) readMapping() {
-	data, _ := os.ReadFile("/proc/self/maps")
-	parseProcSelfMaps(data, b.addMapping)
-	if len(b.mem) == 0 { // pprof expects a map entry, so fake one.
-		b.addMappingEntry(0, 0, 0, "", "", true)
-		// TODO(hyangah): make addMapping return *memMap or
-		// take a memMap struct, and get rid of addMappingEntry
-		// that takes a bunch of positional arguments.
-	}
-}
-
 var space = []byte(" ")
 var newline = []byte("\n")
 
@@ -735,13 +732,12 @@
 			continue
 		}
 
-		// TODO: pprof's remapMappingIDs makes two adjustments:
+		// TODO: pprof's remapMappingIDs makes one adjustment:
 		// 1. If there is an /anon_hugepage mapping first and it is
 		// consecutive to a next mapping, drop the /anon_hugepage.
-		// 2. If start-offset = 0x400000, change start to 0x400000 and offset to 0.
-		// There's no indication why either of these is needed.
-		// Let's try not doing these and see what breaks.
-		// If we do need them, they would go here, before we
+		// There's no indication why this is needed.
+		// Let's try not doing this and see what breaks.
+		// If we do need it, it would go here, before we
 		// enter the mappings into b.mem in the first place.
 
 		buildID, _ := elfBuildID(file)
diff --git a/src/runtime/pprof/proto_other.go b/src/runtime/pprof/proto_other.go
new file mode 100644
index 0000000..4a7fe79
--- /dev/null
+++ b/src/runtime/pprof/proto_other.go
@@ -0,0 +1,30 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !windows
+
+package pprof
+
+import (
+	"errors"
+	"os"
+)
+
+// readMapping reads /proc/self/maps and writes mappings to b.pb.
+// It saves the address ranges of the mappings in b.mem for use
+// when emitting locations.
+func (b *profileBuilder) readMapping() {
+	data, _ := os.ReadFile("/proc/self/maps")
+	parseProcSelfMaps(data, b.addMapping)
+	if len(b.mem) == 0 { // pprof expects a map entry, so fake one.
+		b.addMappingEntry(0, 0, 0, "", "", true)
+		// TODO(hyangah): make addMapping return *memMap or
+		// take a memMap struct, and get rid of addMappingEntry
+		// that takes a bunch of positional arguments.
+	}
+}
+
+func readMainModuleMapping() (start, end uint64, err error) {
+	return 0, 0, errors.New("not implemented")
+}
diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go
index 84a051a..780b481 100644
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -101,6 +101,36 @@
 		addr2 = mprof.Mapping[1].Start
 		map2 = mprof.Mapping[1]
 		map2.BuildID, _ = elfBuildID(map2.File)
+	case "windows":
+		addr1 = uint64(abi.FuncPCABIInternal(f1))
+		addr2 = uint64(abi.FuncPCABIInternal(f2))
+
+		exe, err := os.Executable()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		start, end, err := readMainModuleMapping()
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		map1 = &profile.Mapping{
+			ID:           1,
+			Start:        start,
+			Limit:        end,
+			File:         exe,
+			BuildID:      peBuildID(exe),
+			HasFunctions: true,
+		}
+		map2 = &profile.Mapping{
+			ID:           1,
+			Start:        start,
+			Limit:        end,
+			File:         exe,
+			BuildID:      peBuildID(exe),
+			HasFunctions: true,
+		}
 	case "js":
 		addr1 = uint64(abi.FuncPCABIInternal(f1))
 		addr2 = uint64(abi.FuncPCABIInternal(f2))
@@ -285,7 +315,7 @@
 			if len(out) > 0 && out[len(out)-1] != '\n' {
 				out += "\n"
 			}
-			var buf bytes.Buffer
+			var buf strings.Builder
 			parseProcSelfMaps([]byte(in), func(lo, hi, offset uint64, file, buildID string) {
 				fmt.Fprintf(&buf, "%08x %08x %08x %s\n", lo, hi, offset, file)
 			})
diff --git a/src/runtime/pprof/proto_windows.go b/src/runtime/pprof/proto_windows.go
new file mode 100644
index 0000000..d5ae4a5
--- /dev/null
+++ b/src/runtime/pprof/proto_windows.go
@@ -0,0 +1,73 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"errors"
+	"internal/syscall/windows"
+	"syscall"
+)
+
+// readMapping adds memory mapping information to the profile.
+func (b *profileBuilder) readMapping() {
+	snap, err := createModuleSnapshot()
+	if err != nil {
+		// pprof expects a map entry, so fake one, when we haven't added anything yet.
+		b.addMappingEntry(0, 0, 0, "", "", true)
+		return
+	}
+	defer func() { _ = syscall.CloseHandle(snap) }()
+
+	var module windows.ModuleEntry32
+	module.Size = uint32(windows.SizeofModuleEntry32)
+	err = windows.Module32First(snap, &module)
+	if err != nil {
+		// pprof expects a map entry, so fake one, when we haven't added anything yet.
+		b.addMappingEntry(0, 0, 0, "", "", true)
+		return
+	}
+	for err == nil {
+		exe := syscall.UTF16ToString(module.ExePath[:])
+		b.addMappingEntry(
+			uint64(module.ModBaseAddr),
+			uint64(module.ModBaseAddr)+uint64(module.ModBaseSize),
+			0,
+			exe,
+			peBuildID(exe),
+			false,
+		)
+		err = windows.Module32Next(snap, &module)
+	}
+}
+
+func readMainModuleMapping() (start, end uint64, err error) {
+	snap, err := createModuleSnapshot()
+	if err != nil {
+		return 0, 0, err
+	}
+	defer func() { _ = syscall.CloseHandle(snap) }()
+
+	var module windows.ModuleEntry32
+	module.Size = uint32(windows.SizeofModuleEntry32)
+	err = windows.Module32First(snap, &module)
+	if err != nil {
+		return 0, 0, err
+	}
+
+	return uint64(module.ModBaseAddr), uint64(module.ModBaseAddr) + uint64(module.ModBaseSize), nil
+}
+
+func createModuleSnapshot() (syscall.Handle, error) {
+	for {
+		snap, err := syscall.CreateToolhelp32Snapshot(windows.TH32CS_SNAPMODULE|windows.TH32CS_SNAPMODULE32, uint32(syscall.Getpid()))
+		var errno syscall.Errno
+		if err != nil && errors.As(err, &errno) && errno == windows.ERROR_BAD_LENGTH {
+			// When CreateToolhelp32Snapshot(SNAPMODULE|SNAPMODULE32, ...) fails
+			// with ERROR_BAD_LENGTH then it should be retried until it succeeds.
+			continue
+		}
+		return snap, err
+	}
+}
diff --git a/src/runtime/pprof/protobuf.go b/src/runtime/pprof/protobuf.go
index 7b99095..f7ec1ac 100644
--- a/src/runtime/pprof/protobuf.go
+++ b/src/runtime/pprof/protobuf.go
@@ -116,7 +116,7 @@
 }
 
 func (b *protobuf) boolOpt(tag int, x bool) {
-	if x == false {
+	if !x {
 		return
 	}
 	b.bool(tag, x)
diff --git a/src/runtime/pprof/runtime.go b/src/runtime/pprof/runtime.go
index dd2545b..57e9ca4 100644
--- a/src/runtime/pprof/runtime.go
+++ b/src/runtime/pprof/runtime.go
@@ -6,9 +6,13 @@
 
 import (
 	"context"
+	"runtime"
 	"unsafe"
 )
 
+// runtime_FrameStartLine is defined in runtime/symtab.go.
+func runtime_FrameStartLine(f *runtime.Frame) int
+
 // runtime_expandFinalInlineFrame is defined in runtime/symtab.go.
 func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr
 
diff --git a/src/runtime/pprof/rusage_test.go b/src/runtime/pprof/rusage_test.go
index b82b1af..8039510 100644
--- a/src/runtime/pprof/rusage_test.go
+++ b/src/runtime/pprof/rusage_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris
+//go:build unix
 
 package pprof
 
diff --git a/src/runtime/pprof/uname_linux_test.go b/src/runtime/pprof/uname_linux_test.go
deleted file mode 100644
index 8374c83..0000000
--- a/src/runtime/pprof/uname_linux_test.go
+++ /dev/null
@@ -1,61 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build linux
-
-package pprof
-
-import (
-	"fmt"
-	"regexp"
-	"strconv"
-	"syscall"
-)
-
-var versionRe = regexp.MustCompile(`^(\d+)(?:\.(\d+)(?:\.(\d+))).*$`)
-
-func linuxKernelVersion() (major, minor, patch int, err error) {
-	var uname syscall.Utsname
-	if err := syscall.Uname(&uname); err != nil {
-		return 0, 0, 0, err
-	}
-
-	buf := make([]byte, 0, len(uname.Release))
-	for _, b := range uname.Release {
-		if b == 0 {
-			break
-		}
-		buf = append(buf, byte(b))
-	}
-	rl := string(buf)
-
-	m := versionRe.FindStringSubmatch(rl)
-	if m == nil {
-		return 0, 0, 0, fmt.Errorf("error matching version number in %q", rl)
-	}
-
-	v, err := strconv.ParseInt(m[1], 10, 64)
-	if err != nil {
-		return 0, 0, 0, fmt.Errorf("error parsing major version %q in %s: %w", m[1], rl, err)
-	}
-	major = int(v)
-
-	if len(m) >= 3 {
-		v, err := strconv.ParseInt(m[2], 10, 64)
-		if err != nil {
-			return 0, 0, 0, fmt.Errorf("error parsing minor version %q in %s: %w", m[2], rl, err)
-		}
-		minor = int(v)
-	}
-
-	if len(m) >= 4 {
-		v, err := strconv.ParseInt(m[3], 10, 64)
-		if err != nil {
-			return 0, 0, 0, fmt.Errorf("error parsing patch version %q in %s: %w", m[3], rl, err)
-		}
-		patch = int(v)
-	}
-
-	return
-}
diff --git a/src/runtime/pprof/uname_other_test.go b/src/runtime/pprof/uname_other_test.go
deleted file mode 100644
index 3276407..0000000
--- a/src/runtime/pprof/uname_other_test.go
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright 2021 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-//go:build !linux
-
-package pprof
-
-import (
-	"errors"
-)
-
-func linuxKernelVersion() (major, minor, patch int, err error) {
-	return 0, 0, 0, errors.New("not running on linux")
-}
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
index da24f50..4f62fc6 100644
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -55,7 +55,6 @@
 import (
 	"internal/abi"
 	"internal/goarch"
-	"runtime/internal/atomic"
 )
 
 type suspendGState struct {
@@ -192,7 +191,7 @@
 		case _Grunning:
 			// Optimization: if there is already a pending preemption request
 			// (from the previous loop iteration), don't bother with the atomics.
-			if gp.preemptStop && gp.preempt && gp.stackguard0 == stackPreempt && asyncM == gp.m && atomic.Load(&asyncM.preemptGen) == asyncGen {
+			if gp.preemptStop && gp.preempt && gp.stackguard0 == stackPreempt && asyncM == gp.m && asyncM.preemptGen.Load() == asyncGen {
 				break
 			}
 
@@ -208,7 +207,7 @@
 
 			// Prepare for asynchronous preemption.
 			asyncM2 := gp.m
-			asyncGen2 := atomic.Load(&asyncM2.preemptGen)
+			asyncGen2 := asyncM2.preemptGen.Load()
 			needAsync := asyncM != asyncM2 || asyncGen != asyncGen2
 			asyncM = asyncM2
 			asyncGen = asyncGen2
@@ -419,7 +418,7 @@
 		inltree := (*[1 << 20]inlinedCall)(inldata)
 		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
 		if ix >= 0 {
-			name = funcnameFromNameoff(f, inltree[ix].func_)
+			name = funcnameFromNameOff(f, inltree[ix].nameOff)
 		}
 	}
 	if hasPrefix(name, "runtime.") ||
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
index 31f7c8b..94a84fb 100644
--- a/src/runtime/preempt_amd64.s
+++ b/src/runtime/preempt_amd64.s
@@ -1,6 +1,7 @@
 // Code generated by mkpreempt.go; DO NOT EDIT.
 
 #include "go_asm.h"
+#include "asm_amd64.h"
 #include "textflag.h"
 
 TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
@@ -27,8 +28,10 @@
 	MOVQ R14, 96(SP)
 	MOVQ R15, 104(SP)
 	#ifdef GOOS_darwin
+	#ifndef hasAVX
 	CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
 	JE 2(PC)
+	#endif
 	VZEROUPPER
 	#endif
 	MOVUPS X0, 112(SP)
diff --git a/src/runtime/print.go b/src/runtime/print.go
index b2a642b..a1e0b8e 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -6,7 +6,6 @@
 
 import (
 	"internal/goarch"
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -40,7 +39,7 @@
 func recordForPanic(b []byte) {
 	printlock()
 
-	if atomic.Load(&panicking) == 0 {
+	if panicking.Load() == 0 {
 		// Not actively crashing: maintain circular buffer of print output.
 		for i := 0; i < len(b); {
 			n := copy(printBacklog[printBacklogIndex:], b[i:])
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 3991a48..554a60d 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -73,7 +73,7 @@
 // If there is at least one spinning thread (sched.nmspinning>1), we don't
 // unpark new threads when submitting work. To compensate for that, if the last
 // spinning thread finds work and stops spinning, it must unpark a new spinning
-// thread.  This approach smooths out unjustified spikes of thread unparking,
+// thread. This approach smooths out unjustified spikes of thread unparking,
 // but at the same time guarantees eventual maximal CPU parallelism
 // utilization.
 //
@@ -143,11 +143,11 @@
 
 // The main goroutine.
 func main() {
-	g := getg()
+	mp := getg().m
 
 	// Racectx of m0->g0 is used only as the parent of the main goroutine.
 	// It must not be used for anything else.
-	g.m.g0.racectx = 0
+	mp.g0.racectx = 0
 
 	// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
 	// Using decimal instead of binary GB and MB because
@@ -180,7 +180,7 @@
 	// to preserve the lock.
 	lockOSThread()
 
-	if g.m != &m0 {
+	if mp != &m0 {
 		throw("runtime.main not on m0")
 	}
 
@@ -249,6 +249,7 @@
 	fn := main_main // make an indirect call, as the linker doesn't know the address of the main package when laying down the runtime
 	fn()
 	if raceenabled {
+		runExitHooks(0) // run hooks now, since racefini does not return
 		racefini()
 	}
 
@@ -256,18 +257,19 @@
 	// another goroutine at the same time as main returns,
 	// let the other goroutine finish printing the panic trace.
 	// Once it does, it will exit. See issues 3934 and 20018.
-	if atomic.Load(&runningPanicDefers) != 0 {
+	if runningPanicDefers.Load() != 0 {
 		// Running deferred functions should not take long.
 		for c := 0; c < 1000; c++ {
-			if atomic.Load(&runningPanicDefers) == 0 {
+			if runningPanicDefers.Load() == 0 {
 				break
 			}
 			Gosched()
 		}
 	}
-	if atomic.Load(&panicking) != 0 {
+	if panicking.Load() != 0 {
 		gopark(nil, nil, waitReasonPanicWait, traceEvGoStop, 1)
 	}
+	runExitHooks(0)
 
 	exit(0)
 	for {
@@ -279,8 +281,9 @@
 // os_beforeExit is called from os.Exit(0).
 //
 //go:linkname os_beforeExit os.runtime_beforeExit
-func os_beforeExit() {
-	if raceenabled {
+func os_beforeExit(exitCode int) {
+	runExitHooks(exitCode)
+	if exitCode == 0 && raceenabled {
 		racefini()
 	}
 }
@@ -295,10 +298,10 @@
 	lockInit(&forcegc.lock, lockRankForcegc)
 	for {
 		lock(&forcegc.lock)
-		if forcegc.idle != 0 {
+		if forcegc.idle.Load() {
 			throw("forcegc: phase error")
 		}
-		atomic.Store(&forcegc.idle, 1)
+		forcegc.idle.Store(true)
 		goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
 		// this goroutine is explicitly resumed by sysmon
 		if debug.gctrace > 0 {
@@ -326,6 +329,21 @@
 	mcall(goschedguarded_m)
 }
 
+// goschedIfBusy yields the processor like gosched, but only does so if
+// there are no idle Ps or if we're on the only P and there's nothing in
+// the run queue. In both cases, there is freely available idle time.
+//
+//go:nosplit
+func goschedIfBusy() {
+	gp := getg()
+	// Call gosched if gp.preempt is set; we may be in a tight loop that
+	// doesn't otherwise yield.
+	if !gp.preempt && sched.npidle.Load() > 0 {
+		return
+	}
+	mcall(gosched_m)
+}
+
 // Puts the current goroutine into a waiting state and calls unlockf on the
 // system stack.
 //
@@ -463,7 +481,7 @@
 	releasem(mp)
 }
 
-// called from assembly
+// called from assembly.
 func badmcall(fn func(*g)) {
 	throw("runtime: mcall called on m->g0 stack")
 }
@@ -476,22 +494,16 @@
 	panic(plainError("arg size to reflect.call more than 1GB"))
 }
 
-var badmorestackg0Msg = "fatal: morestack on g0\n"
-
 //go:nosplit
 //go:nowritebarrierrec
 func badmorestackg0() {
-	sp := stringStructOf(&badmorestackg0Msg)
-	write(2, sp.str, int32(sp.len))
+	writeErrStr("fatal: morestack on g0\n")
 }
 
-var badmorestackgsignalMsg = "fatal: morestack on gsignal\n"
-
 //go:nosplit
 //go:nowritebarrierrec
 func badmorestackgsignal() {
-	sp := stringStructOf(&badmorestackgsignalMsg)
-	write(2, sp.str, int32(sp.len))
+	writeErrStr("fatal: morestack on gsignal\n")
 }
 
 //go:nosplit
@@ -600,35 +612,13 @@
 	_GoidCacheBatch = 16
 )
 
-// cpuinit extracts the environment variable GODEBUG from the environment on
-// Unix-like operating systems and calls internal/cpu.Initialize.
-func cpuinit() {
-	const prefix = "GODEBUG="
-	var env string
-
+// cpuinit sets up CPU feature flags and calls internal/cpu.Initialize. env should be the complete
+// value of the GODEBUG environment variable.
+func cpuinit(env string) {
 	switch GOOS {
 	case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
 		cpu.DebugOptions = true
-
-		// Similar to goenv_unix but extracts the environment value for
-		// GODEBUG directly.
-		// TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
-		n := int32(0)
-		for argv_index(argv, argc+1+n) != nil {
-			n++
-		}
-
-		for i := int32(0); i < n; i++ {
-			p := argv_index(argv, argc+1+i)
-			s := *(*string)(unsafe.Pointer(&stringStruct{unsafe.Pointer(p), findnull(p)}))
-
-			if hasPrefix(s, prefix) {
-				env = gostring(p)[len(prefix):]
-				break
-			}
-		}
 	}
-
 	cpu.Initialize(env)
 
 	// Support cpu feature variables are used in code generated by the compiler
@@ -647,6 +637,35 @@
 	}
 }
 
+// getGodebugEarly extracts the environment variable GODEBUG from the environment on
+// Unix-like operating systems and returns it. This function exists to extract GODEBUG
+// early before much of the runtime is initialized.
+func getGodebugEarly() string {
+	const prefix = "GODEBUG="
+	var env string
+	switch GOOS {
+	case "aix", "darwin", "ios", "dragonfly", "freebsd", "netbsd", "openbsd", "illumos", "solaris", "linux":
+		// Similar to goenv_unix but extracts the environment value for
+		// GODEBUG directly.
+		// TODO(moehrmann): remove when general goenvs() can be called before cpuinit()
+		n := int32(0)
+		for argv_index(argv, argc+1+n) != nil {
+			n++
+		}
+
+		for i := int32(0); i < n; i++ {
+			p := argv_index(argv, argc+1+i)
+			s := unsafe.String(p, findnull(p))
+
+			if hasPrefix(s, prefix) {
+				env = gostring(p)[len(prefix):]
+				break
+			}
+		}
+	}
+	return env
+}
+
 // The bootstrap sequence is:
 //
 //	call osinit
@@ -678,9 +697,9 @@
 
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
-	_g_ := getg()
+	gp := getg()
 	if raceenabled {
-		_g_.racectx, raceprocctx0 = raceinit()
+		gp.racectx, raceprocctx0 = raceinit()
 	}
 
 	sched.maxmcount = 10000
@@ -691,30 +710,35 @@
 	moduledataverify()
 	stackinit()
 	mallocinit()
-	cpuinit()      // must run before alginit
-	alginit()      // maps, hash, fastrand must not be used before this call
-	fastrandinit() // must run before mcommoninit
-	mcommoninit(_g_.m, -1)
+	godebug := getGodebugEarly()
+	initPageTrace(godebug) // must run after mallocinit but before anything allocates
+	cpuinit(godebug)       // must run before alginit
+	alginit()              // maps, hash, fastrand must not be used before this call
+	fastrandinit()         // must run before mcommoninit
+	mcommoninit(gp.m, -1)
 	modulesinit()   // provides activeModules
 	typelinksinit() // uses maps, activeModules
 	itabsinit()     // uses activeModules
 	stkobjinit()    // must run before GC starts
 
-	sigsave(&_g_.m.sigmask)
-	initSigmask = _g_.m.sigmask
-
-	if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
-		println(offset)
-		throw("sched.timeToRun not aligned to 8 bytes")
-	}
+	sigsave(&gp.m.sigmask)
+	initSigmask = gp.m.sigmask
 
 	goargs()
 	goenvs()
 	parsedebugvars()
 	gcinit()
 
+	// if disableMemoryProfiling is set, update MemProfileRate to 0 to turn off memprofile.
+	// Note: parsedebugvars may update MemProfileRate, but when disableMemoryProfiling is
+	// set to true by the linker, it means that nothing is consuming the profile, it is
+	// safe to set MemProfileRate to 0.
+	if disableMemoryProfiling {
+		MemProfileRate = 0
+	}
+
 	lock(&sched.lock)
-	sched.lastpoll = uint64(nanotime())
+	sched.lastpoll.Store(nanotime())
 	procs := ncpu
 	if n, ok := atoi32(gogetenv("GOMAXPROCS")); ok && n > 0 {
 		procs = n
@@ -733,8 +757,8 @@
 	if debug.cgocheck > 1 {
 		writeBarrier.cgo = true
 		writeBarrier.enabled = true
-		for _, p := range allp {
-			p.wbBuf.reset()
+		for _, pp := range allp {
+			pp.wbBuf.reset()
 		}
 	}
 
@@ -751,9 +775,9 @@
 }
 
 func dumpgstatus(gp *g) {
-	_g_ := getg()
-	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
+	thisg := getg()
+	print("runtime:   gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+	print("runtime: getg:  g=", thisg, ", goid=", thisg.goid, ",  g->atomicstatus=", readgstatus(thisg), "\n")
 }
 
 // sched.lock must be held.
@@ -784,10 +808,10 @@
 
 // Pre-allocated ID may be passed as 'id', or omitted by passing -1.
 func mcommoninit(mp *m, id int64) {
-	_g_ := getg()
+	gp := getg()
 
 	// g0 stack won't make sense for user (and is not necessary unwindable).
-	if _g_ != _g_.m.g0 {
+	if gp != gp.m.g0 {
 		callers(1, mp.createstack[:])
 	}
 
@@ -832,6 +856,12 @@
 	}
 }
 
+func (mp *m) becomeSpinning() {
+	mp.spinning = true
+	sched.nmspinning.Add(1)
+	sched.needspinning.Store(0)
+}
+
 var fastrandseed uintptr
 
 func fastrandinit() {
@@ -848,7 +878,6 @@
 	status := readgstatus(gp)
 
 	// Mark runnable.
-	_g_ := getg()
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if status&^_Gscan != _Gwaiting {
 		dumpgstatus(gp)
@@ -857,7 +886,7 @@
 
 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
 	casgstatus(gp, _Gwaiting, _Grunnable)
-	runqput(_g_.m.p.ptr(), gp, next)
+	runqput(mp.p.ptr(), gp, next)
 	wakep()
 	releasem(mp)
 }
@@ -868,20 +897,20 @@
 
 // freezing is set to non-zero if the runtime is trying to freeze the
 // world.
-var freezing uint32
+var freezing atomic.Bool
 
 // Similar to stopTheWorld but best-effort and can be called several times.
 // There is no reverse operation, used during crashing.
 // This function must not lock any mutexes.
 func freezetheworld() {
-	atomic.Store(&freezing, 1)
+	freezing.Store(true)
 	// stopwait and preemption requests can be lost
 	// due to races with concurrently executing threads,
 	// so try several times
 	for i := 0; i < 5; i++ {
 		// this should tell the scheduler to not start any new goroutines
 		sched.stopwait = freezeStopWait
-		atomic.Store(&sched.gcwaiting, 1)
+		sched.gcwaiting.Store(true)
 		// this should stop running goroutines
 		if !preemptall() {
 			break // no running goroutines
@@ -899,7 +928,7 @@
 //
 //go:nosplit
 func readgstatus(gp *g) uint32 {
-	return atomic.Load(&gp.atomicstatus)
+	return gp.atomicstatus.Load()
 }
 
 // The Gscanstatuses are acting like locks and this releases them.
@@ -921,7 +950,7 @@
 		_Gscansyscall,
 		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
-			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
+			success = gp.atomicstatus.CompareAndSwap(oldval, newval)
 		}
 	}
 	if !success {
@@ -941,7 +970,7 @@
 		_Gwaiting,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
-			r := atomic.Cas(&gp.atomicstatus, oldval, newval)
+			r := gp.atomicstatus.CompareAndSwap(oldval, newval)
 			if r {
 				acquireLockRank(lockRankGscan)
 			}
@@ -954,6 +983,10 @@
 	panic("not reached")
 }
 
+// casgstatusAlwaysTrack is a debug flag that causes casgstatus to always track
+// various latencies on every transition instead of sampling them.
+var casgstatusAlwaysTrack = false
+
 // If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
 // and casfrom_Gscanstatus instead.
 // casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
@@ -977,15 +1010,15 @@
 
 	// loop if gp->atomicstatus is in a scan state giving
 	// GC time to finish and change the state to oldval.
-	for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ {
-		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
+	for i := 0; !gp.atomicstatus.CompareAndSwap(oldval, newval); i++ {
+		if oldval == _Gwaiting && gp.atomicstatus.Load() == _Grunnable {
 			throw("casgstatus: waiting for Gwaiting but is Grunnable")
 		}
 		if i == 0 {
 			nextYield = nanotime() + yieldDelay
 		}
 		if nanotime() < nextYield {
-			for x := 0; x < 10 && gp.atomicstatus != oldval; x++ {
+			for x := 0; x < 10 && gp.atomicstatus.Load() != oldval; x++ {
 				procyield(1)
 			}
 		} else {
@@ -994,37 +1027,75 @@
 		}
 	}
 
-	// Handle tracking for scheduling latencies.
 	if oldval == _Grunning {
-		// Track every 8th time a goroutine transitions out of running.
-		if gp.trackingSeq%gTrackingPeriod == 0 {
+		// Track every gTrackingPeriod time a goroutine transitions out of running.
+		if casgstatusAlwaysTrack || gp.trackingSeq%gTrackingPeriod == 0 {
 			gp.tracking = true
 		}
 		gp.trackingSeq++
 	}
-	if gp.tracking {
-		if oldval == _Grunnable {
-			// We transitioned out of runnable, so measure how much
-			// time we spent in this state and add it to
-			// runnableTime.
-			now := nanotime()
-			gp.runnableTime += now - gp.runnableStamp
-			gp.runnableStamp = 0
-		}
-		if newval == _Grunnable {
-			// We just transitioned into runnable, so record what
-			// time that happened.
-			now := nanotime()
-			gp.runnableStamp = now
-		} else if newval == _Grunning {
-			// We're transitioning into running, so turn off
-			// tracking and record how much time we spent in
-			// runnable.
-			gp.tracking = false
-			sched.timeToRun.record(gp.runnableTime)
-			gp.runnableTime = 0
-		}
+	if !gp.tracking {
+		return
 	}
+
+	// Handle various kinds of tracking.
+	//
+	// Currently:
+	// - Time spent in runnable.
+	// - Time spent blocked on a sync.Mutex or sync.RWMutex.
+	switch oldval {
+	case _Grunnable:
+		// We transitioned out of runnable, so measure how much
+		// time we spent in this state and add it to
+		// runnableTime.
+		now := nanotime()
+		gp.runnableTime += now - gp.trackingStamp
+		gp.trackingStamp = 0
+	case _Gwaiting:
+		if !gp.waitreason.isMutexWait() {
+			// Not blocking on a lock.
+			break
+		}
+		// Blocking on a lock, measure it. Note that because we're
+		// sampling, we have to multiply by our sampling period to get
+		// a more representative estimate of the absolute value.
+		// gTrackingPeriod also represents an accurate sampling period
+		// because we can only enter this state from _Grunning.
+		now := nanotime()
+		sched.totalMutexWaitTime.Add((now - gp.trackingStamp) * gTrackingPeriod)
+		gp.trackingStamp = 0
+	}
+	switch newval {
+	case _Gwaiting:
+		if !gp.waitreason.isMutexWait() {
+			// Not blocking on a lock.
+			break
+		}
+		// Blocking on a lock. Write down the timestamp.
+		now := nanotime()
+		gp.trackingStamp = now
+	case _Grunnable:
+		// We just transitioned into runnable, so record what
+		// time that happened.
+		now := nanotime()
+		gp.trackingStamp = now
+	case _Grunning:
+		// We're transitioning into running, so turn off
+		// tracking and record how much time we spent in
+		// runnable.
+		gp.tracking = false
+		sched.timeToRun.record(gp.runnableTime)
+		gp.runnableTime = 0
+	}
+}
+
+// casGToWaiting transitions gp from old to _Gwaiting, and sets the wait reason.
+//
+// Use this over casgstatus when possible to ensure that a waitreason is set.
+func casGToWaiting(gp *g, old uint32, reason waitReason) {
+	// Set the wait reason before calling casgstatus, because casgstatus will use it.
+	gp.waitreason = reason
+	casgstatus(gp, old, _Gwaiting)
 }
 
 // casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
@@ -1040,7 +1111,7 @@
 		if oldstatus != _Gwaiting && oldstatus != _Grunnable {
 			throw("copystack: bad status, not Gwaiting or Grunnable")
 		}
-		if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
+		if gp.atomicstatus.CompareAndSwap(oldstatus, _Gcopystack) {
 			return oldstatus
 		}
 	}
@@ -1055,7 +1126,7 @@
 		throw("bad g transition")
 	}
 	acquireLockRank(lockRankGscan)
-	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
+	for !gp.atomicstatus.CompareAndSwap(_Grunning, _Gscan|_Gpreempted) {
 	}
 }
 
@@ -1066,7 +1137,8 @@
 	if old != _Gpreempted || new != _Gwaiting {
 		throw("bad g transition")
 	}
-	return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
+	gp.waitreason = waitReasonPreempted
+	return gp.atomicstatus.CompareAndSwap(_Gpreempted, _Gwaiting)
 }
 
 // stopTheWorld stops all P's from executing goroutines, interrupting
@@ -1098,7 +1170,8 @@
 		// must have preempted all goroutines, including any attempting
 		// to scan our stack, in which case, any stack shrinking will
 		// have already completed by the time we exit.
-		casgstatus(gp, _Grunning, _Gwaiting)
+		// Don't provide a wait reason because we're still executing.
+		casGToWaiting(gp, _Grunning, waitReasonStoppingTheWorld)
 		stopTheWorldWithSema()
 		casgstatus(gp, _Gwaiting, _Grunning)
 	})
@@ -1177,41 +1250,41 @@
 // Holding worldsema causes any other goroutines invoking
 // stopTheWorld to block.
 func stopTheWorldWithSema() {
-	_g_ := getg()
+	gp := getg()
 
 	// If we hold a lock, then we won't be able to stop another M
 	// that is blocked trying to acquire the lock.
-	if _g_.m.locks > 0 {
+	if gp.m.locks > 0 {
 		throw("stopTheWorld: holding locks")
 	}
 
 	lock(&sched.lock)
 	sched.stopwait = gomaxprocs
-	atomic.Store(&sched.gcwaiting, 1)
+	sched.gcwaiting.Store(true)
 	preemptall()
 	// stop current P
-	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
+	gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
 	sched.stopwait--
 	// try to retake all P's in Psyscall status
-	for _, p := range allp {
-		s := p.status
-		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
+	for _, pp := range allp {
+		s := pp.status
+		if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
 			if trace.enabled {
-				traceGoSysBlock(p)
-				traceProcStop(p)
+				traceGoSysBlock(pp)
+				traceProcStop(pp)
 			}
-			p.syscalltick++
+			pp.syscalltick++
 			sched.stopwait--
 		}
 	}
 	// stop idle P's
 	now := nanotime()
 	for {
-		p, _ := pidleget(now)
-		if p == nil {
+		pp, _ := pidleget(now)
+		if pp == nil {
 			break
 		}
-		p.status = _Pgcstop
+		pp.status = _Pgcstop
 		sched.stopwait--
 	}
 	wait := sched.stopwait > 0
@@ -1234,13 +1307,13 @@
 	if sched.stopwait != 0 {
 		bad = "stopTheWorld: not stopped (stopwait != 0)"
 	} else {
-		for _, p := range allp {
-			if p.status != _Pgcstop {
+		for _, pp := range allp {
+			if pp.status != _Pgcstop {
 				bad = "stopTheWorld: not stopped (status != _Pgcstop)"
 			}
 		}
 	}
-	if atomic.Load(&freezing) != 0 {
+	if freezing.Load() {
 		// Some other thread is panicking. This can cause the
 		// sanity checks above to fail if the panic happens in
 		// the signal handler on a stopped thread. Either way,
@@ -1271,9 +1344,9 @@
 		newprocs = 0
 	}
 	p1 := procresize(procs)
-	sched.gcwaiting = 0
-	if sched.sysmonwait != 0 {
-		sched.sysmonwait = 0
+	sched.gcwaiting.Store(false)
+	if sched.sysmonwait.Load() {
+		sched.sysmonwait.Store(false)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
@@ -1354,9 +1427,9 @@
 //go:nosplit
 //go:nowritebarrierrec
 func mstart0() {
-	_g_ := getg()
+	gp := getg()
 
-	osStack := _g_.stack.lo == 0
+	osStack := gp.stack.lo == 0
 	if osStack {
 		// Initialize stack bounds from system stack.
 		// Cgo may have left stack size in stack.hi.
@@ -1366,25 +1439,25 @@
 		// We set hi to &size, but there are things above
 		// it. The 1024 is supposed to compensate this,
 		// but is somewhat arbitrary.
-		size := _g_.stack.hi
+		size := gp.stack.hi
 		if size == 0 {
 			size = 8192 * sys.StackGuardMultiplier
 		}
-		_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
-		_g_.stack.lo = _g_.stack.hi - size + 1024
+		gp.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
+		gp.stack.lo = gp.stack.hi - size + 1024
 	}
 	// Initialize stack guard so that we can start calling regular
 	// Go code.
-	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+	gp.stackguard0 = gp.stack.lo + _StackGuard
 	// This is the g0, so we can also call go:systemstack
 	// functions, which check stackguard1.
-	_g_.stackguard1 = _g_.stackguard0
+	gp.stackguard1 = gp.stackguard0
 	mstart1()
 
 	// Exit this thread.
 	if mStackIsSystemAllocated() {
 		// Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate
-		// the stack, but put it in _g_.stack before mstart,
+		// the stack, but put it in gp.stack before mstart,
 		// so the logic above hasn't set osStack yet.
 		osStack = true
 	}
@@ -1396,9 +1469,9 @@
 //
 //go:noinline
 func mstart1() {
-	_g_ := getg()
+	gp := getg()
 
-	if _g_ != _g_.m.g0 {
+	if gp != gp.m.g0 {
 		throw("bad runtime·mstart")
 	}
 
@@ -1408,26 +1481,26 @@
 	// so other calls can reuse the current frame.
 	// And goexit0 does a gogo that needs to return from mstart1
 	// and let mstart0 exit the thread.
-	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
-	_g_.sched.pc = getcallerpc()
-	_g_.sched.sp = getcallersp()
+	gp.sched.g = guintptr(unsafe.Pointer(gp))
+	gp.sched.pc = getcallerpc()
+	gp.sched.sp = getcallersp()
 
 	asminit()
 	minit()
 
 	// Install signal handlers; after minit so that minit can
 	// prepare the thread to be able to handle the signals.
-	if _g_.m == &m0 {
+	if gp.m == &m0 {
 		mstartm0()
 	}
 
-	if fn := _g_.m.mstartfn; fn != nil {
+	if fn := gp.m.mstartfn; fn != nil {
 		fn()
 	}
 
-	if _g_.m != &m0 {
-		acquirep(_g_.m.nextp.ptr())
-		_g_.m.nextp = 0
+	if gp.m != &m0 {
+		acquirep(gp.m.nextp.ptr())
+		gp.m.nextp = 0
 	}
 	schedule()
 }
@@ -1461,7 +1534,7 @@
 // mexit tears down and exits the current thread.
 //
 // Don't call this directly to exit the thread, since it must run at
-// the top of the thread stack. Instead, use gogo(&_g_.m.g0.sched) to
+// the top of the thread stack. Instead, use gogo(&gp.m.g0.sched) to
 // unwind the stack to the point that exits the thread.
 //
 // It is entered with m.p != nil, so write barriers are allowed. It
@@ -1469,10 +1542,9 @@
 //
 //go:yeswritebarrierrec
 func mexit(osStack bool) {
-	g := getg()
-	m := g.m
+	mp := getg().m
 
-	if m == &m0 {
+	if mp == &m0 {
 		// This is the main thread. Just wedge it.
 		//
 		// On Linux, exiting the main thread puts the process
@@ -1497,41 +1569,40 @@
 	unminit()
 
 	// Free the gsignal stack.
-	if m.gsignal != nil {
-		stackfree(m.gsignal.stack)
+	if mp.gsignal != nil {
+		stackfree(mp.gsignal.stack)
 		// On some platforms, when calling into VDSO (e.g. nanotime)
 		// we store our g on the gsignal stack, if there is one.
 		// Now the stack is freed, unlink it from the m, so we
 		// won't write to it when calling VDSO code.
-		m.gsignal = nil
+		mp.gsignal = nil
 	}
 
 	// Remove m from allm.
 	lock(&sched.lock)
 	for pprev := &allm; *pprev != nil; pprev = &(*pprev).alllink {
-		if *pprev == m {
-			*pprev = m.alllink
+		if *pprev == mp {
+			*pprev = mp.alllink
 			goto found
 		}
 	}
 	throw("m not found in allm")
 found:
-	if !osStack {
-		// Delay reaping m until it's done with the stack.
-		//
-		// If this is using an OS stack, the OS will free it
-		// so there's no need for reaping.
-		atomic.Store(&m.freeWait, 1)
-		// Put m on the free list, though it will not be reaped until
-		// freeWait is 0. Note that the free list must not be linked
-		// through alllink because some functions walk allm without
-		// locking, so may be using alllink.
-		m.freelink = sched.freem
-		sched.freem = m
-	}
+	// Delay reaping m until it's done with the stack.
+	//
+	// Put mp on the free list, though it will not be reaped while freeWait
+	// is freeMWait. mp is no longer reachable via allm, so even if it is
+	// on an OS stack, we must keep a reference to mp alive so that the GC
+	// doesn't free mp while we are still using it.
+	//
+	// Note that the free list must not be linked through alllink because
+	// some functions walk allm without locking, so may be using alllink.
+	mp.freeWait.Store(freeMWait)
+	mp.freelink = sched.freem
+	sched.freem = mp
 	unlock(&sched.lock)
 
-	atomic.Xadd64(&ncgocall, int64(m.ncgocall))
+	atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
 
 	// Release the P.
 	handoffp(releasep())
@@ -1548,16 +1619,19 @@
 	if GOOS == "darwin" || GOOS == "ios" {
 		// Make sure pendingPreemptSignals is correct when an M exits.
 		// For #41702.
-		if atomic.Load(&m.signalPending) != 0 {
-			atomic.Xadd(&pendingPreemptSignals, -1)
+		if mp.signalPending.Load() != 0 {
+			pendingPreemptSignals.Add(-1)
 		}
 	}
 
 	// Destroy all allocated resources. After this is called, we may no
 	// longer take any locks.
-	mdestroy(m)
+	mdestroy(mp)
 
 	if osStack {
+		// No more uses of mp, so it is safe to drop the reference.
+		mp.freeWait.Store(freeMRef)
+
 		// Return from mstart and let the system thread
 		// library free the g0 stack and terminate the thread.
 		return
@@ -1567,7 +1641,7 @@
 	// return to. Exit the thread directly. exitThread will clear
 	// m.freeWait when it's done with the stack and the m can be
 	// reaped.
-	exitThread(&m.freeWait)
+	exitThread(&mp.freeWait)
 }
 
 // forEachP calls fn(p) for every P p when p reaches a GC safe point.
@@ -1583,7 +1657,7 @@
 //go:systemstack
 func forEachP(fn func(*p)) {
 	mp := acquirem()
-	_p_ := getg().m.p.ptr()
+	pp := getg().m.p.ptr()
 
 	lock(&sched.lock)
 	if sched.safePointWait != 0 {
@@ -1593,9 +1667,9 @@
 	sched.safePointFn = fn
 
 	// Ask all Ps to run the safe point function.
-	for _, p := range allp {
-		if p != _p_ {
-			atomic.Store(&p.runSafePointFn, 1)
+	for _, p2 := range allp {
+		if p2 != pp {
+			atomic.Store(&p2.runSafePointFn, 1)
 		}
 	}
 	preemptall()
@@ -1617,19 +1691,19 @@
 	unlock(&sched.lock)
 
 	// Run fn for the current P.
-	fn(_p_)
+	fn(pp)
 
 	// Force Ps currently in _Psyscall into _Pidle and hand them
 	// off to induce safe point function execution.
-	for _, p := range allp {
-		s := p.status
-		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
+	for _, p2 := range allp {
+		s := p2.status
+		if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
 			if trace.enabled {
-				traceGoSysBlock(p)
-				traceProcStop(p)
+				traceGoSysBlock(p2)
+				traceProcStop(p2)
 			}
-			p.syscalltick++
-			handoffp(p)
+			p2.syscalltick++
+			handoffp(p2)
 		}
 	}
 
@@ -1650,8 +1724,8 @@
 	if sched.safePointWait != 0 {
 		throw("forEachP: not done")
 	}
-	for _, p := range allp {
-		if p.runSafePointFn != 0 {
+	for _, p2 := range allp {
+		if p2.runSafePointFn != 0 {
 			throw("forEachP: P did not run fn")
 		}
 	}
@@ -1707,20 +1781,20 @@
 // id is optional pre-allocated m ID. Omit by passing -1.
 //
 // This function is allowed to have write barriers even if the caller
-// isn't because it borrows _p_.
+// isn't because it borrows pp.
 //
 //go:yeswritebarrierrec
-func allocm(_p_ *p, fn func(), id int64) *m {
+func allocm(pp *p, fn func(), id int64) *m {
 	allocmLock.rlock()
 
-	// The caller owns _p_, but we may borrow (i.e., acquirep) it. We must
+	// The caller owns pp, but we may borrow (i.e., acquirep) it. We must
 	// disable preemption to ensure it is not stolen, which would make the
 	// caller lose ownership.
 	acquirem()
 
-	_g_ := getg()
-	if _g_.m.p == 0 {
-		acquirep(_p_) // temporarily borrow p for mallocs in this function
+	gp := getg()
+	if gp.m.p == 0 {
+		acquirep(pp) // temporarily borrow p for mallocs in this function
 	}
 
 	// Release the free M list. We need to do this somewhere and
@@ -1729,19 +1803,25 @@
 		lock(&sched.lock)
 		var newList *m
 		for freem := sched.freem; freem != nil; {
-			if freem.freeWait != 0 {
+			wait := freem.freeWait.Load()
+			if wait == freeMWait {
 				next := freem.freelink
 				freem.freelink = newList
 				newList = freem
 				freem = next
 				continue
 			}
-			// stackfree must be on the system stack, but allocm is
-			// reachable off the system stack transitively from
-			// startm.
-			systemstack(func() {
-				stackfree(freem.g0.stack)
-			})
+			// Free the stack if needed. For freeMRef, there is
+			// nothing to do except drop freem from the sched.freem
+			// list.
+			if wait == freeMStack {
+				// stackfree must be on the system stack, but allocm is
+				// reachable off the system stack transitively from
+				// startm.
+				systemstack(func() {
+					stackfree(freem.g0.stack)
+				})
+			}
 			freem = freem.freelink
 		}
 		sched.freem = newList
@@ -1761,11 +1841,11 @@
 	}
 	mp.g0.m = mp
 
-	if _p_ == _g_.m.p.ptr() {
+	if pp == gp.m.p.ptr() {
 		releasep()
 	}
 
-	releasem(_g_.m)
+	releasem(gp.m)
 	allocmLock.runlock()
 	return mp
 }
@@ -1813,7 +1893,7 @@
 		// for details.
 		//
 		// Can not throw, because scheduler is not initialized yet.
-		write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
+		writeErrStr("fatal error: cgo callback before cgo call\n")
 		exit(1)
 	}
 
@@ -1859,10 +1939,10 @@
 	// scheduling stack is, but we assume there's at least 32 kB,
 	// which is more than enough for us.
 	setg(mp.g0)
-	_g_ := getg()
-	_g_.stack.hi = getcallersp() + 1024
-	_g_.stack.lo = getcallersp() - 32*1024
-	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+	gp := getg()
+	gp.stack.hi = getcallersp() + 1024
+	gp.stack.lo = getcallersp() - 32*1024
+	gp.stackguard0 = gp.stack.lo + _StackGuard
 
 	// Initialize this thread to use the m.
 	asminit()
@@ -1870,16 +1950,14 @@
 
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdead, _Gsyscall)
-	atomic.Xadd(&sched.ngsys, -1)
+	sched.ngsys.Add(-1)
 }
 
-var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
-
 // newextram allocates m's and puts them on the extra list.
 // It is called with a working local m, so that it can do things
 // like call schedlock and allocate.
 func newextram() {
-	c := atomic.Xchg(&extraMWaiters, 0)
+	c := extraMWaiters.Swap(0)
 	if c > 0 {
 		for i := uint32(0); i < c; i++ {
 			oneNewExtraM()
@@ -1918,13 +1996,23 @@
 	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
+	mp.isextra = true
 	mp.lockedInt++
 	mp.lockedg.set(gp)
 	gp.lockedm.set(mp)
-	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
+	gp.goid = sched.goidgen.Add(1)
+	gp.sysblocktraced = true
 	if raceenabled {
 		gp.racectx = racegostart(abi.FuncPCABIInternal(newextram) + sys.PCQuantum)
 	}
+	if trace.enabled {
+		// Trigger two trace events for the locked g in the extra m,
+		// since the next event of the g will be traceEvGoSysExit in exitsyscall,
+		// while calling from C thread to Go.
+		traceGoCreate(gp, 0) // no start pc
+		gp.traceseq++
+		traceEvent(traceEvGoInSyscall, -1, gp.goid)
+	}
 	// put on allg for garbage collector
 	allgadd(gp)
 
@@ -1932,7 +2020,7 @@
 	// counted by gcount. It would be more "proper" to increment
 	// sched.ngfree, but that requires locking. Incrementing ngsys
 	// has the same effect.
-	atomic.Xadd(&sched.ngsys, +1)
+	sched.ngsys.Add(1)
 
 	// Add m to the extra list.
 	mnext := lockextra(true)
@@ -1973,7 +2061,7 @@
 	// Return mp.curg to dead state.
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
 	mp.curg.preemptStop = false
-	atomic.Xadd(&sched.ngsys, +1)
+	sched.ngsys.Add(1)
 
 	// Block signals before unminit.
 	// Unminit unregisters the signal handling stack (but needs g on some systems).
@@ -2000,9 +2088,9 @@
 	return uintptr(unsafe.Pointer(getg().m))
 }
 
-var extram uintptr
+var extram atomic.Uintptr
 var extraMCount uint32 // Protected by lockextra
-var extraMWaiters uint32
+var extraMWaiters atomic.Uint32
 
 // lockextra locks the extra list and returns the list head.
 // The caller must unlock the list by storing a new list head
@@ -2016,7 +2104,7 @@
 
 	incr := false
 	for {
-		old := atomic.Loaduintptr(&extram)
+		old := extram.Load()
 		if old == locked {
 			osyield_no_g()
 			continue
@@ -2026,13 +2114,13 @@
 				// Add 1 to the number of threads
 				// waiting for an M.
 				// This is cleared by newextram.
-				atomic.Xadd(&extraMWaiters, 1)
+				extraMWaiters.Add(1)
 				incr = true
 			}
 			usleep_no_g(1)
 			continue
 		}
-		if atomic.Casuintptr(&extram, old, locked) {
+		if extram.CompareAndSwap(old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
 		osyield_no_g()
@@ -2042,7 +2130,7 @@
 
 //go:nosplit
 func unlockextra(mp *m) {
-	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+	extram.Store(uintptr(unsafe.Pointer(mp)))
 }
 
 var (
@@ -2057,6 +2145,13 @@
 	execLock rwmutex
 )
 
+// These errors are reported (via writeErrStr) by some OS-specific
+// versions of newosproc and newosproc0.
+const (
+	failthreadcreate  = "runtime: failed to create new OS thread\n"
+	failallocatestack = "runtime: failed to allocate stack for the new OS thread\n"
+)
+
 // newmHandoff contains a list of m structures that need new OS threads.
 // This is used by newm in situations where newm itself can't safely
 // start an OS thread.
@@ -2085,7 +2180,7 @@
 // id is optional pre-allocated m ID. Omit by passing -1.
 //
 //go:nowritebarrierrec
-func newm(fn func(), _p_ *p, id int64) {
+func newm(fn func(), pp *p, id int64) {
 	// allocm adds a new M to allm, but they do not start until created by
 	// the OS in newm1 or the template thread.
 	//
@@ -2098,8 +2193,8 @@
 	// start.
 	acquirem()
 
-	mp := allocm(_p_, fn, id)
-	mp.nextp.set(_p_)
+	mp := allocm(pp, fn, id)
+	mp.nextp.set(pp)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
 		// We're on a locked M or a thread that may have been
@@ -2221,24 +2316,24 @@
 // Stops execution of the current m until new work is available.
 // Returns with acquired P.
 func stopm() {
-	_g_ := getg()
+	gp := getg()
 
-	if _g_.m.locks != 0 {
+	if gp.m.locks != 0 {
 		throw("stopm holding locks")
 	}
-	if _g_.m.p != 0 {
+	if gp.m.p != 0 {
 		throw("stopm holding p")
 	}
-	if _g_.m.spinning {
+	if gp.m.spinning {
 		throw("stopm spinning")
 	}
 
 	lock(&sched.lock)
-	mput(_g_.m)
+	mput(gp.m)
 	unlock(&sched.lock)
 	mPark()
-	acquirep(_g_.m.nextp.ptr())
-	_g_.m.nextp = 0
+	acquirep(gp.m.nextp.ptr())
+	gp.m.nextp = 0
 }
 
 func mspinning() {
@@ -2249,8 +2344,8 @@
 // Schedules some M to run the p (creates an M if necessary).
 // If p==nil, tries to get an idle P, if no idle P's does nothing.
 // May run with m.p==nil, so write barriers are not allowed.
-// If spinning is set, the caller has incremented nmspinning and startm will
-// either decrement nmspinning or set m.spinning in the newly started M.
+// If spinning is set, the caller has incremented nmspinning and must provide a
+// P. startm will set m.spinning in the newly started M.
 //
 // Callers passing a non-nil P must call from a non-preemptible context. See
 // comment on acquirem below.
@@ -2258,7 +2353,7 @@
 // Must not have write barriers because this may be called without a P.
 //
 //go:nowritebarrierrec
-func startm(_p_ *p, spinning bool) {
+func startm(pp *p, spinning bool) {
 	// Disable preemption.
 	//
 	// Every owned P must have an owner that will eventually stop it in the
@@ -2277,17 +2372,16 @@
 	// disable preemption before acquiring a P from pidleget below.
 	mp := acquirem()
 	lock(&sched.lock)
-	if _p_ == nil {
-		_p_, _ = pidleget(0)
-		if _p_ == nil {
+	if pp == nil {
+		if spinning {
+			// TODO(prattmic): All remaining calls to this function
+			// with _p_ == nil could be cleaned up to find a P
+			// before calling startm.
+			throw("startm: P required for spinning=true")
+		}
+		pp, _ = pidleget(0)
+		if pp == nil {
 			unlock(&sched.lock)
-			if spinning {
-				// The caller incremented nmspinning, but there are no idle Ps,
-				// so it's okay to just undo the increment and give up.
-				if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
-					throw("startm: negative nmspinning")
-				}
-			}
 			releasem(mp)
 			return
 		}
@@ -2314,8 +2408,8 @@
 			// The caller incremented nmspinning, so set m.spinning in the new M.
 			fn = mspinning
 		}
-		newm(fn, _p_, id)
-		// Ownership transfer of _p_ committed by start in newm.
+		newm(fn, pp, id)
+		// Ownership transfer of pp committed by start in newm.
 		// Preemption is now safe.
 		releasem(mp)
 		return
@@ -2327,14 +2421,14 @@
 	if nmp.nextp != 0 {
 		throw("startm: m has p")
 	}
-	if spinning && !runqempty(_p_) {
+	if spinning && !runqempty(pp) {
 		throw("startm: p has runnable gs")
 	}
 	// The caller incremented nmspinning, so set m.spinning in the new M.
 	nmp.spinning = spinning
-	nmp.nextp.set(_p_)
+	nmp.nextp.set(pp)
 	notewakeup(&nmp.park)
-	// Ownership transfer of _p_ committed by wakeup. Preemption is now
+	// Ownership transfer of pp committed by wakeup. Preemption is now
 	// safe.
 	releasem(mp)
 }
@@ -2343,34 +2437,35 @@
 // Always runs without a P, so write barriers are not allowed.
 //
 //go:nowritebarrierrec
-func handoffp(_p_ *p) {
+func handoffp(pp *p) {
 	// handoffp must start an M in any situation where
-	// findrunnable would return a G to run on _p_.
+	// findrunnable would return a G to run on pp.
 
 	// if it has local work, start it straight away
-	if !runqempty(_p_) || sched.runqsize != 0 {
-		startm(_p_, false)
+	if !runqempty(pp) || sched.runqsize != 0 {
+		startm(pp, false)
 		return
 	}
 	// if there's trace work to do, start it straight away
-	if (trace.enabled || trace.shutdown) && traceReaderAvailable() {
-		startm(_p_, false)
+	if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
+		startm(pp, false)
 		return
 	}
 	// if it has GC work, start it straight away
-	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
-		startm(_p_, false)
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
+		startm(pp, false)
 		return
 	}
 	// no local work, check that there are no spinning/idle M's,
 	// otherwise our help is not required
-	if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
-		startm(_p_, true)
+	if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
+		sched.needspinning.Store(0)
+		startm(pp, true)
 		return
 	}
 	lock(&sched.lock)
-	if sched.gcwaiting != 0 {
-		_p_.status = _Pgcstop
+	if sched.gcwaiting.Load() {
+		pp.status = _Pgcstop
 		sched.stopwait--
 		if sched.stopwait == 0 {
 			notewakeup(&sched.stopnote)
@@ -2378,8 +2473,8 @@
 		unlock(&sched.lock)
 		return
 	}
-	if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
-		sched.safePointFn(_p_)
+	if pp.runSafePointFn != 0 && atomic.Cas(&pp.runSafePointFn, 1, 0) {
+		sched.safePointFn(pp)
 		sched.safePointWait--
 		if sched.safePointWait == 0 {
 			notewakeup(&sched.safePointNote)
@@ -2387,21 +2482,21 @@
 	}
 	if sched.runqsize != 0 {
 		unlock(&sched.lock)
-		startm(_p_, false)
+		startm(pp, false)
 		return
 	}
 	// If this is the last running P and nobody is polling network,
 	// need to wakeup another M to poll network.
-	if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
+	if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
 		unlock(&sched.lock)
-		startm(_p_, false)
+		startm(pp, false)
 		return
 	}
 
 	// The scheduler lock cannot be held when calling wakeNetPoller below
 	// because wakeNetPoller may call wakep which may call startm.
-	when := nobarrierWakeTime(_p_)
-	pidleput(_p_, 0)
+	when := nobarrierWakeTime(pp)
+	pidleput(pp, 0)
 	unlock(&sched.lock)
 
 	if when != 0 {
@@ -2411,41 +2506,67 @@
 
 // Tries to add one more P to execute G's.
 // Called when a G is made runnable (newproc, ready).
+// Must be called with a P.
 func wakep() {
-	if atomic.Load(&sched.npidle) == 0 {
+	// Be conservative about spinning threads, only start one if none exist
+	// already.
+	if sched.nmspinning.Load() != 0 || !sched.nmspinning.CompareAndSwap(0, 1) {
 		return
 	}
-	// be conservative about spinning threads
-	if atomic.Load(&sched.nmspinning) != 0 || !atomic.Cas(&sched.nmspinning, 0, 1) {
+
+	// Disable preemption until ownership of pp transfers to the next M in
+	// startm. Otherwise preemption here would leave pp stuck waiting to
+	// enter _Pgcstop.
+	//
+	// See preemption comment on acquirem in startm for more details.
+	mp := acquirem()
+
+	var pp *p
+	lock(&sched.lock)
+	pp, _ = pidlegetSpinning(0)
+	if pp == nil {
+		if sched.nmspinning.Add(-1) < 0 {
+			throw("wakep: negative nmspinning")
+		}
+		unlock(&sched.lock)
+		releasem(mp)
 		return
 	}
-	startm(nil, true)
+	// Since we always have a P, the race in the "No M is available"
+	// comment in startm doesn't apply during the small window between the
+	// unlock here and lock in startm. A checkdead in between will always
+	// see at least one running M (ours).
+	unlock(&sched.lock)
+
+	startm(pp, true)
+
+	releasem(mp)
 }
 
 // Stops execution of the current m that is locked to a g until the g is runnable again.
 // Returns with acquired P.
 func stoplockedm() {
-	_g_ := getg()
+	gp := getg()
 
-	if _g_.m.lockedg == 0 || _g_.m.lockedg.ptr().lockedm.ptr() != _g_.m {
+	if gp.m.lockedg == 0 || gp.m.lockedg.ptr().lockedm.ptr() != gp.m {
 		throw("stoplockedm: inconsistent locking")
 	}
-	if _g_.m.p != 0 {
+	if gp.m.p != 0 {
 		// Schedule another M to run this p.
-		_p_ := releasep()
-		handoffp(_p_)
+		pp := releasep()
+		handoffp(pp)
 	}
 	incidlelocked(1)
 	// Wait until another thread schedules lockedg again.
 	mPark()
-	status := readgstatus(_g_.m.lockedg.ptr())
+	status := readgstatus(gp.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
 		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_.m.lockedg.ptr())
+		dumpgstatus(gp.m.lockedg.ptr())
 		throw("stoplockedm: not runnable")
 	}
-	acquirep(_g_.m.nextp.ptr())
-	_g_.m.nextp = 0
+	acquirep(gp.m.nextp.ptr())
+	gp.m.nextp = 0
 }
 
 // Schedules the locked m to run the locked gp.
@@ -2453,10 +2574,8 @@
 //
 //go:nowritebarrierrec
 func startlockedm(gp *g) {
-	_g_ := getg()
-
 	mp := gp.lockedm.ptr()
-	if mp == _g_.m {
+	if mp == getg().m {
 		throw("startlockedm: locked to me")
 	}
 	if mp.nextp != 0 {
@@ -2464,8 +2583,8 @@
 	}
 	// directly handoff current P to the locked m
 	incidlelocked(-1)
-	_p_ := releasep()
-	mp.nextp.set(_p_)
+	pp := releasep()
+	mp.nextp.set(pp)
 	notewakeup(&mp.park)
 	stopm()
 }
@@ -2473,22 +2592,22 @@
 // Stops the current m for stopTheWorld.
 // Returns when the world is restarted.
 func gcstopm() {
-	_g_ := getg()
+	gp := getg()
 
-	if sched.gcwaiting == 0 {
+	if !sched.gcwaiting.Load() {
 		throw("gcstopm: not waiting for gc")
 	}
-	if _g_.m.spinning {
-		_g_.m.spinning = false
+	if gp.m.spinning {
+		gp.m.spinning = false
 		// OK to just drop nmspinning here,
 		// startTheWorld will unpark threads as necessary.
-		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+		if sched.nmspinning.Add(-1) < 0 {
 			throw("gcstopm: negative nmspinning")
 		}
 	}
-	_p_ := releasep()
+	pp := releasep()
 	lock(&sched.lock)
-	_p_.status = _Pgcstop
+	pp.status = _Pgcstop
 	sched.stopwait--
 	if sched.stopwait == 0 {
 		notewakeup(&sched.stopnote)
@@ -2507,7 +2626,7 @@
 //
 //go:yeswritebarrierrec
 func execute(gp *g, inheritTime bool) {
-	_g_ := getg()
+	mp := getg().m
 
 	if goroutineProfile.active {
 		// Make sure that gp has had its stack written out to the goroutine
@@ -2518,19 +2637,19 @@
 
 	// Assign gp.m before entering _Grunning so running Gs have an
 	// M.
-	_g_.m.curg = gp
-	gp.m = _g_.m
+	mp.curg = gp
+	gp.m = mp
 	casgstatus(gp, _Grunnable, _Grunning)
 	gp.waitsince = 0
 	gp.preempt = false
 	gp.stackguard0 = gp.stack.lo + _StackGuard
 	if !inheritTime {
-		_g_.m.p.ptr().schedtick++
+		mp.p.ptr().schedtick++
 	}
 
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
-	if _g_.m.profilehz != hz {
+	if mp.profilehz != hz {
 		setThreadCPUProfiler(hz)
 	}
 
@@ -2551,19 +2670,19 @@
 // tryWakeP indicates that the returned goroutine is not normal (GC worker, trace
 // reader) so the caller should try to wake a P.
 func findRunnable() (gp *g, inheritTime, tryWakeP bool) {
-	_g_ := getg()
+	mp := getg().m
 
 	// The conditions here and in handoffp must agree: if
 	// findrunnable would return a G to run, handoffp must start
 	// an M.
 
 top:
-	_p_ := _g_.m.p.ptr()
-	if sched.gcwaiting != 0 {
+	pp := mp.p.ptr()
+	if sched.gcwaiting.Load() {
 		gcstopm()
 		goto top
 	}
-	if _p_.runSafePointFn != 0 {
+	if pp.runSafePointFn != 0 {
 		runSafePointFn()
 	}
 
@@ -2571,11 +2690,11 @@
 	// which may steal timers. It's important that between now
 	// and then, nothing blocks, so these numbers remain mostly
 	// relevant.
-	now, pollUntil, _ := checkTimers(_p_, 0)
+	now, pollUntil, _ := checkTimers(pp, 0)
 
 	// Try to schedule the trace reader.
 	if trace.enabled || trace.shutdown {
-		gp = traceReader()
+		gp := traceReader()
 		if gp != nil {
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			traceGoUnpark(gp, 0)
@@ -2585,18 +2704,19 @@
 
 	// Try to schedule a GC worker.
 	if gcBlackenEnabled != 0 {
-		gp, now = gcController.findRunnableGCWorker(_p_, now)
+		gp, tnow := gcController.findRunnableGCWorker(pp, now)
 		if gp != nil {
 			return gp, false, true
 		}
+		now = tnow
 	}
 
 	// Check the global runnable queue once in a while to ensure fairness.
 	// Otherwise two goroutines can completely occupy the local runqueue
 	// by constantly respawning each other.
-	if _p_.schedtick%61 == 0 && sched.runqsize > 0 {
+	if pp.schedtick%61 == 0 && sched.runqsize > 0 {
 		lock(&sched.lock)
-		gp = globrunqget(_p_, 1)
+		gp := globrunqget(pp, 1)
 		unlock(&sched.lock)
 		if gp != nil {
 			return gp, false, false
@@ -2604,7 +2724,7 @@
 	}
 
 	// Wake up the finalizer G.
-	if fingwait && fingwake {
+	if fingStatus.Load()&(fingWait|fingWake) == fingWait|fingWake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
 		}
@@ -2614,14 +2734,14 @@
 	}
 
 	// local runq
-	if gp, inheritTime := runqget(_p_); gp != nil {
+	if gp, inheritTime := runqget(pp); gp != nil {
 		return gp, inheritTime, false
 	}
 
 	// global runq
 	if sched.runqsize != 0 {
 		lock(&sched.lock)
-		gp := globrunqget(_p_, 0)
+		gp := globrunqget(pp, 0)
 		unlock(&sched.lock)
 		if gp != nil {
 			return gp, false, false
@@ -2635,7 +2755,7 @@
 	// blocked thread (e.g. it has already returned from netpoll, but does
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
-	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
+	if netpollinited() && netpollWaiters.Load() > 0 && sched.lastpoll.Load() != 0 {
 		if list := netpoll(0); !list.empty() { // non-blocking
 			gp := list.pop()
 			injectglist(&list)
@@ -2652,15 +2772,12 @@
 	// Limit the number of spinning Ms to half the number of busy Ps.
 	// This is necessary to prevent excessive CPU consumption when
 	// GOMAXPROCS>>1 but the program parallelism is low.
-	procs := uint32(gomaxprocs)
-	if _g_.m.spinning || 2*atomic.Load(&sched.nmspinning) < procs-atomic.Load(&sched.npidle) {
-		if !_g_.m.spinning {
-			_g_.m.spinning = true
-			atomic.Xadd(&sched.nmspinning, 1)
+	if mp.spinning || 2*sched.nmspinning.Load() < gomaxprocs-sched.npidle.Load() {
+		if !mp.spinning {
+			mp.becomeSpinning()
 		}
 
 		gp, inheritTime, tnow, w, newWork := stealWork(now)
-		now = tnow
 		if gp != nil {
 			// Successfully stole.
 			return gp, inheritTime, false
@@ -2670,6 +2787,8 @@
 			// discover.
 			goto top
 		}
+
+		now = tnow
 		if w != 0 && (pollUntil == 0 || w < pollUntil) {
 			// Earlier timer to wait for.
 			pollUntil = w
@@ -2680,10 +2799,10 @@
 	//
 	// If we're in the GC mark phase, can safely scan and blacken objects,
 	// and have work to do, run idle-time marking rather than give up the P.
-	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) && gcController.addIdleMarkWorker() {
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) && gcController.addIdleMarkWorker() {
 		node := (*gcBgMarkWorkerNode)(gcBgMarkWorkerPool.pop())
 		if node != nil {
-			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
 			gp := node.gp.ptr()
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
@@ -2722,19 +2841,25 @@
 
 	// return P and block
 	lock(&sched.lock)
-	if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
+	if sched.gcwaiting.Load() || pp.runSafePointFn != 0 {
 		unlock(&sched.lock)
 		goto top
 	}
 	if sched.runqsize != 0 {
-		gp := globrunqget(_p_, 0)
+		gp := globrunqget(pp, 0)
 		unlock(&sched.lock)
 		return gp, false, false
 	}
-	if releasep() != _p_ {
+	if !mp.spinning && sched.needspinning.Load() == 1 {
+		// See "Delicate dance" comment below.
+		mp.becomeSpinning()
+		unlock(&sched.lock)
+		goto top
+	}
+	if releasep() != pp {
 		throw("findrunnable: wrong p")
 	}
-	now = pidleput(_p_, now)
+	now = pidleput(pp, now)
 	unlock(&sched.lock)
 
 	// Delicate dance: thread transitions from spinning to non-spinning
@@ -2751,43 +2876,60 @@
 	// * New/modified-earlier timers on a per-P timer heap.
 	// * Idle-priority GC work (barring golang.org/issue/19112).
 	//
-	// If we discover new work below, we need to restore m.spinning as a signal
-	// for resetspinning to unpark a new worker thread (because there can be more
-	// than one starving goroutine). However, if after discovering new work
-	// we also observe no idle Ps it is OK to skip unparking a new worker
-	// thread: the system is fully loaded so no spinning threads are required.
-	// Also see "Worker thread parking/unparking" comment at the top of the file.
-	wasSpinning := _g_.m.spinning
-	if _g_.m.spinning {
-		_g_.m.spinning = false
-		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+	// If we discover new work below, we need to restore m.spinning as a
+	// signal for resetspinning to unpark a new worker thread (because
+	// there can be more than one starving goroutine).
+	//
+	// However, if after discovering new work we also observe no idle Ps
+	// (either here or in resetspinning), we have a problem. We may be
+	// racing with a non-spinning M in the block above, having found no
+	// work and preparing to release its P and park. Allowing that P to go
+	// idle will result in loss of work conservation (idle P while there is
+	// runnable work). This could result in complete deadlock in the
+	// unlikely event that we discover new work (from netpoll) right as we
+	// are racing with _all_ other Ps going idle.
+	//
+	// We use sched.needspinning to synchronize with non-spinning Ms going
+	// idle. If needspinning is set when they are about to drop their P,
+	// they abort the drop and instead become a new spinning M on our
+	// behalf. If we are not racing and the system is truly fully loaded
+	// then no spinning threads are required, and the next thread to
+	// naturally become spinning will clear the flag.
+	//
+	// Also see "Worker thread parking/unparking" comment at the top of the
+	// file.
+	wasSpinning := mp.spinning
+	if mp.spinning {
+		mp.spinning = false
+		if sched.nmspinning.Add(-1) < 0 {
 			throw("findrunnable: negative nmspinning")
 		}
 
 		// Note the for correctness, only the last M transitioning from
 		// spinning to non-spinning must perform these rechecks to
-		// ensure no missed work. We are performing it on every M that
-		// transitions as a conservative change to monitor effects on
-		// latency. See golang.org/issue/43997.
+		// ensure no missed work. However, the runtime has some cases
+		// of transient increments of nmspinning that are decremented
+		// without going through this path, so we must be conservative
+		// and perform the check on all spinning Ms.
+		//
+		// See https://go.dev/issue/43997.
 
 		// Check all runqueues once again.
-		_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
-		if _p_ != nil {
-			acquirep(_p_)
-			_g_.m.spinning = true
-			atomic.Xadd(&sched.nmspinning, 1)
+		pp := checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
+		if pp != nil {
+			acquirep(pp)
+			mp.becomeSpinning()
 			goto top
 		}
 
 		// Check for idle-priority GC work again.
-		_p_, gp = checkIdleGCNoP()
-		if _p_ != nil {
-			acquirep(_p_)
-			_g_.m.spinning = true
-			atomic.Xadd(&sched.nmspinning, 1)
+		pp, gp := checkIdleGCNoP()
+		if pp != nil {
+			acquirep(pp)
+			mp.becomeSpinning()
 
 			// Run the idle worker.
-			_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+			pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
 			casgstatus(gp, _Gwaiting, _Grunnable)
 			if trace.enabled {
 				traceGoUnpark(gp, 0)
@@ -2805,12 +2947,12 @@
 	}
 
 	// Poll network until next timer.
-	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
-		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
-		if _g_.m.p != 0 {
+	if netpollinited() && (netpollWaiters.Load() > 0 || pollUntil != 0) && sched.lastpoll.Swap(0) != 0 {
+		sched.pollUntil.Store(pollUntil)
+		if mp.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
-		if _g_.m.spinning {
+		if mp.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
 		// Refresh now.
@@ -2827,8 +2969,8 @@
 			delay = 0
 		}
 		list := netpoll(delay) // block until new work is available
-		atomic.Store64(&sched.pollUntil, 0)
-		atomic.Store64(&sched.lastpoll, uint64(now))
+		sched.pollUntil.Store(0)
+		sched.lastpoll.Store(now)
 		if faketime != 0 && list.empty() {
 			// Using fake time and nothing is ready; stop M.
 			// When all M's stop, checkdead will call timejump.
@@ -2836,12 +2978,12 @@
 			goto top
 		}
 		lock(&sched.lock)
-		_p_, _ = pidleget(now)
+		pp, _ := pidleget(now)
 		unlock(&sched.lock)
-		if _p_ == nil {
+		if pp == nil {
 			injectglist(&list)
 		} else {
-			acquirep(_p_)
+			acquirep(pp)
 			if !list.empty() {
 				gp := list.pop()
 				injectglist(&list)
@@ -2852,13 +2994,12 @@
 				return gp, false, false
 			}
 			if wasSpinning {
-				_g_.m.spinning = true
-				atomic.Xadd(&sched.nmspinning, 1)
+				mp.becomeSpinning()
 			}
 			goto top
 		}
 	} else if pollUntil != 0 && netpollinited() {
-		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		pollerPollUntil := sched.pollUntil.Load()
 		if pollerPollUntil == 0 || pollerPollUntil > pollUntil {
 			netpollBreak()
 		}
@@ -2879,7 +3020,7 @@
 	if !runqempty(p) {
 		return true
 	}
-	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
+	if netpollinited() && netpollWaiters.Load() > 0 && sched.lastpoll.Load() != 0 {
 		if list := netpoll(0); !list.empty() {
 			injectglist(&list)
 			return true
@@ -2904,7 +3045,7 @@
 		stealTimersOrRunNextG := i == stealTries-1
 
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
-			if sched.gcwaiting != 0 {
+			if sched.gcwaiting.Load() {
 				// GC work may be available.
 				return nil, false, now, pollUntil, true
 			}
@@ -2972,17 +3113,18 @@
 	for id, p2 := range allpSnapshot {
 		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
 			lock(&sched.lock)
-			pp, _ := pidleget(0)
-			unlock(&sched.lock)
-			if pp != nil {
-				return pp
+			pp, _ := pidlegetSpinning(0)
+			if pp == nil {
+				// Can't get a P, don't bother checking remaining Ps.
+				unlock(&sched.lock)
+				return nil
 			}
-
-			// Can't get a P, don't bother checking remaining Ps.
-			break
+			unlock(&sched.lock)
+			return pp
 		}
 	}
 
+	// No work available.
 	return nil
 }
 
@@ -3038,7 +3180,7 @@
 	// the assumption in gcControllerState.findRunnableGCWorker that an
 	// empty gcBgMarkWorkerPool is only possible if gcMarkDone is running.
 	lock(&sched.lock)
-	pp, now := pidleget(0)
+	pp, now := pidlegetSpinning(0)
 	if pp == nil {
 		unlock(&sched.lock)
 		return nil, nil
@@ -3068,12 +3210,12 @@
 // going to wake up before the when argument; or it wakes an idle P to service
 // timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
-	if atomic.Load64(&sched.lastpoll) == 0 {
+	if sched.lastpoll.Load() == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
 		// field is either zero or the time to which the current
 		// poll is expected to run. This can have a spurious wakeup
 		// but should never miss a wakeup.
-		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		pollerPollUntil := sched.pollUntil.Load()
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
@@ -3087,13 +3229,13 @@
 }
 
 func resetspinning() {
-	_g_ := getg()
-	if !_g_.m.spinning {
+	gp := getg()
+	if !gp.m.spinning {
 		throw("resetspinning: not a spinning m")
 	}
-	_g_.m.spinning = false
-	nmspinning := atomic.Xadd(&sched.nmspinning, -1)
-	if int32(nmspinning) < 0 {
+	gp.m.spinning = false
+	nmspinning := sched.nmspinning.Add(-1)
+	if nmspinning < 0 {
 		throw("findrunnable: negative nmspinning")
 	}
 	// M wakeup policy is deliberately somewhat conservative, so check if we
@@ -3138,8 +3280,20 @@
 	*glist = gList{}
 
 	startIdle := func(n int) {
-		for ; n != 0 && sched.npidle != 0; n-- {
-			startm(nil, false)
+		for i := 0; i < n; i++ {
+			mp := acquirem() // See comment in startm.
+			lock(&sched.lock)
+
+			pp, _ := pidlegetSpinning(0)
+			if pp == nil {
+				unlock(&sched.lock)
+				releasem(mp)
+				break
+			}
+
+			unlock(&sched.lock)
+			startm(pp, false)
+			releasem(mp)
 		}
 	}
 
@@ -3152,7 +3306,7 @@
 		return
 	}
 
-	npidle := int(atomic.Load(&sched.npidle))
+	npidle := int(sched.npidle.Load())
 	var globq gQueue
 	var n int
 	for n = 0; n < npidle && !q.empty(); n++ {
@@ -3175,31 +3329,31 @@
 // One round of scheduler: find a runnable goroutine and execute it.
 // Never returns.
 func schedule() {
-	_g_ := getg()
+	mp := getg().m
 
-	if _g_.m.locks != 0 {
+	if mp.locks != 0 {
 		throw("schedule: holding locks")
 	}
 
-	if _g_.m.lockedg != 0 {
+	if mp.lockedg != 0 {
 		stoplockedm()
-		execute(_g_.m.lockedg.ptr(), false) // Never returns.
+		execute(mp.lockedg.ptr(), false) // Never returns.
 	}
 
 	// We should not schedule away from a g that is executing a cgo call,
 	// since the cgo call is using the m's g0 stack.
-	if _g_.m.incgo {
+	if mp.incgo {
 		throw("schedule: in cgo")
 	}
 
 top:
-	pp := _g_.m.p.ptr()
+	pp := mp.p.ptr()
 	pp.preempt = false
 
 	// Safety check: if we are spinning, the run queue should be empty.
 	// Check this before calling checkTimers, as that might call
 	// goready to put a ready goroutine on the local run queue.
-	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
+	if mp.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
 		throw("schedule: spinning with local work")
 	}
 
@@ -3208,7 +3362,7 @@
 	// This thread is going to run a goroutine and is not spinning anymore,
 	// so if it was marked as spinning we need to reset it now and potentially
 	// start a new spinning M.
-	if _g_.m.spinning {
+	if mp.spinning {
 		resetspinning()
 	}
 
@@ -3252,10 +3406,10 @@
 // readied later, the caller can do other work but eventually should
 // call schedule to restart the scheduling of goroutines on this m.
 func dropg() {
-	_g_ := getg()
+	gp := getg()
 
-	setMNoWB(&_g_.m.curg.m, nil)
-	setGNoWB(&_g_.m.curg, nil)
+	setMNoWB(&gp.m.curg.m, nil)
+	setGNoWB(&gp.m.curg, nil)
 }
 
 // checkTimers runs any timers for the P that are ready.
@@ -3271,8 +3425,8 @@
 func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 	// If it's not yet time for the first timer, or the first adjusted
 	// timer, then there is nothing to do.
-	next := int64(atomic.Load64(&pp.timer0When))
-	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	next := pp.timer0When.Load()
+	nextAdj := pp.timerModifiedEarliest.Load()
 	if next == 0 || (nextAdj != 0 && nextAdj < next) {
 		next = nextAdj
 	}
@@ -3290,7 +3444,7 @@
 		// if we would clear deleted timers.
 		// This corresponds to the condition below where
 		// we decide whether to call clearDeletedTimers.
-		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+		if pp != getg().m.p.ptr() || int(pp.deletedTimers.Load()) <= int(pp.numTimers.Load()/4) {
 			return now, next, false
 		}
 	}
@@ -3315,7 +3469,7 @@
 	// If this is the local P, and there are a lot of deleted timers,
 	// clear them out. We only do this for the local P to reduce
 	// lock contention on timersLock.
-	if pp == getg().m.p.ptr() && int(atomic.Load(&pp.deletedTimers)) > len(pp.timers)/4 {
+	if pp == getg().m.p.ptr() && int(pp.deletedTimers.Load()) > len(pp.timers)/4 {
 		clearDeletedTimers(pp)
 	}
 
@@ -3331,19 +3485,21 @@
 
 // park continuation on g0.
 func park_m(gp *g) {
-	_g_ := getg()
+	mp := getg().m
 
 	if trace.enabled {
-		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
+		traceGoPark(mp.waittraceev, mp.waittraceskip)
 	}
 
+	// N.B. Not using casGToWaiting here because the waitreason is
+	// set by park_m's caller.
 	casgstatus(gp, _Grunning, _Gwaiting)
 	dropg()
 
-	if fn := _g_.m.waitunlockf; fn != nil {
-		ok := fn(gp, _g_.m.waitlock)
-		_g_.m.waitunlockf = nil
-		_g_.m.waitlock = nil
+	if fn := mp.waitunlockf; fn != nil {
+		ok := fn(gp, mp.waitlock)
+		mp.waitunlockf = nil
+		mp.waitlock = nil
 		if !ok {
 			if trace.enabled {
 				traceGoUnpark(gp, 2)
@@ -3378,7 +3534,7 @@
 	goschedImpl(gp)
 }
 
-// goschedguarded is a forbidden-states-avoided version of gosched_m
+// goschedguarded is a forbidden-states-avoided version of gosched_m.
 func goschedguarded_m(gp *g) {
 
 	if !canPreemptM(gp.m) {
@@ -3410,7 +3566,6 @@
 		dumpgstatus(gp)
 		throw("bad g status")
 	}
-	gp.waitreason = waitReasonPreempted
 
 	if gp.asyncSafePoint {
 		// Double-check that async preemption does not
@@ -3470,24 +3625,24 @@
 
 // goexit continuation on g0.
 func goexit0(gp *g) {
-	_g_ := getg()
-	_p_ := _g_.m.p.ptr()
+	mp := getg().m
+	pp := mp.p.ptr()
 
 	casgstatus(gp, _Grunning, _Gdead)
-	gcController.addScannableStack(_p_, -int64(gp.stack.hi-gp.stack.lo))
+	gcController.addScannableStack(pp, -int64(gp.stack.hi-gp.stack.lo))
 	if isSystemGoroutine(gp, false) {
-		atomic.Xadd(&sched.ngsys, -1)
+		sched.ngsys.Add(-1)
 	}
 	gp.m = nil
 	locked := gp.lockedm != 0
 	gp.lockedm = 0
-	_g_.m.lockedg = 0
+	mp.lockedg = 0
 	gp.preemptStop = false
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
 	gp.writebuf = nil
-	gp.waitreason = 0
+	gp.waitreason = waitReasonZero
 	gp.param = nil
 	gp.labels = nil
 	gp.timer = nil
@@ -3498,22 +3653,22 @@
 		// rapidly creating an exiting goroutines.
 		assistWorkPerByte := gcController.assistWorkPerByte.Load()
 		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
-		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
+		gcController.bgScanCredit.Add(scanCredit)
 		gp.gcAssistBytes = 0
 	}
 
 	dropg()
 
 	if GOARCH == "wasm" { // no threads yet on wasm
-		gfput(_p_, gp)
+		gfput(pp, gp)
 		schedule() // never returns
 	}
 
-	if _g_.m.lockedInt != 0 {
-		print("invalid m->lockedInt = ", _g_.m.lockedInt, "\n")
+	if mp.lockedInt != 0 {
+		print("invalid m->lockedInt = ", mp.lockedInt, "\n")
 		throw("internal lockOSThread error")
 	}
-	gfput(_p_, gp)
+	gfput(pp, gp)
 	if locked {
 		// The goroutine may have locked this thread because
 		// it put it in an unusual kernel state. Kill it
@@ -3522,11 +3677,11 @@
 		// Return to mstart, which will release the P and exit
 		// the thread.
 		if GOOS != "plan9" { // See golang.org/issue/22227.
-			gogo(&_g_.m.g0.sched)
+			gogo(&mp.g0.sched)
 		} else {
 			// Clear lockedExt on plan9 since we may end up re-using
 			// this thread.
-			_g_.m.lockedExt = 0
+			mp.lockedExt = 0
 		}
 	}
 	schedule()
@@ -3541,9 +3696,9 @@
 //go:nosplit
 //go:nowritebarrierrec
 func save(pc, sp uintptr) {
-	_g_ := getg()
+	gp := getg()
 
-	if _g_ == _g_.m.g0 || _g_ == _g_.m.gsignal {
+	if gp == gp.m.g0 || gp == gp.m.gsignal {
 		// m.g0.sched is special and must describe the context
 		// for exiting the thread. mstart1 writes to it directly.
 		// m.gsignal.sched should not be used at all.
@@ -3552,14 +3707,14 @@
 		throw("save on system g not allowed")
 	}
 
-	_g_.sched.pc = pc
-	_g_.sched.sp = sp
-	_g_.sched.lr = 0
-	_g_.sched.ret = 0
+	gp.sched.pc = pc
+	gp.sched.sp = sp
+	gp.sched.lr = 0
+	gp.sched.ret = 0
 	// We need to ensure ctxt is zero, but can't have a write
 	// barrier here. However, it should always already be zero.
 	// Assert that.
-	if _g_.sched.ctxt != nil {
+	if gp.sched.ctxt != nil {
 		badctxt()
 	}
 }
@@ -3594,7 +3749,7 @@
 // when syscall returns we emit traceGoSysExit and when the goroutine starts running
 // (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
 // To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
-// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
+// we remember current value of syscalltick in m (gp.m.syscalltick = gp.m.p.ptr().syscalltick),
 // whoever emits traceGoSysBlock increments p.syscalltick afterwards;
 // and we wait for the increment before emitting traceGoSysExit.
 // Note that the increment is done even if tracing is not enabled,
@@ -3602,27 +3757,27 @@
 //
 //go:nosplit
 func reentersyscall(pc, sp uintptr) {
-	_g_ := getg()
+	gp := getg()
 
 	// Disable preemption because during this function g is in Gsyscall status,
 	// but can have inconsistent g->sched, do not let GC observe it.
-	_g_.m.locks++
+	gp.m.locks++
 
 	// Entersyscall must not call any function that might split/grow the stack.
 	// (See details in comment above.)
 	// Catch calls that might, by replacing the stack guard with something that
 	// will trip any stack check and leaving a flag to tell newstack to die.
-	_g_.stackguard0 = stackPreempt
-	_g_.throwsplit = true
+	gp.stackguard0 = stackPreempt
+	gp.throwsplit = true
 
 	// Leave SP around for GC and traceback.
 	save(pc, sp)
-	_g_.syscallsp = sp
-	_g_.syscallpc = pc
-	casgstatus(_g_, _Grunning, _Gsyscall)
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+	gp.syscallsp = sp
+	gp.syscallpc = pc
+	casgstatus(gp, _Grunning, _Gsyscall)
+	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
-			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			print("entersyscall inconsistent ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
 			throw("entersyscall")
 		})
 	}
@@ -3635,30 +3790,30 @@
 		save(pc, sp)
 	}
 
-	if atomic.Load(&sched.sysmonwait) != 0 {
+	if sched.sysmonwait.Load() {
 		systemstack(entersyscall_sysmon)
 		save(pc, sp)
 	}
 
-	if _g_.m.p.ptr().runSafePointFn != 0 {
+	if gp.m.p.ptr().runSafePointFn != 0 {
 		// runSafePointFn may stack split if run on this stack
 		systemstack(runSafePointFn)
 		save(pc, sp)
 	}
 
-	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
-	_g_.sysblocktraced = true
-	pp := _g_.m.p.ptr()
+	gp.m.syscalltick = gp.m.p.ptr().syscalltick
+	gp.sysblocktraced = true
+	pp := gp.m.p.ptr()
 	pp.m = 0
-	_g_.m.oldp.set(pp)
-	_g_.m.p = 0
+	gp.m.oldp.set(pp)
+	gp.m.p = 0
 	atomic.Store(&pp.status, _Psyscall)
-	if sched.gcwaiting != 0 {
+	if sched.gcwaiting.Load() {
 		systemstack(entersyscall_gcwait)
 		save(pc, sp)
 	}
 
-	_g_.m.locks--
+	gp.m.locks--
 }
 
 // Standard syscall entry used by the go syscall library and normal cgo calls.
@@ -3673,24 +3828,24 @@
 
 func entersyscall_sysmon() {
 	lock(&sched.lock)
-	if atomic.Load(&sched.sysmonwait) != 0 {
-		atomic.Store(&sched.sysmonwait, 0)
+	if sched.sysmonwait.Load() {
+		sched.sysmonwait.Store(false)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
 }
 
 func entersyscall_gcwait() {
-	_g_ := getg()
-	_p_ := _g_.m.oldp.ptr()
+	gp := getg()
+	pp := gp.m.oldp.ptr()
 
 	lock(&sched.lock)
-	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
+	if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
 		if trace.enabled {
-			traceGoSysBlock(_p_)
-			traceProcStop(_p_)
+			traceGoSysBlock(pp)
+			traceProcStop(pp)
 		}
-		_p_.syscalltick++
+		pp.syscalltick++
 		if sched.stopwait--; sched.stopwait == 0 {
 			notewakeup(&sched.stopnote)
 		}
@@ -3702,34 +3857,34 @@
 //
 //go:nosplit
 func entersyscallblock() {
-	_g_ := getg()
+	gp := getg()
 
-	_g_.m.locks++ // see comment in entersyscall
-	_g_.throwsplit = true
-	_g_.stackguard0 = stackPreempt // see comment in entersyscall
-	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
-	_g_.sysblocktraced = true
-	_g_.m.p.ptr().syscalltick++
+	gp.m.locks++ // see comment in entersyscall
+	gp.throwsplit = true
+	gp.stackguard0 = stackPreempt // see comment in entersyscall
+	gp.m.syscalltick = gp.m.p.ptr().syscalltick
+	gp.sysblocktraced = true
+	gp.m.p.ptr().syscalltick++
 
 	// Leave SP around for GC and traceback.
 	pc := getcallerpc()
 	sp := getcallersp()
 	save(pc, sp)
-	_g_.syscallsp = _g_.sched.sp
-	_g_.syscallpc = _g_.sched.pc
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+	gp.syscallsp = gp.sched.sp
+	gp.syscallpc = gp.sched.pc
+	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		sp1 := sp
-		sp2 := _g_.sched.sp
-		sp3 := _g_.syscallsp
+		sp2 := gp.sched.sp
+		sp3 := gp.syscallsp
 		systemstack(func() {
-			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
 			throw("entersyscallblock")
 		})
 	}
-	casgstatus(_g_, _Grunning, _Gsyscall)
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+	casgstatus(gp, _Grunning, _Gsyscall)
+	if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
 		systemstack(func() {
-			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			print("entersyscallblock inconsistent ", hex(sp), " ", hex(gp.sched.sp), " ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
 			throw("entersyscallblock")
 		})
 	}
@@ -3739,7 +3894,7 @@
 	// Resave for traceback during blocked call.
 	save(getcallerpc(), getcallersp())
 
-	_g_.m.locks--
+	gp.m.locks--
 }
 
 func entersyscallblock_handoff() {
@@ -3763,16 +3918,16 @@
 //go:nowritebarrierrec
 //go:linkname exitsyscall
 func exitsyscall() {
-	_g_ := getg()
+	gp := getg()
 
-	_g_.m.locks++ // see comment in entersyscall
-	if getcallersp() > _g_.syscallsp {
+	gp.m.locks++ // see comment in entersyscall
+	if getcallersp() > gp.syscallsp {
 		throw("exitsyscall: syscall frame is no longer valid")
 	}
 
-	_g_.waitsince = 0
-	oldp := _g_.m.oldp.ptr()
-	_g_.m.oldp = 0
+	gp.waitsince = 0
+	oldp := gp.m.oldp.ptr()
+	gp.m.oldp = 0
 	if exitsyscallfast(oldp) {
 		// When exitsyscallfast returns success, we have a P so can now use
 		// write barriers
@@ -3781,33 +3936,33 @@
 			// profile, exactly as it was when the goroutine profiler first
 			// stopped the world.
 			systemstack(func() {
-				tryRecordGoroutineProfileWB(_g_)
+				tryRecordGoroutineProfileWB(gp)
 			})
 		}
 		if trace.enabled {
-			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+			if oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick {
 				systemstack(traceGoStart)
 			}
 		}
 		// There's a cpu for us, so we can run.
-		_g_.m.p.ptr().syscalltick++
+		gp.m.p.ptr().syscalltick++
 		// We need to cas the status and scan before resuming...
-		casgstatus(_g_, _Gsyscall, _Grunning)
+		casgstatus(gp, _Gsyscall, _Grunning)
 
 		// Garbage collector isn't running (since we are),
 		// so okay to clear syscallsp.
-		_g_.syscallsp = 0
-		_g_.m.locks--
-		if _g_.preempt {
+		gp.syscallsp = 0
+		gp.m.locks--
+		if gp.preempt {
 			// restore the preemption request in case we've cleared it in newstack
-			_g_.stackguard0 = stackPreempt
+			gp.stackguard0 = stackPreempt
 		} else {
 			// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
-			_g_.stackguard0 = _g_.stack.lo + _StackGuard
+			gp.stackguard0 = gp.stack.lo + _StackGuard
 		}
-		_g_.throwsplit = false
+		gp.throwsplit = false
 
-		if sched.disable.user && !schedEnabled(_g_) {
+		if sched.disable.user && !schedEnabled(gp) {
 			// Scheduling of this goroutine is disabled.
 			Gosched()
 		}
@@ -3815,21 +3970,21 @@
 		return
 	}
 
-	_g_.sysexitticks = 0
+	gp.sysexitticks = 0
 	if trace.enabled {
 		// Wait till traceGoSysBlock event is emitted.
 		// This ensures consistency of the trace (the goroutine is started after it is blocked).
-		for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
+		for oldp != nil && oldp.syscalltick == gp.m.syscalltick {
 			osyield()
 		}
 		// We can't trace syscall exit right now because we don't have a P.
 		// Tracing code can invoke write barriers that cannot run without a P.
 		// So instead we remember the syscall exit time and emit the event
 		// in execute when we have a P.
-		_g_.sysexitticks = cputicks()
+		gp.sysexitticks = cputicks()
 	}
 
-	_g_.m.locks--
+	gp.m.locks--
 
 	// Call the scheduler.
 	mcall(exitsyscall0)
@@ -3840,14 +3995,14 @@
 	// Must wait until now because until gosched returns
 	// we don't know for sure that the garbage collector
 	// is not running.
-	_g_.syscallsp = 0
-	_g_.m.p.ptr().syscalltick++
-	_g_.throwsplit = false
+	gp.syscallsp = 0
+	gp.m.p.ptr().syscalltick++
+	gp.throwsplit = false
 }
 
 //go:nosplit
 func exitsyscallfast(oldp *p) bool {
-	_g_ := getg()
+	gp := getg()
 
 	// Freezetheworld sets stopwait but does not retake P's.
 	if sched.stopwait == freezeStopWait {
@@ -3871,7 +4026,7 @@
 				if oldp != nil {
 					// Wait till traceGoSysBlock event is emitted.
 					// This ensures consistency of the trace (the goroutine is started after it is blocked).
-					for oldp.syscalltick == _g_.m.syscalltick {
+					for oldp.syscalltick == gp.m.syscalltick {
 						osyield()
 					}
 				}
@@ -3891,33 +4046,33 @@
 //
 //go:nosplit
 func exitsyscallfast_reacquired() {
-	_g_ := getg()
-	if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+	gp := getg()
+	if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
 		if trace.enabled {
-			// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
+			// The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
 			// traceGoSysBlock for this syscall was already emitted,
 			// but here we effectively retake the p from the new syscall running on the same p.
 			systemstack(func() {
 				// Denote blocking of the new syscall.
-				traceGoSysBlock(_g_.m.p.ptr())
+				traceGoSysBlock(gp.m.p.ptr())
 				// Denote completion of the current syscall.
 				traceGoSysExit(0)
 			})
 		}
-		_g_.m.p.ptr().syscalltick++
+		gp.m.p.ptr().syscalltick++
 	}
 }
 
 func exitsyscallfast_pidle() bool {
 	lock(&sched.lock)
-	_p_, _ := pidleget(0)
-	if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
-		atomic.Store(&sched.sysmonwait, 0)
+	pp, _ := pidleget(0)
+	if pp != nil && sched.sysmonwait.Load() {
+		sched.sysmonwait.Store(false)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
-	if _p_ != nil {
-		acquirep(_p_)
+	if pp != nil {
+		acquirep(pp)
 		return true
 	}
 	return false
@@ -3933,12 +4088,12 @@
 	casgstatus(gp, _Gsyscall, _Grunnable)
 	dropg()
 	lock(&sched.lock)
-	var _p_ *p
+	var pp *p
 	if schedEnabled(gp) {
-		_p_, _ = pidleget(0)
+		pp, _ = pidleget(0)
 	}
 	var locked bool
-	if _p_ == nil {
+	if pp == nil {
 		globrunqput(gp)
 
 		// Below, we stoplockedm if gp is locked. globrunqput releases
@@ -3947,13 +4102,13 @@
 		// could race with another M transitioning gp from unlocked to
 		// locked.
 		locked = gp.lockedm != 0
-	} else if atomic.Load(&sched.sysmonwait) != 0 {
-		atomic.Store(&sched.sysmonwait, 0)
+	} else if sched.sysmonwait.Load() {
+		sched.sysmonwait.Store(false)
 		notewakeup(&sched.sysmonnote)
 	}
 	unlock(&sched.lock)
-	if _p_ != nil {
-		acquirep(_p_)
+	if pp != nil {
+		acquirep(pp)
 		execute(gp, false) // Never returns.
 	}
 	if locked {
@@ -4038,7 +4193,7 @@
 // pendingPreemptSignals is the number of preemption signals
 // that have been sent but not received. This is only used on Darwin.
 // For #41702.
-var pendingPreemptSignals uint32
+var pendingPreemptSignals atomic.Int32
 
 // Called from syscall package before Exec.
 //
@@ -4050,7 +4205,7 @@
 	// On Darwin, wait for all pending preemption signals to
 	// be received. See issue #41702.
 	if GOOS == "darwin" || GOOS == "ios" {
-		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
+		for pendingPreemptSignals.Load() > 0 {
 			osyield()
 		}
 	}
@@ -4089,8 +4244,8 @@
 	systemstack(func() {
 		newg := newproc1(fn, gp, pc)
 
-		_p_ := getg().m.p.ptr()
-		runqput(_p_, newg, true)
+		pp := getg().m.p.ptr()
+		runqput(pp, newg, true)
 
 		if mainStarted {
 			wakep()
@@ -4102,15 +4257,13 @@
 // address of the go statement that created this. The caller is responsible
 // for adding the new g to the scheduler.
 func newproc1(fn *funcval, callergp *g, callerpc uintptr) *g {
-	_g_ := getg()
-
 	if fn == nil {
 		fatal("go of nil func value")
 	}
-	acquirem() // disable preemption because it can be holding p in a local var
 
-	_p_ := _g_.m.p.ptr()
-	newg := gfget(_p_)
+	mp := acquirem() // disable preemption because we hold M and P in local vars.
+	pp := mp.p.ptr()
+	newg := gfget(pp)
 	if newg == nil {
 		newg = malg(_StackMin)
 		casgstatus(newg, _Gidle, _Gdead)
@@ -4145,11 +4298,11 @@
 	newg.ancestors = saveAncestors(callergp)
 	newg.startpc = fn.fn
 	if isSystemGoroutine(newg, false) {
-		atomic.Xadd(&sched.ngsys, +1)
+		sched.ngsys.Add(1)
 	} else {
 		// Only user goroutines inherit pprof labels.
-		if _g_.m.curg != nil {
-			newg.labels = _g_.m.curg.labels
+		if mp.curg != nil {
+			newg.labels = mp.curg.labels
 		}
 		if goroutineProfile.active {
 			// A concurrent goroutine profile is running. It should include
@@ -4166,18 +4319,18 @@
 		newg.tracking = true
 	}
 	casgstatus(newg, _Gdead, _Grunnable)
-	gcController.addScannableStack(_p_, int64(newg.stack.hi-newg.stack.lo))
+	gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))
 
-	if _p_.goidcache == _p_.goidcacheend {
+	if pp.goidcache == pp.goidcacheend {
 		// Sched.goidgen is the last allocated id,
 		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
 		// At startup sched.goidgen=0, so main goroutine receives goid=1.
-		_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
-		_p_.goidcache -= _GoidCacheBatch - 1
-		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+		pp.goidcache = sched.goidgen.Add(_GoidCacheBatch)
+		pp.goidcache -= _GoidCacheBatch - 1
+		pp.goidcacheend = pp.goidcache + _GoidCacheBatch
 	}
-	newg.goid = int64(_p_.goidcache)
-	_p_.goidcache++
+	newg.goid = pp.goidcache
+	pp.goidcache++
 	if raceenabled {
 		newg.racectx = racegostart(callerpc)
 		if newg.labels != nil {
@@ -4189,7 +4342,7 @@
 	if trace.enabled {
 		traceGoCreate(newg, newg.startpc)
 	}
-	releasem(_g_.m)
+	releasem(mp)
 
 	return newg
 }
@@ -4230,7 +4383,7 @@
 
 // Put on gfree list.
 // If local list is too long, transfer a batch to the global list.
-func gfput(_p_ *p, gp *g) {
+func gfput(pp *p, gp *g) {
 	if readgstatus(gp) != _Gdead {
 		throw("gfput: bad status (not Gdead)")
 	}
@@ -4245,17 +4398,17 @@
 		gp.stackguard0 = 0
 	}
 
-	_p_.gFree.push(gp)
-	_p_.gFree.n++
-	if _p_.gFree.n >= 64 {
+	pp.gFree.push(gp)
+	pp.gFree.n++
+	if pp.gFree.n >= 64 {
 		var (
 			inc      int32
 			stackQ   gQueue
 			noStackQ gQueue
 		)
-		for _p_.gFree.n >= 32 {
-			gp = _p_.gFree.pop()
-			_p_.gFree.n--
+		for pp.gFree.n >= 32 {
+			gp := pp.gFree.pop()
+			pp.gFree.n--
 			if gp.stack.lo == 0 {
 				noStackQ.push(gp)
 			} else {
@@ -4273,12 +4426,12 @@
 
 // Get from gfree list.
 // If local list is empty, grab a batch from global list.
-func gfget(_p_ *p) *g {
+func gfget(pp *p) *g {
 retry:
-	if _p_.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) {
+	if pp.gFree.empty() && (!sched.gFree.stack.empty() || !sched.gFree.noStack.empty()) {
 		lock(&sched.gFree.lock)
 		// Move a batch of free Gs to the P.
-		for _p_.gFree.n < 32 {
+		for pp.gFree.n < 32 {
 			// Prefer Gs with stacks.
 			gp := sched.gFree.stack.pop()
 			if gp == nil {
@@ -4288,17 +4441,17 @@
 				}
 			}
 			sched.gFree.n--
-			_p_.gFree.push(gp)
-			_p_.gFree.n++
+			pp.gFree.push(gp)
+			pp.gFree.n++
 		}
 		unlock(&sched.gFree.lock)
 		goto retry
 	}
-	gp := _p_.gFree.pop()
+	gp := pp.gFree.pop()
 	if gp == nil {
 		return nil
 	}
-	_p_.gFree.n--
+	pp.gFree.n--
 	if gp.stack.lo != 0 && gp.stack.hi-gp.stack.lo != uintptr(startingStackSize) {
 		// Deallocate old stack. We kept it in gfput because it was the
 		// right size when the goroutine was put on the free list, but
@@ -4331,15 +4484,15 @@
 }
 
 // Purge all cached G's from gfree list to the global list.
-func gfpurge(_p_ *p) {
+func gfpurge(pp *p) {
 	var (
 		inc      int32
 		stackQ   gQueue
 		noStackQ gQueue
 	)
-	for !_p_.gFree.empty() {
-		gp := _p_.gFree.pop()
-		_p_.gFree.n--
+	for !pp.gFree.empty() {
+		gp := pp.gFree.pop()
+		pp.gFree.n--
 		if gp.stack.lo == 0 {
 			noStackQ.push(gp)
 		} else {
@@ -4368,9 +4521,9 @@
 	if GOARCH == "wasm" {
 		return // no threads on wasm yet
 	}
-	_g_ := getg()
-	_g_.m.lockedg.set(_g_)
-	_g_.lockedm.set(_g_.m)
+	gp := getg()
+	gp.m.lockedg.set(gp)
+	gp.lockedm.set(gp.m)
 }
 
 //go:nosplit
@@ -4396,10 +4549,10 @@
 		// while we're in a known-good state.
 		startTemplateThread()
 	}
-	_g_ := getg()
-	_g_.m.lockedExt++
-	if _g_.m.lockedExt == 0 {
-		_g_.m.lockedExt--
+	gp := getg()
+	gp.m.lockedExt++
+	if gp.m.lockedExt == 0 {
+		gp.m.lockedExt--
 		panic("LockOSThread nesting overflow")
 	}
 	dolockOSThread()
@@ -4420,12 +4573,12 @@
 	if GOARCH == "wasm" {
 		return // no threads on wasm yet
 	}
-	_g_ := getg()
-	if _g_.m.lockedInt != 0 || _g_.m.lockedExt != 0 {
+	gp := getg()
+	if gp.m.lockedInt != 0 || gp.m.lockedExt != 0 {
 		return
 	}
-	_g_.m.lockedg = 0
-	_g_.lockedm = 0
+	gp.m.lockedg = 0
+	gp.lockedm = 0
 }
 
 //go:nosplit
@@ -4443,21 +4596,21 @@
 // the goroutine locked to the OS thread until the goroutine (and
 // hence the thread) exits.
 func UnlockOSThread() {
-	_g_ := getg()
-	if _g_.m.lockedExt == 0 {
+	gp := getg()
+	if gp.m.lockedExt == 0 {
 		return
 	}
-	_g_.m.lockedExt--
+	gp.m.lockedExt--
 	dounlockOSThread()
 }
 
 //go:nosplit
 func unlockOSThread() {
-	_g_ := getg()
-	if _g_.m.lockedInt == 0 {
+	gp := getg()
+	if gp.m.lockedInt == 0 {
 		systemstack(badunlockosthread)
 	}
-	_g_.m.lockedInt--
+	gp.m.lockedInt--
 	dounlockOSThread()
 }
 
@@ -4466,9 +4619,9 @@
 }
 
 func gcount() int32 {
-	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - int32(atomic.Load(&sched.ngsys))
-	for _, _p_ := range allp {
-		n -= _p_.gFree.n
+	n := int32(atomic.Loaduintptr(&allglen)) - sched.gFree.n - sched.ngsys.Load()
+	for _, pp := range allp {
+		n -= pp.gFree.n
 	}
 
 	// All these variables can be changed concurrently, so the result can be inconsistent.
@@ -4484,8 +4637,11 @@
 }
 
 var prof struct {
-	signalLock uint32
-	hz         int32
+	signalLock atomic.Uint32
+
+	// Must hold signalLock to write. Reads may be lock-free, but
+	// signalLock should be taken to synchronize with changes.
+	hz atomic.Int32
 }
 
 func _System()                    { _System() }
@@ -4500,7 +4656,7 @@
 //
 //go:nowritebarrierrec
 func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
-	if prof.hz == 0 {
+	if prof.hz.Load() == 0 {
 		return
 	}
 
@@ -4550,7 +4706,7 @@
 		// cgoCallers.  We are running in a signal handler
 		// with all signals blocked, so we don't have to worry
 		// about any other code interrupting us.
-		if atomic.Load(&mp.cgoCallersUse) == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
+		if mp.cgoCallersUse.Load() == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
 			for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 {
 				cgoOff++
 			}
@@ -4563,41 +4719,37 @@
 		if n > 0 {
 			n += cgoOff
 		}
+	} else if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+		// Libcall, i.e. runtime syscall on windows.
+		// Collect Go stack that leads to the call.
+		n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[n], len(stk[n:]), nil, nil, 0)
+	} else if mp != nil && mp.vdsoSP != 0 {
+		// VDSO call, e.g. nanotime1 on Linux.
+		// Collect Go stack that leads to the call.
+		n = gentraceback(mp.vdsoPC, mp.vdsoSP, 0, gp, 0, &stk[n], len(stk[n:]), nil, nil, _TraceJumpStack)
 	} else {
 		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
 	}
 
 	if n <= 0 {
 		// Normal traceback is impossible or has failed.
-		// See if it falls into several common cases.
-		n = 0
-		if usesLibcall() && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
-			// Libcall, i.e. runtime syscall on windows.
-			// Collect Go stack that leads to the call.
-			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
+		// Account it against abstract "System" or "GC".
+		n = 2
+		if inVDSOPage(pc) {
+			pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum
+		} else if pc > firstmoduledata.etext {
+			// "ExternalCode" is better than "etext".
+			pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum
 		}
-		if n == 0 && mp != nil && mp.vdsoSP != 0 {
-			n = gentraceback(mp.vdsoPC, mp.vdsoSP, 0, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
-		}
-		if n == 0 {
-			// If all of the above has failed, account it against abstract "System" or "GC".
-			n = 2
-			if inVDSOPage(pc) {
-				pc = abi.FuncPCABIInternal(_VDSO) + sys.PCQuantum
-			} else if pc > firstmoduledata.etext {
-				// "ExternalCode" is better than "etext".
-				pc = abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum
-			}
-			stk[0] = pc
-			if mp.preemptoff != "" {
-				stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum
-			} else {
-				stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum
-			}
+		stk[0] = pc
+		if mp.preemptoff != "" {
+			stk[1] = abi.FuncPCABIInternal(_GC) + sys.PCQuantum
+		} else {
+			stk[1] = abi.FuncPCABIInternal(_System) + sys.PCQuantum
 		}
 	}
 
-	if prof.hz != 0 {
+	if prof.hz.Load() != 0 {
 		// Note: it can happen on Windows that we interrupted a system thread
 		// with no g, so gp could nil. The other nil checks are done out of
 		// caution, but not expected to be nil in practice.
@@ -4630,22 +4782,22 @@
 
 	// Disable preemption, otherwise we can be rescheduled to another thread
 	// that has profiling enabled.
-	_g_ := getg()
-	_g_.m.locks++
+	gp := getg()
+	gp.m.locks++
 
 	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
 	// it would deadlock.
 	setThreadCPUProfiler(0)
 
-	for !atomic.Cas(&prof.signalLock, 0, 1) {
+	for !prof.signalLock.CompareAndSwap(0, 1) {
 		osyield()
 	}
-	if prof.hz != hz {
+	if prof.hz.Load() != hz {
 		setProcessCPUProfiler(hz)
-		prof.hz = hz
+		prof.hz.Store(hz)
 	}
-	atomic.Store(&prof.signalLock, 0)
+	prof.signalLock.Store(0)
 
 	lock(&sched.lock)
 	sched.profilehz = hz
@@ -4655,7 +4807,7 @@
 		setThreadCPUProfiler(hz)
 	}
 
-	_g_.m.locks--
+	gp.m.locks--
 }
 
 // init initializes pp, which may be a freshly allocated p or a
@@ -4726,9 +4878,9 @@
 		lock(&pp.timersLock)
 		moveTimers(plocal, pp.timers)
 		pp.timers = nil
-		pp.numTimers = 0
-		pp.deletedTimers = 0
-		atomic.Store64(&pp.timer0When, 0)
+		pp.numTimers.Store(0)
+		pp.deletedTimers.Store(0)
+		pp.timer0When.Store(0)
 		unlock(&pp.timersLock)
 		unlock(&plocal.timersLock)
 	}
@@ -4852,32 +5004,32 @@
 		atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
 	}
 
-	_g_ := getg()
-	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
+	gp := getg()
+	if gp.m.p != 0 && gp.m.p.ptr().id < nprocs {
 		// continue to use the current P
-		_g_.m.p.ptr().status = _Prunning
-		_g_.m.p.ptr().mcache.prepareForSweep()
+		gp.m.p.ptr().status = _Prunning
+		gp.m.p.ptr().mcache.prepareForSweep()
 	} else {
 		// release the current P and acquire allp[0].
 		//
 		// We must do this before destroying our current P
 		// because p.destroy itself has write barriers, so we
 		// need to do that from a valid P.
-		if _g_.m.p != 0 {
+		if gp.m.p != 0 {
 			if trace.enabled {
 				// Pretend that we were descheduled
 				// and then scheduled again to keep
 				// the trace sane.
 				traceGoSched()
-				traceProcStop(_g_.m.p.ptr())
+				traceProcStop(gp.m.p.ptr())
 			}
-			_g_.m.p.ptr().m = 0
+			gp.m.p.ptr().m = 0
 		}
-		_g_.m.p = 0
-		p := allp[0]
-		p.m = 0
-		p.status = _Pidle
-		acquirep(p)
+		gp.m.p = 0
+		pp := allp[0]
+		pp.m = 0
+		pp.status = _Pidle
+		acquirep(pp)
 		if trace.enabled {
 			traceGoStart()
 		}
@@ -4888,8 +5040,8 @@
 
 	// release resources from unused P's
 	for i := nprocs; i < old; i++ {
-		p := allp[i]
-		p.destroy()
+		pp := allp[i]
+		pp.destroy()
 		// can't free P itself because it can be referenced by an M in syscall
 	}
 
@@ -4904,17 +5056,17 @@
 
 	var runnablePs *p
 	for i := nprocs - 1; i >= 0; i-- {
-		p := allp[i]
-		if _g_.m.p.ptr() == p {
+		pp := allp[i]
+		if gp.m.p.ptr() == pp {
 			continue
 		}
-		p.status = _Pidle
-		if runqempty(p) {
-			pidleput(p, now)
+		pp.status = _Pidle
+		if runqempty(pp) {
+			pidleput(pp, now)
 		} else {
-			p.m.set(mget())
-			p.link.set(runnablePs)
-			runnablePs = p
+			pp.m.set(mget())
+			pp.link.set(runnablePs)
+			runnablePs = pp
 		}
 	}
 	stealOrder.reset(uint32(nprocs))
@@ -4930,18 +5082,18 @@
 // Associate p and the current m.
 //
 // This function is allowed to have write barriers even if the caller
-// isn't because it immediately acquires _p_.
+// isn't because it immediately acquires pp.
 //
 //go:yeswritebarrierrec
-func acquirep(_p_ *p) {
+func acquirep(pp *p) {
 	// Do the part that isn't allowed to have write barriers.
-	wirep(_p_)
+	wirep(pp)
 
 	// Have p; write barriers now allowed.
 
 	// Perform deferred mcache flush before this P can allocate
 	// from a potentially stale mcache.
-	_p_.mcache.prepareForSweep()
+	pp.mcache.prepareForSweep()
 
 	if trace.enabled {
 		traceProcStart()
@@ -4949,49 +5101,49 @@
 }
 
 // wirep is the first step of acquirep, which actually associates the
-// current M to _p_. This is broken out so we can disallow write
+// current M to pp. This is broken out so we can disallow write
 // barriers for this part, since we don't yet have a P.
 //
 //go:nowritebarrierrec
 //go:nosplit
-func wirep(_p_ *p) {
-	_g_ := getg()
+func wirep(pp *p) {
+	gp := getg()
 
-	if _g_.m.p != 0 {
+	if gp.m.p != 0 {
 		throw("wirep: already in go")
 	}
-	if _p_.m != 0 || _p_.status != _Pidle {
+	if pp.m != 0 || pp.status != _Pidle {
 		id := int64(0)
-		if _p_.m != 0 {
-			id = _p_.m.ptr().id
+		if pp.m != 0 {
+			id = pp.m.ptr().id
 		}
-		print("wirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+		print("wirep: p->m=", pp.m, "(", id, ") p->status=", pp.status, "\n")
 		throw("wirep: invalid p state")
 	}
-	_g_.m.p.set(_p_)
-	_p_.m.set(_g_.m)
-	_p_.status = _Prunning
+	gp.m.p.set(pp)
+	pp.m.set(gp.m)
+	pp.status = _Prunning
 }
 
 // Disassociate p and the current m.
 func releasep() *p {
-	_g_ := getg()
+	gp := getg()
 
-	if _g_.m.p == 0 {
+	if gp.m.p == 0 {
 		throw("releasep: invalid arg")
 	}
-	_p_ := _g_.m.p.ptr()
-	if _p_.m.ptr() != _g_.m || _p_.status != _Prunning {
-		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " p->status=", _p_.status, "\n")
+	pp := gp.m.p.ptr()
+	if pp.m.ptr() != gp.m || pp.status != _Prunning {
+		print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
 		throw("releasep: invalid p state")
 	}
 	if trace.enabled {
-		traceProcStop(_g_.m.p.ptr())
+		traceProcStop(gp.m.p.ptr())
 	}
-	_g_.m.p = 0
-	_p_.m = 0
-	_p_.status = _Pidle
-	return _p_
+	gp.m.p = 0
+	pp.m = 0
+	pp.status = _Pidle
+	return pp
 }
 
 func incidlelocked(v int32) {
@@ -5020,7 +5172,7 @@
 	// freezetheworld will cause all running threads to block.
 	// And runtime will essentially enter into deadlock state,
 	// except that there is a thread that will call exit soon.
-	if panicking > 0 {
+	if panicking.Load() > 0 {
 		return
 	}
 
@@ -5090,7 +5242,7 @@
 			// M must be spinning to steal. We set this to be
 			// explicit, but since this is the only M it would
 			// become spinning on its own anyways.
-			atomic.Xadd(&sched.nmspinning, 1)
+			sched.nmspinning.Add(1)
 			mp.spinning = true
 			mp.nextp.set(pp)
 			notewakeup(&mp.park)
@@ -5099,8 +5251,8 @@
 	}
 
 	// There are no goroutines running, so we can look at the P's.
-	for _, _p_ := range allp {
-		if len(_p_.timers) > 0 {
+	for _, pp := range allp {
+		if len(pp.timers) > 0 {
 			return
 		}
 	}
@@ -5160,13 +5312,13 @@
 		// from a timer to avoid adding system load to applications that spend
 		// most of their time sleeping.
 		now := nanotime()
-		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
+		if debug.schedtrace <= 0 && (sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs) {
 			lock(&sched.lock)
-			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+			if sched.gcwaiting.Load() || sched.npidle.Load() == gomaxprocs {
 				syscallWake := false
 				next := timeSleepUntil()
 				if next > now {
-					atomic.Store(&sched.sysmonwait, 1)
+					sched.sysmonwait.Store(true)
 					unlock(&sched.lock)
 					// Make wake-up period small enough
 					// for the sampling to be correct.
@@ -5183,7 +5335,7 @@
 						osRelax(false)
 					}
 					lock(&sched.lock)
-					atomic.Store(&sched.sysmonwait, 0)
+					sched.sysmonwait.Store(false)
 					noteclear(&sched.sysmonnote)
 				}
 				if syscallWake {
@@ -5204,9 +5356,9 @@
 			asmcgocall(*cgo_yield, nil)
 		}
 		// poll network if not polled for more than 10ms
-		lastpoll := int64(atomic.Load64(&sched.lastpoll))
+		lastpoll := sched.lastpoll.Load()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
-			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+			sched.lastpoll.CompareAndSwap(lastpoll, now)
 			list := netpoll(0) // non-blocking - returns list of goroutines
 			if !list.empty() {
 				// Need to decrement number of idle locked M's
@@ -5253,9 +5405,9 @@
 			idle++
 		}
 		// check if we need to force a GC
-		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
+		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && forcegc.idle.Load() {
 			lock(&forcegc.lock)
-			forcegc.idle = 0
+			forcegc.idle.Store(false)
 			var list gList
 			list.push(forcegc.g)
 			injectglist(&list)
@@ -5289,23 +5441,23 @@
 	// temporarily drop the allpLock. Hence, we need to re-fetch
 	// allp each time around the loop.
 	for i := 0; i < len(allp); i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
+		pp := allp[i]
+		if pp == nil {
 			// This can happen if procresize has grown
 			// allp but not yet created new Ps.
 			continue
 		}
-		pd := &_p_.sysmontick
-		s := _p_.status
+		pd := &pp.sysmontick
+		s := pp.status
 		sysretake := false
 		if s == _Prunning || s == _Psyscall {
 			// Preempt G if it's running for too long.
-			t := int64(_p_.schedtick)
+			t := int64(pp.schedtick)
 			if int64(pd.schedtick) != t {
 				pd.schedtick = uint32(t)
 				pd.schedwhen = now
 			} else if pd.schedwhen+forcePreemptNS <= now {
-				preemptone(_p_)
+				preemptone(pp)
 				// In case of syscall, preemptone() doesn't
 				// work, because there is no M wired to P.
 				sysretake = true
@@ -5313,7 +5465,7 @@
 		}
 		if s == _Psyscall {
 			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
-			t := int64(_p_.syscalltick)
+			t := int64(pp.syscalltick)
 			if !sysretake && int64(pd.syscalltick) != t {
 				pd.syscalltick = uint32(t)
 				pd.syscallwhen = now
@@ -5322,7 +5474,7 @@
 			// On the one hand we don't want to retake Ps if there is no other work to do,
 			// but on the other hand we want to retake them eventually
 			// because they can prevent the sysmon thread from deep sleep.
-			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+			if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
 				continue
 			}
 			// Drop allpLock so we can take sched.lock.
@@ -5332,14 +5484,14 @@
 			// Otherwise the M from which we retake can exit the syscall,
 			// increment nmidle and report deadlock.
 			incidlelocked(-1)
-			if atomic.Cas(&_p_.status, s, _Pidle) {
+			if atomic.Cas(&pp.status, s, _Pidle) {
 				if trace.enabled {
-					traceGoSysBlock(_p_)
-					traceProcStop(_p_)
+					traceGoSysBlock(pp)
+					traceProcStop(pp)
 				}
 				n++
-				_p_.syscalltick++
-				handoffp(_p_)
+				pp.syscalltick++
+				handoffp(pp)
 			}
 			incidlelocked(1)
 			lock(&allpLock)
@@ -5356,11 +5508,11 @@
 // Returns true if preemption request was issued to at least one goroutine.
 func preemptall() bool {
 	res := false
-	for _, _p_ := range allp {
-		if _p_.status != _Prunning {
+	for _, pp := range allp {
+		if pp.status != _Prunning {
 			continue
 		}
-		if preemptone(_p_) {
+		if preemptone(pp) {
 			res = true
 		}
 	}
@@ -5377,8 +5529,8 @@
 // The actual preemption will happen at some point in the future
 // and will be indicated by the gp->status no longer being
 // Grunning
-func preemptone(_p_ *p) bool {
-	mp := _p_.m.ptr()
+func preemptone(pp *p) bool {
+	mp := pp.m.ptr()
 	if mp == nil || mp == getg().m {
 		return false
 	}
@@ -5397,7 +5549,7 @@
 
 	// Request an async preemption of this P.
 	if preemptMSupported && debug.asyncpreemptoff == 0 {
-		_p_.preempt = true
+		pp.preempt = true
 		preemptM(mp)
 	}
 
@@ -5413,23 +5565,25 @@
 	}
 
 	lock(&sched.lock)
-	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", mcount(), " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning.Load(), " needspinning=", sched.needspinning.Load(), " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
 	if detailed {
-		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+		print(" gcwaiting=", sched.gcwaiting.Load(), " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait.Load(), "\n")
 	}
 	// We must be careful while reading data from P's, M's and G's.
 	// Even if we hold schedlock, most data can be changed concurrently.
 	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
-	for i, _p_ := range allp {
-		mp := _p_.m.ptr()
-		h := atomic.Load(&_p_.runqhead)
-		t := atomic.Load(&_p_.runqtail)
+	for i, pp := range allp {
+		mp := pp.m.ptr()
+		h := atomic.Load(&pp.runqhead)
+		t := atomic.Load(&pp.runqtail)
 		if detailed {
-			id := int64(-1)
+			print("  P", i, ": status=", pp.status, " schedtick=", pp.schedtick, " syscalltick=", pp.syscalltick, " m=")
 			if mp != nil {
-				id = mp.id
+				print(mp.id)
+			} else {
+				print("nil")
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, " timerslen=", len(_p_.timers), "\n")
+			print(" runqsize=", t-h, " gfreecnt=", pp.gFree.n, " timerslen=", len(pp.timers), "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
@@ -5450,36 +5604,42 @@
 	}
 
 	for mp := allm; mp != nil; mp = mp.alllink {
-		_p_ := mp.p.ptr()
-		gp := mp.curg
-		lockedg := mp.lockedg.ptr()
-		id1 := int32(-1)
-		if _p_ != nil {
-			id1 = _p_.id
+		pp := mp.p.ptr()
+		print("  M", mp.id, ": p=")
+		if pp != nil {
+			print(pp.id)
+		} else {
+			print("nil")
 		}
-		id2 := int64(-1)
-		if gp != nil {
-			id2 = gp.goid
+		print(" curg=")
+		if mp.curg != nil {
+			print(mp.curg.goid)
+		} else {
+			print("nil")
 		}
-		id3 := int64(-1)
-		if lockedg != nil {
-			id3 = lockedg.goid
+		print(" mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, " locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=")
+		if lockedg := mp.lockedg.ptr(); lockedg != nil {
+			print(lockedg.goid)
+		} else {
+			print("nil")
 		}
-		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
+		print("\n")
 	}
 
 	forEachG(func(gp *g) {
-		mp := gp.m
-		lockedm := gp.lockedm.ptr()
-		id1 := int64(-1)
-		if mp != nil {
-			id1 = mp.id
+		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=")
+		if gp.m != nil {
+			print(gp.m.id)
+		} else {
+			print("nil")
 		}
-		id2 := int64(-1)
-		if lockedm != nil {
-			id2 = lockedm.id
+		print(" lockedm=")
+		if lockedm := gp.lockedm.ptr(); lockedm != nil {
+			print(lockedm.id)
+		} else {
+			print("nil")
 		}
-		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason.String(), ") m=", id1, " lockedm=", id2, "\n")
+		print("\n")
 	})
 	unlock(&sched.lock)
 }
@@ -5501,7 +5661,7 @@
 		sched.disable.n = 0
 		globrunqputbatch(&sched.disable.runnable, n)
 		unlock(&sched.lock)
-		for ; n != 0 && sched.npidle != 0; n-- {
+		for ; n != 0 && sched.npidle.Load() != 0; n-- {
 			startm(nil, false)
 		}
 	} else {
@@ -5592,7 +5752,7 @@
 
 // Try get a batch of G's from the global runnable queue.
 // sched.lock must be held.
-func globrunqget(_p_ *p, max int32) *g {
+func globrunqget(pp *p, max int32) *g {
 	assertLockHeld(&sched.lock)
 
 	if sched.runqsize == 0 {
@@ -5606,8 +5766,8 @@
 	if max > 0 && n > max {
 		n = max
 	}
-	if n > int32(len(_p_.runq))/2 {
-		n = int32(len(_p_.runq)) / 2
+	if n > int32(len(pp.runq))/2 {
+		n = int32(len(pp.runq)) / 2
 	}
 
 	sched.runqsize -= n
@@ -5616,7 +5776,7 @@
 	n--
 	for ; n > 0; n-- {
 		gp1 := sched.runq.pop()
-		runqput(_p_, gp1, false)
+		runqput(pp, gp1, false)
 	}
 	return gp
 }
@@ -5671,7 +5831,7 @@
 // TODO(prattmic): Additional targeted updates may improve the above cases.
 // e.g., updating the mask when stealing a timer.
 func updateTimerPMask(pp *p) {
-	if atomic.Load(&pp.numTimers) > 0 {
+	if pp.numTimers.Load() > 0 {
 		return
 	}
 
@@ -5679,7 +5839,7 @@
 	// decrement numTimers when handling a timerModified timer in
 	// checkTimers. We must take timersLock to serialize with these changes.
 	lock(&pp.timersLock)
-	if atomic.Load(&pp.numTimers) == 0 {
+	if pp.numTimers.Load() == 0 {
 		timerpMask.clear(pp.id)
 	}
 	unlock(&pp.timersLock)
@@ -5696,21 +5856,21 @@
 // May run during STW, so write barriers are not allowed.
 //
 //go:nowritebarrierrec
-func pidleput(_p_ *p, now int64) int64 {
+func pidleput(pp *p, now int64) int64 {
 	assertLockHeld(&sched.lock)
 
-	if !runqempty(_p_) {
+	if !runqempty(pp) {
 		throw("pidleput: P has non-empty run queue")
 	}
 	if now == 0 {
 		now = nanotime()
 	}
-	updateTimerPMask(_p_) // clear if there are no timers.
-	idlepMask.set(_p_.id)
-	_p_.link = sched.pidle
-	sched.pidle.set(_p_)
-	atomic.Xadd(&sched.npidle, 1)
-	if !_p_.limiterEvent.start(limiterEventIdle, now) {
+	updateTimerPMask(pp) // clear if there are no timers.
+	idlepMask.set(pp.id)
+	pp.link = sched.pidle
+	sched.pidle.set(pp)
+	sched.npidle.Add(1)
+	if !pp.limiterEvent.start(limiterEventIdle, now) {
 		throw("must be able to track idle limiter event")
 	}
 	return now
@@ -5726,33 +5886,58 @@
 func pidleget(now int64) (*p, int64) {
 	assertLockHeld(&sched.lock)
 
-	_p_ := sched.pidle.ptr()
-	if _p_ != nil {
+	pp := sched.pidle.ptr()
+	if pp != nil {
 		// Timer may get added at any time now.
 		if now == 0 {
 			now = nanotime()
 		}
-		timerpMask.set(_p_.id)
-		idlepMask.clear(_p_.id)
-		sched.pidle = _p_.link
-		atomic.Xadd(&sched.npidle, -1)
-		_p_.limiterEvent.stop(limiterEventIdle, now)
+		timerpMask.set(pp.id)
+		idlepMask.clear(pp.id)
+		sched.pidle = pp.link
+		sched.npidle.Add(-1)
+		pp.limiterEvent.stop(limiterEventIdle, now)
 	}
-	return _p_, now
+	return pp, now
 }
 
-// runqempty reports whether _p_ has no Gs on its local run queue.
+// pidlegetSpinning tries to get a p from the _Pidle list, acquiring ownership.
+// This is called by spinning Ms (or callers than need a spinning M) that have
+// found work. If no P is available, this must synchronized with non-spinning
+// Ms that may be preparing to drop their P without discovering this work.
+//
+// sched.lock must be held.
+//
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func pidlegetSpinning(now int64) (*p, int64) {
+	assertLockHeld(&sched.lock)
+
+	pp, now := pidleget(now)
+	if pp == nil {
+		// See "Delicate dance" comment in findrunnable. We found work
+		// that we cannot take, we must synchronize with non-spinning
+		// Ms that may be preparing to drop their P.
+		sched.needspinning.Store(1)
+		return nil, now
+	}
+
+	return pp, now
+}
+
+// runqempty reports whether pp has no Gs on its local run queue.
 // It never returns true spuriously.
-func runqempty(_p_ *p) bool {
-	// Defend against a race where 1) _p_ has G1 in runqnext but runqhead == runqtail,
-	// 2) runqput on _p_ kicks G1 to the runq, 3) runqget on _p_ empties runqnext.
+func runqempty(pp *p) bool {
+	// Defend against a race where 1) pp has G1 in runqnext but runqhead == runqtail,
+	// 2) runqput on pp kicks G1 to the runq, 3) runqget on pp empties runqnext.
 	// Simply observing that runqhead == runqtail and then observing that runqnext == nil
 	// does not mean the queue is empty.
 	for {
-		head := atomic.Load(&_p_.runqhead)
-		tail := atomic.Load(&_p_.runqtail)
-		runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&_p_.runnext)))
-		if tail == atomic.Load(&_p_.runqtail) {
+		head := atomic.Load(&pp.runqhead)
+		tail := atomic.Load(&pp.runqtail)
+		runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&pp.runnext)))
+		if tail == atomic.Load(&pp.runqtail) {
 			return head == tail && runnext == 0
 		}
 	}
@@ -5771,18 +5956,18 @@
 
 // runqput tries to put g on the local runnable queue.
 // If next is false, runqput adds g to the tail of the runnable queue.
-// If next is true, runqput puts g in the _p_.runnext slot.
+// If next is true, runqput puts g in the pp.runnext slot.
 // If the run queue is full, runnext puts g on the global queue.
 // Executed only by the owner P.
-func runqput(_p_ *p, gp *g, next bool) {
+func runqput(pp *p, gp *g, next bool) {
 	if randomizeScheduler && next && fastrandn(2) == 0 {
 		next = false
 	}
 
 	if next {
 	retryNext:
-		oldnext := _p_.runnext
-		if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
+		oldnext := pp.runnext
+		if !pp.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
 			goto retryNext
 		}
 		if oldnext == 0 {
@@ -5793,14 +5978,14 @@
 	}
 
 retry:
-	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
-	t := _p_.runqtail
-	if t-h < uint32(len(_p_.runq)) {
-		_p_.runq[t%uint32(len(_p_.runq))].set(gp)
-		atomic.StoreRel(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+	h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with consumers
+	t := pp.runqtail
+	if t-h < uint32(len(pp.runq)) {
+		pp.runq[t%uint32(len(pp.runq))].set(gp)
+		atomic.StoreRel(&pp.runqtail, t+1) // store-release, makes the item available for consumption
 		return
 	}
-	if runqputslow(_p_, gp, h, t) {
+	if runqputslow(pp, gp, h, t) {
 		return
 	}
 	// the queue is not full, now the put above must succeed
@@ -5809,19 +5994,19 @@
 
 // Put g and a batch of work from local runnable queue on global queue.
 // Executed only by the owner P.
-func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
-	var batch [len(_p_.runq)/2 + 1]*g
+func runqputslow(pp *p, gp *g, h, t uint32) bool {
+	var batch [len(pp.runq)/2 + 1]*g
 
 	// First, grab a batch from local queue.
 	n := t - h
 	n = n / 2
-	if n != uint32(len(_p_.runq)/2) {
+	if n != uint32(len(pp.runq)/2) {
 		throw("runqputslow: queue is not full")
 	}
 	for i := uint32(0); i < n; i++ {
-		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+		batch[i] = pp.runq[(h+i)%uint32(len(pp.runq))].ptr()
 	}
-	if !atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+	if !atomic.CasRel(&pp.runqhead, h, h+n) { // cas-release, commits consume
 		return false
 	}
 	batch[n] = gp
@@ -5886,50 +6071,50 @@
 // If inheritTime is true, gp should inherit the remaining time in the
 // current time slice. Otherwise, it should start a new time slice.
 // Executed only by the owner P.
-func runqget(_p_ *p) (gp *g, inheritTime bool) {
+func runqget(pp *p) (gp *g, inheritTime bool) {
 	// If there's a runnext, it's the next G to run.
-	next := _p_.runnext
+	next := pp.runnext
 	// If the runnext is non-0 and the CAS fails, it could only have been stolen by another P,
 	// because other Ps can race to set runnext to 0, but only the current P can set it to non-0.
-	// Hence, there's no need to retry this CAS if it falls.
-	if next != 0 && _p_.runnext.cas(next, 0) {
+	// Hence, there's no need to retry this CAS if it fails.
+	if next != 0 && pp.runnext.cas(next, 0) {
 		return next.ptr(), true
 	}
 
 	for {
-		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := _p_.runqtail
+		h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers
+		t := pp.runqtail
 		if t == h {
 			return nil, false
 		}
-		gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
-		if atomic.CasRel(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+		gp := pp.runq[h%uint32(len(pp.runq))].ptr()
+		if atomic.CasRel(&pp.runqhead, h, h+1) { // cas-release, commits consume
 			return gp, false
 		}
 	}
 }
 
-// runqdrain drains the local runnable queue of _p_ and returns all goroutines in it.
+// runqdrain drains the local runnable queue of pp and returns all goroutines in it.
 // Executed only by the owner P.
-func runqdrain(_p_ *p) (drainQ gQueue, n uint32) {
-	oldNext := _p_.runnext
-	if oldNext != 0 && _p_.runnext.cas(oldNext, 0) {
+func runqdrain(pp *p) (drainQ gQueue, n uint32) {
+	oldNext := pp.runnext
+	if oldNext != 0 && pp.runnext.cas(oldNext, 0) {
 		drainQ.pushBack(oldNext.ptr())
 		n++
 	}
 
 retry:
-	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
-	t := _p_.runqtail
+	h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers
+	t := pp.runqtail
 	qn := t - h
 	if qn == 0 {
 		return
 	}
-	if qn > uint32(len(_p_.runq)) { // read inconsistent h and t
+	if qn > uint32(len(pp.runq)) { // read inconsistent h and t
 		goto retry
 	}
 
-	if !atomic.CasRel(&_p_.runqhead, h, h+qn) { // cas-release, commits consume
+	if !atomic.CasRel(&pp.runqhead, h, h+qn) { // cas-release, commits consume
 		goto retry
 	}
 
@@ -5941,34 +6126,34 @@
 	// meanwhile, other P's can't access to all G's in local P's runnable queue and steal them.
 	// See https://groups.google.com/g/golang-dev/c/0pTKxEKhHSc/m/6Q85QjdVBQAJ for more details.
 	for i := uint32(0); i < qn; i++ {
-		gp := _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+		gp := pp.runq[(h+i)%uint32(len(pp.runq))].ptr()
 		drainQ.pushBack(gp)
 		n++
 	}
 	return
 }
 
-// Grabs a batch of goroutines from _p_'s runnable queue into batch.
+// Grabs a batch of goroutines from pp's runnable queue into batch.
 // Batch is a ring buffer starting at batchHead.
 // Returns number of grabbed goroutines.
 // Can be executed by any P.
-func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
+func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
 	for {
-		h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := atomic.LoadAcq(&_p_.runqtail) // load-acquire, synchronize with the producer
+		h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers
+		t := atomic.LoadAcq(&pp.runqtail) // load-acquire, synchronize with the producer
 		n := t - h
 		n = n - n/2
 		if n == 0 {
 			if stealRunNextG {
-				// Try to steal from _p_.runnext.
-				if next := _p_.runnext; next != 0 {
-					if _p_.status == _Prunning {
-						// Sleep to ensure that _p_ isn't about to run the g
+				// Try to steal from pp.runnext.
+				if next := pp.runnext; next != 0 {
+					if pp.status == _Prunning {
+						// Sleep to ensure that pp isn't about to run the g
 						// we are about to steal.
 						// The important use case here is when the g running
-						// on _p_ ready()s another g and then almost
+						// on pp ready()s another g and then almost
 						// immediately blocks. Instead of stealing runnext
-						// in this window, back off to give _p_ a chance to
+						// in this window, back off to give pp a chance to
 						// schedule runnext. This will avoid thrashing gs
 						// between different Ps.
 						// A sync chan send/recv takes ~50ns as of time of
@@ -5982,7 +6167,7 @@
 							osyield()
 						}
 					}
-					if !_p_.runnext.cas(next, 0) {
+					if !pp.runnext.cas(next, 0) {
 						continue
 					}
 					batch[batchHead%uint32(len(batch))] = next
@@ -5991,14 +6176,14 @@
 			}
 			return 0
 		}
-		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
+		if n > uint32(len(pp.runq)/2) { // read inconsistent h and t
 			continue
 		}
 		for i := uint32(0); i < n; i++ {
-			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
+			g := pp.runq[(h+i)%uint32(len(pp.runq))]
 			batch[(batchHead+i)%uint32(len(batch))] = g
 		}
-		if atomic.CasRel(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+		if atomic.CasRel(&pp.runqhead, h, h+n) { // cas-release, commits consume
 			return n
 		}
 	}
@@ -6007,22 +6192,22 @@
 // Steal half of elements from local runnable queue of p2
 // and put onto local runnable queue of p.
 // Returns one of the stolen elements (or nil if failed).
-func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
-	t := _p_.runqtail
-	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
+func runqsteal(pp, p2 *p, stealRunNextG bool) *g {
+	t := pp.runqtail
+	n := runqgrab(p2, &pp.runq, t, stealRunNextG)
 	if n == 0 {
 		return nil
 	}
 	n--
-	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))].ptr()
+	gp := pp.runq[(t+n)%uint32(len(pp.runq))].ptr()
 	if n == 0 {
 		return gp
 	}
-	h := atomic.LoadAcq(&_p_.runqhead) // load-acquire, synchronize with consumers
-	if t-h+n >= uint32(len(_p_.runq)) {
+	h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with consumers
+	if t-h+n >= uint32(len(pp.runq)) {
 		throw("runqsteal: runq overflow")
 	}
-	atomic.StoreRel(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	atomic.StoreRel(&pp.runqtail, t+n) // store-release, makes the item available for consumption
 	return gp
 }
 
@@ -6143,8 +6328,8 @@
 
 //go:nosplit
 func procPin() int {
-	_g_ := getg()
-	mp := _g_.m
+	gp := getg()
+	mp := gp.m
 
 	mp.locks++
 	return int(mp.p.ptr().id)
@@ -6152,8 +6337,8 @@
 
 //go:nosplit
 func procUnpin() {
-	_g_ := getg()
-	_g_.m.locks--
+	gp := getg()
+	gp.m.locks--
 }
 
 //go:linkname sync_runtime_procPin sync.runtime_procPin
@@ -6190,7 +6375,7 @@
 	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
 	// As opposed to runtime mutex we don't do passive spinning here,
 	// because there can be work on global runq or on other Ps.
-	if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
+	if i >= active_spin || ncpu <= 1 || gomaxprocs <= sched.npidle.Load()+sched.nmspinning.Load()+1 {
 		return false
 	}
 	if p := getg().m.p.ptr(); !runqempty(p) {
@@ -6278,7 +6463,7 @@
 
 type tracestat struct {
 	active bool   // init tracing activation status
-	id     int64  // init goroutine id
+	id     uint64 // init goroutine id
 	allocs uint64 // heap allocations
 	bytes  uint64 // heap allocated bytes
 }
diff --git a/src/runtime/profbuf.go b/src/runtime/profbuf.go
index 3d907d5..c579f21 100644
--- a/src/runtime/profbuf.go
+++ b/src/runtime/profbuf.go
@@ -87,9 +87,9 @@
 type profBuf struct {
 	// accessed atomically
 	r, w         profAtomic
-	overflow     uint64
-	overflowTime uint64
-	eof          uint32
+	overflow     atomic.Uint64
+	overflowTime atomic.Uint64
+	eof          atomic.Uint32
 
 	// immutable (excluding slice content)
 	hdrsize uintptr
@@ -150,15 +150,15 @@
 
 // hasOverflow reports whether b has any overflow records pending.
 func (b *profBuf) hasOverflow() bool {
-	return uint32(atomic.Load64(&b.overflow)) > 0
+	return uint32(b.overflow.Load()) > 0
 }
 
 // takeOverflow consumes the pending overflow records, returning the overflow count
 // and the time of the first overflow.
 // When called by the reader, it is racing against incrementOverflow.
 func (b *profBuf) takeOverflow() (count uint32, time uint64) {
-	overflow := atomic.Load64(&b.overflow)
-	time = atomic.Load64(&b.overflowTime)
+	overflow := b.overflow.Load()
+	time = b.overflowTime.Load()
 	for {
 		count = uint32(overflow)
 		if count == 0 {
@@ -166,11 +166,11 @@
 			break
 		}
 		// Increment generation, clear overflow count in low bits.
-		if atomic.Cas64(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+		if b.overflow.CompareAndSwap(overflow, ((overflow>>32)+1)<<32) {
 			break
 		}
-		overflow = atomic.Load64(&b.overflow)
-		time = atomic.Load64(&b.overflowTime)
+		overflow = b.overflow.Load()
+		time = b.overflowTime.Load()
 	}
 	return uint32(overflow), time
 }
@@ -179,14 +179,14 @@
 // It is racing against a possible takeOverflow in the reader.
 func (b *profBuf) incrementOverflow(now int64) {
 	for {
-		overflow := atomic.Load64(&b.overflow)
+		overflow := b.overflow.Load()
 
 		// Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot.
 		// We need to set overflowTime if we're incrementing b.overflow from 0.
 		if uint32(overflow) == 0 {
 			// Store overflowTime first so it's always available when overflow != 0.
-			atomic.Store64(&b.overflowTime, uint64(now))
-			atomic.Store64(&b.overflow, (((overflow>>32)+1)<<32)+1)
+			b.overflowTime.Store(uint64(now))
+			b.overflow.Store((((overflow >> 32) + 1) << 32) + 1)
 			break
 		}
 		// Otherwise we're racing to increment against reader
@@ -196,7 +196,7 @@
 		if int32(overflow) == -1 {
 			break
 		}
-		if atomic.Cas64(&b.overflow, overflow, overflow+1) {
+		if b.overflow.CompareAndSwap(overflow, overflow+1) {
 			break
 		}
 	}
@@ -394,10 +394,10 @@
 // close signals that there will be no more writes on the buffer.
 // Once all the data has been read from the buffer, reads will return eof=true.
 func (b *profBuf) close() {
-	if atomic.Load(&b.eof) > 0 {
+	if b.eof.Load() > 0 {
 		throw("runtime: profBuf already closed")
 	}
-	atomic.Store(&b.eof, 1)
+	b.eof.Store(1)
 	b.wakeupExtra()
 }
 
@@ -475,7 +475,7 @@
 			dst[2+b.hdrsize] = uint64(count)
 			return dst[:2+b.hdrsize+1], overflowTag[:1], false
 		}
-		if atomic.Load(&b.eof) > 0 {
+		if b.eof.Load() > 0 {
 			// No data, no overflow, EOF set: done.
 			return nil, nil, true
 		}
diff --git a/src/runtime/race.go b/src/runtime/race.go
index 4694288..f83a04d 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go
@@ -67,21 +67,21 @@
 // Non-synchronization events (memory accesses, function entry/exit) still affect
 // the race detector.
 func RaceDisable() {
-	_g_ := getg()
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_begin, _g_.racectx, 0, 0, 0)
+	gp := getg()
+	if gp.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_begin, gp.racectx, 0, 0, 0)
 	}
-	_g_.raceignore++
+	gp.raceignore++
 }
 
 //go:nosplit
 
 // RaceEnable re-enables handling of race events in the current goroutine.
 func RaceEnable() {
-	_g_ := getg()
-	_g_.raceignore--
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_end, _g_.racectx, 0, 0, 0)
+	gp := getg()
+	gp.raceignore--
+	if gp.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_end, gp.racectx, 0, 0, 0)
 	}
 }
 
@@ -187,7 +187,7 @@
 							continue
 						}
 						ctx.pc = f.Entry() + uintptr(inltree[ix].parentPc) // "caller" pc
-						ctx.fn = cfuncnameFromNameoff(fi, inltree[ix].func_)
+						ctx.fn = cfuncnameFromNameOff(fi, inltree[ix].nameOff)
 						ctx.line = uintptr(line)
 						ctx.file = &bytes(file)[0] // assume NUL-terminated
 						ctx.off = pc - f.Entry()
@@ -350,7 +350,7 @@
 // with up to 4 uintptr arguments.
 func racecall(fn *byte, arg0, arg1, arg2, arg3 uintptr)
 
-// checks if the address has shadow (i.e. heap or data/bss)
+// checks if the address has shadow (i.e. heap or data/bss).
 //
 //go:nosplit
 func isvalidaddr(addr unsafe.Pointer) bool {
@@ -360,8 +360,8 @@
 
 //go:nosplit
 func raceinit() (gctx, pctx uintptr) {
-	// cgo is required to initialize libc, which is used by race runtime
-	if !iscgo {
+	// On most machines, cgo is required to initialize libc, which is used by race runtime.
+	if !iscgo && GOOS != "darwin" {
 		throw("raceinit: race build must use cgo")
 	}
 
@@ -453,12 +453,12 @@
 
 //go:nosplit
 func racegostart(pc uintptr) uintptr {
-	_g_ := getg()
+	gp := getg()
 	var spawng *g
-	if _g_.m.curg != nil {
-		spawng = _g_.m.curg
+	if gp.m.curg != nil {
+		spawng = gp.m.curg
 	} else {
-		spawng = _g_
+		spawng = gp
 	}
 
 	var racectx uintptr
@@ -478,8 +478,8 @@
 
 //go:nosplit
 func racewriterangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
-	_g_ := getg()
-	if _g_ != _g_.m.curg {
+	gp := getg()
+	if gp != gp.m.curg {
 		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
 		// Not interesting.
 		return
@@ -495,8 +495,8 @@
 
 //go:nosplit
 func racereadrangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
-	_g_ := getg()
-	if _g_ != _g_.m.curg {
+	gp := getg()
+	if gp != gp.m.curg {
 		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
 		// Not interesting.
 		return
diff --git a/src/runtime/race/README b/src/runtime/race/README
index ad8f55f..596700a 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -6,7 +6,6 @@
 
 race_darwin_amd64.syso built with LLVM 127e59048cd3d8dbb80c14b3036918c114089529 and Go 59ab6f351a370a27458755dc69f4a837e55a05a6.
 race_freebsd_amd64.syso built with LLVM 127e59048cd3d8dbb80c14b3036918c114089529 and Go 59ab6f351a370a27458755dc69f4a837e55a05a6.
-race_linux_amd64.syso built with LLVM 127e59048cd3d8dbb80c14b3036918c114089529 and Go 59ab6f351a370a27458755dc69f4a837e55a05a6.
 race_linux_ppc64le.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
 race_netbsd_amd64.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
 race_windows_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
@@ -14,3 +13,5 @@
 race_darwin_arm64.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
 race_openbsd_amd64.syso built with LLVM fcf6ae2f070eba73074b6ec8d8281e54d29dbeeb and Go 8f2db14cd35bbd674cb2988a508306de6655e425.
 race_linux_s390x.syso built with LLVM 41cb504b7c4b18ac15830107431a0c1eec73a6b2 and Go 851ecea4cc99ab276109493477b2c7e30c253ea8.
+internal/amd64v3/race_linux.syso built with LLVM 74c2d4f6024c8f160871a2baa928d0b42415f183 and Go c0f27eb3d580c8b9efd73802678eba4c6c9461be.
+internal/amd64v1/race_linux.syso built with LLVM 74c2d4f6024c8f160871a2baa928d0b42415f183 and Go c0f27eb3d580c8b9efd73802678eba4c6c9461be.
diff --git a/src/runtime/race/doc.go b/src/runtime/race/doc.go
index 9e93f66..60a20df 100644
--- a/src/runtime/race/doc.go
+++ b/src/runtime/race/doc.go
@@ -7,3 +7,5 @@
 // For details about the race detector see
 // https://golang.org/doc/articles/race_detector.html
 package race
+
+//go:generate ./mkcgo.sh
diff --git a/src/runtime/race/internal/amd64v1/doc.go b/src/runtime/race/internal/amd64v1/doc.go
new file mode 100644
index 0000000..ccb088c
--- /dev/null
+++ b/src/runtime/race/internal/amd64v1/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This package holds the race detector .syso for
+// amd64 architectures with GOAMD64<v3.
+
+//go:build amd64 && ((linux && !amd64.v3) || darwin || freebsd || netbsd || openbsd || windows)
+
+package amd64v1
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/internal/amd64v1/race_darwin.syso
similarity index 100%
rename from src/runtime/race/race_darwin_amd64.syso
rename to src/runtime/race/internal/amd64v1/race_darwin.syso
Binary files differ
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/internal/amd64v1/race_freebsd.syso
similarity index 100%
rename from src/runtime/race/race_freebsd_amd64.syso
rename to src/runtime/race/internal/amd64v1/race_freebsd.syso
Binary files differ
diff --git a/src/runtime/race/internal/amd64v1/race_linux.syso b/src/runtime/race/internal/amd64v1/race_linux.syso
new file mode 100644
index 0000000..68f1508
--- /dev/null
+++ b/src/runtime/race/internal/amd64v1/race_linux.syso
Binary files differ
diff --git a/src/runtime/race/race_netbsd_amd64.syso b/src/runtime/race/internal/amd64v1/race_netbsd.syso
similarity index 100%
rename from src/runtime/race/race_netbsd_amd64.syso
rename to src/runtime/race/internal/amd64v1/race_netbsd.syso
Binary files differ
diff --git a/src/runtime/race/race_openbsd_amd64.syso b/src/runtime/race/internal/amd64v1/race_openbsd.syso
similarity index 100%
rename from src/runtime/race/race_openbsd_amd64.syso
rename to src/runtime/race/internal/amd64v1/race_openbsd.syso
Binary files differ
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/internal/amd64v1/race_windows.syso
similarity index 100%
rename from src/runtime/race/race_windows_amd64.syso
rename to src/runtime/race/internal/amd64v1/race_windows.syso
Binary files differ
diff --git a/src/runtime/race/internal/amd64v3/doc.go b/src/runtime/race/internal/amd64v3/doc.go
new file mode 100644
index 0000000..215998a
--- /dev/null
+++ b/src/runtime/race/internal/amd64v3/doc.go
@@ -0,0 +1,10 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This package holds the race detector .syso for
+// amd64 architectures with GOAMD64>=v3.
+
+//go:build amd64 && linux && amd64.v3
+
+package amd64v3
diff --git a/src/runtime/race/internal/amd64v3/race_linux.syso b/src/runtime/race/internal/amd64v3/race_linux.syso
new file mode 100644
index 0000000..33c3e76
--- /dev/null
+++ b/src/runtime/race/internal/amd64v3/race_linux.syso
Binary files differ
diff --git a/src/runtime/race/mkcgo.sh b/src/runtime/race/mkcgo.sh
new file mode 100755
index 0000000..6ebe5a4
--- /dev/null
+++ b/src/runtime/race/mkcgo.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+hdr='
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by mkcgo.sh. DO NOT EDIT.
+
+//go:build race
+
+'
+
+convert() {
+	(echo "$hdr"; go tool cgo -dynpackage race -dynimport $1) | gofmt
+}
+
+convert race_darwin_arm64.syso >race_darwin_arm64.go
+convert internal/amd64v1/race_darwin.syso >race_darwin_amd64.go
+
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index 8692066..9c508eb 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build (race && linux && amd64) || (race && freebsd && amd64) || (race && netbsd && amd64) || (race && darwin && amd64) || (race && windows && amd64) || (race && linux && ppc64le) || (race && linux && arm64) || (race && darwin && arm64) || (race && openbsd && amd64) || (race && linux && s390x)
+//go:build race && ((linux && (amd64 || arm64 || ppc64le || s390x)) || ((freebsd || netbsd || openbsd || windows) && amd64))
 
 package race
 
@@ -11,5 +11,10 @@
 // The prebuilt race runtime lives in race_GOOS_GOARCH.syso.
 // Calls to the runtime are done directly from src/runtime/race.go.
 
+// On darwin we always use system DLLs to create threads,
+// so we use race_darwin_$GOARCH.go to provide the syso-derived
+// symbol information without needing to invoke cgo.
+// This allows -race to be used on Mac systems without a C toolchain.
+
 // void __race_unused_func(void);
 import "C"
diff --git a/src/runtime/race/race_darwin_amd64.go b/src/runtime/race/race_darwin_amd64.go
new file mode 100644
index 0000000..fbb838a
--- /dev/null
+++ b/src/runtime/race/race_darwin_amd64.go
@@ -0,0 +1,101 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by mkcgo.sh. DO NOT EDIT.
+
+//go:build race
+
+package race
+
+//go:cgo_import_dynamic _Block_object_assign _Block_object_assign ""
+//go:cgo_import_dynamic _Block_object_dispose _Block_object_dispose ""
+//go:cgo_import_dynamic _NSConcreteStackBlock _NSConcreteStackBlock ""
+//go:cgo_import_dynamic _NSGetArgv _NSGetArgv ""
+//go:cgo_import_dynamic _NSGetEnviron _NSGetEnviron ""
+//go:cgo_import_dynamic _NSGetExecutablePath _NSGetExecutablePath ""
+//go:cgo_import_dynamic __bzero __bzero ""
+//go:cgo_import_dynamic __error __error ""
+//go:cgo_import_dynamic __fork __fork ""
+//go:cgo_import_dynamic __mmap __mmap ""
+//go:cgo_import_dynamic __munmap __munmap ""
+//go:cgo_import_dynamic __stack_chk_fail __stack_chk_fail ""
+//go:cgo_import_dynamic __stack_chk_guard __stack_chk_guard ""
+//go:cgo_import_dynamic _dyld_get_image_header _dyld_get_image_header ""
+//go:cgo_import_dynamic _dyld_get_image_name _dyld_get_image_name ""
+//go:cgo_import_dynamic _dyld_get_image_vmaddr_slide _dyld_get_image_vmaddr_slide ""
+//go:cgo_import_dynamic _dyld_get_shared_cache_range _dyld_get_shared_cache_range ""
+//go:cgo_import_dynamic _dyld_get_shared_cache_uuid _dyld_get_shared_cache_uuid ""
+//go:cgo_import_dynamic _dyld_image_count _dyld_image_count ""
+//go:cgo_import_dynamic _exit _exit ""
+//go:cgo_import_dynamic abort abort ""
+//go:cgo_import_dynamic arc4random_buf arc4random_buf ""
+//go:cgo_import_dynamic close close ""
+//go:cgo_import_dynamic dlsym dlsym ""
+//go:cgo_import_dynamic dup dup ""
+//go:cgo_import_dynamic dup2 dup2 ""
+//go:cgo_import_dynamic dyld_shared_cache_iterate_text dyld_shared_cache_iterate_text ""
+//go:cgo_import_dynamic execve execve ""
+//go:cgo_import_dynamic exit exit ""
+//go:cgo_import_dynamic fstat$INODE64 fstat$INODE64 ""
+//go:cgo_import_dynamic ftruncate ftruncate ""
+//go:cgo_import_dynamic getpid getpid ""
+//go:cgo_import_dynamic getrlimit getrlimit ""
+//go:cgo_import_dynamic gettimeofday gettimeofday ""
+//go:cgo_import_dynamic getuid getuid ""
+//go:cgo_import_dynamic grantpt grantpt ""
+//go:cgo_import_dynamic ioctl ioctl ""
+//go:cgo_import_dynamic isatty isatty ""
+//go:cgo_import_dynamic lstat$INODE64 lstat$INODE64 ""
+//go:cgo_import_dynamic mach_absolute_time mach_absolute_time ""
+//go:cgo_import_dynamic mach_task_self_ mach_task_self_ ""
+//go:cgo_import_dynamic mach_timebase_info mach_timebase_info ""
+//go:cgo_import_dynamic mach_vm_region_recurse mach_vm_region_recurse ""
+//go:cgo_import_dynamic madvise madvise ""
+//go:cgo_import_dynamic malloc_num_zones malloc_num_zones ""
+//go:cgo_import_dynamic malloc_zones malloc_zones ""
+//go:cgo_import_dynamic memcpy memcpy ""
+//go:cgo_import_dynamic memset_pattern16 memset_pattern16 ""
+//go:cgo_import_dynamic mkdir mkdir ""
+//go:cgo_import_dynamic mprotect mprotect ""
+//go:cgo_import_dynamic open open ""
+//go:cgo_import_dynamic pipe pipe ""
+//go:cgo_import_dynamic posix_openpt posix_openpt ""
+//go:cgo_import_dynamic posix_spawn posix_spawn ""
+//go:cgo_import_dynamic posix_spawn_file_actions_addclose posix_spawn_file_actions_addclose ""
+//go:cgo_import_dynamic posix_spawn_file_actions_adddup2 posix_spawn_file_actions_adddup2 ""
+//go:cgo_import_dynamic posix_spawn_file_actions_destroy posix_spawn_file_actions_destroy ""
+//go:cgo_import_dynamic posix_spawn_file_actions_init posix_spawn_file_actions_init ""
+//go:cgo_import_dynamic posix_spawnattr_destroy posix_spawnattr_destroy ""
+//go:cgo_import_dynamic posix_spawnattr_init posix_spawnattr_init ""
+//go:cgo_import_dynamic posix_spawnattr_setflags posix_spawnattr_setflags ""
+//go:cgo_import_dynamic pthread_attr_getstack pthread_attr_getstack ""
+//go:cgo_import_dynamic pthread_create pthread_create ""
+//go:cgo_import_dynamic pthread_get_stackaddr_np pthread_get_stackaddr_np ""
+//go:cgo_import_dynamic pthread_get_stacksize_np pthread_get_stacksize_np ""
+//go:cgo_import_dynamic pthread_getspecific pthread_getspecific ""
+//go:cgo_import_dynamic pthread_join pthread_join ""
+//go:cgo_import_dynamic pthread_self pthread_self ""
+//go:cgo_import_dynamic pthread_sigmask pthread_sigmask ""
+//go:cgo_import_dynamic pthread_threadid_np pthread_threadid_np ""
+//go:cgo_import_dynamic read read ""
+//go:cgo_import_dynamic readlink readlink ""
+//go:cgo_import_dynamic realpath$DARWIN_EXTSN realpath$DARWIN_EXTSN ""
+//go:cgo_import_dynamic rename rename ""
+//go:cgo_import_dynamic sched_yield sched_yield ""
+//go:cgo_import_dynamic setrlimit setrlimit ""
+//go:cgo_import_dynamic sigaction sigaction ""
+//go:cgo_import_dynamic stat$INODE64 stat$INODE64 ""
+//go:cgo_import_dynamic sysconf sysconf ""
+//go:cgo_import_dynamic sysctl sysctl ""
+//go:cgo_import_dynamic sysctlbyname sysctlbyname ""
+//go:cgo_import_dynamic task_info task_info ""
+//go:cgo_import_dynamic tcgetattr tcgetattr ""
+//go:cgo_import_dynamic tcsetattr tcsetattr ""
+//go:cgo_import_dynamic unlink unlink ""
+//go:cgo_import_dynamic unlockpt unlockpt ""
+//go:cgo_import_dynamic usleep usleep ""
+//go:cgo_import_dynamic vm_region_64 vm_region_64 ""
+//go:cgo_import_dynamic vm_region_recurse_64 vm_region_recurse_64 ""
+//go:cgo_import_dynamic waitpid waitpid ""
+//go:cgo_import_dynamic write write ""
diff --git a/src/runtime/race/race_darwin_arm64.go b/src/runtime/race/race_darwin_arm64.go
new file mode 100644
index 0000000..fe8584c
--- /dev/null
+++ b/src/runtime/race/race_darwin_arm64.go
@@ -0,0 +1,95 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code generated by mkcgo.sh. DO NOT EDIT.
+
+//go:build race
+
+package race
+
+//go:cgo_import_dynamic _NSGetArgv _NSGetArgv ""
+//go:cgo_import_dynamic _NSGetEnviron _NSGetEnviron ""
+//go:cgo_import_dynamic _NSGetExecutablePath _NSGetExecutablePath ""
+//go:cgo_import_dynamic __error __error ""
+//go:cgo_import_dynamic __fork __fork ""
+//go:cgo_import_dynamic __mmap __mmap ""
+//go:cgo_import_dynamic __munmap __munmap ""
+//go:cgo_import_dynamic __stack_chk_fail __stack_chk_fail ""
+//go:cgo_import_dynamic __stack_chk_guard __stack_chk_guard ""
+//go:cgo_import_dynamic _dyld_get_image_header _dyld_get_image_header ""
+//go:cgo_import_dynamic _dyld_get_image_name _dyld_get_image_name ""
+//go:cgo_import_dynamic _dyld_get_image_vmaddr_slide _dyld_get_image_vmaddr_slide ""
+//go:cgo_import_dynamic _dyld_image_count _dyld_image_count ""
+//go:cgo_import_dynamic _exit _exit ""
+//go:cgo_import_dynamic abort abort ""
+//go:cgo_import_dynamic arc4random_buf arc4random_buf ""
+//go:cgo_import_dynamic bzero bzero ""
+//go:cgo_import_dynamic close close ""
+//go:cgo_import_dynamic dlsym dlsym ""
+//go:cgo_import_dynamic dup dup ""
+//go:cgo_import_dynamic dup2 dup2 ""
+//go:cgo_import_dynamic execve execve ""
+//go:cgo_import_dynamic exit exit ""
+//go:cgo_import_dynamic fstat fstat ""
+//go:cgo_import_dynamic ftruncate ftruncate ""
+//go:cgo_import_dynamic getpid getpid ""
+//go:cgo_import_dynamic getrlimit getrlimit ""
+//go:cgo_import_dynamic gettimeofday gettimeofday ""
+//go:cgo_import_dynamic getuid getuid ""
+//go:cgo_import_dynamic grantpt grantpt ""
+//go:cgo_import_dynamic ioctl ioctl ""
+//go:cgo_import_dynamic isatty isatty ""
+//go:cgo_import_dynamic lstat lstat ""
+//go:cgo_import_dynamic mach_absolute_time mach_absolute_time ""
+//go:cgo_import_dynamic mach_task_self_ mach_task_self_ ""
+//go:cgo_import_dynamic mach_timebase_info mach_timebase_info ""
+//go:cgo_import_dynamic mach_vm_region_recurse mach_vm_region_recurse ""
+//go:cgo_import_dynamic madvise madvise ""
+//go:cgo_import_dynamic malloc_num_zones malloc_num_zones ""
+//go:cgo_import_dynamic malloc_zones malloc_zones ""
+//go:cgo_import_dynamic memcpy memcpy ""
+//go:cgo_import_dynamic memset_pattern16 memset_pattern16 ""
+//go:cgo_import_dynamic mkdir mkdir ""
+//go:cgo_import_dynamic mprotect mprotect ""
+//go:cgo_import_dynamic open open ""
+//go:cgo_import_dynamic pipe pipe ""
+//go:cgo_import_dynamic posix_openpt posix_openpt ""
+//go:cgo_import_dynamic posix_spawn posix_spawn ""
+//go:cgo_import_dynamic posix_spawn_file_actions_addclose posix_spawn_file_actions_addclose ""
+//go:cgo_import_dynamic posix_spawn_file_actions_adddup2 posix_spawn_file_actions_adddup2 ""
+//go:cgo_import_dynamic posix_spawn_file_actions_destroy posix_spawn_file_actions_destroy ""
+//go:cgo_import_dynamic posix_spawn_file_actions_init posix_spawn_file_actions_init ""
+//go:cgo_import_dynamic posix_spawnattr_destroy posix_spawnattr_destroy ""
+//go:cgo_import_dynamic posix_spawnattr_init posix_spawnattr_init ""
+//go:cgo_import_dynamic posix_spawnattr_setflags posix_spawnattr_setflags ""
+//go:cgo_import_dynamic pthread_attr_getstack pthread_attr_getstack ""
+//go:cgo_import_dynamic pthread_create pthread_create ""
+//go:cgo_import_dynamic pthread_get_stackaddr_np pthread_get_stackaddr_np ""
+//go:cgo_import_dynamic pthread_get_stacksize_np pthread_get_stacksize_np ""
+//go:cgo_import_dynamic pthread_getspecific pthread_getspecific ""
+//go:cgo_import_dynamic pthread_join pthread_join ""
+//go:cgo_import_dynamic pthread_self pthread_self ""
+//go:cgo_import_dynamic pthread_sigmask pthread_sigmask ""
+//go:cgo_import_dynamic pthread_threadid_np pthread_threadid_np ""
+//go:cgo_import_dynamic read read ""
+//go:cgo_import_dynamic readlink readlink ""
+//go:cgo_import_dynamic realpath$DARWIN_EXTSN realpath$DARWIN_EXTSN ""
+//go:cgo_import_dynamic rename rename ""
+//go:cgo_import_dynamic sched_yield sched_yield ""
+//go:cgo_import_dynamic setrlimit setrlimit ""
+//go:cgo_import_dynamic sigaction sigaction ""
+//go:cgo_import_dynamic stat stat ""
+//go:cgo_import_dynamic sysconf sysconf ""
+//go:cgo_import_dynamic sysctl sysctl ""
+//go:cgo_import_dynamic sysctlbyname sysctlbyname ""
+//go:cgo_import_dynamic task_info task_info ""
+//go:cgo_import_dynamic tcgetattr tcgetattr ""
+//go:cgo_import_dynamic tcsetattr tcsetattr ""
+//go:cgo_import_dynamic unlink unlink ""
+//go:cgo_import_dynamic unlockpt unlockpt ""
+//go:cgo_import_dynamic usleep usleep ""
+//go:cgo_import_dynamic vm_region_64 vm_region_64 ""
+//go:cgo_import_dynamic vm_region_recurse_64 vm_region_recurse_64 ""
+//go:cgo_import_dynamic waitpid waitpid ""
+//go:cgo_import_dynamic write write ""
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
deleted file mode 100644
index 6885610..0000000
--- a/src/runtime/race/race_linux_amd64.syso
+++ /dev/null
Binary files differ
diff --git a/src/runtime/race/race_unix_test.go b/src/runtime/race/race_unix_test.go
index 6cc0730..3cf53b0 100644
--- a/src/runtime/race/race_unix_test.go
+++ b/src/runtime/race/race_unix_test.go
@@ -19,11 +19,11 @@
 	if err != nil {
 		t.Fatalf("failed to mmap memory: %v", err)
 	}
+	defer syscall.Munmap(data)
 	p := (*uint32)(unsafe.Pointer(&data[0]))
 	atomic.AddUint32(p, 1)
 	(*p)++
 	if *p != 2 {
 		t.Fatalf("data[0] = %v, expect 2", *p)
 	}
-	syscall.Munmap(data)
 }
diff --git a/src/runtime/race/race_v1_amd64.go b/src/runtime/race/race_v1_amd64.go
new file mode 100644
index 0000000..7c40db1
--- /dev/null
+++ b/src/runtime/race/race_v1_amd64.go
@@ -0,0 +1,9 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build (linux && !amd64.v3) || darwin || freebsd || netbsd || openbsd || windows
+
+package race
+
+import _ "runtime/race/internal/amd64v1"
diff --git a/src/runtime/race/race_v3_amd64.go b/src/runtime/race/race_v3_amd64.go
new file mode 100644
index 0000000..80728d8
--- /dev/null
+++ b/src/runtime/race/race_v3_amd64.go
@@ -0,0 +1,9 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build linux && amd64.v3
+
+package race
+
+import _ "runtime/race/internal/amd64v3"
diff --git a/src/runtime/race/sched_test.go b/src/runtime/race/sched_test.go
index 9fe83ea..a66860c 100644
--- a/src/runtime/race/sched_test.go
+++ b/src/runtime/race/sched_test.go
@@ -7,10 +7,10 @@
 package race_test
 
 import (
-	"bytes"
 	"fmt"
 	"reflect"
 	"runtime"
+	"strings"
 	"testing"
 )
 
@@ -40,7 +40,7 @@
 		}
 	}
 
-	var buf bytes.Buffer
+	var buf strings.Builder
 	for i := 0; i < N; i++ {
 		fmt.Fprintf(&buf, "%v\n", out[i])
 	}
diff --git a/src/runtime/rdebug.go b/src/runtime/rdebug.go
index 1b213f1..7ecb2a5 100644
--- a/src/runtime/rdebug.go
+++ b/src/runtime/rdebug.go
@@ -15,8 +15,8 @@
 
 //go:linkname setPanicOnFault runtime/debug.setPanicOnFault
 func setPanicOnFault(new bool) (old bool) {
-	_g_ := getg()
-	old = _g_.paniconfault
-	_g_.paniconfault = new
+	gp := getg()
+	old = gp.paniconfault
+	gp.paniconfault = new
 	return old
 }
diff --git a/src/runtime/retry.go b/src/runtime/retry.go
new file mode 100644
index 0000000..2e2f813
--- /dev/null
+++ b/src/runtime/retry.go
@@ -0,0 +1,23 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix
+
+package runtime
+
+// retryOnEAGAIN retries a function until it does not return EAGAIN.
+// It will use an increasing delay between calls, and retry up to 20 times.
+// The function argument is expected to return an errno value,
+// and retryOnEAGAIN will return any errno value other than EAGAIN.
+// If all retries return EAGAIN, then retryOnEAGAIN will return EAGAIN.
+func retryOnEAGAIN(fn func() int32) int32 {
+	for tries := 0; tries < 20; tries++ {
+		errno := fn()
+		if errno != _EAGAIN {
+			return errno
+		}
+		usleep_no_g(uint32(tries+1) * 1000) // milliseconds
+	}
+	return _EAGAIN
+}
diff --git a/src/runtime/rt0_freebsd_riscv64.s b/src/runtime/rt0_freebsd_riscv64.s
new file mode 100644
index 0000000..dc46b70
--- /dev/null
+++ b/src/runtime/rt0_freebsd_riscv64.s
@@ -0,0 +1,112 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// On FreeBSD argc/argv are passed in R0, not X2
+TEXT _rt0_riscv64_freebsd(SB),NOSPLIT|NOFRAME,$0
+	ADD	$8, A0, A1	// argv
+	MOV	0(A0), A0	// argc
+	JMP	main(SB)
+
+// When building with -buildmode=c-shared, this symbol is called when the shared
+// library is loaded.
+TEXT _rt0_riscv64_freebsd_lib(SB),NOSPLIT,$224
+	// Preserve callee-save registers, along with X1 (LR).
+	MOV	X1, (8*3)(X2)
+	MOV	X8, (8*4)(X2)
+	MOV	X9, (8*5)(X2)
+	MOV	X18, (8*6)(X2)
+	MOV	X19, (8*7)(X2)
+	MOV	X20, (8*8)(X2)
+	MOV	X21, (8*9)(X2)
+	MOV	X22, (8*10)(X2)
+	MOV	X23, (8*11)(X2)
+	MOV	X24, (8*12)(X2)
+	MOV	X25, (8*13)(X2)
+	MOV	X26, (8*14)(X2)
+	MOV	g, (8*15)(X2)
+	MOVD	F8, (8*16)(X2)
+	MOVD	F9, (8*17)(X2)
+	MOVD	F18, (8*18)(X2)
+	MOVD	F19, (8*19)(X2)
+	MOVD	F20, (8*20)(X2)
+	MOVD	F21, (8*21)(X2)
+	MOVD	F22, (8*22)(X2)
+	MOVD	F23, (8*23)(X2)
+	MOVD	F24, (8*24)(X2)
+	MOVD	F25, (8*25)(X2)
+	MOVD	F26, (8*26)(X2)
+	MOVD	F27, (8*27)(X2)
+
+	// Initialize g as nil in case of using g later e.g. sigaction in cgo_sigaction.go
+	MOV	X0, g
+
+	MOV	A0, _rt0_riscv64_freebsd_lib_argc<>(SB)
+	MOV	A1, _rt0_riscv64_freebsd_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOV	$runtime·libpreinit(SB), T0
+	JALR	RA, T0
+
+	// Create a new thread to do the runtime initialization and return.
+	MOV	_cgo_sys_thread_create(SB), T0
+	BEQZ	T0, nocgo
+	MOV	$_rt0_riscv64_freebsd_lib_go(SB), A0
+	MOV	$0, A1
+	JALR	RA, T0
+	JMP	restore
+
+nocgo:
+	MOV	$0x800000, A0                     // stacksize = 8192KB
+	MOV	$_rt0_riscv64_freebsd_lib_go(SB), A1
+	MOV	A0, 8(X2)
+	MOV	A1, 16(X2)
+	MOV	$runtime·newosproc0(SB), T0
+	JALR	RA, T0
+
+restore:
+	// Restore callee-save registers, along with X1 (LR).
+	MOV	(8*3)(X2), X1
+	MOV	(8*4)(X2), X8
+	MOV	(8*5)(X2), X9
+	MOV	(8*6)(X2), X18
+	MOV	(8*7)(X2), X19
+	MOV	(8*8)(X2), X20
+	MOV	(8*9)(X2), X21
+	MOV	(8*10)(X2), X22
+	MOV	(8*11)(X2), X23
+	MOV	(8*12)(X2), X24
+	MOV	(8*13)(X2), X25
+	MOV	(8*14)(X2), X26
+	MOV	(8*15)(X2), g
+	MOVD	(8*16)(X2), F8
+	MOVD	(8*17)(X2), F9
+	MOVD	(8*18)(X2), F18
+	MOVD	(8*19)(X2), F19
+	MOVD	(8*20)(X2), F20
+	MOVD	(8*21)(X2), F21
+	MOVD	(8*22)(X2), F22
+	MOVD	(8*23)(X2), F23
+	MOVD	(8*24)(X2), F24
+	MOVD	(8*25)(X2), F25
+	MOVD	(8*26)(X2), F26
+	MOVD	(8*27)(X2), F27
+
+	RET
+
+TEXT _rt0_riscv64_freebsd_lib_go(SB),NOSPLIT,$0
+	MOV	_rt0_riscv64_freebsd_lib_argc<>(SB), A0
+	MOV	_rt0_riscv64_freebsd_lib_argv<>(SB), A1
+	MOV	$runtime·rt0_go(SB), T0
+	JALR	ZERO, T0
+
+DATA _rt0_riscv64_freebsd_lib_argc<>(SB)/8, $0
+GLOBL _rt0_riscv64_freebsd_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_riscv64_freebsd_lib_argv<>(SB)/8, $0
+GLOBL _rt0_riscv64_freebsd_lib_argv<>(SB),NOPTR, $8
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	MOV	$runtime·rt0_go(SB), T0
+	JALR	ZERO, T0
diff --git a/src/runtime/rt0_linux_ppc64.s b/src/runtime/rt0_linux_ppc64.s
index 897d610..c9300a9 100644
--- a/src/runtime/rt0_linux_ppc64.s
+++ b/src/runtime/rt0_linux_ppc64.s
@@ -22,6 +22,7 @@
 	// There is no TLS base pointer.
 	//
 	// TODO(austin): Support ABI v1 dynamic linking entry point
+	XOR	R0, R0 // Note, newer kernels may not always set R0 to 0.
 	MOVD	$runtime·rt0_go(SB), R12
 	MOVD	R12, CTR
 	MOVBZ	runtime·iscgo(SB), R5
diff --git a/src/runtime/runtime-gdb.py b/src/runtime/runtime-gdb.py
index 5bb605c..c4462de 100644
--- a/src/runtime/runtime-gdb.py
+++ b/src/runtime/runtime-gdb.py
@@ -447,7 +447,7 @@
 		# args = gdb.string_to_argv(arg)
 		vp = gdb.lookup_type('void').pointer()
 		for ptr in SliceValue(gdb.parse_and_eval("'runtime.allgs'")):
-			if ptr['atomicstatus'] == G_DEAD:
+			if ptr['atomicstatus']['value'] == G_DEAD:
 				continue
 			s = ' '
 			if ptr['m']:
@@ -455,7 +455,7 @@
 			pc = ptr['sched']['pc'].cast(vp)
 			pc = pc_to_int(pc)
 			blk = gdb.block_for_pc(pc)
-			status = int(ptr['atomicstatus'])
+			status = int(ptr['atomicstatus']['value'])
 			st = sts.get(status, "unknown(%d)" % status)
 			print(s, ptr['goid'], "{0:8s}".format(st), blk.function)
 
@@ -472,7 +472,7 @@
 	"""
 	vp = gdb.lookup_type('void').pointer()
 	for ptr in SliceValue(gdb.parse_and_eval("'runtime.allgs'")):
-		if ptr['atomicstatus'] == G_DEAD:
+		if ptr['atomicstatus']['value'] == G_DEAD:
 			continue
 		if ptr['goid'] == goid:
 			break
@@ -480,7 +480,7 @@
 		return None, None
 	# Get the goroutine's saved state.
 	pc, sp = ptr['sched']['pc'], ptr['sched']['sp']
-	status = ptr['atomicstatus']&~G_SCAN
+	status = ptr['atomicstatus']['value']&~G_SCAN
 	# Goroutine is not running nor in syscall, so use the info in goroutine
 	if status != G_RUNNING and status != G_SYSCALL:
 		return pc.cast(vp), sp.cast(vp)
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index d97c2a2..4e7c227 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -6,6 +6,7 @@
 
 import (
 	"bytes"
+	"flag"
 	"fmt"
 	"internal/testenv"
 	"os"
@@ -16,6 +17,7 @@
 	"strconv"
 	"strings"
 	"testing"
+	"time"
 )
 
 // NOTE: In some configurations, GDB will segfault when sent a SIGWINCH signal.
@@ -40,6 +42,10 @@
 		if runtime.GOARCH == "mips" {
 			t.Skip("skipping gdb tests on linux/mips; see https://golang.org/issue/25939")
 		}
+		// Disable GDB tests on alpine until issue #54352 resolved.
+		if strings.HasSuffix(testenv.Builder(), "-alpine") {
+			t.Skip("skipping gdb tests on alpine; see https://golang.org/issue/54352")
+		}
 	case "freebsd":
 		t.Skip("skipping gdb tests on FreeBSD; see https://golang.org/issue/29508")
 	case "aix":
@@ -394,6 +400,15 @@
 	if runtime.GOOS == "netbsd" {
 		testenv.SkipFlaky(t, 15603)
 	}
+	if flag.Lookup("test.parallel").Value.(flag.Getter).Get().(int) < 2 {
+		// It is possible that this test will hang for a long time due to an
+		// apparent GDB bug reported in https://go.dev/issue/37405.
+		// If test parallelism is high enough, that might be ok: the other parallel
+		// tests will finish, and then this test will finish right before it would
+		// time out. However, if test are running sequentially, a hang in this test
+		// would likely cause the remaining tests to run out of time.
+		testenv.SkipFlaky(t, 37405)
+	}
 
 	checkGdbEnvironment(t)
 	t.Parallel()
@@ -415,6 +430,7 @@
 	}
 
 	// Execute gdb commands.
+	start := time.Now()
 	args := []string{"-nx", "-batch",
 		"-iex", "add-auto-load-safe-path " + filepath.Join(testenv.GOROOT(t), "src", "runtime"),
 		"-ex", "set startup-with-shell off",
@@ -424,7 +440,32 @@
 		"-ex", "continue",
 		filepath.Join(dir, "a.exe"),
 	}
-	got, err := testenv.RunWithTimeout(t, exec.Command("gdb", args...))
+	cmd = testenv.Command(t, "gdb", args...)
+
+	// Work around the GDB hang reported in https://go.dev/issue/37405.
+	// Sometimes (rarely), the GDB process hangs completely when the Go program
+	// exits, and we suspect that the bug is on the GDB side.
+	//
+	// The default Cancel function added by testenv.Command will mark the test as
+	// failed if it is in danger of timing out, but we want to instead mark it as
+	// skipped. Change the Cancel function to kill the process and merely log
+	// instead of failing the test.
+	//
+	// (This approach does not scale: if the test parallelism is less than or
+	// equal to the number of tests that run right up to the deadline, then the
+	// remaining parallel tests are likely to time out. But as long as it's just
+	// this one flaky test, it's probably fine..?)
+	//
+	// If there is no deadline set on the test at all, relying on the timeout set
+	// by testenv.Command will cause the test to hang indefinitely, but that's
+	// what “no deadline” means, after all — and it's probably the right behavior
+	// anyway if someone is trying to investigate and fix the GDB bug.
+	cmd.Cancel = func() error {
+		t.Logf("GDB command timed out after %v: %v", time.Since(start), cmd)
+		return cmd.Process.Kill()
+	}
+
+	got, err := cmd.CombinedOutput()
 	t.Logf("gdb output:\n%s", got)
 	if err != nil {
 		if bytes.Contains(got, []byte("internal-error: wait returned unexpected status 0x0")) {
diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 2cf93ab..9f68738 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go
@@ -6,29 +6,29 @@
 
 import (
 	"runtime/internal/atomic"
-	_ "unsafe" // for go:linkname
+	"unsafe"
 )
 
 //go:generate go run wincallback.go
 //go:generate go run mkduff.go
 //go:generate go run mkfastlog2table.go
+//go:generate go run mklockrank.go -o lockrank.go
 
 var ticks ticksType
 
 type ticksType struct {
 	lock mutex
-	pad  uint32 // ensure 8-byte alignment of val on 386
-	val  uint64
+	val  atomic.Int64
 }
 
 // Note: Called by runtime/pprof in addition to runtime code.
 func tickspersecond() int64 {
-	r := int64(atomic.Load64(&ticks.val))
+	r := ticks.val.Load()
 	if r != 0 {
 		return r
 	}
 	lock(&ticks.lock)
-	r = int64(ticks.val)
+	r = ticks.val.Load()
 	if r == 0 {
 		t0 := nanotime()
 		c0 := cputicks()
@@ -42,7 +42,7 @@
 		if r == 0 {
 			r++
 		}
-		atomic.Store64(&ticks.val, uint64(r))
+		ticks.val.Store(r)
 	}
 	unlock(&ticks.lock)
 	return r
@@ -65,3 +65,52 @@
 func syscall_Exit(code int) {
 	exit(int32(code))
 }
+
+var godebugDefault string
+var godebugUpdate atomic.Pointer[func(string, string)]
+var godebugEnv atomic.Pointer[string] // set by parsedebugvars
+
+//go:linkname godebug_setUpdate internal/godebug.setUpdate
+func godebug_setUpdate(update func(string, string)) {
+	p := new(func(string, string))
+	*p = update
+	godebugUpdate.Store(p)
+	godebugNotify()
+}
+
+func godebugNotify() {
+	if update := godebugUpdate.Load(); update != nil {
+		var env string
+		if p := godebugEnv.Load(); p != nil {
+			env = *p
+		}
+		(*update)(godebugDefault, env)
+	}
+}
+
+//go:linkname syscall_runtimeSetenv syscall.runtimeSetenv
+func syscall_runtimeSetenv(key, value string) {
+	setenv_c(key, value)
+	if key == "GODEBUG" {
+		p := new(string)
+		*p = value
+		godebugEnv.Store(p)
+		godebugNotify()
+	}
+}
+
+//go:linkname syscall_runtimeUnsetenv syscall.runtimeUnsetenv
+func syscall_runtimeUnsetenv(key string) {
+	unsetenv_c(key)
+	if key == "GODEBUG" {
+		godebugEnv.Store(nil)
+		godebugNotify()
+	}
+}
+
+// writeErrStr writes a string to descriptor 2.
+//
+//go:nosplit
+func writeErrStr(s string) {
+	write(2, unsafe.Pointer(unsafe.StringData(s)), int32(len(s)))
+}
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index e307901..277f18a 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -35,13 +35,13 @@
 //
 //go:nosplit
 func gotraceback() (level int32, all, crash bool) {
-	_g_ := getg()
+	gp := getg()
 	t := atomic.Load(&traceback_cache)
 	crash = t&tracebackCrash != 0
-	all = _g_.m.throwing >= throwTypeUser || t&tracebackAll != 0
-	if _g_.m.traceback != 0 {
-		level = int32(_g_.m.traceback)
-	} else if _g_.m.throwing >= throwTypeRuntime {
+	all = gp.m.throwing >= throwTypeUser || t&tracebackAll != 0
+	if gp.m.traceback != 0 {
+		level = int32(gp.m.traceback)
+	} else if gp.m.throwing >= throwTypeRuntime {
 		// Always include runtime frames in runtime throws unless
 		// otherwise overridden by m.traceback.
 		level = 2
@@ -56,7 +56,7 @@
 	argv **byte
 )
 
-// nosplit for use in linux startup sysargs
+// nosplit for use in linux startup sysargs.
 //
 //go:nosplit
 func argv_index(argv **byte, i int32) *byte {
@@ -355,6 +355,8 @@
 	{"adaptivestackstart", &debug.adaptivestackstart},
 }
 
+var globalGODEBUG string
+
 func parsedebugvars() {
 	// defaults
 	debug.cgocheck = 1
@@ -372,7 +374,9 @@
 		debug.madvdontneed = 1
 	}
 
-	for p := gogetenv("GODEBUG"); p != ""; {
+	globalGODEBUG = gogetenv("GODEBUG")
+	godebugEnv.StoreNoWB(&globalGODEBUG)
+	for p := globalGODEBUG; p != ""; {
 		field := ""
 		i := bytealg.IndexByteString(p, ',')
 		if i < 0 {
@@ -474,18 +478,18 @@
 
 //go:nosplit
 func acquirem() *m {
-	_g_ := getg()
-	_g_.m.locks++
-	return _g_.m
+	gp := getg()
+	gp.m.locks++
+	return gp.m
 }
 
 //go:nosplit
 func releasem(mp *m) {
-	_g_ := getg()
+	gp := getg()
 	mp.locks--
-	if mp.locks == 0 && _g_.preempt {
+	if mp.locks == 0 && gp.preempt {
 		// restore the preemption request in case we've cleared it in newstack
-		_g_.stackguard0 = stackPreempt
+		gp.stackguard0 = stackPreempt
 	}
 }
 
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index e178822..9381d1e 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -435,9 +435,9 @@
 	// 3. By debugCallWrap to pass parameters to a new goroutine because allocating a
 	//    closure in the runtime is forbidden.
 	param        unsafe.Pointer
-	atomicstatus uint32
+	atomicstatus atomic.Uint32
 	stackLock    uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
-	goid         int64
+	goid         uint64
 	schedlink    guintptr
 	waitsince    int64      // approx time when the g become blocked
 	waitreason   waitReason // if status==Gwaiting
@@ -461,14 +461,14 @@
 	activeStackChans bool
 	// parkingOnChan indicates that the goroutine is about to
 	// park on a chansend or chanrecv. Used to signal an unsafe point
-	// for stack shrinking. It's a boolean value, but is updated atomically.
-	parkingOnChan uint8
+	// for stack shrinking.
+	parkingOnChan atomic.Bool
 
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
 	tracking       bool     // whether we're tracking this G for sched latency statistics
 	trackingSeq    uint8    // used to decide whether to track this G
-	runnableStamp  int64    // timestamp of when the G last became runnable, only used when tracking
+	trackingStamp  int64    // timestamp of when the G last started being tracked
 	runnableTime   int64    // the amount of time spent runnable, cleared when running, only used when tracking
 	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
 	traceseq       uint64   // trace event sequencer
@@ -487,7 +487,7 @@
 	cgoCtxt        []uintptr      // cgo traceback context
 	labels         unsafe.Pointer // profiler labels
 	timer          *timer         // cached timer for time.Sleep
-	selectDone     uint32         // are we participating in a select and did someone win the race?
+	selectDone     atomic.Uint32  // are we participating in a select and did someone win the race?
 
 	// goroutineProfiled indicates the status of this goroutine's stack for the
 	// current in-progress goroutine profile
@@ -516,6 +516,13 @@
 	tlsSize  = tlsSlots * goarch.PtrSize
 )
 
+// Values for m.freeWait.
+const (
+	freeMStack = 0 // M done, free stack and reference.
+	freeMRef   = 1 // M done, free reference.
+	freeMWait  = 2 // M still in use.
+)
+
 type m struct {
 	g0      *g     // goroutine with scheduling stack
 	morebuf gobuf  // gobuf arg to morestack
@@ -545,15 +552,16 @@
 	blocked       bool // m is blocked on a note
 	newSigstack   bool // minit on C thread called sigaltstack
 	printlock     int8
-	incgo         bool   // m is executing a cgo call
-	freeWait      uint32 // if == 0, safe to free g0 and delete m (atomic)
+	incgo         bool          // m is executing a cgo call
+	isextra       bool          // m is an extra m
+	freeWait      atomic.Uint32 // Whether it is safe to free g0 and delete m (one of freeMRef, freeMStack, freeMWait)
 	fastrand      uint64
 	needextram    bool
 	traceback     uint8
-	ncgocall      uint64      // number of cgo calls in total
-	ncgo          int32       // number of cgo calls currently in progress
-	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
-	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	ncgocall      uint64        // number of cgo calls in total
+	ncgo          int32         // number of cgo calls currently in progress
+	cgoCallersUse atomic.Uint32 // if non-zero, cgoCallers in use temporarily
+	cgoCallers    *cgoCallers   // cgo traceback if crashing in cgo call
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
@@ -583,12 +591,11 @@
 
 	// preemptGen counts the number of completed preemption
 	// signals. This is used to detect when a preemption is
-	// requested, but fails. Accessed atomically.
-	preemptGen uint32
+	// requested, but fails.
+	preemptGen atomic.Uint32
 
 	// Whether this is a pending preemption signal on this M.
-	// Accessed atomically.
-	signalPending uint32
+	signalPending atomic.Uint32
 
 	dlogPerM
 
@@ -668,19 +675,15 @@
 
 	palloc persistentAlloc // per-P to avoid mutex
 
-	_ uint32 // Alignment for atomic fields below
-
 	// The when field of the first entry on the timer heap.
-	// This is updated using atomic functions.
 	// This is 0 if the timer heap is empty.
-	timer0When uint64
+	timer0When atomic.Int64
 
 	// The earliest known nextwhen field of a timer with
 	// timerModifiedEarlier status. Because the timer may have been
 	// modified again, there need not be any timer with this value.
-	// This is updated using atomic functions.
 	// This is 0 if there are no timerModifiedEarlier timers.
-	timerModifiedEarliest uint64
+	timerModifiedEarliest atomic.Int64
 
 	// Per-P GC state
 	gcAssistTime         int64 // Nanoseconds in assistAlloc
@@ -713,7 +716,7 @@
 
 	// statsSeq is a counter indicating whether this P is currently
 	// writing any stats. Its value is even when not, odd when it is.
-	statsSeq uint32
+	statsSeq atomic.Uint32
 
 	// Lock for timers. We normally access the timers while running
 	// on this P, but the scheduler can also do it from a different P.
@@ -725,12 +728,10 @@
 	timers []*timer
 
 	// Number of timers in P's heap.
-	// Modified using atomic instructions.
-	numTimers uint32
+	numTimers atomic.Uint32
 
 	// Number of timerDeleted timers in P's heap.
-	// Modified using atomic instructions.
-	deletedTimers uint32
+	deletedTimers atomic.Uint32
 
 	// Race context used while executing timer functions.
 	timerRaceCtx uintptr
@@ -753,15 +754,19 @@
 	// scheduler ASAP (regardless of what G is running on it).
 	preempt bool
 
+	// pageTraceBuf is a buffer for writing out page allocation/free/scavenge traces.
+	//
+	// Used only if GOEXPERIMENT=pagetrace.
+	pageTraceBuf pageTraceBuf
+
 	// Padding is no longer needed. False sharing is now not a worry because p is large enough
 	// that its size class is an integer multiple of the cache line size (for any of our architectures).
 }
 
 type schedt struct {
-	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
-	goidgen   uint64
-	lastpoll  uint64 // time of last network poll, 0 if currently polling
-	pollUntil uint64 // time to which current poll is sleeping
+	goidgen   atomic.Uint64
+	lastpoll  atomic.Int64 // time of last network poll, 0 if currently polling
+	pollUntil atomic.Int64 // time to which current poll is sleeping
 
 	lock mutex
 
@@ -776,11 +781,12 @@
 	nmsys        int32    // number of system m's not counted for deadlock
 	nmfreed      int64    // cumulative number of freed m's
 
-	ngsys uint32 // number of system goroutines; updated atomically
+	ngsys atomic.Int32 // number of system goroutines
 
-	pidle      puintptr // idle p's
-	npidle     uint32
-	nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
+	pidle        puintptr // idle p's
+	npidle       atomic.Int32
+	nmspinning   atomic.Int32  // See "Worker thread parking/unparking" comment in proc.go.
+	needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1.
 
 	// Global runnable queue.
 	runq     gQueue
@@ -818,10 +824,10 @@
 	// m.exited is set. Linked through m.freelink.
 	freem *m
 
-	gcwaiting  uint32 // gc is waiting to run
+	gcwaiting  atomic.Bool // gc is waiting to run
 	stopwait   int32
 	stopnote   note
-	sysmonwait uint32
+	sysmonwait atomic.Bool
 	sysmonnote note
 
 	// safepointFn should be called on each P at the next GC
@@ -844,9 +850,16 @@
 	// timeToRun is a distribution of scheduling latencies, defined
 	// as the sum of time a G spends in the _Grunnable state before
 	// it transitions to _Grunning.
-	//
-	// timeToRun is protected by sched.lock.
 	timeToRun timeHistogram
+
+	// idleTime is the total CPU time Ps have "spent" idle.
+	//
+	// Reset on each GC cycle.
+	idleTime atomic.Int64
+
+	// totalMutexWaitTime is the sum of time goroutines have spent in _Gwaiting
+	// with a waitreason of the form waitReasonSync{RW,}Mutex{R,}Lock.
+	totalMutexWaitTime atomic.Int64
 }
 
 // Values for the flags field of a sigTabT.
@@ -867,8 +880,8 @@
 // Keep in sync with linker (../cmd/link/internal/ld/pcln.go:/pclntab)
 // and with package debug/gosym and with symtab.go in package runtime.
 type _func struct {
-	entryoff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
-	nameoff  int32  // function name
+	entryOff uint32 // start pc, as offset from moduledata.text/pcHeader.textStart
+	nameOff  int32  // function name, as index into moduledata.funcnametab.
 
 	args        int32  // in/out args size
 	deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
@@ -878,21 +891,45 @@
 	pcln      uint32
 	npcdata   uint32
 	cuOffset  uint32 // runtime.cutab offset of this function's CU
+	startLine int32  // line number of start of function (func keyword/TEXT directive)
 	funcID    funcID // set for certain special runtime functions
 	flag      funcFlag
 	_         [1]byte // pad
 	nfuncdata uint8   // must be last, must end on a uint32-aligned boundary
+
+	// The end of the struct is followed immediately by two variable-length
+	// arrays that reference the pcdata and funcdata locations for this
+	// function.
+
+	// pcdata contains the offset into moduledata.pctab for the start of
+	// that index's table. e.g.,
+	// &moduledata.pctab[_func.pcdata[_PCDATA_UnsafePoint]] is the start of
+	// the unsafe point table.
+	//
+	// An offset of 0 indicates that there is no table.
+	//
+	// pcdata [npcdata]uint32
+
+	// funcdata contains the offset past moduledata.gofunc which contains a
+	// pointer to that index's funcdata. e.g.,
+	// *(moduledata.gofunc +  _func.funcdata[_FUNCDATA_ArgsPointerMaps]) is
+	// the argument pointer map.
+	//
+	// An offset of ^uint32(0) indicates that there is no entry.
+	//
+	// funcdata [nfuncdata]uint32
 }
 
 // Pseudo-Func that is returned for PCs that occur in inlined code.
 // A *Func can be either a *_func or a *funcinl, and they are distinguished
 // by the first uintptr.
 type funcinl struct {
-	ones  uint32  // set to ^0 to distinguish from _func
-	entry uintptr // entry of the real (the "outermost") frame
-	name  string
-	file  string
-	line  int
+	ones      uint32  // set to ^0 to distinguish from _func
+	entry     uintptr // entry of the real (the "outermost") frame
+	name      string
+	file      string
+	line      int32
+	startLine int32
 }
 
 // layout of Itab known to compilers
@@ -917,7 +954,7 @@
 type forcegcstate struct {
 	lock mutex
 	g    *g
-	idle uint32
+	idle atomic.Bool
 }
 
 // extendRandom extends the random numbers in r[:n] to the whole slice r.
@@ -994,24 +1031,10 @@
 	goexit    bool
 }
 
-// stack traces
-type stkframe struct {
-	fn       funcInfo   // function being run
-	pc       uintptr    // program counter within fn
-	continpc uintptr    // program counter where execution can continue, or 0 if not
-	lr       uintptr    // program counter at caller aka link register
-	sp       uintptr    // stack pointer at pc
-	fp       uintptr    // stack pointer at caller aka frame pointer
-	varp     uintptr    // top of local variables
-	argp     uintptr    // pointer to function arguments
-	arglen   uintptr    // number of bytes at argp
-	argmap   *bitvector // force use of this argmap
-}
-
 // ancestorInfo records details of where a goroutine was started.
 type ancestorInfo struct {
 	pcs  []uintptr // pcs from the stack of this goroutine
-	goid int64     // goroutine id of this goroutine; original goroutine possibly dead
+	goid uint64    // goroutine id of this goroutine; original goroutine possibly dead
 	gopc uintptr   // pc of go statement that created this goroutine
 }
 
@@ -1050,12 +1073,17 @@
 	waitReasonSemacquire                              // "semacquire"
 	waitReasonSleep                                   // "sleep"
 	waitReasonSyncCondWait                            // "sync.Cond.Wait"
-	waitReasonTimerGoroutineIdle                      // "timer goroutine (idle)"
+	waitReasonSyncMutexLock                           // "sync.Mutex.Lock"
+	waitReasonSyncRWMutexRLock                        // "sync.RWMutex.RLock"
+	waitReasonSyncRWMutexLock                         // "sync.RWMutex.Lock"
 	waitReasonTraceReaderBlocked                      // "trace reader (blocked)"
 	waitReasonWaitForGCCycle                          // "wait for GC cycle"
 	waitReasonGCWorkerIdle                            // "GC worker (idle)"
+	waitReasonGCWorkerActive                          // "GC worker (active)"
 	waitReasonPreempted                               // "preempted"
 	waitReasonDebugCall                               // "debug call"
+	waitReasonGCMarkTermination                       // "GC mark termination"
+	waitReasonStoppingTheWorld                        // "stopping the world"
 )
 
 var waitReasonStrings = [...]string{
@@ -1080,12 +1108,17 @@
 	waitReasonSemacquire:            "semacquire",
 	waitReasonSleep:                 "sleep",
 	waitReasonSyncCondWait:          "sync.Cond.Wait",
-	waitReasonTimerGoroutineIdle:    "timer goroutine (idle)",
+	waitReasonSyncMutexLock:         "sync.Mutex.Lock",
+	waitReasonSyncRWMutexRLock:      "sync.RWMutex.RLock",
+	waitReasonSyncRWMutexLock:       "sync.RWMutex.Lock",
 	waitReasonTraceReaderBlocked:    "trace reader (blocked)",
 	waitReasonWaitForGCCycle:        "wait for GC cycle",
 	waitReasonGCWorkerIdle:          "GC worker (idle)",
+	waitReasonGCWorkerActive:        "GC worker (active)",
 	waitReasonPreempted:             "preempted",
 	waitReasonDebugCall:             "debug call",
+	waitReasonGCMarkTermination:     "GC mark termination",
+	waitReasonStoppingTheWorld:      "stopping the world",
 }
 
 func (w waitReason) String() string {
@@ -1095,6 +1128,12 @@
 	return waitReasonStrings[w]
 }
 
+func (w waitReason) isMutexWait() bool {
+	return w == waitReasonSyncMutexLock ||
+		w == waitReasonSyncRWMutexRLock ||
+		w == waitReasonSyncRWMutexLock
+}
+
 var (
 	allm       *m
 	gomaxprocs int32
diff --git a/src/runtime/runtime_linux_test.go b/src/runtime/runtime_linux_test.go
index a753aee..6af5561 100644
--- a/src/runtime/runtime_linux_test.go
+++ b/src/runtime/runtime_linux_test.go
@@ -53,15 +53,6 @@
 	}
 }
 
-func TestEpollctlErrorSign(t *testing.T) {
-	v := Epollctl(-1, 1, -1, unsafe.Pointer(&EpollEvent{}))
-
-	const EBADF = 0x09
-	if v != -EBADF {
-		t.Errorf("epollctl = %v, want %v", v, -EBADF)
-	}
-}
-
 func TestKernelStructSize(t *testing.T) {
 	// Check that the Go definitions of structures exchanged with the kernel are
 	// the same size as what the kernel defines.
diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index 018a8db..2faf06e 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@@ -377,7 +377,7 @@
 				if !ok {
 					b.Fatal("goroutine profile failed")
 				}
-				latencies = append(latencies, time.Now().Sub(start))
+				latencies = append(latencies, time.Since(start))
 			}
 			b.StopTimer()
 
diff --git a/src/runtime/rwmutex.go b/src/runtime/rwmutex.go
index 7713c3f..ede3d13 100644
--- a/src/runtime/rwmutex.go
+++ b/src/runtime/rwmutex.go
@@ -23,8 +23,8 @@
 	wLock  mutex    // serializes writers
 	writer muintptr // pending writer waiting for completing readers
 
-	readerCount uint32 // number of pending readers
-	readerWait  uint32 // number of departing readers
+	readerCount atomic.Int32 // number of pending readers
+	readerWait  atomic.Int32 // number of departing readers
 }
 
 const rwmutexMaxReaders = 1 << 30
@@ -36,7 +36,7 @@
 	// deadlock (issue #20903). Alternatively, we could drop the P
 	// while sleeping.
 	acquirem()
-	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
+	if rw.readerCount.Add(1) < 0 {
 		// A writer is pending. Park on the reader queue.
 		systemstack(func() {
 			lockWithRank(&rw.rLock, lockRankRwmutexR)
@@ -60,12 +60,12 @@
 
 // runlock undoes a single rlock call on rw.
 func (rw *rwmutex) runlock() {
-	if r := int32(atomic.Xadd(&rw.readerCount, -1)); r < 0 {
+	if r := rw.readerCount.Add(-1); r < 0 {
 		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
 			throw("runlock of unlocked rwmutex")
 		}
 		// A writer is pending.
-		if atomic.Xadd(&rw.readerWait, -1) == 0 {
+		if rw.readerWait.Add(-1) == 0 {
 			// The last reader unblocks the writer.
 			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			w := rw.writer.ptr()
@@ -84,10 +84,10 @@
 	lockWithRank(&rw.wLock, lockRankRwmutexW)
 	m := getg().m
 	// Announce that there is a pending writer.
-	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
+	r := rw.readerCount.Add(-rwmutexMaxReaders) + rwmutexMaxReaders
 	// Wait for any active readers to complete.
 	lockWithRank(&rw.rLock, lockRankRwmutexR)
-	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
+	if r != 0 && rw.readerWait.Add(r) != 0 {
 		// Wait for reader to wake us up.
 		systemstack(func() {
 			rw.writer.set(m)
@@ -103,7 +103,7 @@
 // unlock unlocks rw for writing.
 func (rw *rwmutex) unlock() {
 	// Announce to readers that there is no active writer.
-	r := int32(atomic.Xadd(&rw.readerCount, rwmutexMaxReaders))
+	r := rw.readerCount.Add(rwmutexMaxReaders)
 	if r >= rwmutexMaxReaders {
 		throw("unlock of unlocked rwmutex")
 	}
diff --git a/src/runtime/rwmutex_test.go b/src/runtime/rwmutex_test.go
index f15d367..ddb16ae 100644
--- a/src/runtime/rwmutex_test.go
+++ b/src/runtime/rwmutex_test.go
@@ -17,10 +17,10 @@
 	"testing"
 )
 
-func parallelReader(m *RWMutex, clocked chan bool, cunlock *uint32, cdone chan bool) {
+func parallelReader(m *RWMutex, clocked chan bool, cunlock *atomic.Bool, cdone chan bool) {
 	m.RLock()
 	clocked <- true
-	for atomic.LoadUint32(cunlock) == 0 {
+	for !cunlock.Load() {
 	}
 	m.RUnlock()
 	cdone <- true
@@ -30,7 +30,7 @@
 	GOMAXPROCS(numReaders + 1)
 	var m RWMutex
 	clocked := make(chan bool, numReaders)
-	var cunlock uint32
+	var cunlock atomic.Bool
 	cdone := make(chan bool)
 	for i := 0; i < numReaders; i++ {
 		go parallelReader(&m, clocked, &cunlock, cdone)
@@ -39,7 +39,7 @@
 	for i := 0; i < numReaders; i++ {
 		<-clocked
 	}
-	atomic.StoreUint32(&cunlock, 1)
+	cunlock.Store(true)
 	// Wait for the goroutines to finish.
 	for i := 0; i < numReaders; i++ {
 		<-cdone
diff --git a/src/runtime/select.go b/src/runtime/select.go
index e18b2f1..1072465 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -8,7 +8,6 @@
 
 import (
 	"internal/abi"
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -70,7 +69,7 @@
 	// Mark that it's safe for stack shrinking to occur now,
 	// because any thread acquiring this G's stack for shrinking
 	// is guaranteed to observe activeStackChans after this store.
-	atomic.Store8(&gp.parkingOnChan, 0)
+	gp.parkingOnChan.Store(false)
 	// Make sure we unlock after setting activeStackChans and
 	// unsetting parkingOnChan. The moment we unlock any of the
 	// channel locks we risk gp getting readied by a channel operation
@@ -324,13 +323,13 @@
 	// to park on a channel. The window between when this G's status
 	// changes and when we set gp.activeStackChans is not safe for
 	// stack shrinking.
-	atomic.Store8(&gp.parkingOnChan, 1)
+	gp.parkingOnChan.Store(true)
 	gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
 	gp.activeStackChans = false
 
 	sellock(scases, lockorder)
 
-	gp.selectDone = 0
+	gp.selectDone.Store(0)
 	sg = (*sudog)(gp.param)
 	gp.param = nil
 
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 39935f7..bc23a85 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -39,8 +39,8 @@
 // BenchmarkSemTable/OneAddrCollision/* for a benchmark that exercises this.
 type semaRoot struct {
 	lock  mutex
-	treap *sudog // root of balanced tree of unique waiters.
-	nwait uint32 // Number of waiters. Read w/o the lock.
+	treap *sudog        // root of balanced tree of unique waiters.
+	nwait atomic.Uint32 // Number of waiters. Read w/o the lock.
 }
 
 var semtable semTable
@@ -59,12 +59,12 @@
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
 func sync_runtime_Semacquire(addr *uint32) {
-	semacquire1(addr, false, semaBlockProfile, 0)
+	semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire)
 }
 
 //go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire
 func poll_runtime_Semacquire(addr *uint32) {
-	semacquire1(addr, false, semaBlockProfile, 0)
+	semacquire1(addr, false, semaBlockProfile, 0, waitReasonSemacquire)
 }
 
 //go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
@@ -74,7 +74,17 @@
 
 //go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
 func sync_runtime_SemacquireMutex(addr *uint32, lifo bool, skipframes int) {
-	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes)
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncMutexLock)
+}
+
+//go:linkname sync_runtime_SemacquireRWMutexR sync.runtime_SemacquireRWMutexR
+func sync_runtime_SemacquireRWMutexR(addr *uint32, lifo bool, skipframes int) {
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexRLock)
+}
+
+//go:linkname sync_runtime_SemacquireRWMutex sync.runtime_SemacquireRWMutex
+func sync_runtime_SemacquireRWMutex(addr *uint32, lifo bool, skipframes int) {
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile, skipframes, waitReasonSyncRWMutexLock)
 }
 
 //go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease
@@ -98,10 +108,10 @@
 
 // Called from runtime.
 func semacquire(addr *uint32) {
-	semacquire1(addr, false, 0, 0)
+	semacquire1(addr, false, 0, 0, waitReasonSemacquire)
 }
 
-func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int) {
+func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags, skipframes int, reason waitReason) {
 	gp := getg()
 	if gp != gp.m.curg {
 		throw("semacquire not on the G stack")
@@ -137,17 +147,17 @@
 	for {
 		lockWithRank(&root.lock, lockRankRoot)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
-		atomic.Xadd(&root.nwait, 1)
+		root.nwait.Add(1)
 		// Check cansemacquire to avoid missed wakeup.
 		if cansemacquire(addr) {
-			atomic.Xadd(&root.nwait, -1)
+			root.nwait.Add(-1)
 			unlock(&root.lock)
 			break
 		}
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
 		root.queue(addr, s, lifo)
-		goparkunlock(&root.lock, waitReasonSemacquire, traceEvGoBlockSync, 4+skipframes)
+		goparkunlock(&root.lock, reason, traceEvGoBlockSync, 4+skipframes)
 		if s.ticket != 0 || cansemacquire(addr) {
 			break
 		}
@@ -169,13 +179,13 @@
 	// Easy case: no waiters?
 	// This check must happen after the xadd, to avoid a missed wakeup
 	// (see loop in semacquire).
-	if atomic.Load(&root.nwait) == 0 {
+	if root.nwait.Load() == 0 {
 		return
 	}
 
 	// Harder case: search for a waiter and wake it.
 	lockWithRank(&root.lock, lockRankRoot)
-	if atomic.Load(&root.nwait) == 0 {
+	if root.nwait.Load() == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
 		unlock(&root.lock)
@@ -183,7 +193,7 @@
 	}
 	s, t0 := root.dequeue(addr)
 	if s != nil {
-		atomic.Xadd(&root.nwait, -1)
+		root.nwait.Add(-1)
 	}
 	unlock(&root.lock)
 	if s != nil { // May be slow or even yield, so unlock first
@@ -451,7 +461,7 @@
 type notifyList struct {
 	// wait is the ticket number of the next waiter. It is atomically
 	// incremented outside the lock.
-	wait uint32
+	wait atomic.Uint32
 
 	// notify is the ticket number of the next waiter to be notified. It can
 	// be read outside the lock, but is only written to with lock held.
@@ -482,7 +492,7 @@
 func notifyListAdd(l *notifyList) uint32 {
 	// This may be called concurrently, for example, when called from
 	// sync.Cond.Wait while holding a RWMutex in read mode.
-	return atomic.Xadd(&l.wait, 1) - 1
+	return l.wait.Add(1) - 1
 }
 
 // notifyListWait waits for a notification. If one has been sent since
@@ -527,7 +537,7 @@
 func notifyListNotifyAll(l *notifyList) {
 	// Fast-path: if there are no new waiters since the last notification
 	// we don't need to acquire the lock.
-	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+	if l.wait.Load() == atomic.Load(&l.notify) {
 		return
 	}
 
@@ -542,7 +552,7 @@
 	// value of wait because any previous waiters are already in the list
 	// or will notice that they have already been notified when trying to
 	// add themselves to the list.
-	atomic.Store(&l.notify, atomic.Load(&l.wait))
+	atomic.Store(&l.notify, l.wait.Load())
 	unlock(&l.lock)
 
 	// Go through the local list and ready all waiters.
@@ -560,7 +570,7 @@
 func notifyListNotifyOne(l *notifyList) {
 	// Fast-path: if there are no new waiters since the last notification
 	// we don't need to acquire the lock at all.
-	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+	if l.wait.Load() == atomic.Load(&l.notify) {
 		return
 	}
 
@@ -568,7 +578,7 @@
 
 	// Re-check under the lock if we need to do anything.
 	t := l.notify
-	if t == atomic.Load(&l.wait) {
+	if t == l.wait.Load() {
 		unlock(&l.lock)
 		return
 	}
diff --git a/src/runtime/semasleep_test.go b/src/runtime/semasleep_test.go
index d56733c..7262853 100644
--- a/src/runtime/semasleep_test.go
+++ b/src/runtime/semasleep_test.go
@@ -37,14 +37,16 @@
 	if err := cmd.Start(); err != nil {
 		t.Fatalf("Failed to start command: %v", err)
 	}
+
+	waiting := false
 	doneCh := make(chan error, 1)
-	go func() {
-		doneCh <- cmd.Wait()
-		close(doneCh)
-	}()
 	t.Cleanup(func() {
 		cmd.Process.Kill()
-		<-doneCh
+		if waiting {
+			<-doneCh
+		} else {
+			cmd.Wait()
+		}
 	})
 
 	// Wait for After1 to close its stdout so that we know the runtime's SIGIO
@@ -57,6 +59,19 @@
 		t.Fatalf("error reading from testprog: %v", err)
 	}
 
+	// Wait for child exit.
+	//
+	// Note that we must do this after waiting for the write/child end of
+	// stdout to close. Wait closes the read/parent end of stdout, so
+	// starting this goroutine prior to io.ReadAll introduces a race
+	// condition where ReadAll may get fs.ErrClosed if the child exits too
+	// quickly.
+	waiting = true
+	go func() {
+		doneCh <- cmd.Wait()
+		close(doneCh)
+	}()
+
 	// Wait for an arbitrary timeout longer than one second. The subprocess itself
 	// attempts to sleep for one second, but if the machine running the test is
 	// heavily loaded that subprocess may not schedule very quickly even if the
diff --git a/src/runtime/signal_darwin_amd64.go b/src/runtime/signal_darwin_amd64.go
index abc212a..20544d8 100644
--- a/src/runtime/signal_darwin_amd64.go
+++ b/src/runtime/signal_darwin_amd64.go
@@ -84,6 +84,10 @@
 		// in real life, people will probably search for it and find this code.
 		// There are no Google hits for b01dfacedebac1e or 0xb01dfacedebac1e
 		// as I type this comment.
+		//
+		// Note: if this code is removed, please consider
+		// enabling TestSignalForwardingGo for darwin-amd64 in
+		// misc/cgo/testcarchive/carchive_test.go.
 		if c.sigcode() == _SI_USER {
 			c.set_sigcode(_SI_USER + 1)
 			c.set_sigaddr(0xb01dfacedebac1e)
diff --git a/src/runtime/signal_freebsd_riscv64.go b/src/runtime/signal_freebsd_riscv64.go
new file mode 100644
index 0000000..fbf6c63
--- /dev/null
+++ b/src/runtime/signal_freebsd_riscv64.go
@@ -0,0 +1,63 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *mcontext { return &(*ucontext)(c.ctxt).uc_mcontext }
+
+func (c *sigctxt) ra() uint64  { return c.regs().mc_gpregs.gp_ra }
+func (c *sigctxt) sp() uint64  { return c.regs().mc_gpregs.gp_sp }
+func (c *sigctxt) gp() uint64  { return c.regs().mc_gpregs.gp_gp }
+func (c *sigctxt) tp() uint64  { return c.regs().mc_gpregs.gp_tp }
+func (c *sigctxt) t0() uint64  { return c.regs().mc_gpregs.gp_t[0] }
+func (c *sigctxt) t1() uint64  { return c.regs().mc_gpregs.gp_t[1] }
+func (c *sigctxt) t2() uint64  { return c.regs().mc_gpregs.gp_t[2] }
+func (c *sigctxt) s0() uint64  { return c.regs().mc_gpregs.gp_s[0] }
+func (c *sigctxt) s1() uint64  { return c.regs().mc_gpregs.gp_s[1] }
+func (c *sigctxt) a0() uint64  { return c.regs().mc_gpregs.gp_a[0] }
+func (c *sigctxt) a1() uint64  { return c.regs().mc_gpregs.gp_a[1] }
+func (c *sigctxt) a2() uint64  { return c.regs().mc_gpregs.gp_a[2] }
+func (c *sigctxt) a3() uint64  { return c.regs().mc_gpregs.gp_a[3] }
+func (c *sigctxt) a4() uint64  { return c.regs().mc_gpregs.gp_a[4] }
+func (c *sigctxt) a5() uint64  { return c.regs().mc_gpregs.gp_a[5] }
+func (c *sigctxt) a6() uint64  { return c.regs().mc_gpregs.gp_a[6] }
+func (c *sigctxt) a7() uint64  { return c.regs().mc_gpregs.gp_a[7] }
+func (c *sigctxt) s2() uint64  { return c.regs().mc_gpregs.gp_s[2] }
+func (c *sigctxt) s3() uint64  { return c.regs().mc_gpregs.gp_s[3] }
+func (c *sigctxt) s4() uint64  { return c.regs().mc_gpregs.gp_s[4] }
+func (c *sigctxt) s5() uint64  { return c.regs().mc_gpregs.gp_s[5] }
+func (c *sigctxt) s6() uint64  { return c.regs().mc_gpregs.gp_s[6] }
+func (c *sigctxt) s7() uint64  { return c.regs().mc_gpregs.gp_s[7] }
+func (c *sigctxt) s8() uint64  { return c.regs().mc_gpregs.gp_s[8] }
+func (c *sigctxt) s9() uint64  { return c.regs().mc_gpregs.gp_s[9] }
+func (c *sigctxt) s10() uint64 { return c.regs().mc_gpregs.gp_s[10] }
+func (c *sigctxt) s11() uint64 { return c.regs().mc_gpregs.gp_s[11] }
+func (c *sigctxt) t3() uint64  { return c.regs().mc_gpregs.gp_t[3] }
+func (c *sigctxt) t4() uint64  { return c.regs().mc_gpregs.gp_t[4] }
+func (c *sigctxt) t5() uint64  { return c.regs().mc_gpregs.gp_t[5] }
+func (c *sigctxt) t6() uint64  { return c.regs().mc_gpregs.gp_t[6] }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) pc() uint64 { return c.regs().mc_gpregs.gp_sepc }
+
+func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_pc(x uint64) { c.regs().mc_gpregs.gp_sepc = x }
+func (c *sigctxt) set_ra(x uint64) { c.regs().mc_gpregs.gp_ra = x }
+func (c *sigctxt) set_sp(x uint64) { c.regs().mc_gpregs.gp_sp = x }
+func (c *sigctxt) set_gp(x uint64) { c.regs().mc_gpregs.gp_gp = x }
+
+func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) { c.info.si_addr = x }
diff --git a/src/runtime/signal_riscv64.go b/src/runtime/signal_riscv64.go
index 5eeb227..b8d7b97 100644
--- a/src/runtime/signal_riscv64.go
+++ b/src/runtime/signal_riscv64.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build linux && riscv64
+//go:build (linux || freebsd) && riscv64
 
 package runtime
 
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 0be499b..c401fc1 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -349,11 +349,11 @@
 	}
 
 	// Acknowledge the preemption.
-	atomic.Xadd(&gp.m.preemptGen, 1)
-	atomic.Store(&gp.m.signalPending, 0)
+	gp.m.preemptGen.Add(1)
+	gp.m.signalPending.Store(0)
 
 	if GOOS == "darwin" || GOOS == "ios" {
-		atomic.Xadd(&pendingPreemptSignals, -1)
+		pendingPreemptSignals.Add(-1)
 	}
 }
 
@@ -372,9 +372,9 @@
 		execLock.rlock()
 	}
 
-	if atomic.Cas(&mp.signalPending, 0, 1) {
+	if mp.signalPending.CompareAndSwap(0, 1) {
 		if GOOS == "darwin" || GOOS == "ios" {
-			atomic.Xadd(&pendingPreemptSignals, 1)
+			pendingPreemptSignals.Add(1)
 		}
 
 		// If multiple threads are preempting the same M, it may send many
@@ -433,9 +433,9 @@
 		return
 	}
 	c := &sigctxt{info, ctx}
-	g := sigFetchG(c)
-	setg(g)
-	if g == nil {
+	gp := sigFetchG(c)
+	setg(gp)
+	if gp == nil {
 		if sig == _SIGPROF {
 			// Some platforms (Linux) have per-thread timers, which we use in
 			// combination with the process-wide timer. Avoid double-counting.
@@ -453,7 +453,7 @@
 			// The default behavior for sigPreempt is to ignore
 			// the signal, so badsignal will be a no-op anyway.
 			if GOOS == "darwin" || GOOS == "ios" {
-				atomic.Xadd(&pendingPreemptSignals, -1)
+				pendingPreemptSignals.Add(-1)
 			}
 			return
 		}
@@ -462,22 +462,22 @@
 		return
 	}
 
-	setg(g.m.gsignal)
+	setg(gp.m.gsignal)
 
 	// If some non-Go code called sigaltstack, adjust.
 	var gsignalStack gsignalStack
-	setStack := adjustSignalStack(sig, g.m, &gsignalStack)
+	setStack := adjustSignalStack(sig, gp.m, &gsignalStack)
 	if setStack {
-		g.m.gsignal.stktopsp = getcallersp()
+		gp.m.gsignal.stktopsp = getcallersp()
 	}
 
-	if g.stackguard0 == stackFork {
+	if gp.stackguard0 == stackFork {
 		signalDuringFork(sig)
 	}
 
 	c.fixsigcode(sig)
-	sighandler(sig, info, ctx, g)
-	setg(g)
+	sighandler(sig, info, ctx, gp)
+	setg(gp)
 	if setStack {
 		restoreGsignalStack(&gsignalStack)
 	}
@@ -502,7 +502,7 @@
 //go:nosplit
 //go:nowritebarrierrec
 func sigprofNonGo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
-	if prof.hz != 0 {
+	if prof.hz.Load() != 0 {
 		c := &sigctxt{info, ctx}
 		// Some platforms (Linux) have per-thread timers, which we use in
 		// combination with the process-wide timer. Avoid double-counting.
@@ -525,7 +525,7 @@
 //go:nosplit
 //go:nowritebarrierrec
 func sigprofNonGoPC(pc uintptr) {
-	if prof.hz != 0 {
+	if prof.hz.Load() != 0 {
 		stk := []uintptr{
 			pc,
 			abi.FuncPCABIInternal(_ExternalCode) + sys.PCQuantum,
@@ -596,7 +596,7 @@
 
 // sighandler is invoked when a signal occurs. The global g will be
 // set to a gsignal goroutine and we will be running on the alternate
-// signal stack. The parameter g will be the value of the global g
+// signal stack. The parameter gp will be the value of the global g
 // when the signal occurred. The sig, info, and ctxt parameters are
 // from the system signal handler: they are the parameters passed when
 // the SA is passed to the sigaction system call.
@@ -606,9 +606,11 @@
 //
 //go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
-	_g_ := getg()
+	// The g executing the signal handler. This is almost always
+	// mp.gsignal. See delayedSignal for an exception.
+	gsignal := getg()
+	mp := gsignal.m
 	c := &sigctxt{info, ctxt}
-	mp := _g_.m
 
 	// Cgo TSAN (not the Go race detector) intercepts signals and calls the
 	// signal handler at a later time. When the signal handler is called, the
@@ -620,7 +622,7 @@
 	// signal delivery. We use that as an indicator of delayed signals.
 	// For delayed signals, the handler is called on the g0 stack (see
 	// adjustSignalStack).
-	delayedSignal := *cgo_yield != nil && mp != nil && _g_.stack == mp.g0.stack
+	delayedSignal := *cgo_yield != nil && mp != nil && gsignal.stack == mp.g0.stack
 
 	if sig == _SIGPROF {
 		// Some platforms (Linux) have per-thread timers, which we use in
@@ -660,7 +662,7 @@
 	if sig < uint32(len(sigtable)) {
 		flags = sigtable[sig].flags
 	}
-	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 && gp.throwsplit {
+	if !c.sigFromUser() && flags&_SigPanic != 0 && gp.throwsplit {
 		// We can't safely sigpanic because it may grow the
 		// stack. Abort in the signal handler instead.
 		flags = _SigThrow
@@ -670,7 +672,7 @@
 		// causes a memory fault. Don't turn that into a panic.
 		flags = _SigThrow
 	}
-	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+	if !c.sigFromUser() && flags&_SigPanic != 0 {
 		// The signal is going to cause a panic.
 		// Arrange the stack so that it looks like the point
 		// where the signal occurred made a call to the
@@ -688,13 +690,13 @@
 		return
 	}
 
-	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+	if c.sigFromUser() || flags&_SigNotify != 0 {
 		if sigsend(sig) {
 			return
 		}
 	}
 
-	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+	if c.sigFromUser() && signal_ignored(sig) {
 		return
 	}
 
@@ -704,14 +706,14 @@
 
 	// _SigThrow means that we should exit now.
 	// If we get here with _SigPanic, it means that the signal
-	// was sent to us by a program (c.sigcode() == _SI_USER);
+	// was sent to us by a program (c.sigFromUser() is true);
 	// in that case, if we didn't handle it in sigsend, we exit now.
 	if flags&(_SigThrow|_SigPanic) == 0 {
 		return
 	}
 
-	_g_.m.throwing = throwTypeRuntime
-	_g_.m.caughtsig.set(gp)
+	mp.throwing = throwTypeRuntime
+	mp.caughtsig.set(gp)
 
 	if crashing == 0 {
 		startpanic_m()
@@ -723,12 +725,12 @@
 		print("Signal ", sig, "\n")
 	}
 
-	print("PC=", hex(c.sigpc()), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.incgo && gp == _g_.m.g0 && _g_.m.curg != nil {
+	print("PC=", hex(c.sigpc()), " m=", mp.id, " sigcode=", c.sigcode(), "\n")
+	if mp.incgo && gp == mp.g0 && mp.curg != nil {
 		print("signal arrived during cgo execution\n")
 		// Switch to curg so that we get a traceback of the Go code
 		// leading up to the cgocall, which switched from curg to g0.
-		gp = _g_.m.curg
+		gp = mp.curg
 	}
 	if sig == _SIGILL || sig == _SIGFPE {
 		// It would be nice to know how long the instruction is.
@@ -760,10 +762,10 @@
 	if level > 0 {
 		goroutineheader(gp)
 		tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
-		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+		if crashing > 0 && gp != mp.curg && mp.curg != nil && readgstatus(mp.curg)&^_Gscan == _Grunning {
 			// tracebackothers on original m skipped this one; trace it now.
-			goroutineheader(_g_.m.curg)
-			traceback(^uintptr(0), ^uintptr(0), 0, _g_.m.curg)
+			goroutineheader(mp.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, mp.curg)
 		} else if crashing == 0 {
 			tracebackothers(gp)
 			print("\n")
@@ -814,34 +816,41 @@
 //
 //go:linkname sigpanic
 func sigpanic() {
-	g := getg()
-	if !canpanic(g) {
+	gp := getg()
+	if !canpanic() {
 		throw("unexpected signal during runtime execution")
 	}
 
-	switch g.sig {
+	switch gp.sig {
 	case _SIGBUS:
-		if g.sigcode0 == _BUS_ADRERR && g.sigcode1 < 0x1000 {
+		if gp.sigcode0 == _BUS_ADRERR && gp.sigcode1 < 0x1000 {
 			panicmem()
 		}
 		// Support runtime/debug.SetPanicOnFault.
-		if g.paniconfault {
-			panicmemAddr(g.sigcode1)
+		if gp.paniconfault {
+			panicmemAddr(gp.sigcode1)
 		}
-		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		print("unexpected fault address ", hex(gp.sigcode1), "\n")
 		throw("fault")
 	case _SIGSEGV:
-		if (g.sigcode0 == 0 || g.sigcode0 == _SEGV_MAPERR || g.sigcode0 == _SEGV_ACCERR) && g.sigcode1 < 0x1000 {
+		if (gp.sigcode0 == 0 || gp.sigcode0 == _SEGV_MAPERR || gp.sigcode0 == _SEGV_ACCERR) && gp.sigcode1 < 0x1000 {
 			panicmem()
 		}
 		// Support runtime/debug.SetPanicOnFault.
-		if g.paniconfault {
-			panicmemAddr(g.sigcode1)
+		if gp.paniconfault {
+			panicmemAddr(gp.sigcode1)
 		}
-		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		if inUserArenaChunk(gp.sigcode1) {
+			// We could check that the arena chunk is explicitly set to fault,
+			// but the fact that we faulted on accessing it is enough to prove
+			// that it is.
+			print("accessed data from freed user arena ", hex(gp.sigcode1), "\n")
+		} else {
+			print("unexpected fault address ", hex(gp.sigcode1), "\n")
+		}
 		throw("fault")
 	case _SIGFPE:
-		switch g.sigcode0 {
+		switch gp.sigcode0 {
 		case _FPE_INTDIV:
 			panicdivide()
 		case _FPE_INTOVF:
@@ -850,11 +859,11 @@
 		panicfloat()
 	}
 
-	if g.sig >= uint32(len(sigtable)) {
-		// can't happen: we looked up g.sig in sigtable to decide to call sigpanic
+	if gp.sig >= uint32(len(sigtable)) {
+		// can't happen: we looked up gp.sig in sigtable to decide to call sigpanic
 		throw("unexpected signal value")
 	}
-	panic(errorString(sigtable[g.sig].name))
+	panic(errorString(sigtable[gp.sig].name))
 }
 
 // dieFromSignal kills the program with a signal.
@@ -927,7 +936,7 @@
 	//
 	// On FreeBSD, the libthr sigaction code prevents
 	// this from working so we fall through to raise.
-	if GOOS != "freebsd" && (isarchive || islibrary) && handler == _SIG_DFL && c.sigcode() != _SI_USER {
+	if GOOS != "freebsd" && (isarchive || islibrary) && handler == _SIG_DFL && !c.sigFromUser() {
 		return
 	}
 
@@ -1030,8 +1039,6 @@
 	throw("signal received during fork")
 }
 
-var badginsignalMsg = "fatal: bad g in signal handler\n"
-
 // This runs on a foreign stack, without an m or a g. No stack split.
 //
 //go:nosplit
@@ -1042,8 +1049,7 @@
 		// There is no extra M. needm will not be able to grab
 		// an M. Instead of hanging, just crash.
 		// Cannot call split-stack function as there is no G.
-		s := stringStructOf(&badginsignalMsg)
-		write(2, s.str, int32(s.len))
+		writeErrStr("fatal: bad g in signal handler\n")
 		exit(2)
 		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
 	}
@@ -1108,15 +1114,15 @@
 	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
 	// is set to _SI_USER even for a SIGPIPE raised from a write to a closed socket
 	// or pipe.
-	if (c.sigcode() == _SI_USER || flags&_SigPanic == 0) && sig != _SIGPIPE {
+	if (c.sigFromUser() || flags&_SigPanic == 0) && sig != _SIGPIPE {
 		return false
 	}
 	// Determine if the signal occurred inside Go code. We test that:
 	//   (1) we weren't in VDSO page,
 	//   (2) we were in a goroutine (i.e., m.curg != nil), and
 	//   (3) we weren't in CGO.
-	g := sigFetchG(c)
-	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
+	gp := sigFetchG(c)
+	if gp != nil && gp.m != nil && gp.m.curg != nil && !gp.m.incgo {
 		return false
 	}
 
@@ -1207,15 +1213,15 @@
 // of whether it is already set). Record which choice was made in
 // newSigstack, so that it can be undone in unminit.
 func minitSignalStack() {
-	_g_ := getg()
+	mp := getg().m
 	var st stackt
 	sigaltstack(nil, &st)
 	if st.ss_flags&_SS_DISABLE != 0 || !iscgo {
-		signalstack(&_g_.m.gsignal.stack)
-		_g_.m.newSigstack = true
+		signalstack(&mp.gsignal.stack)
+		mp.newSigstack = true
 	} else {
-		setGsignalStack(&st, &_g_.m.goSigStack)
-		_g_.m.newSigstack = false
+		setGsignalStack(&st, &mp.goSigStack)
+		mp.newSigstack = false
 	}
 }
 
@@ -1297,18 +1303,18 @@
 //go:nosplit
 //go:nowritebarrierrec
 func setGsignalStack(st *stackt, old *gsignalStack) {
-	g := getg()
+	gp := getg()
 	if old != nil {
-		old.stack = g.m.gsignal.stack
-		old.stackguard0 = g.m.gsignal.stackguard0
-		old.stackguard1 = g.m.gsignal.stackguard1
-		old.stktopsp = g.m.gsignal.stktopsp
+		old.stack = gp.m.gsignal.stack
+		old.stackguard0 = gp.m.gsignal.stackguard0
+		old.stackguard1 = gp.m.gsignal.stackguard1
+		old.stktopsp = gp.m.gsignal.stktopsp
 	}
 	stsp := uintptr(unsafe.Pointer(st.ss_sp))
-	g.m.gsignal.stack.lo = stsp
-	g.m.gsignal.stack.hi = stsp + st.ss_size
-	g.m.gsignal.stackguard0 = stsp + _StackGuard
-	g.m.gsignal.stackguard1 = stsp + _StackGuard
+	gp.m.gsignal.stack.lo = stsp
+	gp.m.gsignal.stack.hi = stsp + st.ss_size
+	gp.m.gsignal.stackguard0 = stsp + _StackGuard
+	gp.m.gsignal.stackguard1 = stsp + _StackGuard
 }
 
 // restoreGsignalStack restores the gsignal stack to the value it had
@@ -1340,9 +1346,9 @@
 //go:nosplit
 //go:linkname setsigsegv
 func setsigsegv(pc uintptr) {
-	g := getg()
-	g.sig = _SIGSEGV
-	g.sigpc = pc
-	g.sigcode0 = _SEGV_MAPERR
-	g.sigcode1 = 0 // TODO: emulate si_addr
+	gp := getg()
+	gp.sig = _SIGSEGV
+	gp.sigpc = pc
+	gp.sigcode0 = _SEGV_MAPERR
+	gp.sigcode1 = 0 // TODO: emulate si_addr
 }
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index c5cf38c..37986cd 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -199,35 +199,37 @@
 	return 0 // not reached
 }
 
+// Always called on g0. gp is the G where the exception occurred.
+//
 //go:nosplit
 func winthrow(info *exceptionrecord, r *context, gp *g) {
-	_g_ := getg()
+	g0 := getg()
 
-	if panicking != 0 { // traceback already printed
+	if panicking.Load() != 0 { // traceback already printed
 		exit(2)
 	}
-	panicking = 1
+	panicking.Store(1)
 
 	// In case we're handling a g0 stack overflow, blow away the
 	// g0 stack bounds so we have room to print the traceback. If
 	// this somehow overflows the stack, the OS will trap it.
-	_g_.stack.lo = 0
-	_g_.stackguard0 = _g_.stack.lo + _StackGuard
-	_g_.stackguard1 = _g_.stackguard0
+	g0.stack.lo = 0
+	g0.stackguard0 = g0.stack.lo + _StackGuard
+	g0.stackguard1 = g0.stackguard0
 
 	print("Exception ", hex(info.exceptioncode), " ", hex(info.exceptioninformation[0]), " ", hex(info.exceptioninformation[1]), " ", hex(r.ip()), "\n")
 
 	print("PC=", hex(r.ip()), "\n")
-	if _g_.m.incgo && gp == _g_.m.g0 && _g_.m.curg != nil {
+	if g0.m.incgo && gp == g0.m.g0 && g0.m.curg != nil {
 		if iscgo {
 			print("signal arrived during external code execution\n")
 		}
-		gp = _g_.m.curg
+		gp = g0.m.curg
 	}
 	print("\n")
 
-	_g_.m.throwing = throwTypeRuntime
-	_g_.m.caughtsig.set(gp)
+	g0.m.throwing = throwTypeRuntime
+	g0.m.caughtsig.set(gp)
 
 	level, _, docrash := gotraceback()
 	if level > 0 {
@@ -244,20 +246,27 @@
 }
 
 func sigpanic() {
-	g := getg()
-	if !canpanic(g) {
+	gp := getg()
+	if !canpanic() {
 		throw("unexpected signal during runtime execution")
 	}
 
-	switch g.sig {
+	switch gp.sig {
 	case _EXCEPTION_ACCESS_VIOLATION:
-		if g.sigcode1 < 0x1000 {
+		if gp.sigcode1 < 0x1000 {
 			panicmem()
 		}
-		if g.paniconfault {
-			panicmemAddr(g.sigcode1)
+		if gp.paniconfault {
+			panicmemAddr(gp.sigcode1)
 		}
-		print("unexpected fault address ", hex(g.sigcode1), "\n")
+		if inUserArenaChunk(gp.sigcode1) {
+			// We could check that the arena chunk is explicitly set to fault,
+			// but the fact that we faulted on accessing it is enough to prove
+			// that it is.
+			print("accessed data from freed user arena ", hex(gp.sigcode1), "\n")
+		} else {
+			print("unexpected fault address ", hex(gp.sigcode1), "\n")
+		}
 		throw("fault")
 	case _EXCEPTION_INT_DIVIDE_BY_ZERO:
 		panicdivide()
diff --git a/src/runtime/signal_windows_test.go b/src/runtime/signal_windows_test.go
index add23cd..c9b8e90 100644
--- a/src/runtime/signal_windows_test.go
+++ b/src/runtime/signal_windows_test.go
@@ -1,4 +1,6 @@
-//go:build windows
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
 
 package runtime_test
 
@@ -15,6 +17,64 @@
 	"testing"
 )
 
+func TestVectoredHandlerExceptionInNonGoThread(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	if strings.HasPrefix(testenv.Builder(), "windows-amd64-2012") {
+		testenv.SkipFlaky(t, 49681)
+	}
+	testenv.MustHaveGoBuild(t)
+	testenv.MustHaveCGO(t)
+	testenv.MustHaveExecPath(t, "gcc")
+	testprog.Lock()
+	defer testprog.Unlock()
+	dir := t.TempDir()
+
+	// build c program
+	dll := filepath.Join(dir, "veh.dll")
+	cmd := exec.Command("gcc", "-shared", "-o", dll, "testdata/testwinlibthrow/veh.c")
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build c exe: %s\n%s", err, out)
+	}
+
+	// build go exe
+	exe := filepath.Join(dir, "test.exe")
+	cmd = exec.Command(testenv.GoToolPath(t), "build", "-o", exe, "testdata/testwinlibthrow/main.go")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build go library: %s\n%s", err, out)
+	}
+
+	// run test program in same thread
+	cmd = exec.Command(exe)
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err == nil {
+		t.Fatal("error expected")
+	}
+	if _, ok := err.(*exec.ExitError); ok && len(out) > 0 {
+		if !bytes.Contains(out, []byte("Exception 0x2a")) {
+			t.Fatalf("unexpected failure while running executable: %s\n%s", err, out)
+		}
+	} else {
+		t.Fatalf("unexpected error while running executable: %s\n%s", err, out)
+	}
+	// run test program in a new thread
+	cmd = exec.Command(exe, "thread")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err == nil {
+		t.Fatal("error expected")
+	}
+	if err, ok := err.(*exec.ExitError); ok {
+		if err.ExitCode() != 42 {
+			t.Fatalf("unexpected failure while running executable: %s\n%s", err, out)
+		}
+	} else {
+		t.Fatalf("unexpected error while running executable: %s\n%s", err, out)
+	}
+}
+
 func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
 	if *flagQuick {
 		t.Skip("-quick")
@@ -91,8 +151,8 @@
 
 	// run test program
 	cmd = exec.Command(exe)
-	var stdout bytes.Buffer
-	var stderr bytes.Buffer
+	var stdout strings.Builder
+	var stderr strings.Builder
 	cmd.Stdout = &stdout
 	cmd.Stderr = &stderr
 	inPipe, err := cmd.StdinPipe()
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 49502cb..51e424d 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -54,8 +54,8 @@
 	wanted     [(_NSIG + 31) / 32]uint32
 	ignored    [(_NSIG + 31) / 32]uint32
 	recv       [(_NSIG + 31) / 32]uint32
-	state      uint32
-	delivering uint32
+	state      atomic.Uint32
+	delivering atomic.Uint32
 	inuse      bool
 }
 
@@ -74,11 +74,11 @@
 		return false
 	}
 
-	atomic.Xadd(&sig.delivering, 1)
+	sig.delivering.Add(1)
 	// We are running in the signal handler; defer is not available.
 
 	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
-		atomic.Xadd(&sig.delivering, -1)
+		sig.delivering.Add(-1)
 		return false
 	}
 
@@ -86,7 +86,7 @@
 	for {
 		mask := sig.mask[s/32]
 		if mask&bit != 0 {
-			atomic.Xadd(&sig.delivering, -1)
+			sig.delivering.Add(-1)
 			return true // signal already in queue
 		}
 		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
@@ -97,18 +97,18 @@
 	// Notify receiver that queue has new bit.
 Send:
 	for {
-		switch atomic.Load(&sig.state) {
+		switch sig.state.Load() {
 		default:
 			throw("sigsend: inconsistent state")
 		case sigIdle:
-			if atomic.Cas(&sig.state, sigIdle, sigSending) {
+			if sig.state.CompareAndSwap(sigIdle, sigSending) {
 				break Send
 			}
 		case sigSending:
 			// notification already pending
 			break Send
 		case sigReceiving:
-			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
+			if sig.state.CompareAndSwap(sigReceiving, sigIdle) {
 				if GOOS == "darwin" || GOOS == "ios" {
 					sigNoteWakeup(&sig.note)
 					break Send
@@ -119,7 +119,7 @@
 		}
 	}
 
-	atomic.Xadd(&sig.delivering, -1)
+	sig.delivering.Add(-1)
 	return true
 }
 
@@ -140,11 +140,11 @@
 		// Wait for updates to be available from signal sender.
 	Receive:
 		for {
-			switch atomic.Load(&sig.state) {
+			switch sig.state.Load() {
 			default:
 				throw("signal_recv: inconsistent state")
 			case sigIdle:
-				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
+				if sig.state.CompareAndSwap(sigIdle, sigReceiving) {
 					if GOOS == "darwin" || GOOS == "ios" {
 						sigNoteSleep(&sig.note)
 						break Receive
@@ -154,7 +154,7 @@
 					break Receive
 				}
 			case sigSending:
-				if atomic.Cas(&sig.state, sigSending, sigIdle) {
+				if sig.state.CompareAndSwap(sigSending, sigIdle) {
 					break Receive
 				}
 			}
@@ -182,14 +182,14 @@
 	// a signal, has read from sig.wanted, is now updating sig.mask,
 	// and has not yet woken up the processor thread. We need to wait
 	// until all current signal deliveries have completed.
-	for atomic.Load(&sig.delivering) != 0 {
+	for sig.delivering.Load() != 0 {
 		Gosched()
 	}
 
 	// Although WaitUntilIdle seems like the right name for this
 	// function, the state we are looking for is sigReceiving, not
 	// sigIdle.  The sigIdle state is really more like sigProcessing.
-	for atomic.Load(&sig.state) != sigReceiving {
+	for sig.state.Load() != sigReceiving {
 		Gosched()
 	}
 }
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 2413a46..459dc88 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -18,7 +18,7 @@
 	cap   int
 }
 
-// A notInHeapSlice is a slice backed by go:notinheap memory.
+// A notInHeapSlice is a slice backed by runtime/internal/sys.NotInHeap memory.
 type notInHeapSlice struct {
 	array *notInHeap
 	len   int
@@ -123,92 +123,72 @@
 	return math.MulUintptr(a, b)
 }
 
-// Keep this code in sync with cmd/compile/internal/walk/builtin.go:walkUnsafeSlice
-func unsafeslice(et *_type, ptr unsafe.Pointer, len int) {
-	if len < 0 {
-		panicunsafeslicelen()
-	}
-
-	mem, overflow := math.MulUintptr(et.size, uintptr(len))
-	if overflow || mem > -uintptr(ptr) {
-		if ptr == nil {
-			panicunsafeslicenilptr()
-		}
-		panicunsafeslicelen()
-	}
-}
-
-// Keep this code in sync with cmd/compile/internal/walk/builtin.go:walkUnsafeSlice
-func unsafeslice64(et *_type, ptr unsafe.Pointer, len64 int64) {
-	len := int(len64)
-	if int64(len) != len64 {
-		panicunsafeslicelen()
-	}
-	unsafeslice(et, ptr, len)
-}
-
-func unsafeslicecheckptr(et *_type, ptr unsafe.Pointer, len64 int64) {
-	unsafeslice64(et, ptr, len64)
-
-	// Check that underlying array doesn't straddle multiple heap objects.
-	// unsafeslice64 has already checked for overflow.
-	if checkptrStraddles(ptr, uintptr(len64)*et.size) {
-		throw("checkptr: unsafe.Slice result straddles multiple allocations")
-	}
-}
-
-func panicunsafeslicelen() {
-	panic(errorString("unsafe.Slice: len out of range"))
-}
-
-func panicunsafeslicenilptr() {
-	panic(errorString("unsafe.Slice: ptr is nil and len is not zero"))
-}
-
-// growslice handles slice growth during append.
-// It is passed the slice element type, the old slice, and the desired new minimum capacity,
-// and it returns a new slice with at least that capacity, with the old data
-// copied into it.
-// The new slice's length is set to the old slice's length,
-// NOT to the new requested capacity.
-// This is for codegen convenience. The old slice's length is used immediately
-// to calculate where to write new values during an append.
-// TODO: When the old backend is gone, reconsider this decision.
-// The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
-func growslice(et *_type, old slice, cap int) slice {
+// growslice allocates new backing store for a slice.
+//
+// arguments:
+//
+//	oldPtr = pointer to the slice's backing array
+//	newLen = new length (= oldLen + num)
+//	oldCap = original slice's capacity.
+//	   num = number of elements being added
+//	    et = element type
+//
+// return values:
+//
+//	newPtr = pointer to the new backing store
+//	newLen = same value as the argument
+//	newCap = capacity of the new backing store
+//
+// Requires that uint(newLen) > uint(oldCap).
+// Assumes the original slice length is newLen - num
+//
+// A new backing store is allocated with space for at least newLen elements.
+// Existing entries [0, oldLen) are copied over to the new backing store.
+// Added entries [oldLen, newLen) are not initialized by growslice
+// (although for pointer-containing element types, they are zeroed). They
+// must be initialized by the caller.
+// Trailing entries [newLen, newCap) are zeroed.
+//
+// growslice's odd calling convention makes the generated code that calls
+// this function simpler. In particular, it accepts and returns the
+// new length so that the old length is not live (does not need to be
+// spilled/restored) and the new length is returned (also does not need
+// to be spilled/restored).
+func growslice(oldPtr unsafe.Pointer, newLen, oldCap, num int, et *_type) slice {
+	oldLen := newLen - num
 	if raceenabled {
 		callerpc := getcallerpc()
-		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, abi.FuncPCABIInternal(growslice))
+		racereadrangepc(oldPtr, uintptr(oldLen*int(et.size)), callerpc, abi.FuncPCABIInternal(growslice))
 	}
 	if msanenabled {
-		msanread(old.array, uintptr(old.len*int(et.size)))
+		msanread(oldPtr, uintptr(oldLen*int(et.size)))
 	}
 	if asanenabled {
-		asanread(old.array, uintptr(old.len*int(et.size)))
+		asanread(oldPtr, uintptr(oldLen*int(et.size)))
 	}
 
-	if cap < old.cap {
-		panic(errorString("growslice: cap out of range"))
+	if newLen < 0 {
+		panic(errorString("growslice: len out of range"))
 	}
 
 	if et.size == 0 {
 		// append should not create a slice with nil pointer but non-zero len.
-		// We assume that append doesn't need to preserve old.array in this case.
-		return slice{unsafe.Pointer(&zerobase), old.len, cap}
+		// We assume that append doesn't need to preserve oldPtr in this case.
+		return slice{unsafe.Pointer(&zerobase), newLen, newLen}
 	}
 
-	newcap := old.cap
+	newcap := oldCap
 	doublecap := newcap + newcap
-	if cap > doublecap {
-		newcap = cap
+	if newLen > doublecap {
+		newcap = newLen
 	} else {
 		const threshold = 256
-		if old.cap < threshold {
+		if oldCap < threshold {
 			newcap = doublecap
 		} else {
 			// Check 0 < newcap to detect overflow
 			// and prevent an infinite loop.
-			for 0 < newcap && newcap < cap {
+			for 0 < newcap && newcap < newLen {
 				// Transition from growing 2x for small slices
 				// to growing 1.25x for large slices. This formula
 				// gives a smooth-ish transition between the two.
@@ -217,7 +197,7 @@
 			// Set newcap to the requested cap when
 			// the newcap calculation overflowed.
 			if newcap <= 0 {
-				newcap = cap
+				newcap = newLen
 			}
 		}
 	}
@@ -230,14 +210,14 @@
 	// For powers of 2, use a variable shift.
 	switch {
 	case et.size == 1:
-		lenmem = uintptr(old.len)
-		newlenmem = uintptr(cap)
+		lenmem = uintptr(oldLen)
+		newlenmem = uintptr(newLen)
 		capmem = roundupsize(uintptr(newcap))
 		overflow = uintptr(newcap) > maxAlloc
 		newcap = int(capmem)
 	case et.size == goarch.PtrSize:
-		lenmem = uintptr(old.len) * goarch.PtrSize
-		newlenmem = uintptr(cap) * goarch.PtrSize
+		lenmem = uintptr(oldLen) * goarch.PtrSize
+		newlenmem = uintptr(newLen) * goarch.PtrSize
 		capmem = roundupsize(uintptr(newcap) * goarch.PtrSize)
 		overflow = uintptr(newcap) > maxAlloc/goarch.PtrSize
 		newcap = int(capmem / goarch.PtrSize)
@@ -245,21 +225,23 @@
 		var shift uintptr
 		if goarch.PtrSize == 8 {
 			// Mask shift for better code generation.
-			shift = uintptr(sys.Ctz64(uint64(et.size))) & 63
+			shift = uintptr(sys.TrailingZeros64(uint64(et.size))) & 63
 		} else {
-			shift = uintptr(sys.Ctz32(uint32(et.size))) & 31
+			shift = uintptr(sys.TrailingZeros32(uint32(et.size))) & 31
 		}
-		lenmem = uintptr(old.len) << shift
-		newlenmem = uintptr(cap) << shift
+		lenmem = uintptr(oldLen) << shift
+		newlenmem = uintptr(newLen) << shift
 		capmem = roundupsize(uintptr(newcap) << shift)
 		overflow = uintptr(newcap) > (maxAlloc >> shift)
 		newcap = int(capmem >> shift)
+		capmem = uintptr(newcap) << shift
 	default:
-		lenmem = uintptr(old.len) * et.size
-		newlenmem = uintptr(cap) * et.size
+		lenmem = uintptr(oldLen) * et.size
+		newlenmem = uintptr(newLen) * et.size
 		capmem, overflow = math.MulUintptr(et.size, uintptr(newcap))
 		capmem = roundupsize(capmem)
 		newcap = int(capmem / et.size)
+		capmem = uintptr(newcap) * et.size
 	}
 
 	// The check of overflow in addition to capmem > maxAlloc is needed
@@ -276,27 +258,48 @@
 	//   print(len(s), "\n")
 	// }
 	if overflow || capmem > maxAlloc {
-		panic(errorString("growslice: cap out of range"))
+		panic(errorString("growslice: len out of range"))
 	}
 
 	var p unsafe.Pointer
 	if et.ptrdata == 0 {
 		p = mallocgc(capmem, nil, false)
-		// The append() that calls growslice is going to overwrite from old.len to cap (which will be the new length).
+		// The append() that calls growslice is going to overwrite from oldLen to newLen.
 		// Only clear the part that will not be overwritten.
+		// The reflect_growslice() that calls growslice will manually clear
+		// the region not cleared here.
 		memclrNoHeapPointers(add(p, newlenmem), capmem-newlenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
 		p = mallocgc(capmem, et, true)
 		if lenmem > 0 && writeBarrier.enabled {
-			// Only shade the pointers in old.array since we know the destination slice p
+			// Only shade the pointers in oldPtr since we know the destination slice p
 			// only contains nil pointers because it has been cleared during alloc.
-			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem-et.size+et.ptrdata)
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(oldPtr), lenmem-et.size+et.ptrdata)
 		}
 	}
-	memmove(p, old.array, lenmem)
+	memmove(p, oldPtr, lenmem)
 
-	return slice{p, old.len, newcap}
+	return slice{p, newLen, newcap}
+}
+
+//go:linkname reflect_growslice reflect.growslice
+func reflect_growslice(et *_type, old slice, num int) slice {
+	// Semantically equivalent to slices.Grow, except that the caller
+	// is responsible for ensuring that old.len+num > old.cap.
+	num -= old.cap - old.len // preserve memory of old[old.len:old.cap]
+	new := growslice(old.array, old.cap+num, old.cap, num, et)
+	// growslice does not zero out new[old.cap:new.len] since it assumes that
+	// the memory will be overwritten by an append() that called growslice.
+	// Since the caller of reflect_growslice is not append(),
+	// zero out this region before returning the slice to the reflect package.
+	if et.ptrdata == 0 {
+		oldcapmem := uintptr(old.cap) * et.size
+		newlenmem := uintptr(new.len) * et.size
+		memclrNoHeapPointers(add(new.array, oldcapmem), newlenmem-oldcapmem)
+	}
+	new.len = old.len // preserve the old length
+	return new
 }
 
 func isPowerOfTwo(x uintptr) bool {
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 2a7f0bd..d5e587a 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -98,6 +98,7 @@
 	// The guard leaves enough room for one _StackSmall frame plus
 	// a _StackLimit chain of NOSPLIT calls plus _StackSystem
 	// bytes for the OS.
+	// This arithmetic must match that in cmd/internal/objabi/stack.go:StackLimit.
 	_StackGuard = 928*sys.StackGuardMultiplier + _StackSystem
 
 	// After a stack split check the SP is allowed to be this
@@ -107,6 +108,7 @@
 
 	// The maximum number of bytes that a chain of NOSPLIT
 	// functions can use.
+	// This arithmetic must match that in cmd/internal/objabi/stack.go:StackLimit.
 	_StackLimit = _StackGuard - _StackSystem - _StackSmall
 )
 
@@ -157,11 +159,11 @@
 // There is a free list for each order.
 var stackpool [_NumStackOrders]struct {
 	item stackpoolItem
-	_    [cpu.CacheLinePadSize - unsafe.Sizeof(stackpoolItem{})%cpu.CacheLinePadSize]byte
+	_    [(cpu.CacheLinePadSize - unsafe.Sizeof(stackpoolItem{})%cpu.CacheLinePadSize) % cpu.CacheLinePadSize]byte
 }
 
-//go:notinheap
 type stackpoolItem struct {
+	_    sys.NotInHeap
 	mu   mutex
 	span mSpanList
 }
@@ -564,7 +566,7 @@
 	sghi uintptr
 }
 
-// Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
+// adjustpointer checks whether *vpp is in the old stack described by adjinfo.
 // If so, it rewrites *vpp to point into the new stack.
 func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
 	pp := (*uintptr)(vpp)
@@ -617,7 +619,7 @@
 		}
 		b := *(addb(bv.bytedata, i/8))
 		for b != 0 {
-			j := uintptr(sys.Ctz8(b))
+			j := uintptr(sys.TrailingZeros8(b))
 			b &= b - 1
 			pp := (*uintptr)(add(scanp, (i+j)*goarch.PtrSize))
 		retry:
@@ -664,7 +666,7 @@
 		return true
 	}
 
-	locals, args, objs := getStackMap(frame, &adjinfo.cache, true)
+	locals, args, objs := frame.getStackMap(&adjinfo.cache, true)
 
 	// Adjust local variables if stack frame has been allocated.
 	if locals.n > 0 {
@@ -886,7 +888,7 @@
 	// Adjust sudogs, synchronizing with channel ops if necessary.
 	ncopy := used
 	if !gp.activeStackChans {
-		if newsize < old.hi-old.lo && atomic.Load8(&gp.parkingOnChan) != 0 {
+		if newsize < old.hi-old.lo && gp.parkingOnChan.Load() {
 			// It's not safe for someone to shrink this stack while we're actively
 			// parking on a channel, but it is safe to grow since we do that
 			// ourselves and explicitly don't want to synchronize with channels
@@ -1150,7 +1152,7 @@
 	// We also can't *shrink* the stack in the window between the
 	// goroutine calling gopark to park on a channel and
 	// gp.activeStackChans being set.
-	return gp.syscallsp == 0 && !gp.asyncSafePoint && atomic.Load8(&gp.parkingOnChan) == 0
+	return gp.syscallsp == 0 && !gp.asyncSafePoint && !gp.parkingOnChan.Load()
 }
 
 // Maybe shrink the stack being used by gp.
@@ -1247,147 +1249,6 @@
 	unlock(&stackLarge.lock)
 }
 
-// getStackMap returns the locals and arguments live pointer maps, and
-// stack object list for frame.
-func getStackMap(frame *stkframe, cache *pcvalueCache, debug bool) (locals, args bitvector, objs []stackObjectRecord) {
-	targetpc := frame.continpc
-	if targetpc == 0 {
-		// Frame is dead. Return empty bitvectors.
-		return
-	}
-
-	f := frame.fn
-	pcdata := int32(-1)
-	if targetpc != f.entry() {
-		// Back up to the CALL. If we're at the function entry
-		// point, we want to use the entry map (-1), even if
-		// the first instruction of the function changes the
-		// stack map.
-		targetpc--
-		pcdata = pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
-	}
-	if pcdata == -1 {
-		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function. It is likely that we are looking
-		// at the function prologue, assume so and hope for the best.
-		pcdata = 0
-	}
-
-	// Local variables.
-	size := frame.varp - frame.sp
-	var minsize uintptr
-	switch goarch.ArchFamily {
-	case goarch.ARM64:
-		minsize = sys.StackAlign
-	default:
-		minsize = sys.MinFrameSize
-	}
-	if size > minsize {
-		stackid := pcdata
-		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
-		if stkmap == nil || stkmap.n <= 0 {
-			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
-			throw("missing stackmap")
-		}
-		// If nbit == 0, there's no work to do.
-		if stkmap.nbit > 0 {
-			if stackid < 0 || stackid >= stkmap.n {
-				// don't know where we are
-				print("runtime: pcdata is ", stackid, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
-				throw("bad symbol table")
-			}
-			locals = stackmapdata(stkmap, stackid)
-			if stackDebug >= 3 && debug {
-				print("      locals ", stackid, "/", stkmap.n, " ", locals.n, " words ", locals.bytedata, "\n")
-			}
-		} else if stackDebug >= 3 && debug {
-			print("      no locals to adjust\n")
-		}
-	}
-
-	// Arguments.
-	if frame.arglen > 0 {
-		if frame.argmap != nil {
-			// argmap is set when the function is reflect.makeFuncStub or reflect.methodValueCall.
-			// In this case, arglen specifies how much of the args section is actually live.
-			// (It could be either all the args + results, or just the args.)
-			args = *frame.argmap
-			n := int32(frame.arglen / goarch.PtrSize)
-			if n < args.n {
-				args.n = n // Don't use more of the arguments than arglen.
-			}
-		} else {
-			stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
-			if stackmap == nil || stackmap.n <= 0 {
-				print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(frame.arglen), "\n")
-				throw("missing stackmap")
-			}
-			if pcdata < 0 || pcdata >= stackmap.n {
-				// don't know where we are
-				print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " args stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
-				throw("bad symbol table")
-			}
-			if stackmap.nbit > 0 {
-				args = stackmapdata(stackmap, pcdata)
-			}
-		}
-	}
-
-	// stack objects.
-	if (GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "ppc64" || GOARCH == "ppc64le" || GOARCH == "riscv64") &&
-		unsafe.Sizeof(abi.RegArgs{}) > 0 && frame.argmap != nil {
-		// argmap is set when the function is reflect.makeFuncStub or reflect.methodValueCall.
-		// We don't actually use argmap in this case, but we need to fake the stack object
-		// record for these frames which contain an internal/abi.RegArgs at a hard-coded offset.
-		// This offset matches the assembly code on amd64 and arm64.
-		objs = methodValueCallFrameObjs[:]
-	} else {
-		p := funcdata(f, _FUNCDATA_StackObjects)
-		if p != nil {
-			n := *(*uintptr)(p)
-			p = add(p, goarch.PtrSize)
-			r0 := (*stackObjectRecord)(noescape(p))
-			objs = unsafe.Slice(r0, int(n))
-			// Note: the noescape above is needed to keep
-			// getStackMap from "leaking param content:
-			// frame".  That leak propagates up to getgcmask, then
-			// GCMask, then verifyGCInfo, which converts the stack
-			// gcinfo tests into heap gcinfo tests :(
-		}
-	}
-
-	return
-}
-
-var methodValueCallFrameObjs [1]stackObjectRecord // initialized in stackobjectinit
-
-func stkobjinit() {
-	var abiRegArgsEface any = abi.RegArgs{}
-	abiRegArgsType := efaceOf(&abiRegArgsEface)._type
-	if abiRegArgsType.kind&kindGCProg != 0 {
-		throw("abiRegArgsType needs GC Prog, update methodValueCallFrameObjs")
-	}
-	// Set methodValueCallFrameObjs[0].gcdataoff so that
-	// stackObjectRecord.gcdata() will work correctly with it.
-	ptr := uintptr(unsafe.Pointer(&methodValueCallFrameObjs[0]))
-	var mod *moduledata
-	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		if datap.gofunc <= ptr && ptr < datap.end {
-			mod = datap
-			break
-		}
-	}
-	if mod == nil {
-		throw("methodValueCallFrameObjs is not in a module")
-	}
-	methodValueCallFrameObjs[0] = stackObjectRecord{
-		off:       -int32(alignUp(abiRegArgsType.size, 8)), // It's always the highest address local.
-		size:      int32(abiRegArgsType.size),
-		_ptrdata:  int32(abiRegArgsType.ptrdata),
-		gcdataoff: uint32(uintptr(unsafe.Pointer(abiRegArgsType.gcdata)) - mod.rodata),
-	}
-}
-
 // A stackObjectRecord is generated by the compiler for each stack object in a stack frame.
 // This record must match the generator code in cmd/compile/internal/liveness/plive.go:emitStackObjects.
 type stackObjectRecord struct {
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index dfb29a9..92d5880 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -5,7 +5,6 @@
 package runtime_test
 
 import (
-	"bytes"
 	"fmt"
 	"reflect"
 	"regexp"
@@ -109,13 +108,14 @@
 
 	// in finalizer
 	var finalizerStart time.Time
-	var started, progress uint32
+	var started atomic.Bool
+	var progress atomic.Uint32
 	wg.Add(1)
 	s := new(string) // Must be of a type that avoids the tiny allocator, or else the finalizer might not run.
 	SetFinalizer(s, func(ss *string) {
 		defer wg.Done()
 		finalizerStart = time.Now()
-		atomic.StoreUint32(&started, 1)
+		started.Store(true)
 		growStack(&progress)
 	})
 	setFinalizerTime := time.Now()
@@ -128,10 +128,10 @@
 			// Panic — instead of calling t.Error and returning from the test — so
 			// that we get a useful goroutine dump if the test times out, especially
 			// if GOTRACEBACK=system or GOTRACEBACK=crash is set.
-			if atomic.LoadUint32(&started) == 0 {
+			if !started.Load() {
 				panic("finalizer did not start")
 			} else {
-				panic(fmt.Sprintf("finalizer started %s ago (%s after registration) and ran %d iterations, but did not return", time.Since(finalizerStart), finalizerStart.Sub(setFinalizerTime), atomic.LoadUint32(&progress)))
+				panic(fmt.Sprintf("finalizer started %s ago (%s after registration) and ran %d iterations, but did not return", time.Since(finalizerStart), finalizerStart.Sub(setFinalizerTime), progress.Load()))
 			}
 		})
 		defer timer.Stop()
@@ -139,7 +139,7 @@
 
 	GC()
 	wg.Wait()
-	t.Logf("finalizer started after %s and ran %d iterations in %v", finalizerStart.Sub(setFinalizerTime), atomic.LoadUint32(&progress), time.Since(finalizerStart))
+	t.Logf("finalizer started after %s and ran %d iterations in %v", finalizerStart.Sub(setFinalizerTime), progress.Load(), time.Since(finalizerStart))
 }
 
 // ... and in init
@@ -147,7 +147,7 @@
 //	growStack()
 //}
 
-func growStack(progress *uint32) {
+func growStack(progress *atomic.Uint32) {
 	n := 1 << 10
 	if testing.Short() {
 		n = 1 << 8
@@ -159,7 +159,7 @@
 			panic("stack is corrupted")
 		}
 		if progress != nil {
-			atomic.StoreUint32(progress, uint32(i))
+			progress.Store(uint32(i))
 		}
 	}
 	GC()
@@ -777,7 +777,7 @@
 	// and that we see TestTracebackSystemstack.
 	countIn, countOut := 0, 0
 	frames := CallersFrames(pcs)
-	var tb bytes.Buffer
+	var tb strings.Builder
 	for {
 		frame, more := frames.Next()
 		fmt.Fprintf(&tb, "\n%s+0x%x %s:%d", frame.Function, frame.PC-frame.Entry, frame.File, frame.Line)
diff --git a/src/runtime/start_line_amd64_test.go b/src/runtime/start_line_amd64_test.go
new file mode 100644
index 0000000..305ed0b
--- /dev/null
+++ b/src/runtime/start_line_amd64_test.go
@@ -0,0 +1,23 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime/internal/startlinetest"
+	"testing"
+)
+
+// TestStartLineAsm tests the start line metadata of an assembly function. This
+// is only tested on amd64 to avoid the need for a proliferation of per-arch
+// copies of this function.
+func TestStartLineAsm(t *testing.T) {
+	startlinetest.CallerStartLine = callerStartLine
+
+	const wantLine = 23
+	got := startlinetest.AsmFunc()
+	if got != wantLine {
+		t.Errorf("start line got %d want %d", got, wantLine)
+	}
+}
diff --git a/src/runtime/start_line_test.go b/src/runtime/start_line_test.go
new file mode 100644
index 0000000..6c4faa8
--- /dev/null
+++ b/src/runtime/start_line_test.go
@@ -0,0 +1,138 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"internal/testenv"
+	"runtime"
+	"testing"
+)
+
+// The tests in this file test the function start line metadata included in
+// _func and inlinedCall. TestStartLine hard-codes the start lines of functions
+// in this file. If code moves, the test will need to be updated.
+//
+// The "start line" of a function should be the line containing the func
+// keyword.
+
+func normalFunc() int {
+	return callerStartLine(false)
+}
+
+func multilineDeclarationFunc() int {
+	return multilineDeclarationFunc1(0, 0, 0)
+}
+
+//go:noinline
+func multilineDeclarationFunc1(
+	a, b, c int) int {
+	return callerStartLine(false)
+}
+
+func blankLinesFunc() int {
+
+	// Some
+	// lines
+	// without
+	// code
+
+	return callerStartLine(false)
+}
+
+func inlineFunc() int {
+	return inlineFunc1()
+}
+
+func inlineFunc1() int {
+	return callerStartLine(true)
+}
+
+var closureFn func() int
+
+func normalClosure() int {
+	// Assign to global to ensure this isn't inlined.
+	closureFn = func() int {
+		return callerStartLine(false)
+	}
+	return closureFn()
+}
+
+func inlineClosure() int {
+	return func() int {
+		return callerStartLine(true)
+	}()
+}
+
+func TestStartLine(t *testing.T) {
+	// We test inlined vs non-inlined variants. We can't do that if
+	// optimizations are disabled.
+	testenv.SkipIfOptimizationOff(t)
+
+	testCases := []struct{
+		name string
+		fn   func() int
+		want int
+	}{
+		{
+			name: "normal",
+			fn:   normalFunc,
+			want: 21,
+		},
+		{
+			name: "multiline-declaration",
+			fn:   multilineDeclarationFunc,
+			want: 30,
+		},
+		{
+			name: "blank-lines",
+			fn:   blankLinesFunc,
+			want: 35,
+		},
+		{
+			name: "inline",
+			fn:   inlineFunc,
+			want: 49,
+		},
+		{
+			name: "normal-closure",
+			fn:   normalClosure,
+			want: 57,
+		},
+		{
+			name: "inline-closure",
+			fn:   inlineClosure,
+			want: 64,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			got := tc.fn()
+			if got != tc.want {
+				t.Errorf("start line got %d want %d", got, tc.want)
+			}
+		})
+	}
+}
+
+//go:noinline
+func callerStartLine(wantInlined bool) int {
+	var pcs [1]uintptr
+	n := runtime.Callers(2, pcs[:])
+	if n != 1 {
+		panic(fmt.Sprintf("no caller of callerStartLine? n = %d", n))
+	}
+
+	frames := runtime.CallersFrames(pcs[:])
+	frame, _ := frames.Next()
+
+	inlined := frame.Func == nil // Func always set to nil for inlined frames
+	if wantInlined != inlined {
+		panic(fmt.Sprintf("caller %s inlined got %v want %v", frame.Function, inlined, wantInlined))
+	}
+
+	return runtime.FrameStartLine(&frame)
+}
diff --git a/src/runtime/stkframe.go b/src/runtime/stkframe.go
new file mode 100644
index 0000000..3ecf3a8
--- /dev/null
+++ b/src/runtime/stkframe.go
@@ -0,0 +1,289 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/abi"
+	"internal/goarch"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A stkframe holds information about a single physical stack frame.
+type stkframe struct {
+	// fn is the function being run in this frame. If there is
+	// inlining, this is the outermost function.
+	fn funcInfo
+
+	// pc is the program counter within fn.
+	//
+	// The meaning of this is subtle:
+	//
+	// - Typically, this frame performed a regular function call
+	//   and this is the return PC (just after the CALL
+	//   instruction). In this case, pc-1 reflects the CALL
+	//   instruction itself and is the correct source of symbolic
+	//   information.
+	//
+	// - If this frame "called" sigpanic, then pc is the
+	//   instruction that panicked, and pc is the correct address
+	//   to use for symbolic information.
+	//
+	// - If this is the innermost frame, then PC is where
+	//   execution will continue, but it may not be the
+	//   instruction following a CALL. This may be from
+	//   cooperative preemption, in which case this is the
+	//   instruction after the call to morestack. Or this may be
+	//   from a signal or an un-started goroutine, in which case
+	//   PC could be any instruction, including the first
+	//   instruction in a function. Conventionally, we use pc-1
+	//   for symbolic information, unless pc == fn.entry(), in
+	//   which case we use pc.
+	pc uintptr
+
+	// continpc is the PC where execution will continue in fn, or
+	// 0 if execution will not continue in this frame.
+	//
+	// This is usually the same as pc, unless this frame "called"
+	// sigpanic, in which case it's either the address of
+	// deferreturn or 0 if this frame will never execute again.
+	//
+	// This is the PC to use to look up GC liveness for this frame.
+	continpc uintptr
+
+	lr   uintptr // program counter at caller aka link register
+	sp   uintptr // stack pointer at pc
+	fp   uintptr // stack pointer at caller aka frame pointer
+	varp uintptr // top of local variables
+	argp uintptr // pointer to function arguments
+}
+
+// reflectMethodValue is a partial duplicate of reflect.makeFuncImpl
+// and reflect.methodValue.
+type reflectMethodValue struct {
+	fn     uintptr
+	stack  *bitvector // ptrmap for both args and results
+	argLen uintptr    // just args
+}
+
+// argBytes returns the argument frame size for a call to frame.fn.
+func (frame *stkframe) argBytes() uintptr {
+	if frame.fn.args != _ArgsSizeUnknown {
+		return uintptr(frame.fn.args)
+	}
+	// This is an uncommon and complicated case. Fall back to fully
+	// fetching the argument map to compute its size.
+	argMap, _ := frame.argMapInternal()
+	return uintptr(argMap.n) * goarch.PtrSize
+}
+
+// argMapInternal is used internally by stkframe to fetch special
+// argument maps.
+//
+// argMap.n is always populated with the size of the argument map.
+//
+// argMap.bytedata is only populated for dynamic argument maps (used
+// by reflect). If the caller requires the argument map, it should use
+// this if non-nil, and otherwise fetch the argument map using the
+// current PC.
+//
+// hasReflectStackObj indicates that this frame also has a reflect
+// function stack object, which the caller must synthesize.
+func (frame *stkframe) argMapInternal() (argMap bitvector, hasReflectStackObj bool) {
+	f := frame.fn
+	if f.args != _ArgsSizeUnknown {
+		argMap.n = f.args / goarch.PtrSize
+		return
+	}
+	// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
+	switch funcname(f) {
+	case "reflect.makeFuncStub", "reflect.methodValueCall":
+		// These take a *reflect.methodValue as their
+		// context register and immediately save it to 0(SP).
+		// Get the methodValue from 0(SP).
+		arg0 := frame.sp + sys.MinFrameSize
+
+		minSP := frame.fp
+		if !usesLR {
+			// The CALL itself pushes a word.
+			// Undo that adjustment.
+			minSP -= goarch.PtrSize
+		}
+		if arg0 >= minSP {
+			// The function hasn't started yet.
+			// This only happens if f was the
+			// start function of a new goroutine
+			// that hasn't run yet *and* f takes
+			// no arguments and has no results
+			// (otherwise it will get wrapped in a
+			// closure). In this case, we can't
+			// reach into its locals because it
+			// doesn't have locals yet, but we
+			// also know its argument map is
+			// empty.
+			if frame.pc != f.entry() {
+				print("runtime: confused by ", funcname(f), ": no frame (sp=", hex(frame.sp), " fp=", hex(frame.fp), ") at entry+", hex(frame.pc-f.entry()), "\n")
+				throw("reflect mismatch")
+			}
+			return bitvector{}, false // No locals, so also no stack objects
+		}
+		hasReflectStackObj = true
+		mv := *(**reflectMethodValue)(unsafe.Pointer(arg0))
+		// Figure out whether the return values are valid.
+		// Reflect will update this value after it copies
+		// in the return values.
+		retValid := *(*bool)(unsafe.Pointer(arg0 + 4*goarch.PtrSize))
+		if mv.fn != f.entry() {
+			print("runtime: confused by ", funcname(f), "\n")
+			throw("reflect mismatch")
+		}
+		argMap = *mv.stack
+		if !retValid {
+			// argMap.n includes the results, but
+			// those aren't valid, so drop them.
+			n := int32((uintptr(mv.argLen) &^ (goarch.PtrSize - 1)) / goarch.PtrSize)
+			if n < argMap.n {
+				argMap.n = n
+			}
+		}
+	}
+	return
+}
+
+// getStackMap returns the locals and arguments live pointer maps, and
+// stack object list for frame.
+func (frame *stkframe) getStackMap(cache *pcvalueCache, debug bool) (locals, args bitvector, objs []stackObjectRecord) {
+	targetpc := frame.continpc
+	if targetpc == 0 {
+		// Frame is dead. Return empty bitvectors.
+		return
+	}
+
+	f := frame.fn
+	pcdata := int32(-1)
+	if targetpc != f.entry() {
+		// Back up to the CALL. If we're at the function entry
+		// point, we want to use the entry map (-1), even if
+		// the first instruction of the function changes the
+		// stack map.
+		targetpc--
+		pcdata = pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
+	}
+	if pcdata == -1 {
+		// We do not have a valid pcdata value but there might be a
+		// stackmap for this function. It is likely that we are looking
+		// at the function prologue, assume so and hope for the best.
+		pcdata = 0
+	}
+
+	// Local variables.
+	size := frame.varp - frame.sp
+	var minsize uintptr
+	switch goarch.ArchFamily {
+	case goarch.ARM64:
+		minsize = sys.StackAlign
+	default:
+		minsize = sys.MinFrameSize
+	}
+	if size > minsize {
+		stackid := pcdata
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
+		if stkmap == nil || stkmap.n <= 0 {
+			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
+			throw("missing stackmap")
+		}
+		// If nbit == 0, there's no work to do.
+		if stkmap.nbit > 0 {
+			if stackid < 0 || stackid >= stkmap.n {
+				// don't know where we are
+				print("runtime: pcdata is ", stackid, " and ", stkmap.n, " locals stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
+				throw("bad symbol table")
+			}
+			locals = stackmapdata(stkmap, stackid)
+			if stackDebug >= 3 && debug {
+				print("      locals ", stackid, "/", stkmap.n, " ", locals.n, " words ", locals.bytedata, "\n")
+			}
+		} else if stackDebug >= 3 && debug {
+			print("      no locals to adjust\n")
+		}
+	}
+
+	// Arguments. First fetch frame size and special-case argument maps.
+	var isReflect bool
+	args, isReflect = frame.argMapInternal()
+	if args.n > 0 && args.bytedata == nil {
+		// Non-empty argument frame, but not a special map.
+		// Fetch the argument map at pcdata.
+		stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
+		if stackmap == nil || stackmap.n <= 0 {
+			print("runtime: frame ", funcname(f), " untyped args ", hex(frame.argp), "+", hex(args.n*goarch.PtrSize), "\n")
+			throw("missing stackmap")
+		}
+		if pcdata < 0 || pcdata >= stackmap.n {
+			// don't know where we are
+			print("runtime: pcdata is ", pcdata, " and ", stackmap.n, " args stack map entries for ", funcname(f), " (targetpc=", hex(targetpc), ")\n")
+			throw("bad symbol table")
+		}
+		if stackmap.nbit == 0 {
+			args.n = 0
+		} else {
+			args = stackmapdata(stackmap, pcdata)
+		}
+	}
+
+	// stack objects.
+	if (GOARCH == "amd64" || GOARCH == "arm64" || GOARCH == "ppc64" || GOARCH == "ppc64le" || GOARCH == "riscv64") &&
+		unsafe.Sizeof(abi.RegArgs{}) > 0 && isReflect {
+		// For reflect.makeFuncStub and reflect.methodValueCall,
+		// we need to fake the stack object record.
+		// These frames contain an internal/abi.RegArgs at a hard-coded offset.
+		// This offset matches the assembly code on amd64 and arm64.
+		objs = methodValueCallFrameObjs[:]
+	} else {
+		p := funcdata(f, _FUNCDATA_StackObjects)
+		if p != nil {
+			n := *(*uintptr)(p)
+			p = add(p, goarch.PtrSize)
+			r0 := (*stackObjectRecord)(noescape(p))
+			objs = unsafe.Slice(r0, int(n))
+			// Note: the noescape above is needed to keep
+			// getStackMap from "leaking param content:
+			// frame".  That leak propagates up to getgcmask, then
+			// GCMask, then verifyGCInfo, which converts the stack
+			// gcinfo tests into heap gcinfo tests :(
+		}
+	}
+
+	return
+}
+
+var methodValueCallFrameObjs [1]stackObjectRecord // initialized in stackobjectinit
+
+func stkobjinit() {
+	var abiRegArgsEface any = abi.RegArgs{}
+	abiRegArgsType := efaceOf(&abiRegArgsEface)._type
+	if abiRegArgsType.kind&kindGCProg != 0 {
+		throw("abiRegArgsType needs GC Prog, update methodValueCallFrameObjs")
+	}
+	// Set methodValueCallFrameObjs[0].gcdataoff so that
+	// stackObjectRecord.gcdata() will work correctly with it.
+	ptr := uintptr(unsafe.Pointer(&methodValueCallFrameObjs[0]))
+	var mod *moduledata
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if datap.gofunc <= ptr && ptr < datap.end {
+			mod = datap
+			break
+		}
+	}
+	if mod == nil {
+		throw("methodValueCallFrameObjs is not in a module")
+	}
+	methodValueCallFrameObjs[0] = stackObjectRecord{
+		off:       -int32(alignUp(abiRegArgsType.size, 8)), // It's always the highest address local.
+		size:      int32(abiRegArgsType.size),
+		_ptrdata:  int32(abiRegArgsType.ptrdata),
+		gcdataoff: uint32(uintptr(unsafe.Pointer(abiRegArgsType.gcdata)) - mod.rodata),
+	}
+}
diff --git a/src/runtime/string.go b/src/runtime/string.go
index 359a565..a00976b 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -78,7 +78,7 @@
 // n is the length of the slice.
 // Buf is a fixed-size buffer for the result,
 // it is not nil if the result does not escape.
-func slicebytetostring(buf *tmpBuf, ptr *byte, n int) (str string) {
+func slicebytetostring(buf *tmpBuf, ptr *byte, n int) string {
 	if n == 0 {
 		// Turns out to be a relatively common case.
 		// Consider that you want to parse out data between parens in "foo()bar",
@@ -102,9 +102,7 @@
 		if goarch.BigEndian {
 			p = add(p, 7)
 		}
-		stringStructOf(&str).str = p
-		stringStructOf(&str).len = 1
-		return
+		return unsafe.String((*byte)(p), 1)
 	}
 
 	var p unsafe.Pointer
@@ -113,16 +111,14 @@
 	} else {
 		p = mallocgc(uintptr(n), nil, false)
 	}
-	stringStructOf(&str).str = p
-	stringStructOf(&str).len = n
 	memmove(p, unsafe.Pointer(ptr), uintptr(n))
-	return
+	return unsafe.String((*byte)(p), n)
 }
 
 // stringDataOnStack reports whether the string's data is
 // stored on the current goroutine's stack.
 func stringDataOnStack(s string) bool {
-	ptr := uintptr(stringStructOf(&s).str)
+	ptr := uintptr(unsafe.Pointer(unsafe.StringData(s)))
 	stk := getg().stack
 	return stk.lo <= ptr && ptr < stk.hi
 }
@@ -151,7 +147,7 @@
 //     where k is []byte, T1 to Tn is a nesting of struct and array literals.
 //   - Used for "<"+string(b)+">" concatenation where b is []byte.
 //   - Used for string(b)=="foo" comparison where b is []byte.
-func slicebytetostringtmp(ptr *byte, n int) (str string) {
+func slicebytetostringtmp(ptr *byte, n int) string {
 	if raceenabled && n > 0 {
 		racereadrangepc(unsafe.Pointer(ptr),
 			uintptr(n),
@@ -164,9 +160,7 @@
 	if asanenabled && n > 0 {
 		asanread(unsafe.Pointer(ptr), uintptr(n))
 	}
-	stringStructOf(&str).str = unsafe.Pointer(ptr)
-	stringStructOf(&str).len = n
-	return
+	return unsafe.String(ptr, n)
 }
 
 func stringtoslicebyte(buf *tmpBuf, s string) []byte {
@@ -271,13 +265,7 @@
 // b to set the string contents and then drop b.
 func rawstring(size int) (s string, b []byte) {
 	p := mallocgc(uintptr(size), nil, false)
-
-	stringStructOf(&s).str = p
-	stringStructOf(&s).len = size
-
-	*(*slice)(unsafe.Pointer(&b)) = slice{p, size, size}
-
-	return
+	return unsafe.String((*byte)(p), size), unsafe.Slice((*byte)(p), size)
 }
 
 // rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
@@ -337,6 +325,13 @@
 	return s
 }
 
+// internal_syscall_gostring is a version of gostring for internal/syscall/unix.
+//
+//go:linkname internal_syscall_gostring internal/syscall/unix.gostring
+func internal_syscall_gostring(p *byte) string {
+	return gostring(p)
+}
+
 func gostringn(p *byte, l int) string {
 	if l == 0 {
 		return ""
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index 1ea7f5e..cfc0ad7 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go
@@ -223,6 +223,19 @@
 	}
 }
 
+func TestConcatTempString(t *testing.T) {
+	s := "bytes"
+	b := []byte(s)
+	n := testing.AllocsPerRun(1000, func() {
+		if "prefix "+string(b)+" suffix" != "prefix bytes suffix" {
+			t.Fatalf("strings are not equal: '%v' and '%v'", "prefix "+string(b)+" suffix", "prefix bytes suffix")
+		}
+	})
+	if n != 0 {
+		t.Fatalf("want 0 allocs, got %v", n)
+	}
+}
+
 func TestCompareTempString(t *testing.T) {
 	s := strings.Repeat("x", sizeNoStack)
 	b := []byte(s)
@@ -230,10 +243,24 @@
 		if string(b) != s {
 			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
 		}
+		if string(b) < s {
+			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
+		}
+		if string(b) > s {
+			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
+		}
 		if string(b) == s {
 		} else {
 			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
 		}
+		if string(b) <= s {
+		} else {
+			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
+		}
+		if string(b) >= s {
+		} else {
+			t.Fatalf("strings are not equal: '%v' and '%v'", string(b), s)
+		}
 	})
 	if n != 0 {
 		t.Fatalf("want 0 allocs, got %v", n)
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 929f8fa..42c2612 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -59,13 +59,10 @@
 //go:noescape
 func systemstack(fn func())
 
-var badsystemstackMsg = "fatal: systemstack called from unexpected goroutine"
-
 //go:nosplit
 //go:nowritebarrierrec
 func badsystemstack() {
-	sp := stringStructOf(&badsystemstackMsg)
-	write(2, sp.str, int32(sp.len))
+	writeErrStr("fatal: systemstack called from unexpected goroutine")
 }
 
 // memclrNoHeapPointers clears n bytes starting at ptr.
@@ -131,7 +128,7 @@
 	// by the compiler should be in this list.
 	if goarch.IsAmd64|goarch.IsArm64|goarch.IsPpc64|
 		goarch.IsPpc64le|goarch.IsMips64|goarch.IsMips64le|
-		goarch.IsS390x|goarch.IsRiscv64 == 1 {
+		goarch.IsS390x|goarch.IsRiscv64|goarch.IsLoong64 == 1 {
 		mp.fastrand += 0xa0761d6478bd642f
 		hi, lo := math.Mul64(mp.fastrand, mp.fastrand^0xe7037ed1a0b428db)
 		return uint32(hi ^ lo)
@@ -196,6 +193,9 @@
 	return uint(fastrand64())
 }
 
+//go:linkname rand_fastrand64 math/rand.fastrand64
+func rand_fastrand64() uint64 { return fastrand64() }
+
 //go:linkname sync_fastrandn sync.fastrandn
 func sync_fastrandn(n uint32) uint32 { return fastrandn(n) }
 
diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 94a888d..0d83deb 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // read calls the read system call.
 // It returns a non-negative number of bytes written or a negative errno value.
@@ -22,7 +25,7 @@
 	usleep(usec)
 }
 
-// write calls the write system call.
+// write1 calls the write system call.
 // It returns a non-negative number of bytes written or a negative errno value.
 //
 //go:noescape
@@ -31,11 +34,11 @@
 //go:noescape
 func open(name *byte, mode, perm int32) int32
 
-// return value is only set on linux to be used in osinit()
+// return value is only set on linux to be used in osinit().
 func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
 
-// exitThread terminates the current thread, writing *wait = 0 when
+// exitThread terminates the current thread, writing *wait = freeMStack when
 // the stack is safe to reclaim.
 //
 //go:noescape
-func exitThread(wait *uint32)
+func exitThread(wait *atomic.Uint32)
diff --git a/src/runtime/stubs_ppc64.go b/src/runtime/stubs_ppc64.go
index 6919b74..e23e338 100644
--- a/src/runtime/stubs_ppc64.go
+++ b/src/runtime/stubs_ppc64.go
@@ -6,7 +6,7 @@
 
 package runtime
 
-// This is needed for vet
+// This is needed for vet.
 //
 //go:noescape
 func callCgoSigaction(sig uintptr, new, old *sigactiont) int32
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index ad34b68..dead27e 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -49,6 +49,15 @@
 	File string
 	Line int
 
+	// startLine is the line number of the beginning of the function in
+	// this frame. Specifically, it is the line number of the func keyword
+	// for Go functions. Note that //line directives can change the
+	// filename and/or line number arbitrarily within a function, meaning
+	// that the Line - startLine offset is not always meaningful.
+	//
+	// This may be zero if not known.
+	startLine int
+
 	// Entry point program counter for the function; may be zero
 	// if not known. If Func is not nil then Entry ==
 	// Func.Entry().
@@ -108,6 +117,7 @@
 			pc--
 		}
 		name := funcname(funcInfo)
+		startLine := f.startLine()
 		if inldata := funcdata(funcInfo, _FUNCDATA_InlTree); inldata != nil {
 			inltree := (*[1 << 20]inlinedCall)(inldata)
 			// Non-strict as cgoTraceback may have added bogus PCs
@@ -116,17 +126,19 @@
 			if ix >= 0 {
 				// Note: entry is not modified. It always refers to a real frame, not an inlined one.
 				f = nil
-				name = funcnameFromNameoff(funcInfo, inltree[ix].func_)
-				// File/line is already correct.
-				// TODO: remove file/line from InlinedCall?
+				ic := inltree[ix]
+				name = funcnameFromNameOff(funcInfo, ic.nameOff)
+				startLine = ic.startLine
+				// File/line from funcline1 below are already correct.
 			}
 		}
 		ci.frames = append(ci.frames, Frame{
-			PC:       pc,
-			Func:     f,
-			Function: name,
-			Entry:    entry,
-			funcInfo: funcInfo,
+			PC:        pc,
+			Func:      f,
+			Function:  name,
+			Entry:     entry,
+			startLine: int(startLine),
+			funcInfo:  funcInfo,
 			// Note: File,Line set below
 		})
 	}
@@ -158,6 +170,13 @@
 	return
 }
 
+// runtime_FrameStartLine returns the start line of the function in a Frame.
+//
+//go:linkname runtime_FrameStartLine runtime/pprof.runtime_FrameStartLine
+func runtime_FrameStartLine(f *Frame) int {
+	return f.startLine
+}
+
 // runtime_expandFinalInlineFrame expands the final pc in stk to include all
 // "callers" if pc is inline.
 //
@@ -393,7 +412,7 @@
 
 // pcHeader holds data used by the pclntab lookups.
 type pcHeader struct {
-	magic          uint32  // 0xFFFFFFF0
+	magic          uint32  // 0xFFFFFFF1
 	pad1, pad2     uint8   // 0,0
 	minLC          uint8   // min instruction size
 	ptrSize        uint8   // size of a ptr in bytes
@@ -428,6 +447,7 @@
 	data, edata           uintptr
 	bss, ebss             uintptr
 	noptrbss, enoptrbss   uintptr
+	covctrs, ecovctrs     uintptr
 	end, gcdata, gcbss    uintptr
 	types, etypes         uintptr
 	rodata                uintptr
@@ -575,7 +595,7 @@
 const minfunc = 16                 // minimum function size
 const pcbucketsize = 256 * minfunc // size of bucket in the pc->func lookup table
 
-// findfunctab is an array of these structures.
+// findfuncbucket is an array of these structures.
 // Each bucket represents 4096 bytes of the text segment.
 // Each subbucket represents 256 bytes of the text segment.
 // To find a function given a pc, locate the bucket and subbucket for
@@ -599,7 +619,7 @@
 func moduledataverify1(datap *moduledata) {
 	// Check that the pclntab's format is valid.
 	hdr := datap.pcHeader
-	if hdr.magic != 0xfffffff0 || hdr.pad1 != 0 || hdr.pad2 != 0 ||
+	if hdr.magic != 0xfffffff1 || hdr.pad1 != 0 || hdr.pad2 != 0 ||
 		hdr.minLC != sys.PCQuantum || hdr.ptrSize != goarch.PtrSize || hdr.textStart != datap.text {
 		println("runtime: pcHeader: magic=", hex(hdr.magic), "pad1=", hdr.pad1, "pad2=", hdr.pad2,
 			"minLC=", hdr.minLC, "ptrSize=", hdr.ptrSize, "pcHeader.textStart=", hex(hdr.textStart),
@@ -727,14 +747,16 @@
 		// The runtime currently doesn't have function end info, alas.
 		if ix := pcdatavalue1(f, _PCDATA_InlTreeIndex, pc, nil, false); ix >= 0 {
 			inltree := (*[1 << 20]inlinedCall)(inldata)
-			name := funcnameFromNameoff(f, inltree[ix].func_)
+			ic := inltree[ix]
+			name := funcnameFromNameOff(f, ic.nameOff)
 			file, line := funcline(f, pc)
 			fi := &funcinl{
-				ones:  ^uint32(0),
-				entry: f.entry(), // entry of the real (the outermost) function.
-				name:  name,
-				file:  file,
-				line:  int(line),
+				ones:      ^uint32(0),
+				entry:     f.entry(), // entry of the real (the outermost) function.
+				name:      name,
+				file:      file,
+				line:      line,
+				startLine: ic.startLine,
 			}
 			return (*Func)(unsafe.Pointer(fi))
 		}
@@ -773,7 +795,7 @@
 	fn := f.raw()
 	if fn.isInlined() { // inlined version
 		fi := (*funcinl)(unsafe.Pointer(fn))
-		return fi.file, fi.line
+		return fi.file, int(fi.line)
 	}
 	// Pass strict=false here, because anyone can call this function,
 	// and they might just be wrong about targetpc belonging to f.
@@ -781,6 +803,17 @@
 	return file, int(line32)
 }
 
+// startLine returns the starting line number of the function. i.e., the line
+// number of the func keyword.
+func (f *Func) startLine() int32 {
+	fn := f.raw()
+	if fn.isInlined() { // inlined version
+		fi := (*funcinl)(unsafe.Pointer(fn))
+		return fi.startLine
+	}
+	return fn.funcInfo().startLine
+}
+
 // findmoduledatap looks up the moduledata for a PC.
 //
 // It is nosplit because it's part of the isgoexception
@@ -811,12 +844,12 @@
 
 // isInlined reports whether f should be re-interpreted as a *funcinl.
 func (f *_func) isInlined() bool {
-	return f.entryoff == ^uint32(0) // see comment for funcinl.ones
+	return f.entryOff == ^uint32(0) // see comment for funcinl.ones
 }
 
 // entry returns the entry PC for f.
 func (f funcInfo) entry() uintptr {
-	return f.datap.textAddr(f.entryoff)
+	return f.datap.textAddr(f.entryOff)
 }
 
 // findfunc looks up function metadata for a PC.
@@ -902,7 +935,7 @@
 	}
 
 	if !f.valid() {
-		if strict && panicking == 0 {
+		if strict && panicking.Load() == 0 {
 			println("runtime: no module data for", hex(f.entry()))
 			throw("no module data")
 		}
@@ -945,7 +978,7 @@
 
 	// If there was a table, it should have covered all program counters.
 	// If not, something is wrong.
-	if panicking != 0 || !strict {
+	if panicking.Load() != 0 || !strict {
 		return -1, 0
 	}
 
@@ -968,10 +1001,10 @@
 }
 
 func cfuncname(f funcInfo) *byte {
-	if !f.valid() || f.nameoff == 0 {
+	if !f.valid() || f.nameOff == 0 {
 		return nil
 	}
-	return &f.datap.funcnametab[f.nameoff]
+	return &f.datap.funcnametab[f.nameOff]
 }
 
 func funcname(f funcInfo) string {
@@ -994,15 +1027,15 @@
 	return name[:i]
 }
 
-func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
+func cfuncnameFromNameOff(f funcInfo, nameOff int32) *byte {
 	if !f.valid() {
 		return nil
 	}
-	return &f.datap.funcnametab[nameoff]
+	return &f.datap.funcnametab[nameOff]
 }
 
-func funcnameFromNameoff(f funcInfo, nameoff int32) string {
-	return gostringnocopy(cfuncnameFromNameoff(f, nameoff))
+func funcnameFromNameOff(f funcInfo, nameOff int32) string {
+	return gostringnocopy(cfuncnameFromNameOff(f, nameOff))
 }
 
 func funcfile(f funcInfo, fileno int32) string {
@@ -1173,11 +1206,9 @@
 
 // inlinedCall is the encoding of entries in the FUNCDATA_InlTree table.
 type inlinedCall struct {
-	parent   int16  // index of parent in the inltree, or < 0
-	funcID   funcID // type of the called function
-	_        byte
-	file     int32 // perCU file index for inlined call. See cmd/link:pcln.go
-	line     int32 // line number of the call site
-	func_    int32 // offset into pclntab for name of called function
-	parentPc int32 // position of an instruction whose source position is the call site (offset from entry)
+	funcID    funcID // type of the called function
+	_         [3]byte
+	nameOff   int32 // offset into pclntab for name of called function
+	parentPc  int32 // position of an instruction whose source position is the call site (offset from entry)
+	startLine int32 // line number of start of function (func keyword/TEXT directive)
 }
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index 1547fdc..5ba697e 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -6,6 +6,7 @@
 
 import (
 	"internal/abi"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -48,6 +49,17 @@
 }
 func syscall6()
 
+//go:linkname syscall_syscall9 syscall.syscall9
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscall9(fn, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
+	entersyscall()
+	libcCall(unsafe.Pointer(abi.FuncPCABI0(syscall9)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscall9()
+
 //go:linkname syscall_syscall6X syscall.syscall6X
 //go:nosplit
 func syscall_syscall6X(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
@@ -86,7 +98,7 @@
 	return args.r1, args.r2, args.err
 }
 
-// syscallNoErr is used in crypto/x509 to call into Security.framework and CF.
+// crypto_x509_syscall is used in crypto/x509/internal/macos to call into Security.framework and CF.
 
 //go:linkname crypto_x509_syscall crypto/x509/internal/macos.syscall
 //go:nosplit
@@ -167,6 +179,47 @@
 }
 func pthread_kill_trampoline()
 
+// osinit_hack is a clumsy hack to work around Apple libc bugs
+// causing fork+exec to hang in the child process intermittently.
+// See go.dev/issue/33565 and go.dev/issue/56784 for a few reports.
+//
+// The stacks obtained from the hung child processes are in
+// libSystem_atfork_child, which is supposed to reinitialize various
+// parts of the C library in the new process.
+//
+// One common stack dies in _notify_fork_child calling _notify_globals
+// (inlined) calling _os_alloc_once, because _os_alloc_once detects that
+// the once lock is held by the parent process and then calls
+// _os_once_gate_corruption_abort. The allocation is setting up the
+// globals for the notification subsystem. See the source code at [1].
+// To work around this, we can allocate the globals earlier in the Go
+// program's lifetime, before any execs are involved, by calling any
+// notify routine that is exported, calls _notify_globals, and doesn't do
+// anything too expensive otherwise. notify_is_valid_token(0) fits the bill.
+//
+// The other common stack dies in xpc_atfork_child calling
+// _objc_msgSend_uncached which ends up in
+// WAITING_FOR_ANOTHER_THREAD_TO_FINISH_CALLING_+initialize. Of course,
+// whatever thread the child is waiting for is in the parent process and
+// is not going to finish anything in the child process. There is no
+// public source code for these routines, so it is unclear exactly what
+// the problem is. An Apple engineer suggests using xpc_date_create_from_current,
+// which empirically does fix the problem.
+//
+// So osinit_hack_trampoline (in sys_darwin_$GOARCH.s) calls
+// notify_is_valid_token(0) and xpc_date_create_from_current(), which makes the
+// fork+exec hangs stop happening. If Apple fixes the libc bug in
+// some future version of macOS, then we can remove this awful code.
+//
+//go:nosplit
+func osinit_hack() {
+	if GOOS == "darwin" { // not ios
+		libcCall(unsafe.Pointer(abi.FuncPCABI0(osinit_hack_trampoline)), nil)
+	}
+	return
+}
+func osinit_hack_trampoline()
+
 // mmap is used to do low-level memory allocation via mmap. Don't allow stack
 // splits, since this function (used by sysAlloc) is called in a lot of low-level
 // parts of the runtime and callers often assume it won't acquire any locks.
@@ -474,7 +527,8 @@
 func pthread_cond_signal_trampoline()
 
 // Not used on Darwin, but must be defined.
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
+	throw("exitThread")
 }
 
 //go:nosplit
@@ -535,3 +589,6 @@
 //go:cgo_import_dynamic libc_pthread_cond_wait pthread_cond_wait "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_cond_timedwait_relative_np pthread_cond_timedwait_relative_np "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_cond_signal pthread_cond_signal "/usr/lib/libSystem.B.dylib"
+
+//go:cgo_import_dynamic libc_notify_is_valid_token notify_is_valid_token "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_xpc_date_create_from_current xpc_date_create_from_current "/usr/lib/libSystem.B.dylib"
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index ba81fcc..6eaeeb9 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -597,6 +597,15 @@
 	POPQ	BP
 	RET
 
+TEXT runtime·osinit_hack_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	$0, DI	// arg 1 val
+	CALL	libc_notify_is_valid_token(SB)
+	CALL	libc_xpc_date_create_from_current(SB)
+	POPQ	BP
+	RET
+
 // syscall calls a function in libc on behalf of the syscall package.
 // syscall takes a pointer to a struct like:
 // struct {
@@ -839,6 +848,65 @@
 	POPQ	BP
 	RET
 
+// syscall9 calls a function in libc on behalf of the syscall package.
+// syscall9 takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	a7    uintptr
+//	a8    uintptr
+//	a9    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall9 must be called on the g0 stack with the
+// C calling convention (use libcCall).
+//
+// syscall9 expects a 32-bit result and tests for 32-bit -1
+// to decide there was an error.
+TEXT runtime·syscall9(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), R13// fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	(4*8)(DI), CX // a4
+	MOVQ	(5*8)(DI), R8 // a5
+	MOVQ	(6*8)(DI), R9 // a6
+	MOVQ	(7*8)(DI), R10 // a7
+	MOVQ	(8*8)(DI), R11 // a8
+	MOVQ	(9*8)(DI), R12 // a9
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	R13
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (10*8)(DI) // r1
+	MOVQ	DX, (11*8)(DI) // r2
+
+	CMPL	AX, $-1
+	JNE	ok
+
+	CALL	libc_error(SB)
+	MOVLQSX	(AX), AX
+	MOVQ	(SP), DI
+	MOVQ	AX, (12*8)(DI) // err
+
+ok:
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
+
 // syscall_x509 is for crypto/x509. It is like syscall6 but does not check for errors,
 // takes 5 uintptrs and 1 float64, and only returns one value,
 // for use with standard C ABI functions.
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index bf0dc9d..4a51fb3 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -458,6 +458,12 @@
 	BL	libc_pthread_setspecific(SB)
 	RET
 
+TEXT runtime·osinit_hack_trampoline(SB),NOSPLIT,$0
+	MOVD	$0, R0	// arg 1 val
+	BL	libc_notify_is_valid_token(SB)
+	BL	libc_xpc_date_create_from_current(SB)
+	RET
+
 // syscall calls a function in libc on behalf of the syscall package.
 // syscall takes a pointer to a struct like:
 // struct {
@@ -669,6 +675,63 @@
 ok:
 	RET
 
+// syscall9 calls a function in libc on behalf of the syscall package.
+// syscall9 takes a pointer to a struct like:
+// struct {
+//	fn    uintptr
+//	a1    uintptr
+//	a2    uintptr
+//	a3    uintptr
+//	a4    uintptr
+//	a5    uintptr
+//	a6    uintptr
+//	a7    uintptr
+//	a8    uintptr
+//	a9    uintptr
+//	r1    uintptr
+//	r2    uintptr
+//	err   uintptr
+// }
+// syscall9 must be called on the g0 stack with the
+// C calling convention (use libcCall).
+TEXT runtime·syscall9(SB),NOSPLIT,$0
+	SUB	$16, RSP	// push structure pointer
+	MOVD	R0, 8(RSP)
+
+	MOVD	0(R0), R12	// fn
+	MOVD	16(R0), R1	// a2
+	MOVD	24(R0), R2	// a3
+	MOVD	32(R0), R3	// a4
+	MOVD	40(R0), R4	// a5
+	MOVD	48(R0), R5	// a6
+	MOVD	56(R0), R6	// a7
+	MOVD	64(R0), R7	// a8
+	MOVD	72(R0), R8	// a9
+	MOVD	8(R0), R0	// a1
+
+	// If fn is declared as vararg, we have to pass the vararg arguments on the stack.
+	// See syscall above. The only function this applies to is openat, for which the 4th
+	// arg must be on the stack.
+	MOVD	R3, (RSP)
+
+	BL	(R12)
+
+	MOVD	8(RSP), R2	// pop structure pointer
+	ADD	$16, RSP
+	MOVD	R0, 80(R2)	// save r1
+	MOVD	R1, 88(R2)	// save r2
+	CMPW	$-1, R0
+	BNE	ok
+	SUB	$16, RSP	// push structure pointer
+	MOVD	R2, 8(RSP)
+	BL	libc_error(SB)
+	MOVW	(R0), R0
+	MOVD	8(RSP), R2	// pop structure pointer
+	ADD	$16, RSP
+	MOVD	R0, 96(R2)	// save err
+ok:
+	RET
+
 // syscall_x509 is for crypto/x509. It is like syscall6 but does not check for errors,
 // takes 5 uintptrs and 1 float64, and only returns one value,
 // for use with standard C ABI functions.
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index 602d5e9..0cf9821 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -65,7 +65,7 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-8
 	MOVQ	wait+0(FP), AX
 	// We're done using the stack.
diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
index 9e5210b..4e0bc9b 100644
--- a/src/runtime/sys_freebsd_386.s
+++ b/src/runtime/sys_freebsd_386.s
@@ -10,8 +10,44 @@
 #include "go_tls.h"
 #include "textflag.h"
 
+#define CLOCK_REALTIME		0
+#define CLOCK_MONOTONIC		4
+#define FD_CLOEXEC		1
+#define F_SETFD			2
+
+#define SYS_exit		1
+#define SYS_read		3
+#define SYS_write		4
+#define SYS_open		5
+#define SYS_close		6
+#define SYS_getpid		20
+#define SYS_kill		37
+#define SYS_sigaltstack		53
+#define SYS_munmap		73
+#define SYS_madvise		75
+#define SYS_setitimer		83
+#define SYS_fcntl		92
+#define SYS_sysarch		165
+#define SYS___sysctl		202
+#define SYS_clock_gettime	232
+#define SYS_nanosleep		240
+#define SYS_sched_yield		331
+#define SYS_sigprocmask		340
+#define SYS_kqueue		362
+#define SYS_sigaction		416
+#define SYS_sigreturn		417
+#define SYS_thr_exit		431
+#define SYS_thr_self		432
+#define SYS_thr_kill		433
+#define SYS__umtx_op		454
+#define SYS_thr_new		455
+#define SYS_mmap		477
+#define SYS_cpuset_getaffinity	487
+#define SYS_pipe2 		542
+#define SYS_kevent		560
+
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$-4
-	MOVL	$454, AX
+	MOVL	$SYS__umtx_op, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
@@ -19,7 +55,7 @@
 	RET
 
 TEXT runtime·thr_new(SB),NOSPLIT,$-4
-	MOVL	$455, AX
+	MOVL	$SYS_thr_new, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
@@ -54,7 +90,7 @@
 
 // Exit the entire program (like C exit)
 TEXT runtime·exit(SB),NOSPLIT,$-4
-	MOVL	$1, AX
+	MOVL	$SYS_exit, AX
 	INT	$0x80
 	MOVL	$0xf1, 0xf1  // crash
 	RET
@@ -63,7 +99,7 @@
 DATA exitStack<>+0x00(SB)/4, $0
 DATA exitStack<>+0x04(SB)/4, $0
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVL	wait+0(FP), AX
 	// We're done using the stack.
@@ -72,13 +108,13 @@
 	// on the stack. We want to pass 0, so switch over to a fake
 	// stack of 0s. It won't write to the stack.
 	MOVL	$exitStack<>(SB), SP
-	MOVL	$431, AX	// thr_exit
+	MOVL	$SYS_thr_exit, AX
 	INT	$0x80
 	MOVL	$0xf1, 0xf1  // crash
 	JMP	0(PC)
 
 TEXT runtime·open(SB),NOSPLIT,$-4
-	MOVL	$5, AX
+	MOVL	$SYS_open, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$-1, AX
@@ -86,7 +122,7 @@
 	RET
 
 TEXT runtime·closefd(SB),NOSPLIT,$-4
-	MOVL	$6, AX
+	MOVL	$SYS_close, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$-1, AX
@@ -94,7 +130,7 @@
 	RET
 
 TEXT runtime·read(SB),NOSPLIT,$-4
-	MOVL	$3, AX
+	MOVL	$SYS_read, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX			// caller expects negative errno
@@ -103,7 +139,7 @@
 
 // func pipe2(flags int32) (r, w int32, errno int32)
 TEXT runtime·pipe2(SB),NOSPLIT,$12-16
-	MOVL	$542, AX
+	MOVL	$SYS_pipe2, AX
 	LEAL	r+4(FP), BX
 	MOVL	BX, 4(SP)
 	MOVL	flags+0(FP), BX
@@ -115,7 +151,7 @@
 	RET
 
 TEXT runtime·write1(SB),NOSPLIT,$-4
-	MOVL	$4, AX
+	MOVL	$SYS_write, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX			// caller expects negative errno
@@ -126,25 +162,25 @@
 	// thr_self(&0(FP))
 	LEAL	ret+0(FP), AX
 	MOVL	AX, 4(SP)
-	MOVL	$432, AX
+	MOVL	$SYS_thr_self, AX
 	INT	$0x80
 	RET
 
 TEXT runtime·thr_kill(SB),NOSPLIT,$-4
 	// thr_kill(tid, sig)
-	MOVL	$433, AX
+	MOVL	$SYS_thr_kill, AX
 	INT	$0x80
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$16
 	// getpid
-	MOVL	$20, AX
+	MOVL	$SYS_getpid, AX
 	INT	$0x80
 	// kill(self, sig)
 	MOVL	AX, 4(SP)
 	MOVL	sig+0(FP), AX
 	MOVL	AX, 8(SP)
-	MOVL	$37, AX
+	MOVL	$SYS_kill, AX
 	INT	$0x80
 	RET
 
@@ -160,7 +196,7 @@
 	MOVSL
 	MOVL	$0, AX	// top 32 bits of file offset
 	STOSL
-	MOVL	$477, AX
+	MOVL	$SYS_mmap, AX
 	INT	$0x80
 	JAE	ok
 	MOVL	$0, p+24(FP)
@@ -172,14 +208,14 @@
 	RET
 
 TEXT runtime·munmap(SB),NOSPLIT,$-4
-	MOVL	$73, AX
+	MOVL	$SYS_munmap, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
 TEXT runtime·madvise(SB),NOSPLIT,$-4
-	MOVL	$75, AX	// madvise
+	MOVL	$SYS_madvise, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$-1, AX
@@ -187,15 +223,15 @@
 	RET
 
 TEXT runtime·setitimer(SB), NOSPLIT, $-4
-	MOVL	$83, AX
+	MOVL	$SYS_setitimer, AX
 	INT	$0x80
 	RET
 
 // func fallback_walltime() (sec int64, nsec int32)
 TEXT runtime·fallback_walltime(SB), NOSPLIT, $32-12
-	MOVL	$232, AX // clock_gettime
+	MOVL	$SYS_clock_gettime, AX
 	LEAL	12(SP), BX
-	MOVL	$0, 4(SP)	// CLOCK_REALTIME
+	MOVL	$CLOCK_REALTIME, 4(SP)
 	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX	// sec
@@ -209,9 +245,9 @@
 
 // func fallback_nanotime() int64
 TEXT runtime·fallback_nanotime(SB), NOSPLIT, $32-8
-	MOVL	$232, AX
+	MOVL	$SYS_clock_gettime, AX
 	LEAL	12(SP), BX
-	MOVL	$4, 4(SP)	// CLOCK_MONOTONIC
+	MOVL	$CLOCK_MONOTONIC, 4(SP)
 	MOVL	BX, 8(SP)
 	INT	$0x80
 	MOVL	12(SP), AX	// sec
@@ -230,7 +266,7 @@
 
 
 TEXT runtime·asmSigaction(SB),NOSPLIT,$-4
-	MOVL	$416, AX
+	MOVL	$SYS_sigaction, AX
 	INT	$0x80
 	MOVL	AX, ret+12(FP)
 	RET
@@ -267,13 +303,13 @@
 	MOVL	24(SP), AX	// context
 	MOVL	$0, 0(SP)	// syscall gap
 	MOVL	AX, 4(SP)
-	MOVL	$417, AX	// sigreturn(ucontext)
+	MOVL	$SYS_sigreturn, AX
 	INT	$0x80
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
 TEXT runtime·sigaltstack(SB),NOSPLIT,$0
-	MOVL	$53, AX
+	MOVL	$SYS_sigaltstack, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
@@ -293,7 +329,7 @@
 	LEAL	12(SP), AX
 	MOVL	AX, 4(SP)		// arg 1 - rqtp
 	MOVL	$0, 8(SP)		// arg 2 - rmtp
-	MOVL	$240, AX		// sys_nanosleep
+	MOVL	$SYS_nanosleep, AX
 	INT	$0x80
 	RET
 
@@ -352,7 +388,7 @@
 	MOVL	$0, 0(SP)	// syscall gap
 	MOVL	$1, 4(SP)
 	MOVL	AX, 8(SP)
-	MOVL	$165, AX
+	MOVL	$SYS_sysarch, AX
 	INT	$0x80
 	JAE	2(PC)
 	INT	$3
@@ -368,7 +404,7 @@
 	MOVSL				// arg 4 - oldlenp
 	MOVSL				// arg 5 - newp
 	MOVSL				// arg 6 - newlen
-	MOVL	$202, AX		// sys___sysctl
+	MOVL	$SYS___sysctl, AX
 	INT	$0x80
 	JAE	4(PC)
 	NEGL	AX
@@ -379,7 +415,7 @@
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$-4
-	MOVL	$331, AX		// sys_sched_yield
+	MOVL	$SYS_sched_yield, AX
 	INT	$0x80
 	RET
 
@@ -391,7 +427,7 @@
 	MOVL	AX, 8(SP)		// arg 2 - set
 	MOVL	old+8(FP), AX
 	MOVL	AX, 12(SP)		// arg 3 - oset
-	MOVL	$340, AX		// sys_sigprocmask
+	MOVL	$SYS_sigprocmask, AX
 	INT	$0x80
 	JAE	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
@@ -399,7 +435,7 @@
 
 // int32 runtime·kqueue(void);
 TEXT runtime·kqueue(SB),NOSPLIT,$0
-	MOVL	$362, AX
+	MOVL	$SYS_kqueue, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
@@ -408,7 +444,7 @@
 
 // int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int nevents, Timespec *timeout);
 TEXT runtime·kevent(SB),NOSPLIT,$0
-	MOVL	$363, AX
+	MOVL	$SYS_kevent, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
@@ -417,12 +453,12 @@
 
 // int32 runtime·closeonexec(int32 fd);
 TEXT runtime·closeonexec(SB),NOSPLIT,$32
-	MOVL	$92, AX		// fcntl
+	MOVL	$SYS_fcntl, AX
 	// 0(SP) is where the caller PC would be; kernel skips it
 	MOVL	fd+0(FP), BX
 	MOVL	BX, 4(SP)	// fd
-	MOVL	$2, 8(SP)	// F_SETFD
-	MOVL	$1, 12(SP)	// FD_CLOEXEC
+	MOVL	$F_SETFD, 8(SP)
+	MOVL	$FD_CLOEXEC, 12(SP)
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
@@ -430,7 +466,7 @@
 
 // func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
 TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-28
-	MOVL	$487, AX
+	MOVL	$SYS_cpuset_getaffinity, AX
 	INT	$0x80
 	JAE	2(PC)
 	NEGL	AX
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 94341f6..374e0ab 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -11,13 +11,49 @@
 #include "textflag.h"
 #include "cgo/abi_amd64.h"
 
+#define CLOCK_REALTIME		0
+#define CLOCK_MONOTONIC		4
+#define FD_CLOEXEC		1
+#define F_SETFD			2
+#define AMD64_SET_FSBASE	129
+
+#define SYS_exit		1
+#define SYS_read		3
+#define SYS_write		4
+#define SYS_open		5
+#define SYS_close		6
+#define SYS_getpid		20
+#define SYS_kill		37
+#define SYS_sigaltstack		53
+#define SYS_munmap		73
+#define SYS_madvise		75
+#define SYS_setitimer		83
+#define SYS_fcntl		92
+#define SYS_sysarch		165
+#define SYS___sysctl		202
+#define SYS_clock_gettime	232
+#define SYS_nanosleep		240
+#define SYS_sched_yield		331
+#define SYS_sigprocmask		340
+#define SYS_kqueue		362
+#define SYS_sigaction		416
+#define SYS_thr_exit		431
+#define SYS_thr_self		432
+#define SYS_thr_kill		433
+#define SYS__umtx_op		454
+#define SYS_thr_new		455
+#define SYS_mmap		477
+#define SYS_cpuset_getaffinity	487
+#define SYS_pipe2 		542
+#define SYS_kevent		560
+
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
 	MOVQ addr+0(FP), DI
 	MOVL mode+8(FP), SI
 	MOVL val+12(FP), DX
 	MOVQ uaddr1+16(FP), R10
 	MOVQ ut+24(FP), R8
-	MOVL $454, AX
+	MOVL $SYS__umtx_op, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
@@ -27,7 +63,7 @@
 TEXT runtime·thr_new(SB),NOSPLIT,$0
 	MOVQ param+0(FP), DI
 	MOVL size+8(FP), SI
-	MOVL $455, AX
+	MOVL $SYS_thr_new, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
@@ -55,18 +91,18 @@
 // Exit the entire program (like C exit)
 TEXT runtime·exit(SB),NOSPLIT,$-8
 	MOVL	code+0(FP), DI		// arg 1 exit status
-	MOVL	$1, AX
+	MOVL	$SYS_exit, AX
 	SYSCALL
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-8
 	MOVQ	wait+0(FP), AX
 	// We're done using the stack.
 	MOVL	$0, (AX)
 	MOVL	$0, DI		// arg 1 long *state
-	MOVL	$431, AX	// thr_exit
+	MOVL	$SYS_thr_exit, AX
 	SYSCALL
 	MOVL	$0xf1, 0xf1  // crash
 	JMP	0(PC)
@@ -75,7 +111,7 @@
 	MOVQ	name+0(FP), DI		// arg 1 pathname
 	MOVL	mode+8(FP), SI		// arg 2 flags
 	MOVL	perm+12(FP), DX		// arg 3 mode
-	MOVL	$5, AX
+	MOVL	$SYS_open, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$-1, AX
@@ -84,7 +120,7 @@
 
 TEXT runtime·closefd(SB),NOSPLIT,$-8
 	MOVL	fd+0(FP), DI		// arg 1 fd
-	MOVL	$6, AX
+	MOVL	$SYS_close, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$-1, AX
@@ -95,7 +131,7 @@
 	MOVL	fd+0(FP), DI		// arg 1 fd
 	MOVQ	p+8(FP), SI		// arg 2 buf
 	MOVL	n+16(FP), DX		// arg 3 count
-	MOVL	$3, AX
+	MOVL	$SYS_read, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX			// caller expects negative errno
@@ -106,7 +142,7 @@
 TEXT runtime·pipe2(SB),NOSPLIT,$0-20
 	LEAQ	r+8(FP), DI
 	MOVL	flags+0(FP), SI
-	MOVL	$542, AX
+	MOVL	$SYS_pipe2, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
@@ -117,7 +153,7 @@
 	MOVQ	fd+0(FP), DI		// arg 1 fd
 	MOVQ	p+8(FP), SI		// arg 2 buf
 	MOVL	n+16(FP), DX		// arg 3 count
-	MOVL	$4, AX
+	MOVL	$SYS_write, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX			// caller expects negative errno
@@ -127,7 +163,7 @@
 TEXT runtime·thr_self(SB),NOSPLIT,$0-8
 	// thr_self(&0(FP))
 	LEAQ	ret+0(FP), DI	// arg 1
-	MOVL	$432, AX
+	MOVL	$SYS_thr_self, AX
 	SYSCALL
 	RET
 
@@ -135,18 +171,18 @@
 	// thr_kill(tid, sig)
 	MOVQ	tid+0(FP), DI	// arg 1 id
 	MOVQ	sig+8(FP), SI	// arg 2 sig
-	MOVL	$433, AX
+	MOVL	$SYS_thr_kill, AX
 	SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$0
 	// getpid
-	MOVL	$20, AX
+	MOVL	$SYS_getpid, AX
 	SYSCALL
 	// kill(self, sig)
 	MOVQ	AX, DI		// arg 1 pid
 	MOVL	sig+0(FP), SI	// arg 2 sig
-	MOVL	$37, AX
+	MOVL	$SYS_kill, AX
 	SYSCALL
 	RET
 
@@ -154,14 +190,14 @@
 	MOVL	mode+0(FP), DI
 	MOVQ	new+8(FP), SI
 	MOVQ	old+16(FP), DX
-	MOVL	$83, AX
+	MOVL	$SYS_setitimer, AX
 	SYSCALL
 	RET
 
 // func fallback_walltime() (sec int64, nsec int32)
 TEXT runtime·fallback_walltime(SB), NOSPLIT, $32-12
-	MOVL	$232, AX	// clock_gettime
-	MOVQ	$0, DI		// CLOCK_REALTIME
+	MOVL	$SYS_clock_gettime, AX
+	MOVQ	$CLOCK_REALTIME, DI
 	LEAQ	8(SP), SI
 	SYSCALL
 	MOVQ	8(SP), AX	// sec
@@ -173,8 +209,8 @@
 	RET
 
 TEXT runtime·fallback_nanotime(SB), NOSPLIT, $32-8
-	MOVL	$232, AX
-	MOVQ	$4, DI		// CLOCK_MONOTONIC
+	MOVL	$SYS_clock_gettime, AX
+	MOVQ	$CLOCK_MONOTONIC, DI
 	LEAQ	8(SP), SI
 	SYSCALL
 	MOVQ	8(SP), AX	// sec
@@ -191,7 +227,7 @@
 	MOVQ	sig+0(FP), DI		// arg 1 sig
 	MOVQ	new+8(FP), SI		// arg 2 act
 	MOVQ	old+16(FP), DX		// arg 3 oact
-	MOVL	$416, AX
+	MOVL	$SYS_sigaction, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$-1, AX
@@ -349,14 +385,14 @@
 	MOVQ	_cgo_callers(SB), AX
 	JMP	AX
 
-TEXT runtime·mmap(SB),NOSPLIT,$0
+TEXT runtime·sysMmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI		// arg 1 addr
 	MOVQ	n+8(FP), SI		// arg 2 len
 	MOVL	prot+16(FP), DX		// arg 3 prot
 	MOVL	flags+20(FP), R10		// arg 4 flags
 	MOVL	fd+24(FP), R8		// arg 5 fid
 	MOVL	off+28(FP), R9		// arg 6 offset
-	MOVL	$477, AX
+	MOVL	$SYS_mmap, AX
 	SYSCALL
 	JCC	ok
 	MOVQ	$0, p+32(FP)
@@ -367,20 +403,51 @@
 	MOVQ	$0, err+40(FP)
 	RET
 
-TEXT runtime·munmap(SB),NOSPLIT,$0
+// Call the function stored in _cgo_mmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMmap(SB),NOSPLIT,$16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	prot+16(FP), DX
+	MOVL	flags+20(FP), CX
+	MOVL	fd+24(FP), R8
+	MOVL	off+28(FP), R9
+	MOVQ	_cgo_mmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
+TEXT runtime·sysMunmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI		// arg 1 addr
 	MOVQ	n+8(FP), SI		// arg 2 len
-	MOVL	$73, AX
+	MOVL	$SYS_munmap, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+// Call the function stored in _cgo_munmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMunmap(SB),NOSPLIT,$16-16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	_cgo_munmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	RET
+
 TEXT runtime·madvise(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
 	MOVL	flags+16(FP), DX
-	MOVQ	$75, AX	// madvise
+	MOVQ	$SYS_madvise, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$-1, AX
@@ -390,7 +457,7 @@
 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
 	MOVQ	new+0(FP), DI
 	MOVQ	old+8(FP), SI
-	MOVQ	$53, AX
+	MOVQ	$SYS_sigaltstack, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
@@ -408,7 +475,7 @@
 
 	MOVQ	SP, DI			// arg 1 - rqtp
 	MOVQ	$0, SI			// arg 2 - rmtp
-	MOVL	$240, AX		// sys_nanosleep
+	MOVL	$SYS_nanosleep, AX
 	SYSCALL
 	RET
 
@@ -417,8 +484,8 @@
 	ADDQ	$8, DI	// adjust for ELF: wants to use -8(FS) for g and m
 	MOVQ	DI, 0(SP)
 	MOVQ	SP, SI
-	MOVQ	$129, DI	// AMD64_SET_FSBASE
-	MOVQ	$165, AX	// sysarch
+	MOVQ	$AMD64_SET_FSBASE, DI
+	MOVQ	$SYS_sysarch, AX
 	SYSCALL
 	JCC	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
@@ -431,7 +498,7 @@
 	MOVQ	size+24(FP), R10		// arg 4 - oldlenp
 	MOVQ	dst+32(FP), R8		// arg 5 - newp
 	MOVQ	ndst+40(FP), R9		// arg 6 - newlen
-	MOVQ	$202, AX		// sys___sysctl
+	MOVQ	$SYS___sysctl, AX
 	SYSCALL
 	JCC 4(PC)
 	NEGQ	AX
@@ -442,7 +509,7 @@
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$-4
-	MOVL	$331, AX		// sys_sched_yield
+	MOVL	$SYS_sched_yield, AX
 	SYSCALL
 	RET
 
@@ -450,7 +517,7 @@
 	MOVL	how+0(FP), DI		// arg 1 - how
 	MOVQ	new+8(FP), SI		// arg 2 - set
 	MOVQ	old+16(FP), DX		// arg 3 - oset
-	MOVL	$340, AX		// sys_sigprocmask
+	MOVL	$SYS_sigprocmask, AX
 	SYSCALL
 	JAE	2(PC)
 	MOVL	$0xf1, 0xf1  // crash
@@ -461,7 +528,7 @@
 	MOVQ	$0, DI
 	MOVQ	$0, SI
 	MOVQ	$0, DX
-	MOVL	$362, AX
+	MOVL	$SYS_kqueue, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
@@ -476,7 +543,7 @@
 	MOVQ	ev+24(FP), R10
 	MOVL	nev+32(FP), R8
 	MOVQ	ts+40(FP), R9
-	MOVL	$363, AX
+	MOVL	$SYS_kevent, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
@@ -486,9 +553,9 @@
 // void runtime·closeonexec(int32 fd);
 TEXT runtime·closeonexec(SB),NOSPLIT,$0
 	MOVL	fd+0(FP), DI	// fd
-	MOVQ	$2, SI		// F_SETFD
-	MOVQ	$1, DX		// FD_CLOEXEC
-	MOVL	$92, AX		// fcntl
+	MOVQ	$F_SETFD, SI
+	MOVQ	$FD_CLOEXEC, DX
+	MOVL	$SYS_fcntl, AX
 	SYSCALL
 	RET
 
@@ -499,7 +566,7 @@
 	MOVQ	id+16(FP), DX
 	MOVQ	size+24(FP), R10
 	MOVQ	mask+32(FP), R8
-	MOVL	$487, AX
+	MOVL	$SYS_cpuset_getaffinity, AX
 	SYSCALL
 	JCC	2(PC)
 	NEGQ	AX
diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index cbee34d..a3fee14 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s
@@ -31,7 +31,6 @@
 #define SYS_sched_yield (SYS_BASE + 331)
 #define SYS_sigprocmask (SYS_BASE + 340)
 #define SYS_kqueue (SYS_BASE + 362)
-#define SYS_kevent (SYS_BASE + 363)
 #define SYS_sigaction (SYS_BASE + 416)
 #define SYS_thr_exit (SYS_BASE + 431)
 #define SYS_thr_self (SYS_BASE + 432)
@@ -41,6 +40,7 @@
 #define SYS_mmap (SYS_BASE + 477)
 #define SYS_cpuset_getaffinity (SYS_BASE + 487)
 #define SYS_pipe2 (SYS_BASE + 542)
+#define SYS_kevent (SYS_BASE + 560)
 
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
 	MOVW addr+0(FP), R0
@@ -85,7 +85,7 @@
 	MOVW.CS R8, (R8)
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVW	wait+0(FP), R0
 	// We're done using the stack.
diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
index 5dcdf37..29866cb 100644
--- a/src/runtime/sys_freebsd_arm64.s
+++ b/src/runtime/sys_freebsd_arm64.s
@@ -38,7 +38,6 @@
 #define SYS_sched_yield		331
 #define SYS_sigprocmask		340
 #define SYS_kqueue		362
-#define SYS_kevent		363
 #define SYS_sigaction		416
 #define SYS_thr_exit		431
 #define SYS_thr_self		432
@@ -48,6 +47,7 @@
 #define SYS_mmap		477
 #define SYS_cpuset_getaffinity	487
 #define SYS_pipe2 		542
+#define SYS_kevent		560
 
 TEXT emptyfunc<>(SB),0,$0-0
 	RET
@@ -99,7 +99,7 @@
 	MOVD	$0, R0
 	MOVD	R0, (R0)
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVD	wait+0(FP), R0
 	// We're done using the stack.
@@ -460,7 +460,7 @@
 	BEQ	3(PC)
 
 	// get CNTPCT (Physical Count Register) into R0
-	MRS	CNTPCT_EL0, R0 // SIGILL
+	MRS	CNTPCT_EL0, R0
 	B	2(PC)
 
 	// get CNTVCT (Virtual Count Register) into R0
diff --git a/src/runtime/sys_freebsd_riscv64.s b/src/runtime/sys_freebsd_riscv64.s
new file mode 100644
index 0000000..30deed2
--- /dev/null
+++ b/src/runtime/sys_freebsd_riscv64.s
@@ -0,0 +1,436 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for riscv64, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define CLOCK_REALTIME		0
+#define CLOCK_MONOTONIC		4
+#define FD_CLOEXEC		1
+#define F_SETFD			2
+#define F_GETFL			3
+#define F_SETFL			4
+#define O_NONBLOCK		4
+
+#define SYS_exit		1
+#define SYS_read		3
+#define SYS_write		4
+#define SYS_open		5
+#define SYS_close		6
+#define SYS_getpid		20
+#define SYS_kill		37
+#define SYS_sigaltstack		53
+#define SYS_munmap		73
+#define SYS_madvise		75
+#define SYS_setitimer		83
+#define SYS_fcntl		92
+#define SYS___sysctl		202
+#define SYS_nanosleep		240
+#define SYS_clock_gettime	232
+#define SYS_sched_yield		331
+#define SYS_sigprocmask		340
+#define SYS_kqueue		362
+#define SYS_sigaction		416
+#define SYS_thr_exit		431
+#define SYS_thr_self		432
+#define SYS_thr_kill		433
+#define SYS__umtx_op		454
+#define SYS_thr_new		455
+#define SYS_mmap		477
+#define SYS_cpuset_getaffinity	487
+#define SYS_pipe2 		542
+#define SYS_kevent		560
+
+TEXT emptyfunc<>(SB),0,$0-0
+	RET
+
+// func sys_umtx_op(addr *uint32, mode int32, val uint32, uaddr1 uintptr, ut *umtx_time) int32
+TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
+	MOV	addr+0(FP), A0
+	MOVW	mode+8(FP), A1
+	MOVW	val+12(FP), A2
+	MOV	uaddr1+16(FP), A3
+	MOV	ut+24(FP), A4
+	MOV	$SYS__umtx_op, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+32(FP)
+	RET
+
+// func thr_new(param *thrparam, size int32) int32
+TEXT runtime·thr_new(SB),NOSPLIT,$0
+	MOV	param+0(FP), A0
+	MOVW	size+8(FP), A1
+	MOV	$SYS_thr_new, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+16(FP)
+	RET
+
+// func thr_start()
+TEXT runtime·thr_start(SB),NOSPLIT,$0
+	// set up g
+	MOV	m_g0(A0), g
+	MOV	A0, g_m(g)
+	CALL	emptyfunc<>(SB)	 // fault if stack check is wrong
+	CALL	runtime·mstart(SB)
+
+	WORD	$0	// crash
+	RET
+
+// func exit(code int32)
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), A0
+	MOV	$SYS_exit, T0
+	ECALL
+	WORD	$0	// crash
+
+// func exitThread(wait *atomic.Uint32)
+TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+	MOV	wait+0(FP), A0
+	// We're done using the stack.
+	FENCE
+	MOVW	ZERO, (A0)
+	FENCE
+	MOV	$0, A0	// exit code
+	MOV	$SYS_thr_exit, T0
+	ECALL
+	JMP	0(PC)
+
+// func open(name *byte, mode, perm int32) int32
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
+	MOV	name+0(FP), A0
+	MOVW	mode+8(FP), A1
+	MOVW	perm+12(FP), A2
+	MOV	$SYS_open, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+16(FP)
+	RET
+
+// func closefd(fd int32) int32
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	fd+0(FP), A0
+	MOV	$SYS_close, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOV	$r+8(FP), A0
+	MOVW	flags+0(FP), A1
+	MOV	$SYS_pipe2, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, errno+16(FP)
+	RET
+
+// func write1(fd uintptr, p unsafe.Pointer, n int32) int32
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
+	MOV	fd+0(FP), A0
+	MOV	p+8(FP), A1
+	MOVW	n+16(FP), A2
+	MOV	$SYS_write, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func read(fd int32, p unsafe.Pointer, n int32) int32
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	fd+0(FP), A0
+	MOV	p+8(FP), A1
+	MOVW	n+16(FP), A2
+	MOV	$SYS_read, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func usleep(usec uint32)
+TEXT runtime·usleep(SB),NOSPLIT,$24-4
+	MOVWU	usec+0(FP), A0
+	MOV	$1000, A1
+	MUL	A1, A0, A0
+	MOV	$1000000000, A1
+	DIV	A1, A0, A2
+	MOV	A2, 8(X2)
+	REM	A1, A0, A3
+	MOV	A3, 16(X2)
+	ADD	$8, X2, A0
+	MOV	ZERO, A1
+	MOV	$SYS_nanosleep, T0
+	ECALL
+	RET
+
+// func thr_self() thread
+TEXT runtime·thr_self(SB),NOSPLIT,$8-8
+	MOV	$ptr-8(SP), A0	// arg 1 &8(SP)
+	MOV	$SYS_thr_self, T0
+	ECALL
+	MOV	ptr-8(SP), A0
+	MOV	A0, ret+0(FP)
+	RET
+
+// func thr_kill(t thread, sig int)
+TEXT runtime·thr_kill(SB),NOSPLIT,$0-16
+	MOV	tid+0(FP), A0	// arg 1 pid
+	MOV	sig+8(FP), A1	// arg 2 sig
+	MOV	$SYS_thr_kill, T0
+	ECALL
+	RET
+
+// func raiseproc(sig uint32)
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
+	MOV	$SYS_getpid, T0
+	ECALL
+	// arg 1 pid - already in A0
+	MOVW	sig+0(FP), A1	// arg 2
+	MOV	$SYS_kill, T0
+	ECALL
+	RET
+
+// func setitimer(mode int32, new, old *itimerval)
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	mode+0(FP), A0
+	MOV	new+8(FP), A1
+	MOV	old+16(FP), A2
+	MOV	$SYS_setitimer, T0
+	ECALL
+	RET
+
+// func fallback_walltime() (sec int64, nsec int32)
+TEXT runtime·fallback_walltime(SB),NOSPLIT,$24-12
+	MOV	$CLOCK_REALTIME, A0
+	MOV	$8(X2), A1
+	MOV	$SYS_clock_gettime, T0
+	ECALL
+	MOV	8(X2), T0	// sec
+	MOVW	16(X2), T1	// nsec
+	MOV	T0, sec+0(FP)
+	MOVW	T1, nsec+8(FP)
+	RET
+
+// func fallback_nanotime() int64
+TEXT runtime·fallback_nanotime(SB),NOSPLIT,$24-8
+	MOV	$CLOCK_MONOTONIC, A0
+	MOV	$8(X2), A1
+	MOV	$SYS_clock_gettime, T0
+	ECALL
+	MOV	8(X2), T0	// sec
+	MOV	16(X2), T1	// nsec
+
+	// sec is in T0, nsec in T1
+	// return nsec in T0
+	MOV	$1000000000, T2
+	MUL	T2, T0
+	ADD	T1, T0
+
+	MOV	T0, ret+0(FP)
+	RET
+
+// func asmSigaction(sig uintptr, new, old *sigactiont) int32
+TEXT runtime·asmSigaction(SB),NOSPLIT|NOFRAME,$0
+	MOV	sig+0(FP), A0		// arg 1 sig
+	MOV	new+8(FP), A1		// arg 2 act
+	MOV	old+16(FP), A2		// arg 3 oact
+	MOV	$SYS_sigaction, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), A0
+	MOV	info+16(FP), A1
+	MOV	ctx+24(FP), A2
+	MOV	fn+0(FP), T1
+	JALR	RA, T1
+	RET
+
+// func sigtramp(signo, ureg, ctxt unsafe.Pointer)
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	MOVW	A0, 8(X2)
+	MOV	A1, 16(X2)
+	MOV	A2, 24(X2)
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVBU	runtime·iscgo(SB), A0
+	BEQ	A0, ZERO, ok
+	CALL	runtime·load_g(SB)
+ok:
+	MOV	$runtime·sigtrampgo(SB), A0
+	JALR	RA, A0
+	RET
+
+// func mmap(addr uintptr, n uintptr, prot int, flags int, fd int, off int64) (ret uintptr, err error)
+TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOVW	prot+16(FP), A2
+	MOVW	flags+20(FP), A3
+	MOVW	fd+24(FP), A4
+	MOVW	off+28(FP), A5
+	MOV	$SYS_mmap, T0
+	ECALL
+	BNE	T0, ZERO, fail
+	MOV	A0, p+32(FP)
+	MOV	ZERO, err+40(FP)
+	RET
+fail:
+	MOV	ZERO, p+32(FP)
+	MOV	A0, err+40(FP)
+	RET
+
+// func munmap(addr uintptr, n uintptr) (err error)
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOV	$SYS_munmap, T0
+	ECALL
+	BNE	T0, ZERO, fail
+	RET
+fail:
+	WORD	$0	// crash
+
+// func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOVW	flags+16(FP), A2
+	MOV	$SYS_madvise, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOV	mib+0(FP), A0
+	MOV	miblen+8(FP), A1
+	MOV	out+16(FP), A2
+	MOV	size+24(FP), A3
+	MOV	dst+32(FP), A4
+	MOV	ndst+40(FP), A5
+	MOV	$SYS___sysctl, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+48(FP)
+	RET
+
+// func sigaltstack(new, old *stackt)
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+	MOV	new+0(FP), A0
+	MOV	old+8(FP), A1
+	MOV	$SYS_sigaltstack, T0
+	ECALL
+	BNE	T0, ZERO, fail
+	RET
+fail:
+	WORD	$0	// crash
+
+// func osyield()
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
+	MOV	$SYS_sched_yield, T0
+	ECALL
+	RET
+
+// func sigprocmask(how int32, new, old *sigset)
+TEXT runtime·sigprocmask(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	how+0(FP), A0
+	MOV	new+8(FP), A1
+	MOV	old+16(FP), A2
+	MOV	$SYS_sigprocmask, T0
+	ECALL
+	BNE	T0, ZERO, fail
+	RET
+fail:
+	WORD	$0	// crash
+
+
+// func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+TEXT runtime·cpuset_getaffinity(SB),NOSPLIT|NOFRAME,$0-44
+	MOV	level+0(FP), A0
+	MOV	which+8(FP), A1
+	MOV	id+16(FP), A2
+	MOV	size+24(FP), A3
+	MOV	mask+32(FP), A4
+	MOV	$SYS_cpuset_getaffinity, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+40(FP)
+	RET
+
+// func kqueue() int32
+TEXT runtime·kqueue(SB),NOSPLIT|NOFRAME,$0
+	MOV $SYS_kqueue, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	MOV	$-1, A0
+ok:
+	MOVW	A0, ret+0(FP)
+	RET
+
+// func kevent(kq int, ch unsafe.Pointer, nch int, ev unsafe.Pointer, nev int, ts *Timespec) (n int, err error)
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVW	kq+0(FP), A0
+	MOV	ch+8(FP), A1
+	MOVW	nch+16(FP), A2
+	MOV	ev+24(FP), A3
+	MOVW	nev+32(FP), A4
+	MOV	ts+40(FP), A5
+	MOV	$SYS_kevent, T0
+	ECALL
+	BEQ	T0, ZERO, ok
+	NEG	A0, A0
+ok:
+	MOVW	A0, ret+48(FP)
+	RET
+
+// func closeonexec(fd int32)
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
+	MOVW	fd+0(FP), A0
+	MOV	$F_SETFD, A1
+	MOV	$FD_CLOEXEC, A2
+	MOV	$SYS_fcntl, T0
+	ECALL
+	RET
+
+// func getCntxct() uint32
+TEXT runtime·getCntxct(SB),NOSPLIT|NOFRAME,$0
+	RDTIME	A0
+	MOVW	A0, ret+0(FP)
+	RET
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 4942f21..12a2941 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -33,7 +33,6 @@
 #define SYS_access		33
 #define SYS_kill		37
 #define SYS_brk 		45
-#define SYS_fcntl		55
 #define SYS_munmap		91
 #define SYS_socketcall		102
 #define SYS_setittimer		104
@@ -52,15 +51,11 @@
 #define SYS_sched_getaffinity	242
 #define SYS_set_thread_area	243
 #define SYS_exit_group		252
-#define SYS_epoll_create	254
-#define SYS_epoll_ctl		255
-#define SYS_epoll_wait		256
 #define SYS_timer_create	259
 #define SYS_timer_settime	260
 #define SYS_timer_delete	263
 #define SYS_clock_gettime	265
 #define SYS_tgkill		270
-#define SYS_epoll_create1	329
 #define SYS_pipe2		331
 
 TEXT runtime·exit(SB),NOSPLIT,$0
@@ -77,7 +72,7 @@
 	INT $3	// not reached
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVL	wait+0(FP), AX
 	// We're done using the stack.
@@ -726,53 +721,6 @@
 	MOVL	AX, ret+12(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT,$0
-	MOVL    $SYS_epoll_create, AX
-	MOVL	size+0(FP), BX
-	INVOKE_SYSCALL
-	MOVL	AX, ret+4(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT,$0
-	MOVL    $SYS_epoll_create1, AX
-	MOVL	flags+0(FP), BX
-	INVOKE_SYSCALL
-	MOVL	AX, ret+4(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT,$0
-	MOVL	$SYS_epoll_ctl, AX
-	MOVL	epfd+0(FP), BX
-	MOVL	op+4(FP), CX
-	MOVL	fd+8(FP), DX
-	MOVL	ev+12(FP), SI
-	INVOKE_SYSCALL
-	MOVL	AX, ret+16(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT,$0
-	MOVL	$SYS_epoll_wait, AX
-	MOVL	epfd+0(FP), BX
-	MOVL	ev+4(FP), CX
-	MOVL	nev+8(FP), DX
-	MOVL	timeout+12(FP), SI
-	INVOKE_SYSCALL
-	MOVL	AX, ret+16(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT,$0
-	MOVL	$SYS_fcntl, AX
-	MOVL	fd+0(FP), BX  // fd
-	MOVL	$2, CX  // F_SETFD
-	MOVL	$1, DX  // FD_CLOEXEC
-	INVOKE_SYSCALL
-	RET
-
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0
 	MOVL	$SYS_access, AX
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index ca6ecb1..c7a89ba 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -33,24 +33,19 @@
 #define SYS_clone		56
 #define SYS_exit		60
 #define SYS_kill		62
-#define SYS_fcntl		72
 #define SYS_sigaltstack 	131
 #define SYS_arch_prctl		158
 #define SYS_gettid		186
 #define SYS_futex		202
 #define SYS_sched_getaffinity	204
-#define SYS_epoll_create	213
 #define SYS_timer_create	222
 #define SYS_timer_settime	223
 #define SYS_timer_delete	226
 #define SYS_clock_gettime	228
 #define SYS_exit_group		231
-#define SYS_epoll_ctl		233
 #define SYS_tgkill		234
 #define SYS_openat		257
 #define SYS_faccessat		269
-#define SYS_epoll_pwait		281
-#define SYS_epoll_create1	291
 #define SYS_pipe2		293
 
 TEXT runtime·exit(SB),NOSPLIT,$0-4
@@ -59,7 +54,7 @@
 	SYSCALL
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-8
 	MOVQ	wait+0(FP), AX
 	// We're done using the stack.
@@ -666,55 +661,6 @@
 	MOVL	AX, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT,$0
-	MOVL    size+0(FP), DI
-	MOVL    $SYS_epoll_create, AX
-	SYSCALL
-	MOVL	AX, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT,$0
-	MOVL	flags+0(FP), DI
-	MOVL	$SYS_epoll_create1, AX
-	SYSCALL
-	MOVL	AX, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT,$0
-	MOVL	epfd+0(FP), DI
-	MOVL	op+4(FP), SI
-	MOVL	fd+8(FP), DX
-	MOVQ	ev+16(FP), R10
-	MOVL	$SYS_epoll_ctl, AX
-	SYSCALL
-	MOVL	AX, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT,$0
-	// This uses pwait instead of wait, because Android O blocks wait.
-	MOVL	epfd+0(FP), DI
-	MOVQ	ev+8(FP), SI
-	MOVL	nev+16(FP), DX
-	MOVL	timeout+20(FP), R10
-	MOVQ	$0, R8
-	MOVL	$SYS_epoll_pwait, AX
-	SYSCALL
-	MOVL	AX, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT,$0
-	MOVL    fd+0(FP), DI  // fd
-	MOVQ    $2, SI  // F_SETFD
-	MOVQ    $1, DX  // FD_CLOEXEC
-	MOVL	$SYS_fcntl, AX
-	SYSCALL
-	RET
-
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0
 	// This uses faccessat instead of access, because Android O blocks access.
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 66bf403..7b8c4f0 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -41,15 +41,10 @@
 #define SYS_nanosleep (SYS_BASE + 162)
 #define SYS_sched_getaffinity (SYS_BASE + 242)
 #define SYS_clock_gettime (SYS_BASE + 263)
-#define SYS_epoll_create (SYS_BASE + 250)
-#define SYS_epoll_ctl (SYS_BASE + 251)
-#define SYS_epoll_wait (SYS_BASE + 252)
 #define SYS_timer_create (SYS_BASE + 257)
 #define SYS_timer_settime (SYS_BASE + 258)
 #define SYS_timer_delete (SYS_BASE + 261)
-#define SYS_epoll_create1 (SYS_BASE + 357)
 #define SYS_pipe2 (SYS_BASE + 359)
-#define SYS_fcntl (SYS_BASE + 55)
 #define SYS_access (SYS_BASE + 33)
 #define SYS_connect (SYS_BASE + 283)
 #define SYS_socket (SYS_BASE + 281)
@@ -122,7 +117,7 @@
 	MOVW	$1003, R1
 	MOVW	R0, (R1)	// fail hard
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	wait+0(FP), R0
 	// We're done using the stack.
@@ -262,73 +257,105 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·walltime(SB),NOSPLIT,$8-12
+// Call a VDSO function.
+//
+// R0-R3: arguments to VDSO function (C calling convention)
+// R4: uintptr function to call
+//
+// There is no return value.
+TEXT runtime·vdsoCall(SB),NOSPLIT,$8-0
+	// R0-R3 may be arguments to fn, do not touch.
+	// R4 is function to call.
+	// R5-R9 are available as locals. They are unchanged by the C call
+	// (callee-save).
+
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 
 	// Save old SP. Use R13 instead of SP to avoid linker rewriting the offsets.
-	MOVW	R13, R4	// R4 is unchanged by C code.
+	MOVW	R13, R5
 
-	MOVW	g_m(g), R5 // R5 is unchanged by C code.
+	MOVW	g_m(g), R6
 
 	// Set vdsoPC and vdsoSP for SIGPROF traceback.
 	// Save the old values on stack and restore them on exit,
 	// so this function is reentrant.
-	MOVW	m_vdsoPC(R5), R1
-	MOVW	m_vdsoSP(R5), R2
-	MOVW	R1, 4(R13)
-	MOVW	R2, 8(R13)
+	MOVW	m_vdsoPC(R6), R7
+	MOVW	m_vdsoSP(R6), R8
+	MOVW	R7, 4(R13)
+	MOVW	R8, 8(R13)
 
-	MOVW	$ret-4(FP), R2 // caller's SP
-	MOVW	LR, m_vdsoPC(R5)
-	MOVW	R2, m_vdsoSP(R5)
+	MOVW	$sp-4(FP), R7 // caller's SP
+	MOVW	LR, m_vdsoPC(R6)
+	MOVW	R7, m_vdsoSP(R6)
 
-	MOVW	m_curg(R5), R0
+	MOVW	m_curg(R6), R7
 
-	CMP	g, R0		// Only switch if on curg.
+	CMP	g, R7		// Only switch if on curg.
 	B.NE	noswitch
 
-	MOVW	m_g0(R5), R0
-	MOVW	(g_sched+gobuf_sp)(R0), R13	 // Set SP to g0 stack
+	MOVW	m_g0(R6), R7
+	MOVW	(g_sched+gobuf_sp)(R7), R13	 // Set SP to g0 stack
 
 noswitch:
-	SUB	$24, R13	// Space for results
 	BIC	$0x7, R13	// Align for C code
 
-	MOVW	$CLOCK_REALTIME, R0
-	MOVW	$8(R13), R1	// timespec
-	MOVW	runtime·vdsoClockgettimeSym(SB), R2
-	CMP	$0, R2
-	B.EQ	fallback
-
 	// Store g on gsignal's stack, so if we receive a signal
 	// during VDSO code we can find the g.
-	// If we don't have a signal stack, we won't receive signal,
-	// so don't bother saving g.
-	// When using cgo, we already saved g on TLS, also don't save
-	// g here.
-	// Also don't save g if we are already on the signal stack.
-	// We won't get a nested signal.
-	MOVB	runtime·iscgo(SB), R6
-	CMP	$0, R6
+
+	// When using cgo, we already saved g on TLS, also don't save g here.
+	MOVB	runtime·iscgo(SB), R7
+	CMP	$0, R7
 	BNE	nosaveg
-	MOVW	m_gsignal(R5), R6          // g.m.gsignal
-	CMP	$0, R6
+	// If we don't have a signal stack, we won't receive signal, so don't
+	// bother saving g.
+	MOVW	m_gsignal(R6), R7          // g.m.gsignal
+	CMP	$0, R7
 	BEQ	nosaveg
-	CMP	g, R6
+	// Don't save g if we are already on the signal stack, as we won't get
+	// a nested signal.
+	CMP	g, R7
 	BEQ	nosaveg
-	MOVW	(g_stack+stack_lo)(R6), R6 // g.m.gsignal.stack.lo
-	MOVW	g, (R6)
+	// If we don't have a signal stack, we won't receive signal, so don't
+	// bother saving g.
+	MOVW	(g_stack+stack_lo)(R7), R7 // g.m.gsignal.stack.lo
+	CMP	$0, R7
+	BEQ	nosaveg
+	MOVW	g, (R7)
 
-	BL	(R2)
+	BL	(R4)
 
-	MOVW	$0, R1
-	MOVW	R1, (R6) // clear g slot, R6 is unchanged by C code
+	MOVW	$0, R8
+	MOVW	R8, (R7) // clear g slot
 
 	JMP	finish
 
 nosaveg:
-	BL	(R2)
+	BL	(R4)
+
+finish:
+	MOVW	R5, R13		// Restore real SP
+	// Restore vdsoPC, vdsoSP
+	// We don't worry about being signaled between the two stores.
+	// If we are not in a signal handler, we'll restore vdsoSP to 0,
+	// and no one will care about vdsoPC. If we are in a signal handler,
+	// we cannot receive another signal.
+	MOVW	8(R13), R7
+	MOVW	R7, m_vdsoSP(R6)
+	MOVW	4(R13), R7
+	MOVW	R7, m_vdsoPC(R6)
+	RET
+
+TEXT runtime·walltime(SB),NOSPLIT,$12-12
+	MOVW	$CLOCK_REALTIME, R0
+	MOVW	$spec-12(SP), R1	// timespec
+
+	MOVW	runtime·vdsoClockgettimeSym(SB), R4
+	CMP	$0, R4
+	B.EQ	fallback
+
+	BL	runtime·vdsoCall(SB)
+
 	JMP	finish
 
 fallback:
@@ -336,19 +363,8 @@
 	SWI	$0
 
 finish:
-	MOVW	8(R13), R0  // sec
-	MOVW	12(R13), R2  // nsec
-
-	MOVW	R4, R13		// Restore real SP
-	// Restore vdsoPC, vdsoSP
-	// We don't worry about being signaled between the two stores.
-	// If we are not in a signal handler, we'll restore vdsoSP to 0,
-	// and no one will care about vdsoPC. If we are in a signal handler,
-	// we cannot receive another signal.
-	MOVW	8(R13), R1
-	MOVW	R1, m_vdsoSP(R5)
-	MOVW	4(R13), R1
-	MOVW	R1, m_vdsoPC(R5)
+	MOVW	sec-12(SP), R0  // sec
+	MOVW	nsec-8(SP), R2  // nsec
 
 	MOVW	R0, sec_lo+0(FP)
 	MOVW	$0, R1
@@ -356,73 +372,17 @@
 	MOVW	R2, nsec+8(FP)
 	RET
 
-// int64 nanotime1(void)
-TEXT runtime·nanotime1(SB),NOSPLIT,$8-8
-	// Switch to g0 stack. See comment above in runtime·walltime.
-
-	// Save old SP. Use R13 instead of SP to avoid linker rewriting the offsets.
-	MOVW	R13, R4	// R4 is unchanged by C code.
-
-	MOVW	g_m(g), R5 // R5 is unchanged by C code.
-
-	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	// Save the old values on stack and restore them on exit,
-	// so this function is reentrant.
-	MOVW	m_vdsoPC(R5), R1
-	MOVW	m_vdsoSP(R5), R2
-	MOVW	R1, 4(R13)
-	MOVW	R2, 8(R13)
-
-	MOVW	$ret-4(FP), R2 // caller's SP
-	MOVW	LR, m_vdsoPC(R5)
-	MOVW	R2, m_vdsoSP(R5)
-
-	MOVW	m_curg(R5), R0
-
-	CMP	g, R0		// Only switch if on curg.
-	B.NE	noswitch
-
-	MOVW	m_g0(R5), R0
-	MOVW	(g_sched+gobuf_sp)(R0), R13	// Set SP to g0 stack
-
-noswitch:
-	SUB	$24, R13	// Space for results
-	BIC	$0x7, R13	// Align for C code
-
+// func nanotime1() int64
+TEXT runtime·nanotime1(SB),NOSPLIT,$12-8
 	MOVW	$CLOCK_MONOTONIC, R0
-	MOVW	$8(R13), R1	// timespec
-	MOVW	runtime·vdsoClockgettimeSym(SB), R2
-	CMP	$0, R2
+	MOVW	$spec-12(SP), R1	// timespec
+
+	MOVW	runtime·vdsoClockgettimeSym(SB), R4
+	CMP	$0, R4
 	B.EQ	fallback
 
-	// Store g on gsignal's stack, so if we receive a signal
-	// during VDSO code we can find the g.
-	// If we don't have a signal stack, we won't receive signal,
-	// so don't bother saving g.
-	// When using cgo, we already saved g on TLS, also don't save
-	// g here.
-	// Also don't save g if we are already on the signal stack.
-	// We won't get a nested signal.
-	MOVB	runtime·iscgo(SB), R6
-	CMP	$0, R6
-	BNE	nosaveg
-	MOVW	m_gsignal(R5), R6          // g.m.gsignal
-	CMP	$0, R6
-	BEQ	nosaveg
-	CMP	g, R6
-	BEQ	nosaveg
-	MOVW	(g_stack+stack_lo)(R6), R6 // g.m.gsignal.stack.lo
-	MOVW	g, (R6)
+	BL	runtime·vdsoCall(SB)
 
-	BL	(R2)
-
-	MOVW	$0, R1
-	MOVW	R1, (R6) // clear g slot, R6 is unchanged by C code
-
-	JMP	finish
-
-nosaveg:
-	BL	(R2)
 	JMP	finish
 
 fallback:
@@ -430,19 +390,8 @@
 	SWI	$0
 
 finish:
-	MOVW	8(R13), R0	// sec
-	MOVW	12(R13), R2	// nsec
-
-	MOVW	R4, R13		// Restore real SP
-	// Restore vdsoPC, vdsoSP
-	// We don't worry about being signaled between the two stores.
-	// If we are not in a signal handler, we'll restore vdsoSP to 0,
-	// and no one will care about vdsoPC. If we are in a signal handler,
-	// we cannot receive another signal.
-	MOVW	8(R13), R4
-	MOVW	R4, m_vdsoSP(R5)
-	MOVW	4(R13), R4
-	MOVW	R4, m_vdsoPC(R5)
+	MOVW	sec-12(SP), R0  // sec
+	MOVW	nsec-8(SP), R2  // nsec
 
 	MOVW	$1000000000, R3
 	MULLU	R0, R3, (R1, R0)
@@ -451,6 +400,7 @@
 
 	MOVW	R0, ret_lo+0(FP)
 	MOVW	R1, ret_hi+4(FP)
+
 	RET
 
 // int32 futex(int32 *uaddr, int32 op, int32 val,
@@ -661,53 +611,6 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size)
-TEXT runtime·epollcreate(SB),NOSPLIT,$0
-	MOVW	size+0(FP), R0
-	MOVW	$SYS_epoll_create, R7
-	SWI	$0
-	MOVW	R0, ret+4(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags)
-TEXT runtime·epollcreate1(SB),NOSPLIT,$0
-	MOVW	flags+0(FP), R0
-	MOVW	$SYS_epoll_create1, R7
-	SWI	$0
-	MOVW	R0, ret+4(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT,$0
-	MOVW	epfd+0(FP), R0
-	MOVW	op+4(FP), R1
-	MOVW	fd+8(FP), R2
-	MOVW	ev+12(FP), R3
-	MOVW	$SYS_epoll_ctl, R7
-	SWI	$0
-	MOVW	R0, ret+16(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout)
-TEXT runtime·epollwait(SB),NOSPLIT,$0
-	MOVW	epfd+0(FP), R0
-	MOVW	ev+4(FP), R1
-	MOVW	nev+8(FP), R2
-	MOVW	timeout+12(FP), R3
-	MOVW	$SYS_epoll_wait, R7
-	SWI	$0
-	MOVW	R0, ret+16(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd)
-TEXT runtime·closeonexec(SB),NOSPLIT,$0
-	MOVW	fd+0(FP), R0	// fd
-	MOVW	$2, R1	// F_SETFD
-	MOVW	$1, R2	// FD_CLOEXEC
-	MOVW	$SYS_fcntl, R7
-	SWI	$0
-	RET
-
 // b __kuser_get_tls @ 0xffff0fe0
 TEXT runtime·read_tls_fallback(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$0xffff0fe0, R0
diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index b47b6fd..38ff6ac 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s
@@ -22,7 +22,6 @@
 #define SYS_openat		56
 #define SYS_close		57
 #define SYS_pipe2		59
-#define SYS_fcntl		25
 #define SYS_nanosleep		101
 #define SYS_mmap		222
 #define SYS_munmap		215
@@ -42,9 +41,6 @@
 #define SYS_futex		98
 #define SYS_sched_getaffinity	123
 #define SYS_exit_group		94
-#define SYS_epoll_create1	20
-#define SYS_epoll_ctl		21
-#define SYS_epoll_pwait		22
 #define SYS_clock_gettime	113
 #define SYS_faccessat		48
 #define SYS_socket		198
@@ -60,7 +56,7 @@
 	SVC
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVD	wait+0(FP), R0
 	// We're done using the stack.
@@ -762,54 +758,6 @@
 	MOVW	R0, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOVW	$0, R0
-	MOVD	$SYS_epoll_create1, R8
-	SVC
-	MOVW	R0, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), R0
-	MOVD	$SYS_epoll_create1, R8
-	SVC
-	MOVW	R0, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R0
-	MOVW	op+4(FP), R1
-	MOVW	fd+8(FP), R2
-	MOVD	ev+16(FP), R3
-	MOVD	$SYS_epoll_ctl, R8
-	SVC
-	MOVW	R0, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R0
-	MOVD	ev+8(FP), R1
-	MOVW	nev+16(FP), R2
-	MOVW	timeout+20(FP), R3
-	MOVD	$0, R4
-	MOVD	$SYS_epoll_pwait, R8
-	SVC
-	MOVW	R0, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW	fd+0(FP), R0  // fd
-	MOVD	$2, R1	// F_SETFD
-	MOVD	$1, R2	// FD_CLOEXEC
-	MOVD	$SYS_fcntl, R8
-	SVC
-	RET
-
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0-20
 	MOVD	$AT_FDCWD, R0
diff --git a/src/runtime/sys_linux_loong64.s b/src/runtime/sys_linux_loong64.s
index 36a92df..9ce5e72 100644
--- a/src/runtime/sys_linux_loong64.s
+++ b/src/runtime/sys_linux_loong64.s
@@ -18,7 +18,6 @@
 #define SYS_close		57
 #define SYS_getpid		172
 #define SYS_kill		129
-#define SYS_fcntl		25
 #define SYS_mmap		222
 #define SYS_munmap		215
 #define SYS_setitimer		103
@@ -35,12 +34,9 @@
 #define SYS_futex		98
 #define SYS_sched_getaffinity	123
 #define SYS_exit_group		94
-#define SYS_epoll_ctl		21
 #define SYS_tgkill		131
 #define SYS_openat		56
-#define SYS_epoll_pwait		22
 #define SYS_clock_gettime	113
-#define SYS_epoll_create1	20
 #define SYS_brk			214
 #define SYS_pipe2		59
 #define SYS_timer_create	107
@@ -53,7 +49,7 @@
 	SYSCALL
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVV	wait+0(FP), R19
 	// We're done using the stack.
@@ -534,54 +530,6 @@
 	MOVW	R4, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOVW	size+0(FP), R4
-	MOVV	$SYS_epoll_create1, R11
-	SYSCALL
-	MOVW	R4, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), R4
-	MOVV	$SYS_epoll_create1, R11
-	SYSCALL
-	MOVW	R4, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R4
-	MOVW	op+4(FP), R5
-	MOVW	fd+8(FP), R6
-	MOVV	ev+16(FP), R7
-	MOVV	$SYS_epoll_ctl, R11
-	SYSCALL
-	MOVW	R4, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R4
-	MOVV	ev+8(FP), R5
-	MOVW	nev+16(FP), R6
-	MOVW	timeout+20(FP), R7
-	MOVV	$0, R8
-	MOVV	$SYS_epoll_pwait, R11
-	SYSCALL
-	MOVW	R4, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW	fd+0(FP), R4  // fd
-	MOVV	$2, R5	// F_SETFD
-	MOVV	$1, R6	// FD_CLOEXEC
-	MOVV	$SYS_fcntl, R11
-	SYSCALL
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index 06d54df..47f2da5 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s
@@ -20,7 +20,6 @@
 #define SYS_close		5003
 #define SYS_getpid		5038
 #define SYS_kill		5060
-#define SYS_fcntl		5070
 #define SYS_mmap		5009
 #define SYS_munmap		5011
 #define SYS_setitimer		5036
@@ -37,16 +36,12 @@
 #define SYS_futex		5194
 #define SYS_sched_getaffinity	5196
 #define SYS_exit_group		5205
-#define SYS_epoll_create	5207
-#define SYS_epoll_ctl		5208
 #define SYS_timer_create	5216
 #define SYS_timer_settime	5217
 #define SYS_timer_delete	5220
 #define SYS_tgkill		5225
 #define SYS_openat		5247
-#define SYS_epoll_pwait		5272
 #define SYS_clock_gettime	5222
-#define SYS_epoll_create1	5285
 #define SYS_brk			5012
 #define SYS_pipe2		5287
 
@@ -56,7 +51,7 @@
 	SYSCALL
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVV	wait+0(FP), R1
 	// We're done using the stack.
@@ -568,62 +563,6 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOVW    size+0(FP), R4
-	MOVV	$SYS_epoll_create, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBVU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), R4
-	MOVV	$SYS_epoll_create1, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBVU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R4
-	MOVW	op+4(FP), R5
-	MOVW	fd+8(FP), R6
-	MOVV	ev+16(FP), R7
-	MOVV	$SYS_epoll_ctl, R2
-	SYSCALL
-	SUBVU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	// This uses pwait instead of wait, because Android O blocks wait.
-	MOVW	epfd+0(FP), R4
-	MOVV	ev+8(FP), R5
-	MOVW	nev+16(FP), R6
-	MOVW	timeout+20(FP), R7
-	MOVV	$0, R8
-	MOVV	$SYS_epoll_pwait, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBVU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW    fd+0(FP), R4  // fd
-	MOVV    $2, R5  // F_SETFD
-	MOVV    $1, R6  // FD_CLOEXEC
-	MOVV	$SYS_fcntl, R2
-	SYSCALL
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index e70edcc..5e6b6c1 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s
@@ -20,7 +20,6 @@
 #define SYS_getpid		4020
 #define SYS_kill		4037
 #define SYS_brk			4045
-#define SYS_fcntl		4055
 #define SYS_mmap		4090
 #define SYS_munmap		4091
 #define SYS_setitimer		4104
@@ -37,15 +36,11 @@
 #define SYS_futex		4238
 #define SYS_sched_getaffinity	4240
 #define SYS_exit_group		4246
-#define SYS_epoll_create	4248
-#define SYS_epoll_ctl		4249
-#define SYS_epoll_wait		4250
 #define SYS_timer_create	4257
 #define SYS_timer_settime	4258
 #define SYS_timer_delete	4261
 #define SYS_clock_gettime	4263
 #define SYS_tgkill		4266
-#define SYS_epoll_create1	4326
 #define SYS_pipe2		4328
 
 TEXT runtime·exit(SB),NOSPLIT,$0-4
@@ -55,7 +50,7 @@
 	UNDEF
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVW	wait+0(FP), R1
 	// We're done using the stack.
@@ -487,60 +482,6 @@
 	MOVW	R2, ret+12(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT,$0-8
-	MOVW	size+0(FP), R4
-	MOVW	$SYS_epoll_create, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+4(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT,$0-8
-	MOVW	flags+0(FP), R4
-	MOVW	$SYS_epoll_create1, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+4(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT,$0-20
-	MOVW	epfd+0(FP), R4
-	MOVW	op+4(FP), R5
-	MOVW	fd+8(FP), R6
-	MOVW	ev+12(FP), R7
-	MOVW	$SYS_epoll_ctl, R2
-	SYSCALL
-	SUBU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+16(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT,$0-20
-	MOVW	epfd+0(FP), R4
-	MOVW	ev+4(FP), R5
-	MOVW	nev+8(FP), R6
-	MOVW	timeout+12(FP), R7
-	MOVW	$SYS_epoll_wait, R2
-	SYSCALL
-	BEQ	R7, 2(PC)
-	SUBU	R2, R0, R2	// caller expects negative errno
-	MOVW	R2, ret+16(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT,$0-4
-	MOVW	fd+0(FP), R4	// fd
-	MOVW	$2, R5	// F_SETFD
-	MOVW	$1, R6	// FD_CLOEXEC
-	MOVW	$SYS_fcntl, R2
-	SYSCALL
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index 2913a05..d0427a4 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s
@@ -21,7 +21,6 @@
 #define SYS_getpid		 20
 #define SYS_kill		 37
 #define SYS_brk			 45
-#define SYS_fcntl		 55
 #define SYS_mmap		 90
 #define SYS_munmap		 91
 #define SYS_setitimer		104
@@ -38,15 +37,11 @@
 #define SYS_futex		221
 #define SYS_sched_getaffinity	223
 #define SYS_exit_group		234
-#define SYS_epoll_create	236
-#define SYS_epoll_ctl		237
-#define SYS_epoll_wait		238
 #define SYS_timer_create	240
 #define SYS_timer_settime	241
 #define SYS_timer_delete	244
 #define SYS_clock_gettime	246
 #define SYS_tgkill		250
-#define SYS_epoll_create1	315
 #define SYS_pipe2		317
 
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
@@ -54,7 +49,7 @@
 	SYSCALL	$SYS_exit_group
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVD	wait+0(FP), R1
 	// We're done using the stack.
@@ -111,16 +106,22 @@
 	MOVW	R3, errno+16(FP)
 	RET
 
+// func usleep(usec uint32)
 TEXT runtime·usleep(SB),NOSPLIT,$16-4
 	MOVW	usec+0(FP), R3
-	MOVD	R3, R5
-	MOVW	$1000000, R4
-	DIVD	R4, R3
-	MOVD	R3, 8(R1)
-	MOVW	$1000, R4
-	MULLD	R3, R4
-	SUB	R4, R5
-	MOVD	R5, 16(R1)
+
+	// Use magic constant 0x8637bd06 and shift right 51
+	// to perform usec/1000000.
+	MOVD	$0x8637bd06, R4
+	MULLD	R3, R4, R4	// Convert usec to S.
+	SRD	$51, R4, R4
+	MOVD	R4, 8(R1)	// Store to tv_sec
+
+	MOVD	$1000000, R5
+	MULLW	R4, R5, R5	// Convert tv_sec back into uS
+	SUB	R5, R3, R5	// Compute remainder uS.
+	MULLD	$1000, R5, R5	// Convert to nsec
+	MOVD	R5, 16(R1)	// Store to tv_nsec
 
 	// nanosleep(&ts, 0)
 	ADD	$8, R1, R3
@@ -876,55 +877,6 @@
 	MOVW	R3, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOVW    size+0(FP), R3
-	SYSCALL	$SYS_epoll_create
-	BVC	2(PC)
-	NEG	R3	// caller expects negative errno
-	MOVW	R3, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), R3
-	SYSCALL	$SYS_epoll_create1
-	BVC	2(PC)
-	NEG	R3	// caller expects negative errno
-	MOVW	R3, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R3
-	MOVW	op+4(FP), R4
-	MOVW	fd+8(FP), R5
-	MOVD	ev+16(FP), R6
-	SYSCALL	$SYS_epoll_ctl
-	NEG	R3	// caller expects negative errno
-	MOVW	R3, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R3
-	MOVD	ev+8(FP), R4
-	MOVW	nev+16(FP), R5
-	MOVW	timeout+20(FP), R6
-	SYSCALL	$SYS_epoll_wait
-	BVC	2(PC)
-	NEG	R3	// caller expects negative errno
-	MOVW	R3, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW    fd+0(FP), R3  // fd
-	MOVD    $2, R4  // F_SETFD
-	MOVD    $1, R5  // FD_CLOEXEC
-	SYSCALL	$SYS_fcntl
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_riscv64.s b/src/runtime/sys_linux_riscv64.s
index afb2d11..d1558fd 100644
--- a/src/runtime/sys_linux_riscv64.s
+++ b/src/runtime/sys_linux_riscv64.s
@@ -18,13 +18,9 @@
 #define SYS_clone		220
 #define SYS_close		57
 #define SYS_connect		203
-#define SYS_epoll_create1	20
-#define SYS_epoll_ctl		21
-#define SYS_epoll_pwait		22
 #define SYS_exit		93
 #define SYS_exit_group		94
 #define SYS_faccessat		48
-#define SYS_fcntl		25
 #define SYS_futex		98
 #define SYS_getpid		172
 #define SYS_gettid		178
@@ -61,7 +57,7 @@
 	ECALL
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOV	wait+0(FP), A0
 	// We're done using the stack.
@@ -578,54 +574,6 @@
 	MOV	A0, ret+24(FP)
 	RET
 
-// func epollcreate(size int32) int32
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOV	$0, A0
-	MOV	$SYS_epoll_create1, A7
-	ECALL
-	MOVW	A0, ret+8(FP)
-	RET
-
-// func epollcreate1(flags int32) int32
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), A0
-	MOV	$SYS_epoll_create1, A7
-	ECALL
-	MOVW	A0, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollevent) int32
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), A0
-	MOVW	op+4(FP), A1
-	MOVW	fd+8(FP), A2
-	MOV	ev+16(FP), A3
-	MOV	$SYS_epoll_ctl, A7
-	ECALL
-	MOVW	A0, ret+24(FP)
-	RET
-
-// func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), A0
-	MOV	ev+8(FP), A1
-	MOVW	nev+16(FP), A2
-	MOVW	timeout+20(FP), A3
-	MOV	$0, A4
-	MOV	$SYS_epoll_pwait, A7
-	ECALL
-	MOVW	A0, ret+24(FP)
-	RET
-
-// func closeonexec(int32)
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW	fd+0(FP), A0  // fd
-	MOV	$2, A1	// F_SETFD
-	MOV	$1, A2	// FD_CLOEXEC
-	MOV	$SYS_fcntl, A7
-	ECALL
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
index c82cb9b..1448670 100644
--- a/src/runtime/sys_linux_s390x.s
+++ b/src/runtime/sys_linux_s390x.s
@@ -17,7 +17,6 @@
 #define SYS_getpid               20
 #define SYS_kill                 37
 #define SYS_brk			 45
-#define SYS_fcntl                55
 #define SYS_mmap                 90
 #define SYS_munmap               91
 #define SYS_setitimer           104
@@ -35,15 +34,11 @@
 #define SYS_sched_getaffinity   240
 #define SYS_tgkill              241
 #define SYS_exit_group          248
-#define SYS_epoll_create        249
-#define SYS_epoll_ctl           250
-#define SYS_epoll_wait          251
 #define SYS_timer_create        254
 #define SYS_timer_settime       255
 #define SYS_timer_delete        258
 #define SYS_clock_gettime       260
 #define SYS_pipe2		325
-#define SYS_epoll_create1       327
 
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	code+0(FP), R2
@@ -51,7 +46,7 @@
 	SYSCALL
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
 	MOVD	wait+0(FP), R1
 	// We're done using the stack.
@@ -589,53 +584,6 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
-	MOVW    size+0(FP), R2
-	MOVW	$SYS_epoll_create, R1
-	SYSCALL
-	MOVW	R2, ret+8(FP)
-	RET
-
-// int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
-	MOVW	flags+0(FP), R2
-	MOVW	$SYS_epoll_create1, R1
-	SYSCALL
-	MOVW	R2, ret+8(FP)
-	RET
-
-// func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R2
-	MOVW	op+4(FP), R3
-	MOVW	fd+8(FP), R4
-	MOVD	ev+16(FP), R5
-	MOVW	$SYS_epoll_ctl, R1
-	SYSCALL
-	MOVW	R2, ret+24(FP)
-	RET
-
-// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
-	MOVW	epfd+0(FP), R2
-	MOVD	ev+8(FP), R3
-	MOVW	nev+16(FP), R4
-	MOVW	timeout+20(FP), R5
-	MOVW	$SYS_epoll_wait, R1
-	SYSCALL
-	MOVW	R2, ret+24(FP)
-	RET
-
-// void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
-	MOVW    fd+0(FP), R2  // fd
-	MOVD    $2, R3  // F_SETFD
-	MOVD    $1, R4  // FD_CLOEXEC
-	MOVW	$SYS_fcntl, R1
-	SYSCALL
-	RET
-
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index 581b4fc..7be18c6 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s
@@ -53,7 +53,7 @@
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVL	wait+0(FP), AX
 	// We're done using the stack.
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index ab11f6f..30f3f38 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -122,7 +122,7 @@
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-8
 	MOVQ	wait+0(FP), AX
 	// We're done using the stack.
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index dbe3dbc..62fa852 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s
@@ -56,7 +56,7 @@
 	MOVW.CS R8, (R8)
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVW wait+0(FP), R0
 	// We're done using the stack.
diff --git a/src/runtime/sys_netbsd_arm64.s b/src/runtime/sys_netbsd_arm64.s
index fc126ca..d57959f 100644
--- a/src/runtime/sys_netbsd_arm64.s
+++ b/src/runtime/sys_netbsd_arm64.s
@@ -115,7 +115,7 @@
 	MOVD	$0, R0			// If we're still running,
 	MOVD	R0, (R0)		// crash
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0-8
 	MOVD	wait+0(FP), R0
 	// We're done using the stack.
diff --git a/src/runtime/sys_openbsd2.go b/src/runtime/sys_openbsd2.go
index f936e0c..49bad8e 100644
--- a/src/runtime/sys_openbsd2.go
+++ b/src/runtime/sys_openbsd2.go
@@ -8,6 +8,7 @@
 
 import (
 	"internal/abi"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -248,7 +249,8 @@
 func sigaltstack_trampoline()
 
 // Not used on OpenBSD, but must be defined.
-func exitThread(wait *uint32) {
+func exitThread(wait *atomic.Uint32) {
+	throw("exitThread")
 }
 
 //go:nosplit
diff --git a/src/runtime/sys_openbsd_mips64.s b/src/runtime/sys_openbsd_mips64.s
index c2b2092..affd586 100644
--- a/src/runtime/sys_openbsd_mips64.s
+++ b/src/runtime/sys_openbsd_mips64.s
@@ -24,7 +24,7 @@
 	MOVV	R2, (R2)
 	RET
 
-// func exitThread(wait *uint32)
+// func exitThread(wait *atomic.Uint32)
 TEXT runtime·exitThread(SB),NOSPLIT,$0
 	MOVV	wait+0(FP), R4		// arg 1 - notdead
 	MOVV	$302, R2		// sys___threxit
@@ -277,7 +277,7 @@
 
 	// In parent, return.
 	BEQ	R2, 3(PC)
-	MOVW	R2, ret+40(FP)
+	MOVW	$0, ret+40(FP)
 	RET
 
 	// Initialise m, g.
diff --git a/src/runtime/sys_wasm.go b/src/runtime/sys_wasm.go
index e6e7f47..bf57569 100644
--- a/src/runtime/sys_wasm.go
+++ b/src/runtime/sys_wasm.go
@@ -16,10 +16,6 @@
 
 var wasmStack m0Stack
 
-func wasmMove()
-
-func wasmZero()
-
 func wasmDiv()
 
 func wasmTruncS()
diff --git a/src/runtime/sys_wasm.s b/src/runtime/sys_wasm.s
index 164dd16..f706e00 100644
--- a/src/runtime/sys_wasm.s
+++ b/src/runtime/sys_wasm.s
@@ -4,73 +4,6 @@
 
 #include "textflag.h"
 
-TEXT runtime·wasmMove(SB), NOSPLIT, $0-0
-loop:
-	Loop
-		// *dst = *src
-		Get R0
-		Get R1
-		I64Load $0
-		I64Store $0
-
-		// n--
-		Get R2
-		I32Const $1
-		I32Sub
-		Tee R2
-
-		// n == 0
-		I32Eqz
-		If
-			Return
-		End
-
-		// dst += 8
-		Get R0
-		I32Const $8
-		I32Add
-		Set R0
-
-		// src += 8
-		Get R1
-		I32Const $8
-		I32Add
-		Set R1
-
-		Br loop
-	End
-	UNDEF
-
-TEXT runtime·wasmZero(SB), NOSPLIT, $0-0
-loop:
-	Loop
-		// *dst = 0
-		Get R0
-		I64Const $0
-		I64Store $0
-
-		// n--
-		Get R1
-		I32Const $1
-		I32Sub
-		Tee R1
-
-		// n == 0
-		I32Eqz
-		If
-			Return
-		End
-
-		// dst += 8
-		Get R0
-		I32Const $8
-		I32Add
-		Set R0
-
-		Br loop
-	End
-	UNDEF
-
 TEXT runtime·wasmDiv(SB), NOSPLIT, $0-0
 	Get R0
 	I64Const $-0x8000000000000000
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 1467b4d..777726f 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -8,6 +8,9 @@
 #include "time_windows.h"
 #include "cgo/abi_amd64.h"
 
+// Offsets into Thread Environment Block (pointer in GS)
+#define TEB_TlsSlots 0x1480
+
 // void runtime·asmstdcall(void *c);
 TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
 	// asmcgocall will put first argument into CX.
@@ -116,6 +119,7 @@
 	// Make stack space for the rest of the function.
 	ADJSP	$48
 
+	MOVQ	CX, R13	// save exception address
 	MOVQ	AX, R15	// save handler address
 
 	// find g
@@ -153,8 +157,8 @@
 	MOVQ	DI, SP
 
 g0:
-	MOVQ	0(CX), BX // ExceptionRecord*
-	MOVQ	8(CX), CX // Context*
+	MOVQ	0(R13), BX // ExceptionRecord*
+	MOVQ	8(R13), CX // Context*
 	MOVQ	BX, 0(SP)
 	MOVQ	CX, 8(SP)
 	MOVQ	DX, 16(SP)
@@ -162,6 +166,8 @@
 	// AX is set to report result back to Windows
 	MOVL	24(SP), AX
 
+	MOVQ	SP, DI // save g0 SP
+
 	// switch back to original stack and g
 	// no-op if we never left.
 	MOVQ	40(SP), SP
@@ -169,12 +175,54 @@
 	get_tls(BP)
 	MOVQ	DX, g(BP)
 
+	// if return value is CONTINUE_SEARCH, do not set up control
+	// flow guard workaround.
+	CMPQ	AX, $0
+	JEQ	done
+
+	// Check if we need to set up the control flow guard workaround.
+	// On Windows, the stack pointer in the context must lie within
+	// system stack limits when we resume from exception.
+	// Store the resume SP and PC in alternate registers
+	// and return to sigresume on the g0 stack.
+	// sigresume makes no use of the stack at all,
+	// loading SP from R8 and jumping to R9.
+	// Note that smashing R8 and R9 is only safe because we know sigpanic
+	// will not actually return to the original frame, so the registers
+	// are effectively dead. But this does mean we can't use the
+	// same mechanism for async preemption.
+	MOVQ	8(R13), CX
+	MOVQ	$sigresume<>(SB), BX
+	CMPQ	BX, context_rip(CX)
+	JEQ	done			// do not clobber saved SP/PC
+
+	// Save resume SP and PC into R8, R9.
+	MOVQ	context_rsp(CX), BX
+	MOVQ	BX, context_r8(CX)
+	MOVQ	context_rip(CX), BX
+	MOVQ	BX, context_r9(CX)
+
+	// Set up context record to return to sigresume on g0 stack
+	MOVD	DI, BX
+	MOVD	BX, context_rsp(CX)
+	MOVD	$sigresume<>(SB), BX
+	MOVD	BX, context_rip(CX)
+
 done:
 	ADJSP	$-48
 	POP_REGS_HOST_TO_ABI0()
 
 	RET
 
+// Trampoline to resume execution from exception handler.
+// This is part of the control flow guard workaround.
+// It switches stacks and jumps to the continuation address.
+// R8 and R9 are set above at the end of sigtramp<>
+// in the context that starts executing at sigresume<>.
+TEXT sigresume<>(SB),NOSPLIT|NOFRAME,$0
+	MOVQ	R8, SP
+	JMP	R9
+
 TEXT runtime·exceptiontramp(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	$runtime·exceptionhandler(SB), AX
 	JMP	sigtramp<>(SB)
@@ -258,10 +306,10 @@
 	MOVQ	AX, g_stackguard1(DX)
 
 	// Set up tls.
-	LEAQ	m_tls(CX), SI
-	MOVQ	SI, 0x28(GS)
+	LEAQ	m_tls(CX), DI
 	MOVQ	CX, g_m(DX)
-	MOVQ	DX, g(SI)
+	MOVQ	DX, g(DI)
+	CALL	runtime·settls(SB) // clobbers CX
 
 	CALL	runtime·stackcheck(SB)	// clobbers AX,CX
 	CALL	runtime·mstart(SB)
@@ -273,7 +321,8 @@
 
 // set tls base to DI
 TEXT runtime·settls(SB),NOSPLIT,$0
-	MOVQ	DI, 0x28(GS)
+	MOVQ	runtime·tls_g(SB), CX
+	MOVQ	DI, 0(CX)(GS)
 	RET
 
 // Runs on OS stack.
@@ -359,3 +408,32 @@
 	LEAQ	m_tls(AX), DI
 	CALL	runtime·settls(SB)
 	RET
+
+// This is called from rt0_go, which runs on the system stack
+// using the initial stack allocated by the OS.
+TEXT runtime·wintls(SB),NOSPLIT|NOFRAME,$0
+	// Allocate a TLS slot to hold g across calls to external code
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	SUBQ	$48, SP	// room for SP and 4 args as per Windows requirement
+			// plus one extra word to keep stack 16 bytes aligned
+	MOVQ	AX, 32(SP)
+	MOVQ	runtime·_TlsAlloc(SB), AX
+	CALL	AX
+	MOVQ	32(SP), SP
+
+	MOVQ	AX, CX	// TLS index
+
+	// Assert that slot is less than 64 so we can use _TEB->TlsSlots
+	CMPQ	CX, $64
+	JB	ok
+	CALL	runtime·abort(SB)
+ok:
+	// Convert the TLS index at CX into
+	// an offset from TEB_TlsSlots.
+	SHLQ	$3, CX
+
+	// Save offset from TLS into tls_g.
+	ADDQ	$TEB_TlsSlots, CX
+	MOVQ	CX, runtime·tls_g(SB)
+	RET
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 5dc576a..db6d8f1 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -123,8 +123,14 @@
 	MOVW	R1, R7			// Save param1
 
 	BL      runtime·load_g(SB)
-	CMP	$0, g			// is there a current g?
-	BL.EQ	runtime·badsignal2(SB)
+	CMP	$0,	g		// is there a current g?
+	BNE	g_ok
+	ADD	$(8+20), R13	// free locals
+	MOVM.IA.W (R13), [R3, R4-R11, R14]	// pop {r3, r4-r11, lr}
+	MOVW	$0, R0		// continue 
+	BEQ	return
+
+g_ok:
 
 	// save g and SP in case of stack switch
 	MOVW	R13, 24(R13)
diff --git a/src/runtime/sys_windows_arm64.s b/src/runtime/sys_windows_arm64.s
index 024625f..4702a4d 100644
--- a/src/runtime/sys_windows_arm64.s
+++ b/src/runtime/sys_windows_arm64.s
@@ -113,7 +113,8 @@
 	MOVD	$runtime·badsignalmsg(SB), R1	// lpBuffer
 	MOVD	$runtime·badsignallen(SB), R2	// lpNumberOfBytesToWrite
 	MOVD	(R2), R2
-	MOVD	R13, R3		// lpNumberOfBytesWritten
+	// point R3 to stack local that will receive number of bytes written
+	ADD	$16, RSP, R3		// lpNumberOfBytesWritten
 	MOVD	$0, R4			// lpOverlapped
 	MOVD	runtime·_WriteFile(SB), R12
 	SUB	$16, RSP	// skip over saved frame pointer below RSP
@@ -146,10 +147,15 @@
 	MOVD	g, R17 			// saved R28 (callee-save from Windows, not really g)
 
 	BL      runtime·load_g(SB)	// smashes R0, R27, R28 (g)
-	CMP	$0, g			// is there a current g?
-	BNE	2(PC)
-	BL	runtime·badsignal2(SB)
+	CMP	$0,	g		// is there a current g?
+	BNE	g_ok
+	MOVD	R7, LR
+	MOVD	R16, R27	// restore R27
+	MOVD	R17, g		// restore R28
+	MOVD	$0, R0		// continue 
+	RET
 
+g_ok:
 	// Do we need to switch to the g0 stack?
 	MOVD	g, R3			// R3 = oldg (for sigtramp_g0)
 	MOVD	g_m(g), R2		// R2 = m
diff --git a/src/runtime/syscall_aix.go b/src/runtime/syscall_aix.go
index f294922..cc9e912 100644
--- a/src/runtime/syscall_aix.go
+++ b/src/runtime/syscall_aix.go
@@ -127,9 +127,9 @@
 
 // like close, but must not split stack, for fork.
 //
-//go:linkname syscall_close syscall.close
+//go:linkname syscall_closeFD syscall.closeFD
 //go:nosplit
-func syscall_close(fd int32) int32 {
+func syscall_closeFD(fd int32) int32 {
 	_, err := syscall1(&libc_close, uintptr(fd))
 	return int32(err)
 }
diff --git a/src/runtime/syscall_unix_test.go b/src/runtime/syscall_unix_test.go
new file mode 100644
index 0000000..2a69c40
--- /dev/null
+++ b/src/runtime/syscall_unix_test.go
@@ -0,0 +1,25 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix
+
+package runtime_test
+
+import (
+	"runtime"
+	"syscall"
+	"testing"
+)
+
+func TestSyscallFlagAlignment(t *testing.T) {
+	// TODO(mknyszek): Check other flags.
+	check := func(name string, got, want int) {
+		if got != want {
+			t.Errorf("flag %s does not line up: got %d, want %d", name, got, want)
+		}
+	}
+	check("O_WRONLY", runtime.O_WRONLY, syscall.O_WRONLY)
+	check("O_CREAT", runtime.O_CREAT, syscall.O_CREAT)
+	check("O_TRUNC", runtime.O_TRUNC, syscall.O_TRUNC)
+}
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 37f8f40..abc2838 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -5,7 +5,6 @@
 package runtime_test
 
 import (
-	"bytes"
 	"fmt"
 	"internal/abi"
 	"internal/syscall/windows/sysdll"
@@ -629,7 +628,7 @@
 }
 
 func TestRaiseException(t *testing.T) {
-	if testenv.Builder() == "windows-amd64-2012" {
+	if strings.HasPrefix(testenv.Builder(), "windows-amd64-2012") {
 		testenv.SkipFlaky(t, 49681)
 	}
 	o := runTestProg(t, "testprog", "RaiseException")
@@ -1044,7 +1043,7 @@
 
 	cmd := exec.Command(os.Args[0], "-test.run=TestNumCPU")
 	cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
-	var buf bytes.Buffer
+	var buf strings.Builder
 	cmd.Stdout = &buf
 	cmd.Stderr = &buf
 	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: _CREATE_SUSPENDED}
@@ -1054,7 +1053,7 @@
 	}
 	defer func() {
 		err = cmd.Wait()
-		childOutput := string(buf.Bytes())
+		childOutput := buf.String()
 		if err != nil {
 			t.Fatalf("child failed: %v: %v", err, childOutput)
 		}
@@ -1216,7 +1215,7 @@
 
 // wantLoadLibraryEx reports whether we expect LoadLibraryEx to work for tests.
 func wantLoadLibraryEx() bool {
-	return testenv.Builder() == "windows-amd64-gce" || testenv.Builder() == "windows-386-gce"
+	return testenv.Builder() != "" && (runtime.GOARCH == "amd64" || runtime.GOARCH == "386")
 }
 
 func TestLoadLibraryEx(t *testing.T) {
diff --git a/src/runtime/testdata/testexithooks/testexithooks.go b/src/runtime/testdata/testexithooks/testexithooks.go
new file mode 100644
index 0000000..ceb3326
--- /dev/null
+++ b/src/runtime/testdata/testexithooks/testexithooks.go
@@ -0,0 +1,85 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"flag"
+	"os"
+	_ "unsafe"
+)
+
+var modeflag = flag.String("mode", "", "mode to run in")
+
+func main() {
+	flag.Parse()
+	switch *modeflag {
+	case "simple":
+		testSimple()
+	case "goodexit":
+		testGoodExit()
+	case "badexit":
+		testBadExit()
+	case "panics":
+		testPanics()
+	case "callsexit":
+		testHookCallsExit()
+	default:
+		panic("unknown mode")
+	}
+}
+
+//go:linkname runtime_addExitHook runtime.addExitHook
+func runtime_addExitHook(f func(), runOnNonZeroExit bool)
+
+func testSimple() {
+	f1 := func() { println("foo") }
+	f2 := func() { println("bar") }
+	runtime_addExitHook(f1, false)
+	runtime_addExitHook(f2, false)
+	// no explicit call to os.Exit
+}
+
+func testGoodExit() {
+	f1 := func() { println("apple") }
+	f2 := func() { println("orange") }
+	runtime_addExitHook(f1, false)
+	runtime_addExitHook(f2, false)
+	// explicit call to os.Exit
+	os.Exit(0)
+}
+
+func testBadExit() {
+	f1 := func() { println("blog") }
+	f2 := func() { println("blix") }
+	f3 := func() { println("blek") }
+	f4 := func() { println("blub") }
+	f5 := func() { println("blat") }
+	runtime_addExitHook(f1, false)
+	runtime_addExitHook(f2, true)
+	runtime_addExitHook(f3, false)
+	runtime_addExitHook(f4, true)
+	runtime_addExitHook(f5, false)
+	os.Exit(1)
+}
+
+func testPanics() {
+	f1 := func() { println("ok") }
+	f2 := func() { panic("BADBADBAD") }
+	f3 := func() { println("good") }
+	runtime_addExitHook(f1, true)
+	runtime_addExitHook(f2, true)
+	runtime_addExitHook(f3, true)
+	os.Exit(0)
+}
+
+func testHookCallsExit() {
+	f1 := func() { println("ok") }
+	f2 := func() { os.Exit(1) }
+	f3 := func() { println("good") }
+	runtime_addExitHook(f1, true)
+	runtime_addExitHook(f2, true)
+	runtime_addExitHook(f3, true)
+	os.Exit(1)
+}
diff --git a/src/runtime/testdata/testprog/checkptr.go b/src/runtime/testdata/testprog/checkptr.go
index b27e5f7..60e71e6 100644
--- a/src/runtime/testdata/testprog/checkptr.go
+++ b/src/runtime/testdata/testprog/checkptr.go
@@ -20,6 +20,8 @@
 	register("CheckPtrSmall", CheckPtrSmall)
 	register("CheckPtrSliceOK", CheckPtrSliceOK)
 	register("CheckPtrSliceFail", CheckPtrSliceFail)
+	register("CheckPtrStringOK", CheckPtrStringOK)
+	register("CheckPtrStringFail", CheckPtrStringFail)
 	register("CheckPtrAlignmentNested", CheckPtrAlignmentNested)
 }
 
@@ -98,6 +100,17 @@
 	sink2 = unsafe.Slice(p, 100)
 }
 
+func CheckPtrStringOK() {
+	p := new([4]byte)
+	sink2 = unsafe.String(&p[1], 3)
+}
+
+func CheckPtrStringFail() {
+	p := new(byte)
+	sink2 = p
+	sink2 = unsafe.String(p, 100)
+}
+
 func CheckPtrAlignmentNested() {
 	s := make([]int8, 100)
 	p := unsafe.Pointer(&s[0])
diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
index 0f44575..5dc85fb 100644
--- a/src/runtime/testdata/testprog/gc.go
+++ b/src/runtime/testdata/testprog/gc.go
@@ -396,7 +396,7 @@
 		// should do considerably better than this bound.
 		bound := int64(myLimit + 16<<20)
 		start := time.Now()
-		for time.Now().Sub(start) < 200*time.Millisecond {
+		for time.Since(start) < 200*time.Millisecond {
 			metrics.Read(m[:])
 			retained := int64(m[0].Value.Uint64() - m[1].Value.Uint64())
 			if retained > bound {
diff --git a/src/runtime/testdata/testprog/numcpu_freebsd.go b/src/runtime/testdata/testprog/numcpu_freebsd.go
index 7209f67..310c212 100644
--- a/src/runtime/testdata/testprog/numcpu_freebsd.go
+++ b/src/runtime/testdata/testprog/numcpu_freebsd.go
@@ -48,7 +48,7 @@
 		fmt.Printf("fail to launch '%s', error: %s, output: %s\n", strings.Join(cmd.Args, " "), err, output)
 		return
 	}
-	if bytes.Equal(output, []byte("1\n")) == false {
+	if !bytes.Equal(output, []byte("1\n")) {
 		// SMP mode deactivated in kernel.
 		fmt.Println("OK")
 		return
diff --git a/src/runtime/testdata/testprog/traceback_ancestors.go b/src/runtime/testdata/testprog/traceback_ancestors.go
index 1d0d00b..8fc1aa7 100644
--- a/src/runtime/testdata/testprog/traceback_ancestors.go
+++ b/src/runtime/testdata/testprog/traceback_ancestors.go
@@ -87,9 +87,10 @@
 	buf := make([]byte, 128)
 	runtime.Stack(buf, false)
 	prefix := []byte("goroutine ")
-	if !bytes.HasPrefix(buf, prefix) {
+	var found bool
+	if buf, found = bytes.CutPrefix(buf, prefix); !found {
 		panic(fmt.Sprintf("expected %q at beginning of traceback:\n%s", prefix, buf))
 	}
-	id, _, _ := bytes.Cut(bytes.TrimPrefix(buf, prefix), []byte(" "))
+	id, _, _ := bytes.Cut(buf, []byte(" "))
 	return string(id)
 }
diff --git a/src/runtime/testdata/testprog/unsafe.go b/src/runtime/testdata/testprog/unsafe.go
new file mode 100644
index 0000000..021b08f
--- /dev/null
+++ b/src/runtime/testdata/testprog/unsafe.go
@@ -0,0 +1,12 @@
+package main
+
+import "unsafe"
+
+func init() {
+	register("panicOnNilAndEleSizeIsZero", panicOnNilAndEleSizeIsZero)
+}
+
+func panicOnNilAndEleSizeIsZero() {
+	var p *struct{}
+	_ = unsafe.Slice(p, 5)
+}
diff --git a/src/runtime/testdata/testprogcgo/issue29707.go b/src/runtime/testdata/testprogcgo/issue29707.go
new file mode 100644
index 0000000..7d9299f
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/issue29707.go
@@ -0,0 +1,60 @@
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !plan9 && !windows
+// +build !plan9,!windows
+
+// This is for issue #29707
+
+package main
+
+/*
+#include <pthread.h>
+
+extern void* callbackTraceParser(void*);
+typedef void* (*cbTraceParser)(void*);
+
+static void testCallbackTraceParser(cbTraceParser cb) {
+	pthread_t thread_id;
+	pthread_create(&thread_id, NULL, cb, NULL);
+	pthread_join(thread_id, NULL);
+}
+*/
+import "C"
+
+import (
+	"bytes"
+	"fmt"
+	traceparser "internal/trace"
+	"runtime/trace"
+	"time"
+	"unsafe"
+)
+
+func init() {
+	register("CgoTraceParser", CgoTraceParser)
+}
+
+//export callbackTraceParser
+func callbackTraceParser(unsafe.Pointer) unsafe.Pointer {
+	time.Sleep(time.Millisecond)
+	return nil
+}
+
+func CgoTraceParser() {
+	buf := new(bytes.Buffer)
+
+	trace.Start(buf)
+	C.testCallbackTraceParser(C.cbTraceParser(C.callbackTraceParser))
+	trace.Stop()
+
+	_, err := traceparser.Parse(buf, "")
+	if err == traceparser.ErrTimeOrder {
+		fmt.Println("ErrTimeOrder")
+	} else if err != nil {
+		fmt.Println("Parse error: ", err)
+	} else {
+		fmt.Println("OK")
+	}
+}
diff --git a/src/runtime/testdata/testprogcgo/segv.go b/src/runtime/testdata/testprogcgo/segv.go
index 0632475..bf5aa31 100644
--- a/src/runtime/testdata/testprogcgo/segv.go
+++ b/src/runtime/testdata/testprogcgo/segv.go
@@ -2,18 +2,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !plan9 && !windows
-// +build !plan9,!windows
+//go:build unix
+// +build unix
 
 package main
 
+// #include <unistd.h>
 // static void nop() {}
 import "C"
 
-import (
-	"syscall"
-	"time"
-)
+import "syscall"
 
 func init() {
 	register("Segv", Segv)
@@ -35,8 +33,8 @@
 
 	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
 
-	// Give the OS time to deliver the signal.
-	time.Sleep(time.Second)
+	// Wait for the OS to deliver the signal.
+	C.pause()
 }
 
 func SegvInCgo() {
@@ -52,6 +50,6 @@
 
 	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
 
-	// Give the OS time to deliver the signal.
-	time.Sleep(time.Second)
+	// Wait for the OS to deliver the signal.
+	C.pause()
 }
diff --git a/src/runtime/testdata/testprogcgo/segv_linux.go b/src/runtime/testdata/testprogcgo/segv_linux.go
new file mode 100644
index 0000000..fe93778
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/segv_linux.go
@@ -0,0 +1,51 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// #include <unistd.h>
+// static void nop() {}
+import "C"
+
+import "syscall"
+
+func init() {
+	register("TgkillSegv", TgkillSegv)
+	register("TgkillSegvInCgo", TgkillSegvInCgo)
+}
+
+func TgkillSegv() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for i := 0; ; i++ {
+			// Sum defined in segv.go.
+			Sum += i
+		}
+	}()
+
+	<-c
+
+	syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.SIGSEGV)
+
+	// Wait for the OS to deliver the signal.
+	C.pause()
+}
+
+func TgkillSegvInCgo() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for {
+			C.nop()
+		}
+	}()
+
+	<-c
+
+	syscall.Tgkill(syscall.Getpid(), syscall.Gettid(), syscall.SIGSEGV)
+
+	// Wait for the OS to deliver the signal.
+	C.pause()
+}
diff --git a/src/runtime/testdata/testprogcgo/sigfwd.go b/src/runtime/testdata/testprogcgo/sigfwd.go
new file mode 100644
index 0000000..f6a0c03
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/sigfwd.go
@@ -0,0 +1,87 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build unix
+
+package main
+
+import (
+	"fmt"
+	"os"
+)
+
+/*
+#include <signal.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+sig_atomic_t expectCSigsegv;
+int *sigfwdP;
+
+static void sigsegv() {
+	expectCSigsegv = 1;
+	*sigfwdP = 1;
+	fprintf(stderr, "ERROR: C SIGSEGV not thrown on caught?.\n");
+	exit(2);
+}
+
+static void segvhandler(int signum) {
+	if (signum == SIGSEGV) {
+		if (expectCSigsegv == 0) {
+			fprintf(stderr, "SIGSEGV caught in C unexpectedly\n");
+			exit(1);
+		}
+		fprintf(stdout, "OK\n");
+		exit(0);  // success
+	}
+}
+
+static void __attribute__ ((constructor)) sigsetup(void) {
+	if (getenv("GO_TEST_CGOSIGFWD") == NULL) {
+		return;
+	}
+
+	struct sigaction act;
+
+	memset(&act, 0, sizeof act);
+	act.sa_handler = segvhandler;
+	sigaction(SIGSEGV, &act, NULL);
+}
+*/
+import "C"
+
+func init() {
+	register("CgoSigfwd", CgoSigfwd)
+}
+
+var nilPtr *byte
+
+func f() (ret bool) {
+	defer func() {
+		if recover() == nil {
+			fmt.Fprintf(os.Stderr, "ERROR: couldn't raise SIGSEGV in Go\n")
+			C.exit(2)
+		}
+		ret = true
+	}()
+	*nilPtr = 1
+	return false
+}
+
+func CgoSigfwd() {
+	if os.Getenv("GO_TEST_CGOSIGFWD") == "" {
+		fmt.Fprintf(os.Stderr, "test must be run with GO_TEST_CGOSIGFWD set\n")
+		os.Exit(1)
+	}
+
+	// Test that the signal originating in Go is handled (and recovered) by Go.
+	if !f() {
+		fmt.Fprintf(os.Stderr, "couldn't recover from SIGSEGV in Go.\n")
+		C.exit(2)
+	}
+
+	// Test that the signal originating in C is handled by C.
+	C.sigsegv()
+}
diff --git a/src/runtime/testdata/testwinlibthrow/main.go b/src/runtime/testdata/testwinlibthrow/main.go
new file mode 100644
index 0000000..ce0c92f
--- /dev/null
+++ b/src/runtime/testdata/testwinlibthrow/main.go
@@ -0,0 +1,19 @@
+package main

+

+import (

+	"os"

+	"syscall"

+)

+

+func main() {

+	dll := syscall.MustLoadDLL("veh.dll")

+	RaiseNoExcept := dll.MustFindProc("RaiseNoExcept")

+	ThreadRaiseNoExcept := dll.MustFindProc("ThreadRaiseNoExcept")

+

+	thread := len(os.Args) > 1 && os.Args[1] == "thread"

+	if !thread {

+		RaiseNoExcept.Call()

+	} else {

+		ThreadRaiseNoExcept.Call()

+	}

+}

diff --git a/src/runtime/testdata/testwinlibthrow/veh.c b/src/runtime/testdata/testwinlibthrow/veh.c
new file mode 100644
index 0000000..08c1f9e
--- /dev/null
+++ b/src/runtime/testdata/testwinlibthrow/veh.c
@@ -0,0 +1,26 @@
+//go:build ignore

+

+#include <windows.h>

+

+__declspec(dllexport)

+void RaiseNoExcept(void)

+{

+    RaiseException(42, 0, 0, 0);

+}

+

+static DWORD WINAPI ThreadRaiser(void* Context)

+{

+    RaiseNoExcept();

+    return 0;

+}

+

+__declspec(dllexport)

+void ThreadRaiseNoExcept(void)

+{

+    HANDLE thread = CreateThread(0, 0, ThreadRaiser,  0, 0, 0);

+    if (0 != thread)

+    {

+        WaitForSingleObject(thread, INFINITE);

+        CloseHandle(thread);

+    }

+}

diff --git a/src/runtime/time.go b/src/runtime/time.go
index 80b0bfb..6cd70b7 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -36,7 +36,7 @@
 	nextwhen int64
 
 	// The status field holds one of the values below.
-	status uint32
+	status atomic.Uint32
 }
 
 // Code outside this file has to be careful in using a timer value.
@@ -249,6 +249,7 @@
 	goready(arg.(*g), 0)
 }
 
+// Note: this changes some unsynchronized operations to synchronized operations
 // addtimer adds a timer to the current P.
 // This should only be called with a newly created timer.
 // That avoids the risk of changing the when field of a timer in some P's heap,
@@ -263,10 +264,10 @@
 	if t.period < 0 {
 		throw("timer period must be non-negative")
 	}
-	if t.status != timerNoStatus {
+	if t.status.Load() != timerNoStatus {
 		throw("addtimer called with initialized timer")
 	}
-	t.status = timerWaiting
+	t.status.Store(timerWaiting)
 
 	when := t.when
 
@@ -289,7 +290,7 @@
 func doaddtimer(pp *p, t *timer) {
 	// Timers rely on the network poller, so make sure the poller
 	// has started.
-	if netpollInited == 0 {
+	if netpollInited.Load() == 0 {
 		netpollGenericInit()
 	}
 
@@ -301,9 +302,9 @@
 	pp.timers = append(pp.timers, t)
 	siftupTimer(pp.timers, i)
 	if t == pp.timers[0] {
-		atomic.Store64(&pp.timer0When, uint64(t.when))
+		pp.timer0When.Store(t.when)
 	}
-	atomic.Xadd(&pp.numTimers, 1)
+	pp.numTimers.Add(1)
 }
 
 // deltimer deletes the timer t. It may be on some other P, so we can't
@@ -312,21 +313,21 @@
 // Reports whether the timer was removed before it was run.
 func deltimer(t *timer) bool {
 	for {
-		switch s := atomic.Load(&t.status); s {
+		switch s := t.status.Load(); s {
 		case timerWaiting, timerModifiedLater:
 			// Prevent preemption while the timer is in timerModifying.
 			// This could lead to a self-deadlock. See #38070.
 			mp := acquirem()
-			if atomic.Cas(&t.status, s, timerModifying) {
+			if t.status.CompareAndSwap(s, timerModifying) {
 				// Must fetch t.pp before changing status,
 				// as cleantimers in another goroutine
 				// can clear t.pp of a timerDeleted timer.
 				tpp := t.pp.ptr()
-				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+				if !t.status.CompareAndSwap(timerModifying, timerDeleted) {
 					badTimer()
 				}
 				releasem(mp)
-				atomic.Xadd(&tpp.deletedTimers, 1)
+				tpp.deletedTimers.Add(1)
 				// Timer was not yet run.
 				return true
 			} else {
@@ -336,15 +337,15 @@
 			// Prevent preemption while the timer is in timerModifying.
 			// This could lead to a self-deadlock. See #38070.
 			mp := acquirem()
-			if atomic.Cas(&t.status, s, timerModifying) {
+			if t.status.CompareAndSwap(s, timerModifying) {
 				// Must fetch t.pp before setting status
 				// to timerDeleted.
 				tpp := t.pp.ptr()
-				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+				if !t.status.CompareAndSwap(timerModifying, timerDeleted) {
 					badTimer()
 				}
 				releasem(mp)
-				atomic.Xadd(&tpp.deletedTimers, 1)
+				tpp.deletedTimers.Add(1)
 				// Timer was not yet run.
 				return true
 			} else {
@@ -397,10 +398,10 @@
 	if i == 0 {
 		updateTimer0When(pp)
 	}
-	n := atomic.Xadd(&pp.numTimers, -1)
+	n := pp.numTimers.Add(-1)
 	if n == 0 {
 		// If there are no timers, then clearly none are modified.
-		atomic.Store64(&pp.timerModifiedEarliest, 0)
+		pp.timerModifiedEarliest.Store(0)
 	}
 	return smallestChanged
 }
@@ -425,10 +426,10 @@
 		siftdownTimer(pp.timers, 0)
 	}
 	updateTimer0When(pp)
-	n := atomic.Xadd(&pp.numTimers, -1)
+	n := pp.numTimers.Add(-1)
 	if n == 0 {
 		// If there are no timers, then clearly none are modified.
-		atomic.Store64(&pp.timerModifiedEarliest, 0)
+		pp.timerModifiedEarliest.Store(0)
 	}
 }
 
@@ -449,12 +450,12 @@
 	var mp *m
 loop:
 	for {
-		switch status = atomic.Load(&t.status); status {
+		switch status = t.status.Load(); status {
 		case timerWaiting, timerModifiedEarlier, timerModifiedLater:
 			// Prevent preemption while the timer is in timerModifying.
 			// This could lead to a self-deadlock. See #38070.
 			mp = acquirem()
-			if atomic.Cas(&t.status, status, timerModifying) {
+			if t.status.CompareAndSwap(status, timerModifying) {
 				pending = true // timer not yet run
 				break loop
 			}
@@ -466,7 +467,7 @@
 
 			// Timer was already run and t is no longer in a heap.
 			// Act like addtimer.
-			if atomic.Cas(&t.status, status, timerModifying) {
+			if t.status.CompareAndSwap(status, timerModifying) {
 				wasRemoved = true
 				pending = false // timer already run or stopped
 				break loop
@@ -476,8 +477,8 @@
 			// Prevent preemption while the timer is in timerModifying.
 			// This could lead to a self-deadlock. See #38070.
 			mp = acquirem()
-			if atomic.Cas(&t.status, status, timerModifying) {
-				atomic.Xadd(&t.pp.ptr().deletedTimers, -1)
+			if t.status.CompareAndSwap(status, timerModifying) {
+				t.pp.ptr().deletedTimers.Add(-1)
 				pending = false // timer already stopped
 				break loop
 			}
@@ -506,7 +507,7 @@
 		lock(&pp.timersLock)
 		doaddtimer(pp, t)
 		unlock(&pp.timersLock)
-		if !atomic.Cas(&t.status, timerModifying, timerWaiting) {
+		if !t.status.CompareAndSwap(timerModifying, timerWaiting) {
 			badTimer()
 		}
 		releasem(mp)
@@ -531,7 +532,7 @@
 		}
 
 		// Set the new status of the timer.
-		if !atomic.Cas(&t.status, timerModifying, newStatus) {
+		if !t.status.CompareAndSwap(timerModifying, newStatus) {
 			badTimer()
 		}
 		releasem(mp)
@@ -577,18 +578,18 @@
 		if t.pp.ptr() != pp {
 			throw("cleantimers: bad p")
 		}
-		switch s := atomic.Load(&t.status); s {
+		switch s := t.status.Load(); s {
 		case timerDeleted:
-			if !atomic.Cas(&t.status, s, timerRemoving) {
+			if !t.status.CompareAndSwap(s, timerRemoving) {
 				continue
 			}
 			dodeltimer0(pp)
-			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+			if !t.status.CompareAndSwap(timerRemoving, timerRemoved) {
 				badTimer()
 			}
-			atomic.Xadd(&pp.deletedTimers, -1)
+			pp.deletedTimers.Add(-1)
 		case timerModifiedEarlier, timerModifiedLater:
-			if !atomic.Cas(&t.status, s, timerMoving) {
+			if !t.status.CompareAndSwap(s, timerMoving) {
 				continue
 			}
 			// Now we can change the when field.
@@ -596,7 +597,7 @@
 			// Move t to the right position.
 			dodeltimer0(pp)
 			doaddtimer(pp, t)
-			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+			if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 				badTimer()
 			}
 		default:
@@ -614,30 +615,30 @@
 	for _, t := range timers {
 	loop:
 		for {
-			switch s := atomic.Load(&t.status); s {
+			switch s := t.status.Load(); s {
 			case timerWaiting:
-				if !atomic.Cas(&t.status, s, timerMoving) {
+				if !t.status.CompareAndSwap(s, timerMoving) {
 					continue
 				}
 				t.pp = 0
 				doaddtimer(pp, t)
-				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 					badTimer()
 				}
 				break loop
 			case timerModifiedEarlier, timerModifiedLater:
-				if !atomic.Cas(&t.status, s, timerMoving) {
+				if !t.status.CompareAndSwap(s, timerMoving) {
 					continue
 				}
 				t.when = t.nextwhen
 				t.pp = 0
 				doaddtimer(pp, t)
-				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 					badTimer()
 				}
 				break loop
 			case timerDeleted:
-				if !atomic.Cas(&t.status, s, timerRemoved) {
+				if !t.status.CompareAndSwap(s, timerRemoved) {
 					continue
 				}
 				t.pp = 0
@@ -671,8 +672,8 @@
 	// a lot of timers back and forth if the timers rarely expire.
 	// We'll postpone looking through all the adjusted timers until
 	// one would actually expire.
-	first := atomic.Load64(&pp.timerModifiedEarliest)
-	if first == 0 || int64(first) > now {
+	first := pp.timerModifiedEarliest.Load()
+	if first == 0 || first > now {
 		if verifyTimers {
 			verifyTimerHeap(pp)
 		}
@@ -680,7 +681,7 @@
 	}
 
 	// We are going to clear all timerModifiedEarlier timers.
-	atomic.Store64(&pp.timerModifiedEarliest, 0)
+	pp.timerModifiedEarliest.Store(0)
 
 	var moved []*timer
 	for i := 0; i < len(pp.timers); i++ {
@@ -688,20 +689,20 @@
 		if t.pp.ptr() != pp {
 			throw("adjusttimers: bad p")
 		}
-		switch s := atomic.Load(&t.status); s {
+		switch s := t.status.Load(); s {
 		case timerDeleted:
-			if atomic.Cas(&t.status, s, timerRemoving) {
+			if t.status.CompareAndSwap(s, timerRemoving) {
 				changed := dodeltimer(pp, i)
-				if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				if !t.status.CompareAndSwap(timerRemoving, timerRemoved) {
 					badTimer()
 				}
-				atomic.Xadd(&pp.deletedTimers, -1)
+				pp.deletedTimers.Add(-1)
 				// Go back to the earliest changed heap entry.
 				// "- 1" because the loop will add 1.
 				i = changed - 1
 			}
 		case timerModifiedEarlier, timerModifiedLater:
-			if atomic.Cas(&t.status, s, timerMoving) {
+			if t.status.CompareAndSwap(s, timerMoving) {
 				// Now we can change the when field.
 				t.when = t.nextwhen
 				// Take t off the heap, and hold onto it.
@@ -741,7 +742,7 @@
 func addAdjustedTimers(pp *p, moved []*timer) {
 	for _, t := range moved {
 		doaddtimer(pp, t)
-		if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+		if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 			badTimer()
 		}
 	}
@@ -754,8 +755,8 @@
 //
 //go:nowritebarrierrec
 func nobarrierWakeTime(pp *p) int64 {
-	next := int64(atomic.Load64(&pp.timer0When))
-	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	next := pp.timer0When.Load()
+	nextAdj := pp.timerModifiedEarliest.Load()
 	if next == 0 || (nextAdj != 0 && nextAdj < next) {
 		next = nextAdj
 	}
@@ -776,14 +777,14 @@
 		if t.pp.ptr() != pp {
 			throw("runtimer: bad p")
 		}
-		switch s := atomic.Load(&t.status); s {
+		switch s := t.status.Load(); s {
 		case timerWaiting:
 			if t.when > now {
 				// Not ready to run.
 				return t.when
 			}
 
-			if !atomic.Cas(&t.status, s, timerRunning) {
+			if !t.status.CompareAndSwap(s, timerRunning) {
 				continue
 			}
 			// Note that runOneTimer may temporarily unlock
@@ -792,26 +793,26 @@
 			return 0
 
 		case timerDeleted:
-			if !atomic.Cas(&t.status, s, timerRemoving) {
+			if !t.status.CompareAndSwap(s, timerRemoving) {
 				continue
 			}
 			dodeltimer0(pp)
-			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+			if !t.status.CompareAndSwap(timerRemoving, timerRemoved) {
 				badTimer()
 			}
-			atomic.Xadd(&pp.deletedTimers, -1)
+			pp.deletedTimers.Add(-1)
 			if len(pp.timers) == 0 {
 				return -1
 			}
 
 		case timerModifiedEarlier, timerModifiedLater:
-			if !atomic.Cas(&t.status, s, timerMoving) {
+			if !t.status.CompareAndSwap(s, timerMoving) {
 				continue
 			}
 			t.when = t.nextwhen
 			dodeltimer0(pp)
 			doaddtimer(pp, t)
-			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+			if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 				badTimer()
 			}
 
@@ -858,14 +859,14 @@
 			t.when = maxWhen
 		}
 		siftdownTimer(pp.timers, 0)
-		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
+		if !t.status.CompareAndSwap(timerRunning, timerWaiting) {
 			badTimer()
 		}
 		updateTimer0When(pp)
 	} else {
 		// Remove from heap.
 		dodeltimer0(pp)
-		if !atomic.Cas(&t.status, timerRunning, timerNoStatus) {
+		if !t.status.CompareAndSwap(timerRunning, timerNoStatus) {
 			badTimer()
 		}
 	}
@@ -903,7 +904,7 @@
 func clearDeletedTimers(pp *p) {
 	// We are going to clear all timerModifiedEarlier timers.
 	// Do this now in case new ones show up while we are looping.
-	atomic.Store64(&pp.timerModifiedEarliest, 0)
+	pp.timerModifiedEarliest.Store(0)
 
 	cdel := int32(0)
 	to := 0
@@ -912,7 +913,7 @@
 nextTimer:
 	for _, t := range timers {
 		for {
-			switch s := atomic.Load(&t.status); s {
+			switch s := t.status.Load(); s {
 			case timerWaiting:
 				if changedHeap {
 					timers[to] = t
@@ -921,22 +922,22 @@
 				to++
 				continue nextTimer
 			case timerModifiedEarlier, timerModifiedLater:
-				if atomic.Cas(&t.status, s, timerMoving) {
+				if t.status.CompareAndSwap(s, timerMoving) {
 					t.when = t.nextwhen
 					timers[to] = t
 					siftupTimer(timers, to)
 					to++
 					changedHeap = true
-					if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+					if !t.status.CompareAndSwap(timerMoving, timerWaiting) {
 						badTimer()
 					}
 					continue nextTimer
 				}
 			case timerDeleted:
-				if atomic.Cas(&t.status, s, timerRemoving) {
+				if t.status.CompareAndSwap(s, timerRemoving) {
 					t.pp = 0
 					cdel++
-					if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+					if !t.status.CompareAndSwap(timerRemoving, timerRemoved) {
 						badTimer()
 					}
 					changedHeap = true
@@ -964,8 +965,8 @@
 		timers[i] = nil
 	}
 
-	atomic.Xadd(&pp.deletedTimers, -cdel)
-	atomic.Xadd(&pp.numTimers, -cdel)
+	pp.deletedTimers.Add(-cdel)
+	pp.numTimers.Add(-cdel)
 
 	timers = timers[:to]
 	pp.timers = timers
@@ -993,7 +994,7 @@
 			throw("bad timer heap")
 		}
 	}
-	if numTimers := int(atomic.Load(&pp.numTimers)); len(pp.timers) != numTimers {
+	if numTimers := int(pp.numTimers.Load()); len(pp.timers) != numTimers {
 		println("timer heap len", len(pp.timers), "!= numTimers", numTimers)
 		throw("bad timer heap len")
 	}
@@ -1003,9 +1004,9 @@
 // The caller must have locked the timers for pp.
 func updateTimer0When(pp *p) {
 	if len(pp.timers) == 0 {
-		atomic.Store64(&pp.timer0When, 0)
+		pp.timer0When.Store(0)
 	} else {
-		atomic.Store64(&pp.timer0When, uint64(pp.timers[0].when))
+		pp.timer0When.Store(pp.timers[0].when)
 	}
 }
 
@@ -1014,11 +1015,12 @@
 // The timers for pp will not be locked.
 func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
 	for {
-		old := atomic.Load64(&pp.timerModifiedEarliest)
+		old := pp.timerModifiedEarliest.Load()
 		if old != 0 && int64(old) < nextwhen {
 			return
 		}
-		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+
+		if pp.timerModifiedEarliest.CompareAndSwap(old, nextwhen) {
 			return
 		}
 	}
@@ -1039,12 +1041,12 @@
 			continue
 		}
 
-		w := int64(atomic.Load64(&pp.timer0When))
+		w := pp.timer0When.Load()
 		if w != 0 && w < next {
 			next = w
 		}
 
-		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		w = pp.timerModifiedEarliest.Load()
 		if w != 0 && w < next {
 			next = w
 		}
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 10436d8..e7dfab1 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -109,6 +109,8 @@
 
 // trace is global tracing context.
 var trace struct {
+	// trace.lock must only be acquired on the system stack where
+	// stack splits cannot happen while it is held.
 	lock          mutex       // protects the following members
 	lockOwner     *g          // to avoid deadlocks during recursive lock locks
 	enabled       bool        // when set runtime traces events
@@ -126,7 +128,6 @@
 	empty         traceBufPtr // stack of empty buffers
 	fullHead      traceBufPtr // queue of full buffers
 	fullTail      traceBufPtr
-	reader        guintptr        // goroutine that called ReadTrace, or nil
 	stackTab      traceStackTable // maps stack traces to unique ids
 	// cpuLogRead accepts CPU profile samples from the signal handler where
 	// they're generated. It uses a two-word header to hold the IDs of the P and
@@ -144,6 +145,8 @@
 	// specific P.
 	cpuLogBuf traceBufPtr
 
+	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil
+
 	signalLock  atomic.Uint32 // protects use of the following member, only usable in signal handlers
 	cpuLogWrite *profBuf      // copy of cpuLogRead for use in signal handlers, set without signalLock
 
@@ -173,9 +176,8 @@
 }
 
 // traceBuf is per-P tracing buffer.
-//
-//go:notinheap
 type traceBuf struct {
+	_ sys.NotInHeap
 	traceBufHeader
 	arr [64<<10 - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
 }
@@ -186,7 +188,7 @@
 // manipulated in contexts where write barriers are not allowed, so
 // this is necessary.
 //
-// TODO: Since traceBuf is now go:notinheap, this isn't necessary.
+// TODO: Since traceBuf is now embedded runtime/internal/sys.NotInHeap, this isn't necessary.
 type traceBufPtr uintptr
 
 func (tp traceBufPtr) ptr() *traceBuf   { return (*traceBuf)(unsafe.Pointer(tp)) }
@@ -232,14 +234,12 @@
 	// - or GoSysExit appears for a goroutine for which we don't emit EvGoInSyscall below.
 	// To instruct traceEvent that it must not ignore events below, we set startingtrace.
 	// trace.enabled is set afterwards once we have emitted all preliminary events.
-	_g_ := getg()
-	_g_.m.startingtrace = true
+	mp := getg().m
+	mp.startingtrace = true
 
 	// Obtain current stack ID to use in all traceEvGoCreate events below.
-	mp := acquirem()
 	stkBuf := make([]uintptr, traceStackSize)
 	stackID := traceStackID(mp, stkBuf, 2)
-	releasem(mp)
 
 	profBuf := newProfBuf(2, profBufWordCount, profBufTagCount) // after the timestamp, header is [pp.id, gp.goid]
 	trace.cpuLogRead = profBuf
@@ -261,16 +261,27 @@
 			gp.tracelastp = getg().m.p
 			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
 			id := trace.stackTab.put([]uintptr{startPCforTrace(gp.startpc) + sys.PCQuantum})
-			traceEvent(traceEvGoCreate, -1, uint64(gp.goid), uint64(id), stackID)
+			traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
 		}
 		if status == _Gwaiting {
 			// traceEvGoWaiting is implied to have seq=1.
 			gp.traceseq++
-			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
+			traceEvent(traceEvGoWaiting, -1, gp.goid)
 		}
 		if status == _Gsyscall {
 			gp.traceseq++
-			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
+			traceEvent(traceEvGoInSyscall, -1, gp.goid)
+		} else if status == _Gdead && gp.m != nil && gp.m.isextra {
+			// Trigger two trace events for the dead g in the extra m,
+			// since the next event of the g will be traceEvGoSysExit in exitsyscall,
+			// while calling from C thread to Go.
+			gp.traceseq = 0
+			gp.tracelastp = getg().m.p
+			// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+			id := trace.stackTab.put([]uintptr{startPCforTrace(0) + sys.PCQuantum}) // no start pc
+			traceEvent(traceEvGoCreate, -1, gp.goid, uint64(id), stackID)
+			gp.traceseq++
+			traceEvent(traceEvGoInSyscall, -1, gp.goid)
 		} else {
 			gp.sysblocktraced = false
 		}
@@ -293,7 +304,7 @@
 	trace.strings = make(map[string]uint64)
 
 	trace.seqGC = 0
-	_g_.m.startingtrace = false
+	mp.startingtrace = false
 	trace.enabled = true
 
 	// Register runtime goroutine labels.
@@ -386,31 +397,33 @@
 		raceacquire(unsafe.Pointer(&trace.shutdownSema))
 	}
 
-	// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
-	lock(&trace.lock)
-	for _, p := range allp[:cap(allp)] {
-		if p.tracebuf != 0 {
-			throw("trace: non-empty trace buffer in proc")
+	systemstack(func() {
+		// The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
+		lock(&trace.lock)
+		for _, p := range allp[:cap(allp)] {
+			if p.tracebuf != 0 {
+				throw("trace: non-empty trace buffer in proc")
+			}
 		}
-	}
-	if trace.buf != 0 {
-		throw("trace: non-empty global trace buffer")
-	}
-	if trace.fullHead != 0 || trace.fullTail != 0 {
-		throw("trace: non-empty full trace buffer")
-	}
-	if trace.reading != 0 || trace.reader != 0 {
-		throw("trace: reading after shutdown")
-	}
-	for trace.empty != 0 {
-		buf := trace.empty
-		trace.empty = buf.ptr().link
-		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
-	}
-	trace.strings = nil
-	trace.shutdown = false
-	trace.cpuLogRead = nil
-	unlock(&trace.lock)
+		if trace.buf != 0 {
+			throw("trace: non-empty global trace buffer")
+		}
+		if trace.fullHead != 0 || trace.fullTail != 0 {
+			throw("trace: non-empty full trace buffer")
+		}
+		if trace.reading != 0 || trace.reader.Load() != nil {
+			throw("trace: reading after shutdown")
+		}
+		for trace.empty != 0 {
+			buf := trace.empty
+			trace.empty = buf.ptr().link
+			sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
+		}
+		trace.strings = nil
+		trace.shutdown = false
+		trace.cpuLogRead = nil
+		unlock(&trace.lock)
+	})
 }
 
 // ReadTrace returns the next chunk of binary tracing data, blocking until data
@@ -419,6 +432,55 @@
 // returned data before calling ReadTrace again.
 // ReadTrace must be called from one goroutine at a time.
 func ReadTrace() []byte {
+top:
+	var buf []byte
+	var park bool
+	systemstack(func() {
+		buf, park = readTrace0()
+	})
+	if park {
+		gopark(func(gp *g, _ unsafe.Pointer) bool {
+			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
+				// We're racing with another reader.
+				// Wake up and handle this case.
+				return false
+			}
+
+			if g2 := traceReader(); gp == g2 {
+				// New data arrived between unlocking
+				// and the CAS and we won the wake-up
+				// race, so wake up directly.
+				return false
+			} else if g2 != nil {
+				printlock()
+				println("runtime: got trace reader", g2, g2.goid)
+				throw("unexpected trace reader")
+			}
+
+			return true
+		}, nil, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
+		goto top
+	}
+
+	return buf
+}
+
+// readTrace0 is ReadTrace's continuation on g0. This must run on the
+// system stack because it acquires trace.lock.
+//
+//go:systemstack
+func readTrace0() (buf []byte, park bool) {
+	if raceenabled {
+		// g0 doesn't have a race context. Borrow the user G's.
+		if getg().racectx != 0 {
+			throw("expected racectx == 0")
+		}
+		getg().racectx = getg().m.curg.racectx
+		// (This defer should get open-coded, which is safe on
+		// the system stack.)
+		defer func() { getg().racectx = 0 }()
+	}
+
 	// This function may need to lock trace.lock recursively
 	// (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
 	// To allow this we use trace.lockOwner.
@@ -426,16 +488,16 @@
 	// allocation can call heap allocate, which will try to emit a trace
 	// event while holding heap lock.
 	lock(&trace.lock)
-	trace.lockOwner = getg()
+	trace.lockOwner = getg().m.curg
 
-	if trace.reader != 0 {
+	if trace.reader.Load() != nil {
 		// More than one goroutine reads trace. This is bad.
 		// But we rather do not crash the program because of tracing,
 		// because tracing can be enabled at runtime on prod servers.
 		trace.lockOwner = nil
 		unlock(&trace.lock)
 		println("runtime: ReadTrace called from multiple goroutines simultaneously")
-		return nil
+		return nil, false
 	}
 	// Recycle the old buffer.
 	if buf := trace.reading; buf != 0 {
@@ -448,7 +510,7 @@
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.19 trace\x00\x00\x00")
+		return []byte("go 1.19 trace\x00\x00\x00"), false
 	}
 	// Optimistically look for CPU profile samples. This may write new stack
 	// records, and may write new tracing buffers.
@@ -457,17 +519,22 @@
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
-		trace.reader.set(getg())
-		goparkunlock(&trace.lock, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
-		lock(&trace.lock)
+		// We don't simply use a note because the scheduler
+		// executes this goroutine directly when it wakes up
+		// (also a note would consume an M).
+		trace.lockOwner = nil
+		unlock(&trace.lock)
+		return nil, true
 	}
+newFull:
+	assertLockHeld(&trace.lock)
 	// Write a buffer.
 	if trace.fullHead != 0 {
 		buf := traceFullDequeue()
 		trace.reading = buf
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return buf.ptr().arr[:buf.ptr().pos]
+		return buf.ptr().arr[:buf.ptr().pos], false
 	}
 
 	// Write footer with timer frequency.
@@ -480,13 +547,22 @@
 		}
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		var data []byte
-		data = append(data, traceEvFrequency|0<<traceArgCountShift)
-		data = traceAppend(data, uint64(freq))
+
+		// Write frequency event.
+		bufp := traceFlush(0, 0)
+		buf := bufp.ptr()
+		buf.byte(traceEvFrequency | 0<<traceArgCountShift)
+		buf.varint(uint64(freq))
+
+		// Dump stack table.
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
-		trace.stackTab.dump()
-		return data
+		bufp = trace.stackTab.dump(bufp)
+
+		// Flush final buffer.
+		lock(&trace.lock)
+		traceFullQueue(bufp)
+		goto newFull // trace.lock should be held at newFull
 	}
 	// Done.
 	if trace.shutdown {
@@ -500,40 +576,51 @@
 		}
 		// trace.enabled is already reset, so can call traceable functions.
 		semrelease(&trace.shutdownSema)
-		return nil
+		return nil, false
 	}
 	// Also bad, but see the comment above.
 	trace.lockOwner = nil
 	unlock(&trace.lock)
 	println("runtime: spurious wakeup of trace reader")
-	return nil
+	return nil, false
 }
 
 // traceReader returns the trace reader that should be woken up, if any.
 // Callers should first check that trace.enabled or trace.shutdown is set.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceReader() *g {
-	if !traceReaderAvailable() {
+	// Optimistic check first
+	if traceReaderAvailable() == nil {
 		return nil
 	}
 	lock(&trace.lock)
-	if !traceReaderAvailable() {
+	gp := traceReaderAvailable()
+	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
 		unlock(&trace.lock)
 		return nil
 	}
-	gp := trace.reader.ptr()
-	trace.reader.set(nil)
 	unlock(&trace.lock)
 	return gp
 }
 
-// traceReaderAvailable returns true if the trace reader is not currently
+// traceReaderAvailable returns the trace reader if it is not currently
 // scheduled and should be. Callers should first check that trace.enabled
 // or trace.shutdown is set.
-func traceReaderAvailable() bool {
-	return trace.reader != 0 && (trace.fullHead != 0 || trace.shutdown)
+func traceReaderAvailable() *g {
+	if trace.fullHead != 0 || trace.shutdown {
+		return trace.reader.Load()
+	}
+	return nil
 }
 
 // traceProcFree frees trace buffer associated with pp.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceProcFree(pp *p) {
 	buf := pp.tracebuf
 	pp.tracebuf = 0
@@ -624,7 +711,9 @@
 	// TODO: test on non-zero extraBytes param.
 	maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
 	if buf == nil || len(buf.arr)-buf.pos < maxSize {
-		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+		})
 		bufp.set(buf)
 	}
 
@@ -701,7 +790,7 @@
 		hdr[0] = 0b10
 	}
 	if gp != nil {
-		hdr[1] = uint64(gp.goid)
+		hdr[1] = gp.goid
 	}
 
 	// Allow only one writer at a time
@@ -765,7 +854,9 @@
 
 			buf := bufp.ptr()
 			if buf == nil {
-				*bufp = traceFlush(*bufp, 0)
+				systemstack(func() {
+					*bufp = traceFlush(*bufp, 0)
+				})
 				buf = bufp.ptr()
 			}
 			for i := range stk {
@@ -782,19 +873,18 @@
 }
 
 func traceStackID(mp *m, buf []uintptr, skip int) uint64 {
-	_g_ := getg()
-	gp := mp.curg
+	gp := getg()
+	curgp := mp.curg
 	var nstk int
-	if gp == _g_ {
+	if curgp == gp {
 		nstk = callers(skip+1, buf)
-	} else if gp != nil {
-		gp = mp.curg
-		nstk = gcallers(gp, skip, buf)
+	} else if curgp != nil {
+		nstk = gcallers(curgp, skip, buf)
 	}
 	if nstk > 0 {
 		nstk-- // skip runtime.goexit
 	}
-	if nstk > 0 && gp.goid == 1 {
+	if nstk > 0 && curgp.goid == 1 {
 		nstk-- // skip runtime.main
 	}
 	id := trace.stackTab.put(buf[:nstk])
@@ -803,6 +893,11 @@
 
 // traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
 func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
+	// Any time we acquire a buffer, we may end up flushing it,
+	// but flushes are rare. Record the lock edge even if it
+	// doesn't happen this time.
+	lockRankMayTraceFlush()
+
 	mp = acquirem()
 	if p := mp.p.ptr(); p != nil {
 		return mp, p.id, &p.tracebuf
@@ -819,7 +914,21 @@
 	releasem(getg().m)
 }
 
+// lockRankMayTraceFlush records the lock ranking effects of a
+// potential call to traceFlush.
+func lockRankMayTraceFlush() {
+	owner := trace.lockOwner
+	dolock := owner == nil || owner != getg().m.curg
+	if dolock {
+		lockWithRankMayAcquire(&trace.lock, getLockRank(&trace.lock))
+	}
+}
+
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
 func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
@@ -897,8 +1006,10 @@
 	buf := bufp.ptr()
 	size := 1 + 2*traceBytesPerNumber + len(s)
 	if buf == nil || len(buf.arr)-buf.pos < size {
-		buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-		bufp.set(buf)
+		systemstack(func() {
+			buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+			bufp.set(buf)
+		})
 	}
 	buf.byte(traceEvString)
 	buf.varint(id)
@@ -917,15 +1028,6 @@
 	return id, bufp
 }
 
-// traceAppend appends v to buf in little-endian-base-128 encoding.
-func traceAppend(buf []byte, v uint64) []byte {
-	for ; v >= 0x80; v >>= 7 {
-		buf = append(buf, 0x80|byte(v))
-	}
-	buf = append(buf, byte(v))
-	return buf
-}
-
 // varint appends v to buf in little-endian-base-128 encoding.
 func (buf *traceBuf) varint(v uint64) {
 	pos := buf.pos
@@ -938,6 +1040,22 @@
 	buf.pos = pos
 }
 
+// varintAt writes varint v at byte position pos in buf. This always
+// consumes traceBytesPerNumber bytes. This is intended for when the
+// caller needs to reserve space for a varint but can't populate it
+// until later.
+func (buf *traceBuf) varintAt(pos int, v uint64) {
+	for i := 0; i < traceBytesPerNumber; i++ {
+		if i < traceBytesPerNumber-1 {
+			buf.arr[pos] = 0x80 | byte(v)
+		} else {
+			buf.arr[pos] = byte(v)
+		}
+		v >>= 7
+		pos++
+	}
+}
+
 // byte appends v to buf.
 func (buf *traceBuf) byte(v byte) {
 	buf.arr[buf.pos] = v
@@ -947,7 +1065,7 @@
 // traceStackTable maps stack traces (arrays of PC's) to unique uint32 ids.
 // It is lock-free for reading.
 type traceStackTable struct {
-	lock mutex
+	lock mutex // Must be acquired on the system stack
 	seq  uint32
 	mem  traceAlloc
 	tab  [1 << 13]traceStackPtr
@@ -983,26 +1101,31 @@
 		return id
 	}
 	// Now, double check under the mutex.
-	lock(&tab.lock)
-	if id := tab.find(pcs, hash); id != 0 {
+	// Switch to the system stack so we can acquire tab.lock
+	var id uint32
+	systemstack(func() {
+		lock(&tab.lock)
+		if id = tab.find(pcs, hash); id != 0 {
+			unlock(&tab.lock)
+			return
+		}
+		// Create new record.
+		tab.seq++
+		stk := tab.newStack(len(pcs))
+		stk.hash = hash
+		stk.id = tab.seq
+		id = stk.id
+		stk.n = len(pcs)
+		stkpc := stk.stack()
+		for i, pc := range pcs {
+			stkpc[i] = pc
+		}
+		part := int(hash % uintptr(len(tab.tab)))
+		stk.link = tab.tab[part]
+		atomicstorep(unsafe.Pointer(&tab.tab[part]), unsafe.Pointer(stk))
 		unlock(&tab.lock)
-		return id
-	}
-	// Create new record.
-	tab.seq++
-	stk := tab.newStack(len(pcs))
-	stk.hash = hash
-	stk.id = tab.seq
-	stk.n = len(pcs)
-	stkpc := stk.stack()
-	for i, pc := range pcs {
-		stkpc[i] = pc
-	}
-	part := int(hash % uintptr(len(tab.tab)))
-	stk.link = tab.tab[part]
-	atomicstorep(unsafe.Pointer(&tab.tab[part]), unsafe.Pointer(stk))
-	unlock(&tab.lock)
-	return stk.id
+	})
+	return id
 }
 
 // find checks if the stack trace pcs is already present in the table.
@@ -1027,61 +1150,75 @@
 	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*goarch.PtrSize))
 }
 
-// allFrames returns all of the Frames corresponding to pcs.
-func allFrames(pcs []uintptr) []Frame {
-	frames := make([]Frame, 0, len(pcs))
+// traceFrames returns the frames corresponding to pcs. It may
+// allocate and may emit trace events.
+func traceFrames(bufp traceBufPtr, pcs []uintptr) ([]traceFrame, traceBufPtr) {
+	frames := make([]traceFrame, 0, len(pcs))
 	ci := CallersFrames(pcs)
 	for {
+		var frame traceFrame
 		f, more := ci.Next()
-		frames = append(frames, f)
+		frame, bufp = traceFrameForPC(bufp, 0, f)
+		frames = append(frames, frame)
 		if !more {
-			return frames
+			return frames, bufp
 		}
 	}
 }
 
 // dump writes all previously cached stacks to trace buffers,
 // releases all memory and resets state.
-func (tab *traceStackTable) dump() {
-	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
-	bufp := traceFlush(0, 0)
-	for _, stk := range tab.tab {
-		stk := stk.ptr()
+//
+// This must run on the system stack because it calls traceFlush.
+//
+//go:systemstack
+func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
+	for i := range tab.tab {
+		stk := tab.tab[i].ptr()
 		for ; stk != nil; stk = stk.link.ptr() {
-			tmpbuf := tmp[:0]
-			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
-			frames := allFrames(stk.stack())
-			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
-			for _, f := range frames {
-				var frame traceFrame
-				frame, bufp = traceFrameForPC(bufp, 0, f)
-				tmpbuf = traceAppend(tmpbuf, uint64(f.PC))
-				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
-				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
-				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
-			}
-			// Now copy to the buffer.
-			size := 1 + traceBytesPerNumber + len(tmpbuf)
-			if buf := bufp.ptr(); len(buf.arr)-buf.pos < size {
+			var frames []traceFrame
+			frames, bufp = traceFrames(bufp, stk.stack())
+
+			// Estimate the size of this record. This
+			// bound is pretty loose, but avoids counting
+			// lots of varint sizes.
+			maxSize := 1 + traceBytesPerNumber + (2+4*len(frames))*traceBytesPerNumber
+			// Make sure we have enough buffer space.
+			if buf := bufp.ptr(); len(buf.arr)-buf.pos < maxSize {
 				bufp = traceFlush(bufp, 0)
 			}
+
+			// Emit header, with space reserved for length.
 			buf := bufp.ptr()
 			buf.byte(traceEvStack | 3<<traceArgCountShift)
-			buf.varint(uint64(len(tmpbuf)))
-			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
+			lenPos := buf.pos
+			buf.pos += traceBytesPerNumber
+
+			// Emit body.
+			recPos := buf.pos
+			buf.varint(uint64(stk.id))
+			buf.varint(uint64(len(frames)))
+			for _, frame := range frames {
+				buf.varint(uint64(frame.PC))
+				buf.varint(frame.funcID)
+				buf.varint(frame.fileID)
+				buf.varint(frame.line)
+			}
+
+			// Fill in size header.
+			buf.varintAt(lenPos, uint64(buf.pos-recPos))
 		}
 	}
 
-	lock(&trace.lock)
-	traceFullQueue(bufp)
-	unlock(&trace.lock)
-
 	tab.mem.drop()
 	*tab = traceStackTable{}
 	lockInit(&((*tab).lock), lockRankTraceStackTab)
+
+	return bufp
 }
 
 type traceFrame struct {
+	PC     uintptr
 	funcID uint64
 	fileID uint64
 	line   uint64
@@ -1092,6 +1229,7 @@
 func traceFrameForPC(buf traceBufPtr, pid int32, f Frame) (traceFrame, traceBufPtr) {
 	bufp := &buf
 	var frame traceFrame
+	frame.PC = f.PC
 
 	fn := f.Function
 	const maxLen = 1 << 10
@@ -1120,14 +1258,13 @@
 // traceAllocBlock is allocated from non-GC'd memory, so it must not
 // contain heap pointers. Writes to pointers to traceAllocBlocks do
 // not need write barriers.
-//
-//go:notinheap
 type traceAllocBlock struct {
+	_    sys.NotInHeap
 	next traceAllocBlockPtr
 	data [64<<10 - goarch.PtrSize]byte
 }
 
-// TODO: Since traceAllocBlock is now go:notinheap, this isn't necessary.
+// TODO: Since traceAllocBlock is now embedded runtime/internal/sys.NotInHeap, this isn't necessary.
 type traceAllocBlockPtr uintptr
 
 func (p traceAllocBlockPtr) ptr() *traceAllocBlock   { return (*traceAllocBlock)(unsafe.Pointer(p)) }
@@ -1208,11 +1345,11 @@
 func traceGCSweepStart() {
 	// Delay the actual GCSweepStart event until the first span
 	// sweep. If we don't sweep anything, don't emit any events.
-	_p_ := getg().m.p.ptr()
-	if _p_.traceSweep {
+	pp := getg().m.p.ptr()
+	if pp.traceSweep {
 		throw("double traceGCSweepStart")
 	}
-	_p_.traceSweep, _p_.traceSwept, _p_.traceReclaimed = true, 0, 0
+	pp.traceSweep, pp.traceSwept, pp.traceReclaimed = true, 0, 0
 }
 
 // traceGCSweepSpan traces the sweep of a single page.
@@ -1220,24 +1357,24 @@
 // This may be called outside a traceGCSweepStart/traceGCSweepDone
 // pair; however, it will not emit any trace events in this case.
 func traceGCSweepSpan(bytesSwept uintptr) {
-	_p_ := getg().m.p.ptr()
-	if _p_.traceSweep {
-		if _p_.traceSwept == 0 {
+	pp := getg().m.p.ptr()
+	if pp.traceSweep {
+		if pp.traceSwept == 0 {
 			traceEvent(traceEvGCSweepStart, 1)
 		}
-		_p_.traceSwept += bytesSwept
+		pp.traceSwept += bytesSwept
 	}
 }
 
 func traceGCSweepDone() {
-	_p_ := getg().m.p.ptr()
-	if !_p_.traceSweep {
+	pp := getg().m.p.ptr()
+	if !pp.traceSweep {
 		throw("missing traceGCSweepStart")
 	}
-	if _p_.traceSwept != 0 {
-		traceEvent(traceEvGCSweepDone, -1, uint64(_p_.traceSwept), uint64(_p_.traceReclaimed))
+	if pp.traceSwept != 0 {
+		traceEvent(traceEvGCSweepDone, -1, uint64(pp.traceSwept), uint64(pp.traceReclaimed))
 	}
-	_p_.traceSweep = false
+	pp.traceSweep = false
 }
 
 func traceGCMarkAssistStart() {
@@ -1253,20 +1390,20 @@
 	newg.tracelastp = getg().m.p
 	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
 	id := trace.stackTab.put([]uintptr{startPCforTrace(pc) + sys.PCQuantum})
-	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
+	traceEvent(traceEvGoCreate, 2, newg.goid, uint64(id))
 }
 
 func traceGoStart() {
-	_g_ := getg().m.curg
-	_p_ := _g_.m.p
-	_g_.traceseq++
-	if _p_.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
-		traceEvent(traceEvGoStartLabel, -1, uint64(_g_.goid), _g_.traceseq, trace.markWorkerLabels[_p_.ptr().gcMarkWorkerMode])
-	} else if _g_.tracelastp == _p_ {
-		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
+	gp := getg().m.curg
+	pp := gp.m.p
+	gp.traceseq++
+	if pp.ptr().gcMarkWorkerMode != gcMarkWorkerNotWorker {
+		traceEvent(traceEvGoStartLabel, -1, gp.goid, gp.traceseq, trace.markWorkerLabels[pp.ptr().gcMarkWorkerMode])
+	} else if gp.tracelastp == pp {
+		traceEvent(traceEvGoStartLocal, -1, gp.goid)
 	} else {
-		_g_.tracelastp = _p_
-		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
+		gp.tracelastp = pp
+		traceEvent(traceEvGoStart, -1, gp.goid, gp.traceseq)
 	}
 }
 
@@ -1275,14 +1412,14 @@
 }
 
 func traceGoSched() {
-	_g_ := getg()
-	_g_.tracelastp = _g_.m.p
+	gp := getg()
+	gp.tracelastp = gp.m.p
 	traceEvent(traceEvGoSched, 1)
 }
 
 func traceGoPreempt() {
-	_g_ := getg()
-	_g_.tracelastp = _g_.m.p
+	gp := getg()
+	gp.tracelastp = gp.m.p
 	traceEvent(traceEvGoPreempt, 1)
 }
 
@@ -1294,13 +1431,13 @@
 }
 
 func traceGoUnpark(gp *g, skip int) {
-	_p_ := getg().m.p
+	pp := getg().m.p
 	gp.traceseq++
-	if gp.tracelastp == _p_ {
-		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
+	if gp.tracelastp == pp {
+		traceEvent(traceEvGoUnblockLocal, skip, gp.goid)
 	} else {
-		gp.tracelastp = _p_
-		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
+		gp.tracelastp = pp
+		traceEvent(traceEvGoUnblock, skip, gp.goid, gp.traceseq)
 	}
 }
 
@@ -1321,10 +1458,10 @@
 		// aka right now), and assign a fresh time stamp to keep the log consistent.
 		ts = 0
 	}
-	_g_ := getg().m.curg
-	_g_.traceseq++
-	_g_.tracelastp = _g_.m.p
-	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
+	gp := getg().m.curg
+	gp.traceseq++
+	gp.tracelastp = gp.m.p
+	traceEvent(traceEvGoSysExit, -1, gp.goid, gp.traceseq, uint64(ts)/traceTickDiv)
 }
 
 func traceGoSysBlock(pp *p) {
@@ -1338,8 +1475,8 @@
 	releasem(mp)
 }
 
-func traceHeapAlloc() {
-	traceEvent(traceEvHeapAlloc, -1, gcController.heapLive)
+func traceHeapAlloc(live uint64) {
+	traceEvent(traceEvHeapAlloc, -1, live)
 }
 
 func traceHeapGoal() {
@@ -1432,7 +1569,7 @@
 func startPCforTrace(pc uintptr) uintptr {
 	f := findfunc(pc)
 	if !f.valid() {
-		return pc // should not happen, but don't care
+		return pc // may happen for locked g in extra M since its pc is 0.
 	}
 	w := funcdata(f, _FUNCDATA_WrapInfo)
 	if w == nil {
diff --git a/src/runtime/trace/annotation.go b/src/runtime/trace/annotation.go
index 9171633..d47cb85 100644
--- a/src/runtime/trace/annotation.go
+++ b/src/runtime/trace/annotation.go
@@ -178,8 +178,7 @@
 // The information is advisory only. The tracing status
 // may have changed by the time this function returns.
 func IsEnabled() bool {
-	enabled := atomic.LoadInt32(&tracing.enabled)
-	return enabled == 1
+	return tracing.enabled.Load()
 }
 
 //
diff --git a/src/runtime/trace/annotation_test.go b/src/runtime/trace/annotation_test.go
index 31fccef..69ea8f2 100644
--- a/src/runtime/trace/annotation_test.go
+++ b/src/runtime/trace/annotation_test.go
@@ -147,7 +147,7 @@
 		pretty := func(data []testData) string {
 			var s strings.Builder
 			for _, d := range data {
-				s.WriteString(fmt.Sprintf("\t%+v\n", d))
+				fmt.Fprintf(&s, "\t%+v\n", d)
 			}
 			return s.String()
 		}
diff --git a/src/runtime/trace/trace.go b/src/runtime/trace/trace.go
index cf2b644..86c97e2 100644
--- a/src/runtime/trace/trace.go
+++ b/src/runtime/trace/trace.go
@@ -134,7 +134,7 @@
 			w.Write(data)
 		}
 	}()
-	atomic.StoreInt32(&tracing.enabled, 1)
+	tracing.enabled.Store(true)
 	return nil
 }
 
@@ -143,12 +143,12 @@
 func Stop() {
 	tracing.Lock()
 	defer tracing.Unlock()
-	atomic.StoreInt32(&tracing.enabled, 0)
+	tracing.enabled.Store(false)
 
 	runtime.StopTrace()
 }
 
 var tracing struct {
-	sync.Mutex       // gate mutators (Start, Stop)
-	enabled    int32 // accessed via atomic
+	sync.Mutex // gate mutators (Start, Stop)
+	enabled    atomic.Bool
 }
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 49147ff..37f35d5 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -7,7 +7,6 @@
 import (
 	"internal/bytealg"
 	"internal/goarch"
-	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -54,8 +53,6 @@
 	}
 	level, _, _ := gotraceback()
 
-	var ctxt *funcval // Context pointer for unstarted goroutines. See issue #25897.
-
 	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
 		if gp.syscallsp != 0 {
 			pc0 = gp.syscallpc
@@ -69,7 +66,6 @@
 			if usesLR {
 				lr0 = gp.sched.lr
 			}
-			ctxt = (*funcval)(gp.sched.ctxt)
 		}
 	}
 
@@ -163,7 +159,10 @@
 		if frame.fp == 0 {
 			// Jump over system stack transitions. If we're on g0 and there's a user
 			// goroutine, try to jump. Otherwise this is a regular call.
-			if flags&_TraceJumpStack != 0 && gp == gp.m.g0 && gp.m.curg != nil {
+			// We also defensively check that this won't switch M's on us,
+			// which could happen at critical points in the scheduler.
+			// This ensures gp.m doesn't change from a stack jump.
+			if flags&_TraceJumpStack != 0 && gp == gp.m.g0 && gp.m.curg != nil && gp.m.curg.m == gp.m {
 				switch f.funcID {
 				case funcID_morestack:
 					// morestack does not return normally -- newstack()
@@ -171,20 +170,33 @@
 					// This keeps morestack() from showing up in the backtrace,
 					// but that makes some sense since it'll never be returned
 					// to.
-					frame.pc = gp.m.curg.sched.pc
+					gp = gp.m.curg
+					frame.pc = gp.sched.pc
 					frame.fn = findfunc(frame.pc)
 					f = frame.fn
 					flag = f.flag
-					frame.lr = gp.m.curg.sched.lr
-					frame.sp = gp.m.curg.sched.sp
-					stack = gp.m.curg.stack
-					cgoCtxt = gp.m.curg.cgoCtxt
+					frame.lr = gp.sched.lr
+					frame.sp = gp.sched.sp
+					stack = gp.stack
+					cgoCtxt = gp.cgoCtxt
 				case funcID_systemstack:
 					// systemstack returns normally, so just follow the
 					// stack transition.
-					frame.sp = gp.m.curg.sched.sp
-					stack = gp.m.curg.stack
-					cgoCtxt = gp.m.curg.cgoCtxt
+					if usesLR && funcspdelta(f, frame.pc, &cache) == 0 {
+						// We're at the function prologue and the stack
+						// switch hasn't happened, or epilogue where we're
+						// about to return. Just unwind normally.
+						// Do this only on LR machines because on x86
+						// systemstack doesn't have an SP delta (the CALL
+						// instruction opens the frame), therefore no way
+						// to check.
+						flag &^= funcFlag_SPWRITE
+						break
+					}
+					gp = gp.m.curg
+					frame.sp = gp.sched.sp
+					stack = gp.stack
+					cgoCtxt = gp.cgoCtxt
 					flag &^= funcFlag_SPWRITE
 				}
 			}
@@ -287,21 +299,7 @@
 			frame.varp -= goarch.PtrSize
 		}
 
-		// Derive size of arguments.
-		// Most functions have a fixed-size argument block,
-		// so we can use metadata about the function f.
-		// Not all, though: there are some variadic functions
-		// in package runtime and reflect, and for those we use call-specific
-		// metadata recorded by f's caller.
-		if callback != nil || printing {
-			frame.argp = frame.fp + sys.MinFrameSize
-			var ok bool
-			frame.arglen, frame.argmap, ok = getArgInfoFast(f, callback != nil)
-			if !ok {
-				frame.arglen, frame.argmap = getArgInfo(&frame, f, callback != nil, ctxt)
-			}
-		}
-		ctxt = nil // ctxt is only needed to get arg maps for the topmost frame
+		frame.argp = frame.fp + sys.MinFrameSize
 
 		// Determine frame's 'continuation PC', where it can continue.
 		// Normally this is the return address on the stack, but if sigpanic
@@ -418,8 +416,9 @@
 
 					// Create a fake _func for the
 					// inlined function.
-					inlFunc.nameoff = inltree[ix].func_
+					inlFunc.nameOff = inltree[ix].nameOff
 					inlFunc.funcID = inltree[ix].funcID
+					inlFunc.startLine = inltree[ix].startLine
 
 					if (flags&_TraceRuntimeFrames) != 0 || showframe(inlFuncInfo, gp, nprint == 0, inlFuncInfo.funcID, lastFuncID) {
 						name := funcname(inlFuncInfo)
@@ -494,7 +493,6 @@
 		frame.lr = 0
 		frame.sp = frame.fp
 		frame.fp = 0
-		frame.argmap = nil
 
 		// On link register architectures, sighandler saves the LR on stack
 		// before faking a call.
@@ -665,74 +663,6 @@
 	}
 }
 
-// reflectMethodValue is a partial duplicate of reflect.makeFuncImpl
-// and reflect.methodValue.
-type reflectMethodValue struct {
-	fn     uintptr
-	stack  *bitvector // ptrmap for both args and results
-	argLen uintptr    // just args
-}
-
-// getArgInfoFast returns the argument frame information for a call to f.
-// It is short and inlineable. However, it does not handle all functions.
-// If ok reports false, you must call getArgInfo instead.
-// TODO(josharian): once we do mid-stack inlining,
-// call getArgInfo directly from getArgInfoFast and stop returning an ok bool.
-func getArgInfoFast(f funcInfo, needArgMap bool) (arglen uintptr, argmap *bitvector, ok bool) {
-	return uintptr(f.args), nil, !(needArgMap && f.args == _ArgsSizeUnknown)
-}
-
-// getArgInfo returns the argument frame information for a call to f
-// with call frame frame.
-//
-// This is used for both actual calls with active stack frames and for
-// deferred calls or goroutines that are not yet executing. If this is an actual
-// call, ctxt must be nil (getArgInfo will retrieve what it needs from
-// the active stack frame). If this is a deferred call or unstarted goroutine,
-// ctxt must be the function object that was deferred or go'd.
-func getArgInfo(frame *stkframe, f funcInfo, needArgMap bool, ctxt *funcval) (arglen uintptr, argmap *bitvector) {
-	arglen = uintptr(f.args)
-	if needArgMap && f.args == _ArgsSizeUnknown {
-		// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
-		switch funcname(f) {
-		case "reflect.makeFuncStub", "reflect.methodValueCall":
-			// These take a *reflect.methodValue as their
-			// context register.
-			var mv *reflectMethodValue
-			var retValid bool
-			if ctxt != nil {
-				// This is not an actual call, but a
-				// deferred call or an unstarted goroutine.
-				// The function value is itself the *reflect.methodValue.
-				mv = (*reflectMethodValue)(unsafe.Pointer(ctxt))
-			} else {
-				// This is a real call that took the
-				// *reflect.methodValue as its context
-				// register and immediately saved it
-				// to 0(SP). Get the methodValue from
-				// 0(SP).
-				arg0 := frame.sp + sys.MinFrameSize
-				mv = *(**reflectMethodValue)(unsafe.Pointer(arg0))
-				// Figure out whether the return values are valid.
-				// Reflect will update this value after it copies
-				// in the return values.
-				retValid = *(*bool)(unsafe.Pointer(arg0 + 4*goarch.PtrSize))
-			}
-			if mv.fn != f.entry() {
-				print("runtime: confused by ", funcname(f), "\n")
-				throw("reflect mismatch")
-			}
-			bv := mv.stack
-			arglen = uintptr(bv.n * goarch.PtrSize)
-			if !retValid {
-				arglen = uintptr(mv.argLen) &^ (goarch.PtrSize - 1)
-			}
-			argmap = bv
-		}
-	}
-	return
-}
-
 // tracebackCgoContext handles tracing back a cgo context value, from
 // the context argument to setCgoTraceback, for the gentraceback
 // function. It returns the new value of n.
@@ -819,10 +749,10 @@
 		// concurrently with a signal handler.
 		// We just have to stop a signal handler from interrupting
 		// in the middle of our copy.
-		atomic.Store(&gp.m.cgoCallersUse, 1)
+		gp.m.cgoCallersUse.Store(1)
 		cgoCallers := *gp.m.cgoCallers
 		gp.m.cgoCallers[0] = 0
-		atomic.Store(&gp.m.cgoCallersUse, 0)
+		gp.m.cgoCallersUse.Store(0)
 
 		printCgoTraceback(&cgoCallers)
 	}
@@ -880,7 +810,7 @@
 	}
 }
 
-// printAncestorTraceback prints the given function info at a given pc
+// printAncestorTracebackFuncInfo prints the given function info at a given pc
 // within an ancestor traceback. The precision of this info is reduced
 // due to only have access to the pcs at the time of the caller
 // goroutine being created.
@@ -890,7 +820,7 @@
 		inltree := (*[1 << 20]inlinedCall)(inldata)
 		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
 		if ix >= 0 {
-			name = funcnameFromNameoff(f, inltree[ix].func_)
+			name = funcnameFromNameOff(f, inltree[ix].nameOff)
 		}
 	}
 	file, line := funcline(f, pc)
@@ -923,8 +853,8 @@
 // showframe reports whether the frame with the given characteristics should
 // be printed during a traceback.
 func showframe(f funcInfo, gp *g, firstFrame bool, funcID, childID funcID) bool {
-	g := getg()
-	if g.m.throwing >= throwTypeRuntime && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
+	mp := getg().m
+	if mp.throwing >= throwTypeRuntime && gp != nil && (gp == mp.curg || gp == mp.caughtsig.ptr()) {
 		return true
 	}
 	return showfuncinfo(f, firstFrame, funcID, childID)
@@ -934,7 +864,7 @@
 // be printed during a traceback.
 func showfuncinfo(f funcInfo, firstFrame bool, funcID, childID funcID) bool {
 	// Note that f may be a synthesized funcInfo for an inlined
-	// function, in which case only nameoff and funcID are set.
+	// function, in which case only nameOff and funcID are set.
 
 	level, _, _ := gotraceback()
 	if level > 1 {
@@ -1051,10 +981,10 @@
 		}
 		print("\n")
 		goroutineheader(gp)
-		// Note: gp.m == g.m occurs when tracebackothers is
-		// called from a signal handler initiated during a
-		// systemstack call. The original G is still in the
-		// running state, and we want to print its stack.
+		// Note: gp.m == getg().m occurs when tracebackothers is called
+		// from a signal handler initiated during a systemstack call.
+		// The original G is still in the running state, and we want to
+		// print its stack.
 		if gp.m != getg().m && readgstatus(gp)&^_Gscan == _Grunning {
 			print("\tgoroutine running on other thread; stack unavailable\n")
 			printcreatedby(gp)
@@ -1136,7 +1066,7 @@
 			// always consider it a user goroutine.
 			return false
 		}
-		return !fingRunning
+		return fingStatus.Load()&fingRunningFinalizer == 0
 	}
 	return hasPrefix(funcname(f), "runtime.")
 }
@@ -1352,7 +1282,7 @@
 	data     uintptr
 }
 
-// cgoTraceback prints a traceback of callers.
+// printCgoTraceback prints a traceback of callers.
 func printCgoTraceback(callers *cgoCallers) {
 	if cgoSymbolizer == nil {
 		for _, c := range callers {
@@ -1407,7 +1337,7 @@
 // callCgoSymbolizer calls the cgoSymbolizer function.
 func callCgoSymbolizer(arg *cgoSymbolizerArg) {
 	call := cgocall
-	if panicking > 0 || getg().m.curg != getg() {
+	if panicking.Load() > 0 || getg().m.curg != getg() {
 		// We do not want to call into the scheduler when panicking
 		// or when on the system stack.
 		call = asmcgocall
@@ -1427,7 +1357,7 @@
 		return
 	}
 	call := cgocall
-	if panicking > 0 || getg().m.curg != getg() {
+	if panicking.Load() > 0 || getg().m.curg != getg() {
 		// We do not want to call into the scheduler when panicking
 		// or when on the system stack.
 		call = asmcgocall
diff --git a/src/runtime/traceback_test.go b/src/runtime/traceback_test.go
index e50bd95..97eb921 100644
--- a/src/runtime/traceback_test.go
+++ b/src/runtime/traceback_test.go
@@ -9,7 +9,6 @@
 	"internal/abi"
 	"internal/testenv"
 	"runtime"
-	"strings"
 	"testing"
 )
 
@@ -19,7 +18,7 @@
 	if *flagQuick {
 		t.Skip("-quick")
 	}
-	optimized := !strings.HasSuffix(testenv.Builder(), "-noopt")
+	optimized := !testenv.OptimizationOff()
 	abiSel := func(x, y string) string {
 		// select expected output based on ABI
 		// In noopt build we always spill arguments so the output is the same as stack ABI.
diff --git a/src/runtime/type.go b/src/runtime/type.go
index e8e7819..1c6103e 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -454,7 +454,7 @@
 	}
 }
 
-func (n name) name() (s string) {
+func (n name) name() string {
 	if n.bytes == nil {
 		return ""
 	}
@@ -462,22 +462,16 @@
 	if l == 0 {
 		return ""
 	}
-	hdr := (*stringStruct)(unsafe.Pointer(&s))
-	hdr.str = unsafe.Pointer(n.data(1 + i))
-	hdr.len = l
-	return
+	return unsafe.String(n.data(1+i), l)
 }
 
-func (n name) tag() (s string) {
+func (n name) tag() string {
 	if *n.data(0)&(1<<1) == 0 {
 		return ""
 	}
 	i, l := n.readvarint(1)
 	i2, l2 := n.readvarint(1 + i + l)
-	hdr := (*stringStruct)(unsafe.Pointer(&s))
-	hdr.str = unsafe.Pointer(n.data(1 + i + l + i2))
-	hdr.len = l2
-	return
+	return unsafe.String(n.data(1+i+l+i2), l2)
 }
 
 func (n name) pkgPath() string {
diff --git a/src/runtime/unsafe.go b/src/runtime/unsafe.go
new file mode 100644
index 0000000..54649e8
--- /dev/null
+++ b/src/runtime/unsafe.go
@@ -0,0 +1,98 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/math"
+	"unsafe"
+)
+
+func unsafestring(ptr unsafe.Pointer, len int) {
+	if len < 0 {
+		panicunsafestringlen()
+	}
+
+	if uintptr(len) > -uintptr(ptr) {
+		if ptr == nil {
+			panicunsafestringnilptr()
+		}
+		panicunsafestringlen()
+	}
+}
+
+// Keep this code in sync with cmd/compile/internal/walk/builtin.go:walkUnsafeString
+func unsafestring64(ptr unsafe.Pointer, len64 int64) {
+	len := int(len64)
+	if int64(len) != len64 {
+		panicunsafestringlen()
+	}
+	unsafestring(ptr, len)
+}
+
+func unsafestringcheckptr(ptr unsafe.Pointer, len64 int64) {
+	unsafestring64(ptr, len64)
+
+	// Check that underlying array doesn't straddle multiple heap objects.
+	// unsafestring64 has already checked for overflow.
+	if checkptrStraddles(ptr, uintptr(len64)) {
+		throw("checkptr: unsafe.String result straddles multiple allocations")
+	}
+}
+
+func panicunsafestringlen() {
+	panic(errorString("unsafe.String: len out of range"))
+}
+
+func panicunsafestringnilptr() {
+	panic(errorString("unsafe.String: ptr is nil and len is not zero"))
+}
+
+// Keep this code in sync with cmd/compile/internal/walk/builtin.go:walkUnsafeSlice
+func unsafeslice(et *_type, ptr unsafe.Pointer, len int) {
+	if len < 0 {
+		panicunsafeslicelen()
+	}
+
+	if et.size == 0 {
+		if ptr == nil && len > 0 {
+			panicunsafeslicenilptr()
+		}
+	}
+
+	mem, overflow := math.MulUintptr(et.size, uintptr(len))
+	if overflow || mem > -uintptr(ptr) {
+		if ptr == nil {
+			panicunsafeslicenilptr()
+		}
+		panicunsafeslicelen()
+	}
+}
+
+// Keep this code in sync with cmd/compile/internal/walk/builtin.go:walkUnsafeSlice
+func unsafeslice64(et *_type, ptr unsafe.Pointer, len64 int64) {
+	len := int(len64)
+	if int64(len) != len64 {
+		panicunsafeslicelen()
+	}
+	unsafeslice(et, ptr, len)
+}
+
+func unsafeslicecheckptr(et *_type, ptr unsafe.Pointer, len64 int64) {
+	unsafeslice64(et, ptr, len64)
+
+	// Check that underlying array doesn't straddle multiple heap objects.
+	// unsafeslice64 has already checked for overflow.
+	if checkptrStraddles(ptr, uintptr(len64)*et.size) {
+		throw("checkptr: unsafe.Slice result straddles multiple allocations")
+	}
+}
+
+func panicunsafeslicelen() {
+	panic(errorString("unsafe.Slice: len out of range"))
+}
+
+func panicunsafeslicenilptr() {
+	panic(errorString("unsafe.Slice: ptr is nil and len is not zero"))
+}
diff --git a/src/runtime/vdso_freebsd_arm64.go b/src/runtime/vdso_freebsd_arm64.go
index 7d9f62d..37b26d7 100644
--- a/src/runtime/vdso_freebsd_arm64.go
+++ b/src/runtime/vdso_freebsd_arm64.go
@@ -14,7 +14,7 @@
 func (th *vdsoTimehands) getTimecounter() (uint32, bool) {
 	switch th.algo {
 	case _VDSO_TH_ALGO_ARM_GENTIM:
-		return getCntxct(false), true
+		return getCntxct(th.physical != 0), true
 	default:
 		return 0, false
 	}
diff --git a/src/runtime/vdso_freebsd_riscv64.go b/src/runtime/vdso_freebsd_riscv64.go
new file mode 100644
index 0000000..a4fff4b
--- /dev/null
+++ b/src/runtime/vdso_freebsd_riscv64.go
@@ -0,0 +1,21 @@
+// Copyright 2022 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_VDSO_TH_ALGO_RISCV_RDTIME = 1
+)
+
+func getCntxct() uint32
+
+//go:nosplit
+func (th *vdsoTimehands) getTimecounter() (uint32, bool) {
+	switch th.algo {
+	case _VDSO_TH_ALGO_RISCV_RDTIME:
+		return getCntxct(), true
+	default:
+		return 0, false
+	}
+}
diff --git a/src/runtime/vdso_freebsd_x86.go b/src/runtime/vdso_freebsd_x86.go
index 5324a3d..66d1c65 100644
--- a/src/runtime/vdso_freebsd_x86.go
+++ b/src/runtime/vdso_freebsd_x86.go
@@ -34,10 +34,8 @@
 	return uint32(tsc)
 }
 
-//go:systemstack
+//go:nosplit
 func (th *vdsoTimehands) getHPETTimecounter() (uint32, bool) {
-	const digits = "0123456789"
-
 	idx := int(th.x86_hpet_idx)
 	if idx >= len(hpetDevMap) {
 		return 0, false
@@ -45,25 +43,7 @@
 
 	p := atomic.Loaduintptr(&hpetDevMap[idx])
 	if p == 0 {
-		var devPath [len(hpetDevPath)]byte
-		copy(devPath[:], hpetDevPath)
-		devPath[9] = digits[idx]
-
-		fd := open(&devPath[0], 0 /* O_RDONLY */, 0)
-		if fd < 0 {
-			atomic.Casuintptr(&hpetDevMap[idx], 0, ^uintptr(0))
-			return 0, false
-		}
-
-		addr, mmapErr := mmap(nil, physPageSize, _PROT_READ, _MAP_SHARED, fd, 0)
-		closefd(fd)
-		newP := uintptr(addr)
-		if mmapErr != 0 {
-			newP = ^uintptr(0)
-		}
-		if !atomic.Casuintptr(&hpetDevMap[idx], 0, newP) && mmapErr == 0 {
-			munmap(addr, physPageSize)
-		}
+		systemstack(func() { initHPETTimecounter(idx) })
 		p = atomic.Loaduintptr(&hpetDevMap[idx])
 	}
 	if p == ^uintptr(0) {
@@ -72,20 +52,38 @@
 	return *(*uint32)(unsafe.Pointer(p + _HPET_MAIN_COUNTER)), true
 }
 
+//go:systemstack
+func initHPETTimecounter(idx int) {
+	const digits = "0123456789"
+
+	var devPath [len(hpetDevPath)]byte
+	copy(devPath[:], hpetDevPath)
+	devPath[9] = digits[idx]
+
+	fd := open(&devPath[0], 0 /* O_RDONLY */ |_O_CLOEXEC, 0)
+	if fd < 0 {
+		atomic.Casuintptr(&hpetDevMap[idx], 0, ^uintptr(0))
+		return
+	}
+
+	addr, mmapErr := mmap(nil, physPageSize, _PROT_READ, _MAP_SHARED, fd, 0)
+	closefd(fd)
+	newP := uintptr(addr)
+	if mmapErr != 0 {
+		newP = ^uintptr(0)
+	}
+	if !atomic.Casuintptr(&hpetDevMap[idx], 0, newP) && mmapErr == 0 {
+		munmap(addr, physPageSize)
+	}
+}
+
 //go:nosplit
 func (th *vdsoTimehands) getTimecounter() (uint32, bool) {
 	switch th.algo {
 	case _VDSO_TH_ALGO_X86_TSC:
 		return th.getTSCTimecounter(), true
 	case _VDSO_TH_ALGO_X86_HPET:
-		var (
-			tc uint32
-			ok bool
-		)
-		systemstack(func() {
-			tc, ok = th.getHPETTimecounter()
-		})
-		return tc, ok
+		return th.getHPETTimecounter()
 	default:
 		return 0, false
 	}
diff --git a/src/runtime/wincallback.go b/src/runtime/wincallback.go
index 442a984..9ec2027 100644
--- a/src/runtime/wincallback.go
+++ b/src/runtime/wincallback.go
@@ -62,7 +62,7 @@
 TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
 `)
 	for i := 0; i < maxCallback; i++ {
-		buf.WriteString(fmt.Sprintf("\tMOVW\t$%d, R12\n", i))
+		fmt.Fprintf(&buf, "\tMOVW\t$%d, R12\n", i)
 		buf.WriteString("\tB\truntime·callbackasm1(SB)\n")
 	}
 
@@ -90,7 +90,7 @@
 TEXT runtime·callbackasm(SB),NOSPLIT|NOFRAME,$0
 `)
 	for i := 0; i < maxCallback; i++ {
-		buf.WriteString(fmt.Sprintf("\tMOVD\t$%d, R12\n", i))
+		fmt.Fprintf(&buf, "\tMOVD\t$%d, R12\n", i)
 		buf.WriteString("\tB\truntime·callbackasm1(SB)\n")
 	}
 
@@ -104,12 +104,12 @@
 func gengo() {
 	var buf bytes.Buffer
 
-	buf.WriteString(fmt.Sprintf(`// Code generated by wincallback.go using 'go generate'. DO NOT EDIT.
+	fmt.Fprintf(&buf, `// Code generated by wincallback.go using 'go generate'. DO NOT EDIT.
 
 package runtime
 
 const cb_max = %d // maximum number of windows callbacks allowed
-`, maxCallback))
+`, maxCallback)
 	err := os.WriteFile("zcallback_windows.go", buf.Bytes(), 0666)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "wincallback: %s\n", err)