Update linux go to 1.15beta1

From https://ci.android.com/builds/submitted/6626886/linux/latest/go.zip

Test: m blueprint_tools
Change-Id: Ib0d1176e769611b25554177aef209bc7e6456694
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 732d32b..0af48ab 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -34,17 +34,6 @@
 	alg_max
 )
 
-// typeAlg is also copied/used in reflect/type.go.
-// keep them in sync.
-type typeAlg struct {
-	// function for hashing objects of this type
-	// (ptr to object, seed) -> hash
-	hash func(unsafe.Pointer, uintptr) uintptr
-	// function for comparing objects of this type
-	// (ptr to object A, ptr to object B) -> ==?
-	equal func(unsafe.Pointer, unsafe.Pointer) bool
-}
-
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
@@ -68,34 +57,20 @@
 	return memhash(p, h, size)
 }
 
-var algarray = [alg_max]typeAlg{
-	alg_NOEQ:     {nil, nil},
-	alg_MEM0:     {memhash0, memequal0},
-	alg_MEM8:     {memhash8, memequal8},
-	alg_MEM16:    {memhash16, memequal16},
-	alg_MEM32:    {memhash32, memequal32},
-	alg_MEM64:    {memhash64, memequal64},
-	alg_MEM128:   {memhash128, memequal128},
-	alg_STRING:   {strhash, strequal},
-	alg_INTER:    {interhash, interequal},
-	alg_NILINTER: {nilinterhash, nilinterequal},
-	alg_FLOAT32:  {f32hash, f32equal},
-	alg_FLOAT64:  {f64hash, f64equal},
-	alg_CPLX64:   {c64hash, c64equal},
-	alg_CPLX128:  {c128hash, c128equal},
-}
-
+// runtime variable to check if the processor we're running on
+// actually supports the instructions used by the AES-based
+// hash implementation.
 var useAeshash bool
 
 // in asm_*.s
-func aeshash(p unsafe.Pointer, h, s uintptr) uintptr
-func aeshash32(p unsafe.Pointer, h uintptr) uintptr
-func aeshash64(p unsafe.Pointer, h uintptr) uintptr
-func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
+func memhash(p unsafe.Pointer, h, s uintptr) uintptr
+func memhash32(p unsafe.Pointer, h uintptr) uintptr
+func memhash64(p unsafe.Pointer, h uintptr) uintptr
+func strhash(p unsafe.Pointer, h uintptr) uintptr
 
-func strhash(a unsafe.Pointer, h uintptr) uintptr {
+func strhashFallback(a unsafe.Pointer, h uintptr) uintptr {
 	x := (*stringStruct)(a)
-	return memhash(x.str, h, uintptr(x.len))
+	return memhashFallback(x.str, h, uintptr(x.len))
 }
 
 // NOTE: Because NaN != NaN, a map can contain any
@@ -144,14 +119,17 @@
 		return h
 	}
 	t := tab._type
-	fn := t.alg.hash
-	if fn == nil {
+	if t.equal == nil {
+		// Check hashability here. We could do this check inside
+		// typehash, but we want to report the topmost type in
+		// the error text (e.g. in a struct with a field of slice type
+		// we want to report the struct, not the slice).
 		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
-		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
+		return c1 * typehash(t, unsafe.Pointer(&a.data), h^c0)
 	} else {
-		return c1 * fn(a.data, h^c0)
+		return c1 * typehash(t, a.data, h^c0)
 	}
 }
 
@@ -161,17 +139,100 @@
 	if t == nil {
 		return h
 	}
-	fn := t.alg.hash
-	if fn == nil {
+	if t.equal == nil {
+		// See comment in interhash above.
 		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
-		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
+		return c1 * typehash(t, unsafe.Pointer(&a.data), h^c0)
 	} else {
-		return c1 * fn(a.data, h^c0)
+		return c1 * typehash(t, a.data, h^c0)
 	}
 }
 
+// typehash computes the hash of the object of type t at address p.
+// h is the seed.
+// This function is seldom used. Most maps use for hashing either
+// fixed functions (e.g. f32hash) or compiler-generated functions
+// (e.g. for a type like struct { x, y string }). This implementation
+// is slower but more general and is used for hashing interface types
+// (called from interhash or nilinterhash, above) or for hashing in
+// maps generated by reflect.MapOf (reflect_typehash, below).
+// Note: this function must match the compiler generated
+// functions exactly. See issue 37716.
+func typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
+	if t.tflag&tflagRegularMemory != 0 {
+		// Handle ptr sizes specially, see issue 37086.
+		switch t.size {
+		case 4:
+			return memhash32(p, h)
+		case 8:
+			return memhash64(p, h)
+		default:
+			return memhash(p, h, t.size)
+		}
+	}
+	switch t.kind & kindMask {
+	case kindFloat32:
+		return f32hash(p, h)
+	case kindFloat64:
+		return f64hash(p, h)
+	case kindComplex64:
+		return c64hash(p, h)
+	case kindComplex128:
+		return c128hash(p, h)
+	case kindString:
+		return strhash(p, h)
+	case kindInterface:
+		i := (*interfacetype)(unsafe.Pointer(t))
+		if len(i.mhdr) == 0 {
+			return nilinterhash(p, h)
+		}
+		return interhash(p, h)
+	case kindArray:
+		a := (*arraytype)(unsafe.Pointer(t))
+		for i := uintptr(0); i < a.len; i++ {
+			h = typehash(a.elem, add(p, i*a.elem.size), h)
+		}
+		return h
+	case kindStruct:
+		s := (*structtype)(unsafe.Pointer(t))
+		memStart := uintptr(0)
+		memEnd := uintptr(0)
+		for _, f := range s.fields {
+			if memEnd > memStart && (f.name.isBlank() || f.offset() != memEnd || f.typ.tflag&tflagRegularMemory == 0) {
+				// flush any pending regular memory hashing
+				h = memhash(add(p, memStart), h, memEnd-memStart)
+				memStart = memEnd
+			}
+			if f.name.isBlank() {
+				continue
+			}
+			if f.typ.tflag&tflagRegularMemory == 0 {
+				h = typehash(f.typ, add(p, f.offset()), h)
+				continue
+			}
+			if memStart == memEnd {
+				memStart = f.offset()
+			}
+			memEnd = f.offset() + f.typ.size
+		}
+		if memEnd > memStart {
+			h = memhash(add(p, memStart), h, memEnd-memStart)
+		}
+		return h
+	default:
+		// Should never happen, as typehash should only be called
+		// with comparable types.
+		panic(errorString("hash of unhashable type " + t.string()))
+	}
+}
+
+//go:linkname reflect_typehash reflect.typehash
+func reflect_typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
+	return typehash(t, p, h)
+}
+
 func memequal0(p, q unsafe.Pointer) bool {
 	return true
 }
@@ -219,7 +280,7 @@
 	if t == nil {
 		return true
 	}
-	eq := t.alg.equal
+	eq := t.equal
 	if eq == nil {
 		panic(errorString("comparing uncomparable type " + t.string()))
 	}
@@ -236,7 +297,7 @@
 		return true
 	}
 	t := tab._type
-	eq := t.alg.equal
+	eq := t.equal
 	if eq == nil {
 		panic(errorString("comparing uncomparable type " + t.string()))
 	}
@@ -249,7 +310,7 @@
 
 // Testing adapters for hash quality tests (see hash_test.go)
 func stringHash(s string, seed uintptr) uintptr {
-	return algarray[alg_STRING].hash(noescape(unsafe.Pointer(&s)), seed)
+	return strhash(noescape(unsafe.Pointer(&s)), seed)
 }
 
 func bytesHash(b []byte, seed uintptr) uintptr {
@@ -258,21 +319,21 @@
 }
 
 func int32Hash(i uint32, seed uintptr) uintptr {
-	return algarray[alg_MEM32].hash(noescape(unsafe.Pointer(&i)), seed)
+	return memhash32(noescape(unsafe.Pointer(&i)), seed)
 }
 
 func int64Hash(i uint64, seed uintptr) uintptr {
-	return algarray[alg_MEM64].hash(noescape(unsafe.Pointer(&i)), seed)
+	return memhash64(noescape(unsafe.Pointer(&i)), seed)
 }
 
 func efaceHash(i interface{}, seed uintptr) uintptr {
-	return algarray[alg_NILINTER].hash(noescape(unsafe.Pointer(&i)), seed)
+	return nilinterhash(noescape(unsafe.Pointer(&i)), seed)
 }
 
 func ifaceHash(i interface {
 	F()
 }, seed uintptr) uintptr {
-	return algarray[alg_INTER].hash(noescape(unsafe.Pointer(&i)), seed)
+	return interhash(noescape(unsafe.Pointer(&i)), seed)
 }
 
 const hashRandomBytes = sys.PtrSize / 4 * 64
@@ -286,7 +347,6 @@
 func alginit() {
 	// Install AES hash algorithms if the instructions needed are present.
 	if (GOARCH == "386" || GOARCH == "amd64") &&
-		GOOS != "nacl" &&
 		cpu.X86.HasAES && // AESENC
 		cpu.X86.HasSSSE3 && // PSHUFB
 		cpu.X86.HasSSE41 { // PINSR{D,Q}
@@ -305,19 +365,12 @@
 }
 
 func initAlgAES() {
-	if GOOS == "aix" {
-		// runtime.algarray is immutable on AIX: see cmd/link/internal/ld/xcoff.go
-		return
-	}
 	useAeshash = true
-	algarray[alg_MEM32].hash = aeshash32
-	algarray[alg_MEM64].hash = aeshash64
-	algarray[alg_STRING].hash = aeshashstr
 	// Initialize with random data so hash collisions will be hard to engineer.
 	getRandomData(aeskeysched[:])
 }
 
-// Note: These routines perform the read with an native endianness.
+// Note: These routines perform the read with a native endianness.
 func readUnaligned32(p unsafe.Pointer) uint32 {
 	q := (*[4]byte)(p)
 	if sys.BigEndian {
diff --git a/src/runtime/asm.s b/src/runtime/asm.s
index 6b209b2..95a3424 100644
--- a/src/runtime/asm.s
+++ b/src/runtime/asm.s
@@ -12,9 +12,6 @@
 DATA runtime·no_pointers_stackmap+0x04(SB)/4, $0
 GLOBL runtime·no_pointers_stackmap(SB),RODATA, $8
 
-GLOBL runtime·mheap_(SB), NOPTR, $0
-GLOBL runtime·memstats(SB), NOPTR, $0
-
 // NaCl requires that these skips be verifiable machine code.
 #ifdef GOARCH_amd64
 #define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
@@ -22,9 +19,6 @@
 #ifdef GOARCH_386
 #define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
 #endif
-#ifdef GOARCH_amd64p32
-#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
-#endif
 #ifdef GOARCH_wasm
 #define SKIP4 UNDEF; UNDEF; UNDEF; UNDEF
 #endif
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index a01841d..11863fb 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -109,9 +109,6 @@
 	MOVL	SP, (g_stack+stack_hi)(BP)
 
 	// find out information about the processor we're on
-#ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
-	JMP 	has_cpuid
-#else
 	// first see if CPUID instruction is supported.
 	PUSHFL
 	PUSHFL
@@ -123,7 +120,6 @@
 	POPFL	// restore EFLAGS
 	TESTL	$(1<<21), AX
 	JNE 	has_cpuid
-#endif
 
 bad_proc: // show that the program requires MMX.
 	MOVL	$2, 0(SP)
@@ -203,10 +199,6 @@
 	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
 	JMP	ok
 #endif
-#ifdef GOOS_darwin
-	// skip runtime·ldt0setup(SB) on Darwin
-	JMP	ok
-#endif
 
 	// set up %gs
 	CALL	ldt0setup<>(SB)
@@ -911,18 +903,26 @@
 	RET
 
 // hash function using AES hardware instructions
-TEXT runtime·aeshash(SB),NOSPLIT,$0-16
+TEXT runtime·memhash(SB),NOSPLIT,$0-16
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVL	p+0(FP), AX	// ptr to data
 	MOVL	s+8(FP), BX	// size
 	LEAL	ret+12(FP), DX
 	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·memhashFallback(SB)
 
-TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
+TEXT runtime·strhash(SB),NOSPLIT,$0-12
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVL	p+0(FP), AX	// ptr to string object
 	MOVL	4(AX), BX	// length of string
 	MOVL	(AX), AX	// string data
 	LEAL	ret+8(FP), DX
 	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·strhashFallback(SB)
 
 // AX: data
 // BX: length
@@ -1108,7 +1108,9 @@
 	MOVL	X4, (DX)
 	RET
 
-TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
+TEXT runtime·memhash32(SB),NOSPLIT,$0-12
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVL	p+0(FP), AX	// ptr to data
 	MOVL	h+4(FP), X0	// seed
 	PINSRD	$1, (AX), X0	// data
@@ -1117,8 +1119,12 @@
 	AESENC	runtime·aeskeysched+32(SB), X0
 	MOVL	X0, ret+8(FP)
 	RET
+noaes:
+	JMP	runtime·memhash32Fallback(SB)
 
-TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
+TEXT runtime·memhash64(SB),NOSPLIT,$0-12
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVL	p+0(FP), AX	// ptr to data
 	MOVQ	(AX), X0	// data
 	PINSRD	$2, h+4(FP), X0	// seed
@@ -1127,6 +1133,8 @@
 	AESENC	runtime·aeskeysched+32(SB), X0
 	MOVL	X0, ret+8(FP)
 	RET
+noaes:
+	JMP	runtime·memhash64Fallback(SB)
 
 // simple mask to get rid of data in the high part of the register.
 DATA masks<>+0x00(SB)/4, $0x00000000
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index fd3a9c3..fa25c55 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -510,7 +510,8 @@
 	/* call function */			\
 	MOVQ	f+8(FP), DX;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
-	CALL	(DX);				\
+	MOVQ	(DX), AX;			\
+	CALL	AX;				\
 	/* copy return values back */		\
 	MOVQ	argtype+0(FP), DX;		\
 	MOVQ	argptr+16(FP), DI;		\
@@ -885,21 +886,29 @@
 	MOVQ	AX, ret+0(FP)
 	RET
 
-// func aeshash(p unsafe.Pointer, h, s uintptr) uintptr
+// func memhash(p unsafe.Pointer, h, s uintptr) uintptr
 // hash function using AES hardware instructions
-TEXT runtime·aeshash(SB),NOSPLIT,$0-32
+TEXT runtime·memhash(SB),NOSPLIT,$0-32
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	s+16(FP), CX	// size
 	LEAQ	ret+24(FP), DX
 	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·memhashFallback(SB)
 
-// func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshashstr(SB),NOSPLIT,$0-24
+// func strhash(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·strhash(SB),NOSPLIT,$0-24
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVQ	p+0(FP), AX	// ptr to string struct
 	MOVQ	8(AX), CX	// length of string
 	MOVQ	(AX), AX	// string data
 	LEAQ	ret+16(FP), DX
 	JMP	aeshashbody<>(SB)
+noaes:
+	JMP	runtime·strhashFallback(SB)
 
 // AX: data
 // CX: length
@@ -1232,8 +1241,10 @@
 	MOVQ	X8, (DX)
 	RET
 
-// func aeshash32(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
+// func memhash32(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·memhash32(SB),NOSPLIT,$0-24
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
 	PINSRD	$2, (AX), X0	// data
@@ -1242,9 +1253,13 @@
 	AESENC	runtime·aeskeysched+32(SB), X0
 	MOVQ	X0, ret+16(FP)
 	RET
+noaes:
+	JMP	runtime·memhash32Fallback(SB)
 
-// func aeshash64(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshash64(SB),NOSPLIT,$0-24
+// func memhash64(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·memhash64(SB),NOSPLIT,$0-24
+	CMPB	runtime·useAeshash(SB), $0
+	JEQ	noaes
 	MOVQ	p+0(FP), AX	// ptr to data
 	MOVQ	h+8(FP), X0	// seed
 	PINSRQ	$1, (AX), X0	// data
@@ -1253,6 +1268,8 @@
 	AESENC	runtime·aeskeysched+32(SB), X0
 	MOVQ	X0, ret+16(FP)
 	RET
+noaes:
+	JMP	runtime·memhash64Fallback(SB)
 
 // simple mask to get rid of data in the high part of the register.
 DATA masks<>+0x00(SB)/8, $0x0000000000000000
@@ -1458,6 +1475,55 @@
 	MOVQ	96(SP), R15
 	JMP	ret
 
+// gcWriteBarrierCX is gcWriteBarrier, but with args in DI and CX.
+TEXT runtime·gcWriteBarrierCX(SB),NOSPLIT,$0
+	XCHGQ CX, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ CX, AX
+	RET
+
+// gcWriteBarrierDX is gcWriteBarrier, but with args in DI and DX.
+TEXT runtime·gcWriteBarrierDX(SB),NOSPLIT,$0
+	XCHGQ DX, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ DX, AX
+	RET
+
+// gcWriteBarrierBX is gcWriteBarrier, but with args in DI and BX.
+TEXT runtime·gcWriteBarrierBX(SB),NOSPLIT,$0
+	XCHGQ BX, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ BX, AX
+	RET
+
+// gcWriteBarrierBP is gcWriteBarrier, but with args in DI and BP.
+TEXT runtime·gcWriteBarrierBP(SB),NOSPLIT,$0
+	XCHGQ BP, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ BP, AX
+	RET
+
+// gcWriteBarrierSI is gcWriteBarrier, but with args in DI and SI.
+TEXT runtime·gcWriteBarrierSI(SB),NOSPLIT,$0
+	XCHGQ SI, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ SI, AX
+	RET
+
+// gcWriteBarrierR8 is gcWriteBarrier, but with args in DI and R8.
+TEXT runtime·gcWriteBarrierR8(SB),NOSPLIT,$0
+	XCHGQ R8, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ R8, AX
+	RET
+
+// gcWriteBarrierR9 is gcWriteBarrier, but with args in DI and R9.
+TEXT runtime·gcWriteBarrierR9(SB),NOSPLIT,$0
+	XCHGQ R9, AX
+	CALL runtime·gcWriteBarrier(SB)
+	XCHGQ R9, AX
+	RET
+
 DATA	debugCallFrameTooLarge<>+0x00(SB)/20, $"call frame too large"
 GLOBL	debugCallFrameTooLarge<>(SB), RODATA, $20	// Size duplicated below
 
@@ -1494,10 +1560,8 @@
 // a stack pointer to an escaping argument. debugCallV1 cannot check
 // this invariant.
 TEXT runtime·debugCallV1(SB),NOSPLIT,$152-0
-	// Save all registers that may contain pointers in GC register
-	// map order (see ssa.registersAMD64). This makes it possible
-	// to copy the stack while updating pointers currently held in
-	// registers, and for the GC to find roots in registers.
+	// Save all registers that may contain pointers so they can be
+	// conservatively scanned.
 	//
 	// We can't do anything that might clobber any of these
 	// registers before this.
@@ -1727,3 +1791,34 @@
 DATA runtime·tls_g+0(SB)/8, $16
 GLOBL runtime·tls_g+0(SB), NOPTR, $8
 #endif
+
+// The compiler and assembler's -spectre=ret mode rewrites
+// all indirect CALL AX / JMP AX instructions to be
+// CALL retpolineAX / JMP retpolineAX.
+// See https://support.google.com/faqs/answer/7625886.
+#define RETPOLINE(reg) \
+	/*   CALL setup */     BYTE $0xE8; BYTE $(2+2); BYTE $0; BYTE $0; BYTE $0;	\
+	/* nospec: */									\
+	/*   PAUSE */           BYTE $0xF3; BYTE $0x90;					\
+	/*   JMP nospec */      BYTE $0xEB; BYTE $-(2+2);				\
+	/* setup: */									\
+	/*   MOVQ AX, 0(SP) */  BYTE $0x48|((reg&8)>>1); BYTE $0x89;			\
+	                        BYTE $0x04|((reg&7)<<3); BYTE $0x24;			\
+	/*   RET */             BYTE $0xC3
+
+TEXT runtime·retpolineAX(SB),NOSPLIT,$0; RETPOLINE(0)
+TEXT runtime·retpolineCX(SB),NOSPLIT,$0; RETPOLINE(1)
+TEXT runtime·retpolineDX(SB),NOSPLIT,$0; RETPOLINE(2)
+TEXT runtime·retpolineBX(SB),NOSPLIT,$0; RETPOLINE(3)
+/* SP is 4, can't happen / magic encodings */
+TEXT runtime·retpolineBP(SB),NOSPLIT,$0; RETPOLINE(5)
+TEXT runtime·retpolineSI(SB),NOSPLIT,$0; RETPOLINE(6)
+TEXT runtime·retpolineDI(SB),NOSPLIT,$0; RETPOLINE(7)
+TEXT runtime·retpolineR8(SB),NOSPLIT,$0; RETPOLINE(8)
+TEXT runtime·retpolineR9(SB),NOSPLIT,$0; RETPOLINE(9)
+TEXT runtime·retpolineR10(SB),NOSPLIT,$0; RETPOLINE(10)
+TEXT runtime·retpolineR11(SB),NOSPLIT,$0; RETPOLINE(11)
+TEXT runtime·retpolineR12(SB),NOSPLIT,$0; RETPOLINE(12)
+TEXT runtime·retpolineR13(SB),NOSPLIT,$0; RETPOLINE(13)
+TEXT runtime·retpolineR14(SB),NOSPLIT,$0; RETPOLINE(14)
+TEXT runtime·retpolineR15(SB),NOSPLIT,$0; RETPOLINE(15)
diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
deleted file mode 100644
index 48f3711..0000000
--- a/src/runtime/asm_amd64p32.s
+++ /dev/null
@@ -1,763 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "funcdata.h"
-#include "textflag.h"
-
-TEXT runtime·rt0_go(SB),NOSPLIT,$0
-	// copy arguments forward on an even stack
-	MOVL	SP, CX
-	MOVL	8(CX), AX	// argc
-	MOVL	12(CX), BX	// argv
-	SUBL	$128, CX		// plenty of scratch
-	ANDL	$~15, CX
-	MOVL	CX, SP
-
-	MOVL	AX, 16(SP)
-	MOVL	BX, 24(SP)
-
-	// create istack out of the given (operating system) stack.
-	MOVL	$runtime·g0(SB), DI
-	LEAL	(-64*1024+104)(SP), BX
-	MOVL	BX, g_stackguard0(DI)
-	MOVL	BX, g_stackguard1(DI)
-	MOVL	BX, (g_stack+stack_lo)(DI)
-	MOVL	SP, (g_stack+stack_hi)(DI)
-
-	// find out information about the processor we're on
-	MOVL	$0, AX
-	CPUID
-	CMPL	AX, $0
-	JE	nocpuinfo
-
-	CMPL	BX, $0x756E6547  // "Genu"
-	JNE	notintel
-	CMPL	DX, $0x49656E69  // "ineI"
-	JNE	notintel
-	CMPL	CX, $0x6C65746E  // "ntel"
-	JNE	notintel
-	MOVB	$1, runtime·isIntel(SB)
-notintel:
-
-	// Load EAX=1 cpuid flags
-	MOVL	$1, AX
-	CPUID
-	MOVL	AX, runtime·processorVersionInfo(SB)
-
-nocpuinfo:
-	LEAL	runtime·m0+m_tls(SB), DI
-	CALL	runtime·settls(SB)
-
-	// store through it, to make sure it works
-	get_tls(BX)
-	MOVQ	$0x123, g(BX)
-	MOVQ	runtime·m0+m_tls(SB), AX
-	CMPQ	AX, $0x123
-	JEQ 2(PC)
-	CALL	runtime·abort(SB)
-ok:
-	// set the per-goroutine and per-mach "registers"
-	get_tls(BX)
-	LEAL	runtime·g0(SB), CX
-	MOVL	CX, g(BX)
-	LEAL	runtime·m0(SB), AX
-
-	// save m->g0 = g0
-	MOVL	CX, m_g0(AX)
-	// save m0 to g0->m
-	MOVL	AX, g_m(CX)
-
-	CLD				// convention is D is always left cleared
-	CALL	runtime·check(SB)
-
-	MOVL	16(SP), AX		// copy argc
-	MOVL	AX, 0(SP)
-	MOVL	24(SP), AX		// copy argv
-	MOVL	AX, 4(SP)
-	CALL	runtime·args(SB)
-	CALL	runtime·osinit(SB)
-	CALL	runtime·schedinit(SB)
-
-	// create a new goroutine to start program
-	MOVL	$runtime·mainPC(SB), AX	// entry
-	MOVL	$0, 0(SP)
-	MOVL	AX, 4(SP)
-	CALL	runtime·newproc(SB)
-
-	// start this M
-	CALL	runtime·mstart(SB)
-
-	MOVL	$0xf1, 0xf1  // crash
-	RET
-
-DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
-GLOBL	runtime·mainPC(SB),RODATA,$4
-
-TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
-	INT $3
-	RET
-
-TEXT runtime·asminit(SB),NOSPLIT,$0-0
-	// No per-thread init.
-	RET
-
-/*
- *  go-routine
- */
-
-// void gosave(Gobuf*)
-// save state in Gobuf; setjmp
-TEXT runtime·gosave(SB), NOSPLIT, $0-4
-	MOVL	buf+0(FP), AX	// gobuf
-	LEAL	buf+0(FP), BX	// caller's SP
-	MOVL	BX, gobuf_sp(AX)
-	MOVL	0(SP), BX		// caller's PC
-	MOVL	BX, gobuf_pc(AX)
-	MOVQ	$0, gobuf_ret(AX)
-	// Assert ctxt is zero. See func save.
-	MOVL	gobuf_ctxt(AX), BX
-	TESTL	BX, BX
-	JZ	2(PC)
-	CALL	runtime·badctxt(SB)
-	get_tls(CX)
-	MOVL	g(CX), BX
-	MOVL	BX, gobuf_g(AX)
-	RET
-
-// void gogo(Gobuf*)
-// restore state from Gobuf; longjmp
-TEXT runtime·gogo(SB), NOSPLIT, $8-4
-	MOVL	buf+0(FP), BX		// gobuf
-	MOVL	gobuf_g(BX), DX
-	MOVL	0(DX), CX		// make sure g != nil
-	get_tls(CX)
-	MOVL	DX, g(CX)
-	MOVL	gobuf_sp(BX), SP	// restore SP
-	MOVL	gobuf_ctxt(BX), DX
-	MOVQ	gobuf_ret(BX), AX
-	MOVL	$0, gobuf_sp(BX)	// clear to help garbage collector
-	MOVQ	$0, gobuf_ret(BX)
-	MOVL	$0, gobuf_ctxt(BX)
-	MOVL	gobuf_pc(BX), BX
-	JMP	BX
-
-// func mcall(fn func(*g))
-// Switch to m->g0's stack, call fn(g).
-// Fn must never return. It should gogo(&g->sched)
-// to keep running g.
-TEXT runtime·mcall(SB), NOSPLIT, $0-4
-	MOVL	fn+0(FP), DI
-
-	get_tls(CX)
-	MOVL	g(CX), AX	// save state in g->sched
-	MOVL	0(SP), BX	// caller's PC
-	MOVL	BX, (g_sched+gobuf_pc)(AX)
-	LEAL	fn+0(FP), BX	// caller's SP
-	MOVL	BX, (g_sched+gobuf_sp)(AX)
-	MOVL	AX, (g_sched+gobuf_g)(AX)
-
-	// switch to m->g0 & its stack, call fn
-	MOVL	g(CX), BX
-	MOVL	g_m(BX), BX
-	MOVL	m_g0(BX), SI
-	CMPL	SI, AX	// if g == m->g0 call badmcall
-	JNE	3(PC)
-	MOVL	$runtime·badmcall(SB), AX
-	JMP	AX
-	MOVL	SI, g(CX)	// g = m->g0
-	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
-	PUSHQ	AX
-	MOVL	DI, DX
-	MOVL	0(DI), DI
-	CALL	DI
-	POPQ	AX
-	MOVL	$runtime·badmcall2(SB), AX
-	JMP	AX
-	RET
-
-// systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack. We need to distinguish the routine that
-// lives at the bottom of the G stack from the one that lives
-// at the top of the system stack because the one at the top of
-// the system stack terminates the stack walk (see topofstack()).
-TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
-	RET
-
-// func systemstack(fn func())
-TEXT runtime·systemstack(SB), NOSPLIT, $0-4
-	MOVL	fn+0(FP), DI	// DI = fn
-	get_tls(CX)
-	MOVL	g(CX), AX	// AX = g
-	MOVL	g_m(AX), BX	// BX = m
-
-	CMPL	AX, m_gsignal(BX)
-	JEQ	noswitch
-
-	MOVL	m_g0(BX), DX	// DX = g0
-	CMPL	AX, DX
-	JEQ	noswitch
-
-	CMPL	AX, m_curg(BX)
-	JNE	bad
-
-	// switch stacks
-	// save our state in g->sched. Pretend to
-	// be systemstack_switch if the G stack is scanned.
-	MOVL	$runtime·systemstack_switch(SB), SI
-	MOVL	SI, (g_sched+gobuf_pc)(AX)
-	MOVL	SP, (g_sched+gobuf_sp)(AX)
-	MOVL	AX, (g_sched+gobuf_g)(AX)
-
-	// switch to g0
-	MOVL	DX, g(CX)
-	MOVL	(g_sched+gobuf_sp)(DX), SP
-
-	// call target function
-	MOVL	DI, DX
-	MOVL	0(DI), DI
-	CALL	DI
-
-	// switch back to g
-	get_tls(CX)
-	MOVL	g(CX), AX
-	MOVL	g_m(AX), BX
-	MOVL	m_curg(BX), AX
-	MOVL	AX, g(CX)
-	MOVL	(g_sched+gobuf_sp)(AX), SP
-	MOVL	$0, (g_sched+gobuf_sp)(AX)
-	RET
-
-noswitch:
-	// already on m stack, just call directly
-	// Using a tail call here cleans up tracebacks since we won't stop
-	// at an intermediate systemstack.
-	MOVL	DI, DX
-	MOVL	0(DI), DI
-	JMP	DI
-
-bad:
-	// Not g0, not curg. Must be gsignal, but that's not allowed.
-	// Hide call from linker nosplit analysis.
-	MOVL	$runtime·badsystemstack(SB), AX
-	CALL	AX
-	INT	$3
-
-/*
- * support for morestack
- */
-
-// Called during function prolog when more stack is needed.
-//
-// The traceback routines see morestack on a g0 as being
-// the top of a stack (for example, morestack calling newstack
-// calling the scheduler calling newm calling gc), so we must
-// record an argument size. For that purpose, it has no arguments.
-TEXT runtime·morestack(SB),NOSPLIT,$0-0
-	get_tls(CX)
-	MOVL	g(CX), BX
-	MOVL	g_m(BX), BX
-
-	// Cannot grow scheduler stack (m->g0).
-	MOVL	m_g0(BX), SI
-	CMPL	g(CX), SI
-	JNE	3(PC)
-	CALL	runtime·badmorestackg0(SB)
-	MOVL	0, AX
-
-	// Cannot grow signal stack (m->gsignal).
-	MOVL	m_gsignal(BX), SI
-	CMPL	g(CX), SI
-	JNE	3(PC)
-	CALL	runtime·badmorestackgsignal(SB)
-	MOVL	0, AX
-
-	// Called from f.
-	// Set m->morebuf to f's caller.
-	NOP	SP	// tell vet SP changed - stop checking offsets
-	MOVL	8(SP), AX	// f's caller's PC
-	MOVL	AX, (m_morebuf+gobuf_pc)(BX)
-	LEAL	16(SP), AX	// f's caller's SP
-	MOVL	AX, (m_morebuf+gobuf_sp)(BX)
-	get_tls(CX)
-	MOVL	g(CX), SI
-	MOVL	SI, (m_morebuf+gobuf_g)(BX)
-
-	// Set g->sched to context in f.
-	MOVL	0(SP), AX // f's PC
-	MOVL	AX, (g_sched+gobuf_pc)(SI)
-	MOVL	SI, (g_sched+gobuf_g)(SI)
-	LEAL	8(SP), AX // f's SP
-	MOVL	AX, (g_sched+gobuf_sp)(SI)
-	MOVL	DX, (g_sched+gobuf_ctxt)(SI)
-
-	// Call newstack on m->g0's stack.
-	MOVL	m_g0(BX), BX
-	MOVL	BX, g(CX)
-	MOVL	(g_sched+gobuf_sp)(BX), SP
-	CALL	runtime·newstack(SB)
-	MOVL	$0, 0x1003	// crash if newstack returns
-	RET
-
-// morestack trampolines
-TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
-	MOVL	$0, DX
-	JMP	runtime·morestack(SB)
-
-// reflectcall: call a function with the given argument list
-// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
-// we don't have variable-sized frames, so we use a small number
-// of constant-sized-frame functions to encode a few bits of size in the pc.
-// Caution: ugly multiline assembly macros in your future!
-
-#define DISPATCH(NAME,MAXSIZE)		\
-	CMPL	CX, $MAXSIZE;		\
-	JA	3(PC);			\
-	MOVL	$NAME(SB), AX;		\
-	JMP	AX
-// Note: can't just "JMP NAME(SB)" - bad inlining results.
-
-TEXT ·reflectcall(SB), NOSPLIT, $0-20
-	MOVLQZX argsize+12(FP), CX
-	DISPATCH(runtime·call16, 16)
-	DISPATCH(runtime·call32, 32)
-	DISPATCH(runtime·call64, 64)
-	DISPATCH(runtime·call128, 128)
-	DISPATCH(runtime·call256, 256)
-	DISPATCH(runtime·call512, 512)
-	DISPATCH(runtime·call1024, 1024)
-	DISPATCH(runtime·call2048, 2048)
-	DISPATCH(runtime·call4096, 4096)
-	DISPATCH(runtime·call8192, 8192)
-	DISPATCH(runtime·call16384, 16384)
-	DISPATCH(runtime·call32768, 32768)
-	DISPATCH(runtime·call65536, 65536)
-	DISPATCH(runtime·call131072, 131072)
-	DISPATCH(runtime·call262144, 262144)
-	DISPATCH(runtime·call524288, 524288)
-	DISPATCH(runtime·call1048576, 1048576)
-	DISPATCH(runtime·call2097152, 2097152)
-	DISPATCH(runtime·call4194304, 4194304)
-	DISPATCH(runtime·call8388608, 8388608)
-	DISPATCH(runtime·call16777216, 16777216)
-	DISPATCH(runtime·call33554432, 33554432)
-	DISPATCH(runtime·call67108864, 67108864)
-	DISPATCH(runtime·call134217728, 134217728)
-	DISPATCH(runtime·call268435456, 268435456)
-	DISPATCH(runtime·call536870912, 536870912)
-	DISPATCH(runtime·call1073741824, 1073741824)
-	MOVL	$runtime·badreflectcall(SB), AX
-	JMP	AX
-
-#define CALLFN(NAME,MAXSIZE)			\
-TEXT NAME(SB), WRAPPER, $MAXSIZE-20;		\
-	NO_LOCAL_POINTERS;			\
-	/* copy arguments to stack */		\
-	MOVL	argptr+8(FP), SI;		\
-	MOVL	argsize+12(FP), CX;		\
-	MOVL	SP, DI;				\
-	REP;MOVSB;				\
-	/* call function */			\
-	MOVL	f+4(FP), DX;			\
-	MOVL	(DX), AX;			\
-	CALL	AX;				\
-	/* copy return values back */		\
-	MOVL	argtype+0(FP), DX;		\
-	MOVL	argptr+8(FP), DI;		\
-	MOVL	argsize+12(FP), CX;		\
-	MOVL	retoffset+16(FP), BX;		\
-	MOVL	SP, SI;				\
-	ADDL	BX, DI;				\
-	ADDL	BX, SI;				\
-	SUBL	BX, CX;				\
-	CALL	callRet<>(SB);			\
-	RET
-
-// callRet copies return values back at the end of call*. This is a
-// separate function so it can allocate stack space for the arguments
-// to reflectcallmove. It does not follow the Go ABI; it expects its
-// arguments in registers.
-TEXT callRet<>(SB), NOSPLIT, $16-0
-	MOVL	DX, 0(SP)
-	MOVL	DI, 4(SP)
-	MOVL	SI, 8(SP)
-	MOVL	CX, 12(SP)
-	CALL	runtime·reflectcallmove(SB)
-	RET
-
-CALLFN(·call16, 16)
-CALLFN(·call32, 32)
-CALLFN(·call64, 64)
-CALLFN(·call128, 128)
-CALLFN(·call256, 256)
-CALLFN(·call512, 512)
-CALLFN(·call1024, 1024)
-CALLFN(·call2048, 2048)
-CALLFN(·call4096, 4096)
-CALLFN(·call8192, 8192)
-CALLFN(·call16384, 16384)
-CALLFN(·call32768, 32768)
-CALLFN(·call65536, 65536)
-CALLFN(·call131072, 131072)
-CALLFN(·call262144, 262144)
-CALLFN(·call524288, 524288)
-CALLFN(·call1048576, 1048576)
-CALLFN(·call2097152, 2097152)
-CALLFN(·call4194304, 4194304)
-CALLFN(·call8388608, 8388608)
-CALLFN(·call16777216, 16777216)
-CALLFN(·call33554432, 33554432)
-CALLFN(·call67108864, 67108864)
-CALLFN(·call134217728, 134217728)
-CALLFN(·call268435456, 268435456)
-CALLFN(·call536870912, 536870912)
-CALLFN(·call1073741824, 1073741824)
-
-TEXT runtime·procyield(SB),NOSPLIT,$0-0
-	MOVL	cycles+0(FP), AX
-again:
-	PAUSE
-	SUBL	$1, AX
-	JNZ	again
-	RET
-
-TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
-	// Stores are already ordered on x86, so this is just a
-	// compile barrier.
-	RET
-
-// void jmpdefer(fn, sp);
-// called from deferreturn.
-// 1. pop the caller
-// 2. sub 5 bytes from the callers return
-// 3. jmp to the argument
-TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
-	MOVL	fv+0(FP), DX
-	MOVL	argp+4(FP), BX
-	LEAL	-8(BX), SP	// caller sp after CALL
-	SUBL	$5, (SP)	// return to CALL again
-	MOVL	0(DX), BX
-	JMP	BX	// but first run the deferred function
-
-// func asmcgocall(fn, arg unsafe.Pointer) int32
-// Not implemented.
-TEXT runtime·asmcgocall(SB),NOSPLIT,$0-12
-	MOVL	0, AX // crash
-	MOVL	$0, ret+8(FP) // for vet
-	RET
-
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// Not implemented.
-TEXT runtime·cgocallback(SB),NOSPLIT,$0-16
-	MOVL	0, AX
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
-// Not implemented.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$0-16
-	MOVL	0, AX
-	RET
-
-// void setg(G*); set g. for use by needm.
-// Not implemented.
-TEXT runtime·setg(SB), NOSPLIT, $0-4
-	MOVL	0, AX
-	RET
-
-TEXT runtime·abort(SB),NOSPLIT,$0-0
-	INT	$3
-loop:
-	JMP	loop
-
-// check that SP is in range [g->stack.lo, g->stack.hi)
-TEXT runtime·stackcheck(SB), NOSPLIT, $0-0
-	get_tls(CX)
-	MOVL	g(CX), AX
-	CMPL	(g_stack+stack_hi)(AX), SP
-	JHI	2(PC)
-	MOVL	0, AX
-	CMPL	SP, (g_stack+stack_lo)(AX)
-	JHI	2(PC)
-	MOVL	0, AX
-	RET
-
-// int64 runtime·cputicks(void)
-TEXT runtime·cputicks(SB),NOSPLIT,$0-0
-	RDTSC
-	SHLQ	$32, DX
-	ADDQ	DX, AX
-	MOVQ	AX, ret+0(FP)
-	RET
-
-// hash function using AES hardware instructions
-// For now, our one amd64p32 system (NaCl) does not
-// support using AES instructions, so have not bothered to
-// write the implementations. Can copy and adjust the ones
-// in asm_amd64.s when the time comes.
-
-TEXT runtime·aeshash(SB),NOSPLIT,$0-20
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·aeshash64(SB),NOSPLIT,$0-12
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·return0(SB), NOSPLIT, $0
-	MOVL	$0, AX
-	RET
-
-// The top-most function running on a goroutine
-// returns to goexit+PCQuantum.
-TEXT runtime·goexit(SB),NOSPLIT,$0-0
-	BYTE	$0x90	// NOP
-	CALL	runtime·goexit1(SB)	// does not return
-	// traceback from goexit1 must hit code range of goexit
-	BYTE	$0x90	// NOP
-
-TEXT ·checkASM(SB),NOSPLIT,$0-1
-	MOVB	$1, ret+0(FP)
-	RET
-
-// gcWriteBarrier performs a heap pointer write and informs the GC.
-//
-// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
-// - DI is the destination of the write
-// - AX is the value being written at DI
-// It clobbers FLAGS and SI. It does not clobber any other general-purpose registers,
-// but may clobber others (e.g., SSE registers).
-TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$88
-	// Save the registers clobbered by the fast path. This is slightly
-	// faster than having the caller spill these.
-	MOVQ	R14, 72(SP)
-	MOVQ	R13, 80(SP)
-	// TODO: Consider passing g.m.p in as an argument so they can be shared
-	// across a sequence of write barriers.
-	get_tls(R13)
-	MOVL	g(R13), R13
-	MOVL	g_m(R13), R13
-	MOVL	m_p(R13), R13
-	MOVL	(p_wbBuf+wbBuf_next)(R13), R14
-	// Increment wbBuf.next position.
-	LEAL	8(R14), R14
-	MOVL	R14, (p_wbBuf+wbBuf_next)(R13)
-	CMPL	R14, (p_wbBuf+wbBuf_end)(R13)
-	// Record the write.
-	MOVL	AX, -8(R14)	// Record value
-	MOVL	(DI), R13	// TODO: This turns bad writes into bad reads.
-	MOVL	R13, -4(R14)	// Record *slot
-	// Is the buffer full? (flags set in CMPL above)
-	JEQ	flush
-ret:
-	MOVQ	72(SP), R14
-	MOVQ	80(SP), R13
-	// Do the write.
-	MOVL	AX, (DI)
-	RET			// Clobbers SI on NaCl
-
-flush:
-	// Save all general purpose registers since these could be
-	// clobbered by wbBufFlush and were not saved by the caller.
-	// It is possible for wbBufFlush to clobber other registers
-	// (e.g., SSE registers), but the compiler takes care of saving
-	// those in the caller if necessary. This strikes a balance
-	// with registers that are likely to be used.
-	//
-	// We don't have type information for these, but all code under
-	// here is NOSPLIT, so nothing will observe these.
-	//
-	// TODO: We could strike a different balance; e.g., saving X0
-	// and not saving GP registers that are less likely to be used.
-	MOVL	DI, 0(SP)	// Also first argument to wbBufFlush
-	MOVL	AX, 4(SP)	// Also second argument to wbBufFlush
-	MOVQ	BX, 8(SP)
-	MOVQ	CX, 16(SP)
-	MOVQ	DX, 24(SP)
-	// DI already saved
-	// SI is always clobbered on nacl
-	// BP is reserved on nacl
-	MOVQ	R8, 32(SP)
-	MOVQ	R9, 40(SP)
-	MOVQ	R10, 48(SP)
-	MOVQ	R11, 56(SP)
-	MOVQ	R12, 64(SP)
-	// R13 already saved
-	// R14 already saved
-	// R15 is reserved on nacl
-
-	// This takes arguments DI and AX
-	CALL	runtime·wbBufFlush(SB)
-
-	MOVL	0(SP), DI
-	MOVL	4(SP), AX
-	MOVQ	8(SP), BX
-	MOVQ	16(SP), CX
-	MOVQ	24(SP), DX
-	MOVQ	32(SP), R8
-	MOVQ	40(SP), R9
-	MOVQ	48(SP), R10
-	MOVQ	56(SP), R11
-	MOVQ	64(SP), R12
-	JMP	ret
-
-// Note: these functions use a special calling convention to save generated code space.
-// Arguments are passed in registers, but the space for those arguments are allocated
-// in the caller's stack frame. These stubs write the args into that stack space and
-// then tail call to the corresponding runtime handler.
-// The tail call makes these stubs disappear in backtraces.
-TEXT runtime·panicIndex(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicIndex(SB)
-TEXT runtime·panicIndexU(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicIndexU(SB)
-TEXT runtime·panicSliceAlen(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSliceAlen(SB)
-TEXT runtime·panicSliceAlenU(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSliceAlenU(SB)
-TEXT runtime·panicSliceAcap(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSliceAcap(SB)
-TEXT runtime·panicSliceAcapU(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSliceAcapU(SB)
-TEXT runtime·panicSliceB(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicSliceB(SB)
-TEXT runtime·panicSliceBU(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicSliceBU(SB)
-TEXT runtime·panicSlice3Alen(SB),NOSPLIT,$0-8
-	MOVL	DX, x+0(FP)
-	MOVL	BX, y+4(FP)
-	JMP	runtime·goPanicSlice3Alen(SB)
-TEXT runtime·panicSlice3AlenU(SB),NOSPLIT,$0-8
-	MOVL	DX, x+0(FP)
-	MOVL	BX, y+4(FP)
-	JMP	runtime·goPanicSlice3AlenU(SB)
-TEXT runtime·panicSlice3Acap(SB),NOSPLIT,$0-8
-	MOVL	DX, x+0(FP)
-	MOVL	BX, y+4(FP)
-	JMP	runtime·goPanicSlice3Acap(SB)
-TEXT runtime·panicSlice3AcapU(SB),NOSPLIT,$0-8
-	MOVL	DX, x+0(FP)
-	MOVL	BX, y+4(FP)
-	JMP	runtime·goPanicSlice3AcapU(SB)
-TEXT runtime·panicSlice3B(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSlice3B(SB)
-TEXT runtime·panicSlice3BU(SB),NOSPLIT,$0-8
-	MOVL	CX, x+0(FP)
-	MOVL	DX, y+4(FP)
-	JMP	runtime·goPanicSlice3BU(SB)
-TEXT runtime·panicSlice3C(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicSlice3C(SB)
-TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-8
-	MOVL	AX, x+0(FP)
-	MOVL	CX, y+4(FP)
-	JMP	runtime·goPanicSlice3CU(SB)
-
-// Extended versions for 64-bit indexes.
-TEXT runtime·panicExtendIndex(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendIndex(SB)
-TEXT runtime·panicExtendIndexU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendIndexU(SB)
-TEXT runtime·panicExtendSliceAlen(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceAlen(SB)
-TEXT runtime·panicExtendSliceAlenU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceAlenU(SB)
-TEXT runtime·panicExtendSliceAcap(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceAcap(SB)
-TEXT runtime·panicExtendSliceAcapU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceAcapU(SB)
-TEXT runtime·panicExtendSliceB(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceB(SB)
-TEXT runtime·panicExtendSliceBU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendSliceBU(SB)
-TEXT runtime·panicExtendSlice3Alen(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	DX, lo+4(FP)
-	MOVL	BX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3Alen(SB)
-TEXT runtime·panicExtendSlice3AlenU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	DX, lo+4(FP)
-	MOVL	BX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3AlenU(SB)
-TEXT runtime·panicExtendSlice3Acap(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	DX, lo+4(FP)
-	MOVL	BX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3Acap(SB)
-TEXT runtime·panicExtendSlice3AcapU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	DX, lo+4(FP)
-	MOVL	BX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3AcapU(SB)
-TEXT runtime·panicExtendSlice3B(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3B(SB)
-TEXT runtime·panicExtendSlice3BU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	CX, lo+4(FP)
-	MOVL	DX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3BU(SB)
-TEXT runtime·panicExtendSlice3C(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3C(SB)
-TEXT runtime·panicExtendSlice3CU(SB),NOSPLIT,$0-12
-	MOVL	SI, hi+0(FP)
-	MOVL	AX, lo+4(FP)
-	MOVL	CX, y+8(FP)
-	JMP	runtime·goPanicExtendSlice3CU(SB)
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 5a7e3b7..51a50c6 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -185,15 +185,11 @@
 TEXT runtime·breakpoint(SB),NOSPLIT,$0-0
 	// gdb won't skip this breakpoint instruction automatically,
 	// so you must manually "set $pc+=4" to skip it and continue.
-#ifdef GOOS_nacl
-	WORD	$0xe125be7f	// BKPT 0x5bef, NACL_INSTR_ARM_BREAKPOINT
-#else
 #ifdef GOOS_plan9
 	WORD	$0xD1200070	// undefined instruction used as armv5 breakpoint in Plan 9
 #else
 	WORD	$0xe7f001f0	// undefined instruction that gdb understands is a software breakpoint
 #endif
-#endif
 	RET
 
 TEXT runtime·asminit(SB),NOSPLIT,$0-0
@@ -327,9 +323,6 @@
 	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVW	$runtime·systemstack_switch(SB), R3
-#ifdef GOOS_nacl
-	ADD	$4, R3, R3 // get past nacl-insert bic instruction
-#endif
 	ADD	$4, R3, R3 // get past push {lr}
 	MOVW	R3, (g_sched+gobuf_pc)(g)
 	MOVW	R13, (g_sched+gobuf_sp)(g)
@@ -817,18 +810,14 @@
 	RET
 
 // AES hashing not implemented for ARM
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	$0, R0
-	MOVW	(R0), R1
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	$0, R0
-	MOVW	(R0), R1
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	$0, R0
-	MOVW	(R0), R1
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	$0, R0
-	MOVW	(R0), R1
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·memhash64Fallback(SB)
 
 TEXT runtime·return0(SB),NOSPLIT,$0
 	MOVW	$0, R0
@@ -891,7 +880,6 @@
 	SUB	R1, R3, R1
 	RET
 
-#ifndef GOOS_nacl
 // This is called from .init_array and follows the platform, not Go, ABI.
 TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
 	MOVW	R9, saver9-4(SP) // The access to global variables below implicitly uses R9, which is callee-save
@@ -902,7 +890,6 @@
 	MOVW	saver11-8(SP), R11
 	MOVW	saver9-4(SP), R9
 	RET
-#endif
 
 TEXT ·checkASM(SB),NOSPLIT,$0-1
 	MOVW	$1, R3
@@ -939,8 +926,6 @@
 	MOVM.IA.W	(R13), [R0,R1]
 	// Do the write.
 	MOVW	R3, (R2)
-	// Normally RET on nacl clobbers R12, but because this
-	// function has no frame it doesn't have to usual epilogue.
 	RET
 
 flush:
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 6e3b1b1..6b3d1e7 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -27,8 +27,7 @@
 
 	// if there is a _cgo_init, call it using the gcc ABI.
 	MOVD	_cgo_init(SB), R12
-	CMP	$0, R12
-	BEQ	nocgo
+	CBZ	R12, nocgo
 
 	MRS_TPIDR_R0			// load TLS base pointer
 	MOVD	R0, R3			// arg 3: TLS base pointer
@@ -114,8 +113,7 @@
 	MOVD	ZR, gobuf_ret(R3)
 	// Assert ctxt is zero. See func save.
 	MOVD	gobuf_ctxt(R3), R0
-	CMP	$0, R0
-	BEQ	2(PC)
+	CBZ	R0, 2(PC)
 	CALL	runtime·badctxt(SB)
 	RET
 
@@ -445,8 +443,10 @@
 CALLFN(·call536870912, 536870920 )
 CALLFN(·call1073741824, 1073741832 )
 
-// func aeshash32(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-24
+// func memhash32(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	MOVB	runtime·useAeshash(SB), R0
+	CBZ	R0, noaes
 	MOVD	p+0(FP), R0
 	MOVD	h+8(FP), R1
 	MOVD	$ret+16(FP), R2
@@ -465,9 +465,13 @@
 
 	VST1	[V0.D1], (R2)
 	RET
+noaes:
+	B	runtime·memhash32Fallback(SB)
 
-// func aeshash64(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-24
+// func memhash64(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	MOVB	runtime·useAeshash(SB), R0
+	CBZ	R0, noaes
 	MOVD	p+0(FP), R0
 	MOVD	h+8(FP), R1
 	MOVD	$ret+16(FP), R2
@@ -486,31 +490,41 @@
 
 	VST1	[V0.D1], (R2)
 	RET
+noaes:
+	B	runtime·memhash64Fallback(SB)
 
-// func aeshash(p unsafe.Pointer, h, size uintptr) uintptr
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-32
+// func memhash(p unsafe.Pointer, h, size uintptr) uintptr
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	MOVB	runtime·useAeshash(SB), R0
+	CBZ	R0, noaes
 	MOVD	p+0(FP), R0
 	MOVD	s+16(FP), R1
-	MOVWU	h+8(FP), R3
+	MOVD	h+8(FP), R3
 	MOVD	$ret+24(FP), R2
 	B	aeshashbody<>(SB)
+noaes:
+	B	runtime·memhashFallback(SB)
 
-// func aeshashstr(p unsafe.Pointer, h uintptr) uintptr
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-24
+// func strhash(p unsafe.Pointer, h uintptr) uintptr
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	MOVB	runtime·useAeshash(SB), R0
+	CBZ	R0, noaes
 	MOVD	p+0(FP), R10 // string pointer
 	LDP	(R10), (R0, R1) //string data/ length
-	MOVWU	h+8(FP), R3
+	MOVD	h+8(FP), R3
 	MOVD	$ret+16(FP), R2 // return adddress
 	B	aeshashbody<>(SB)
+noaes:
+	B	runtime·strhashFallback(SB)
 
 // R0: data
-// R1: length (maximum 32 bits)
+// R1: length
 // R2: address to put return value
 // R3: seed data
 TEXT aeshashbody<>(SB),NOSPLIT|NOFRAME,$0
 	VEOR	V30.B16, V30.B16, V30.B16
-	VMOV	R3, V30.S[0]
-	VMOV	R1, V30.S[1] // load length into seed
+	VMOV	R3, V30.D[0]
+	VMOV	R1, V30.D[1] // load length into seed
 
 	MOVD	$runtime·aeskeysched+0(SB), R4
 	VLD1.P	16(R4), [V0.B16]
@@ -528,8 +542,7 @@
 	B	aes129plus
 
 aes0to15:
-	CMP	$0, R1
-	BEQ	aes0
+	CBZ	R1, aes0
 	VEOR	V2.B16, V2.B16, V2.B16
 	TBZ	$3, R1, less_than_8
 	VLD1.P	8(R0), V2.D[0]
@@ -859,8 +872,7 @@
 	MOVD	$0, (g_sched+gobuf_ret)(g)
 	// Assert ctxt is zero. See func save.
 	MOVD	(g_sched+gobuf_ctxt)(g), R0
-	CMP	$0, R0
-	BEQ	2(PC)
+	CBZ	R0, 2(PC)
 	CALL	runtime·badctxt(SB)
 	RET
 
@@ -873,8 +885,7 @@
 	MOVD	arg+8(FP), R0
 
 	MOVD	RSP, R2		// save original stack pointer
-	CMP	$0, g
-	BEQ	nosave
+	CBZ	g, nosave
 	MOVD	g, R4
 
 	// Figure out if we need to switch to m->g0 stack.
@@ -970,8 +981,7 @@
 
 	// Load g from thread-local storage.
 	MOVB	runtime·iscgo(SB), R3
-	CMP	$0, R3
-	BEQ	nocgo
+	CBZ	R3, nocgo
 	BL	runtime·load_g(SB)
 nocgo:
 
@@ -980,8 +990,7 @@
 	// In this case, we're running on the thread stack, so there's
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
-	CMP	$0, g
-	BEQ	needm
+	CBZ	g, needm
 
 	MOVD	g_m(g), R8
 	MOVD	R8, savedm-8(SP)
@@ -1072,8 +1081,7 @@
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
 	MOVD	savedm-8(SP), R6
-	CMP	$0, R6
-	BNE	droppedm
+	CBNZ	R6, droppedm
 	MOVD	$runtime·dropm(SB), R0
 	BL	(R0)
 droppedm:
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 8e59140..7330f40 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -610,14 +610,14 @@
 	UNDEF
 
 // AES hashing not implemented for mips64
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback(SB)
 
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R1
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 971dc37..aca0510 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -611,21 +611,15 @@
 TEXT runtime·abort(SB),NOSPLIT,$0-0
 	UNDEF
 
-// Not implemented.
-TEXT runtime·aeshash(SB),NOSPLIT,$0
-	UNDEF
-
-// Not implemented.
-TEXT runtime·aeshash32(SB),NOSPLIT,$0
-	UNDEF
-
-// Not implemented.
-TEXT runtime·aeshash64(SB),NOSPLIT,$0
-	UNDEF
-
-// Not implemented.
-TEXT runtime·aeshashstr(SB),NOSPLIT,$0
-	UNDEF
+// AES hashing not implemented for mips
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	runtime·memhash64Fallback(SB)
 
 TEXT runtime·return0(SB),NOSPLIT,$0
 	MOVW	$0, R1
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 441042c..11d2f2f 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -833,14 +833,14 @@
 	RET
 
 // AES hashing not implemented for ppc64
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R1
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback(SB)
 
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R3
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
new file mode 100644
index 0000000..d7c45a1
--- /dev/null
+++ b/src/runtime/asm_riscv64.s
@@ -0,0 +1,669 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// func rt0_go()
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// X2 = stack; A0 = argc; A1 = argv
+
+	ADD	$-24, X2
+	MOV	A0, 8(X2) // argc
+	MOV	A1, 16(X2) // argv
+
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOV	$runtime·g0(SB), g
+	MOV	$(-64*1024), T0
+	ADD	T0, X2, T1
+	MOV	T1, g_stackguard0(g)
+	MOV	T1, g_stackguard1(g)
+	MOV	T1, (g_stack+stack_lo)(g)
+	MOV	X2, (g_stack+stack_hi)(g)
+
+	// if there is a _cgo_init, call it using the gcc ABI.
+	MOV	_cgo_init(SB), T0
+	BEQ	T0, ZERO, nocgo
+
+	MOV	ZERO, A3	// arg 3: not used
+	MOV	ZERO, A2	// arg 2: not used
+	MOV	$setg_gcc<>(SB), A1	// arg 1: setg
+	MOV	g, A0	// arg 0: G
+	JALR	RA, T0
+
+nocgo:
+	// update stackguard after _cgo_init
+	MOV	(g_stack+stack_lo)(g), T0
+	ADD	$const__StackGuard, T0
+	MOV	T0, g_stackguard0(g)
+	MOV	T0, g_stackguard1(g)
+
+	// set the per-goroutine and per-mach "registers"
+	MOV	$runtime·m0(SB), T0
+
+	// save m->g0 = g0
+	MOV	g, m_g0(T0)
+	// save m0 to g0->m
+	MOV	T0, g_m(g)
+
+	CALL	runtime·check(SB)
+
+	// args are already prepared
+	CALL	runtime·args(SB)
+	CALL	runtime·osinit(SB)
+	CALL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOV	$runtime·mainPC(SB), T0		// entry
+	ADD	$-24, X2
+	MOV	T0, 16(X2)
+	MOV	ZERO, 8(X2)
+	MOV	ZERO, 0(X2)
+	CALL	runtime·newproc(SB)
+	ADD	$24, X2
+
+	// start this M
+	CALL	runtime·mstart(SB)
+
+	WORD $0 // crash if reached
+	RET
+
+// void setg_gcc(G*); set g called from gcc with g in A0
+TEXT setg_gcc<>(SB),NOSPLIT,$0-0
+	MOV	A0, g
+	CALL	runtime·save_g(SB)
+	RET
+
+// func cputicks() int64
+TEXT runtime·cputicks(SB),NOSPLIT,$0-8
+	WORD	$0xc0102573	// rdtime a0
+	MOV	A0, ret+0(FP)
+	RET
+
+// systemstack_switch is a dummy routine that systemstack leaves at the bottom
+// of the G stack. We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the system stack because the one at the top of
+// the system stack terminates the stack walk (see topofstack()).
+TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	UNDEF
+	JALR	RA, ZERO	// make sure this function is not leaf
+	RET
+
+// func systemstack(fn func())
+TEXT runtime·systemstack(SB), NOSPLIT, $0-8
+	MOV	fn+0(FP), CTXT	// CTXT = fn
+	MOV	g_m(g), T0	// T0 = m
+
+	MOV	m_gsignal(T0), T1	// T1 = gsignal
+	BEQ	g, T1, noswitch
+
+	MOV	m_g0(T0), T1	// T1 = g0
+	BEQ	g, T1, noswitch
+
+	MOV	m_curg(T0), T2
+	BEQ	g, T2, switch
+
+	// Bad: g is not gsignal, not g0, not curg. What is it?
+	// Hide call from linker nosplit analysis.
+	MOV	$runtime·badsystemstack(SB), T1
+	JALR	RA, T1
+
+switch:
+	// save our state in g->sched. Pretend to
+	// be systemstack_switch if the G stack is scanned.
+	MOV	$runtime·systemstack_switch(SB), T2
+	ADD	$8, T2	// get past prologue
+	MOV	T2, (g_sched+gobuf_pc)(g)
+	MOV	X2, (g_sched+gobuf_sp)(g)
+	MOV	ZERO, (g_sched+gobuf_lr)(g)
+	MOV	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOV	T1, g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), T0
+	// make it look like mstart called systemstack on g0, to stop traceback
+	ADD	$-8, T0
+	MOV	$runtime·mstart(SB), T1
+	MOV	T1, 0(T0)
+	MOV	T0, X2
+
+	// call target function
+	MOV	0(CTXT), T1	// code pointer
+	JALR	RA, T1
+
+	// switch back to g
+	MOV	g_m(g), T0
+	MOV	m_curg(T0), g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), X2
+	MOV	ZERO, (g_sched+gobuf_sp)(g)
+	RET
+
+noswitch:
+	// already on m stack, just call directly
+	// Using a tail call here cleans up tracebacks since we won't stop
+	// at an intermediate systemstack.
+	MOV	0(CTXT), T1	// code pointer
+	ADD	$8, X2
+	JMP	(T1)
+
+TEXT runtime·getcallerpc(SB),NOSPLIT|NOFRAME,$0-8
+	MOV	0(X2), T0		// LR saved by caller
+	MOV	T0, ret+0(FP)
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// Caller has already loaded:
+// R1: framesize, R2: argsize, R3: LR
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+
+// func morestack()
+TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	MOV	g_m(g), A0
+	MOV	m_g0(A0), A1
+	BNE	g, A1, 3(PC)
+	CALL	runtime·badmorestackg0(SB)
+	CALL	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOV	m_gsignal(A0), A1
+	BNE	g, A1, 3(PC)
+	CALL	runtime·badmorestackgsignal(SB)
+	CALL	runtime·abort(SB)
+
+	// Called from f.
+	// Set g->sched to context in f.
+	MOV	X2, (g_sched+gobuf_sp)(g)
+	MOV	T0, (g_sched+gobuf_pc)(g)
+	MOV	RA, (g_sched+gobuf_lr)(g)
+	MOV	CTXT, (g_sched+gobuf_ctxt)(g)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOV	RA, (m_morebuf+gobuf_pc)(A0)	// f's caller's PC
+	MOV	X2, (m_morebuf+gobuf_sp)(A0)	// f's caller's SP
+	MOV	g, (m_morebuf+gobuf_g)(A0)
+
+	// Call newstack on m->g0's stack.
+	MOV	m_g0(A0), g
+	CALL	runtime·save_g(SB)
+	MOV	(g_sched+gobuf_sp)(g), X2
+	// Create a stack frame on g0 to call newstack.
+	MOV	ZERO, -8(X2)	// Zero saved LR in frame
+	ADD	$-8, X2
+	CALL	runtime·newstack(SB)
+
+	// Not reached, but make sure the return PC from the call to newstack
+	// is still in this function, and not the beginning of the next.
+	UNDEF
+
+// func morestack_noctxt()
+TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	MOV	ZERO, CTXT
+	JMP	runtime·morestack(SB)
+
+// AES hashing not implemented for riscv64
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback(SB)
+
+// func return0()
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOV	$0, A0
+	RET
+
+// restore state from Gobuf; longjmp
+
+// func gogo(buf *gobuf)
+TEXT runtime·gogo(SB), NOSPLIT, $16-8
+	MOV	buf+0(FP), T0
+	MOV	gobuf_g(T0), g	// make sure g is not nil
+	CALL	runtime·save_g(SB)
+
+	MOV	(g), ZERO // make sure g is not nil
+	MOV	gobuf_sp(T0), X2
+	MOV	gobuf_lr(T0), RA
+	MOV	gobuf_ret(T0), A0
+	MOV	gobuf_ctxt(T0), CTXT
+	MOV	ZERO, gobuf_sp(T0)
+	MOV	ZERO, gobuf_ret(T0)
+	MOV	ZERO, gobuf_lr(T0)
+	MOV	ZERO, gobuf_ctxt(T0)
+	MOV	gobuf_pc(T0), T0
+	JALR	ZERO, T0
+
+// func jmpdefer(fv *funcval, argp uintptr)
+// called from deferreturn
+// 1. grab stored return address from the caller's frame
+// 2. sub 8 bytes to get back to JAL deferreturn
+// 3. JMP to fn
+TEXT runtime·jmpdefer(SB), NOSPLIT|NOFRAME, $0-16
+	MOV	0(X2), RA
+	ADD	$-8, RA
+
+	MOV	fv+0(FP), CTXT
+	MOV	argp+8(FP), X2
+	ADD	$-8, X2
+	MOV	0(CTXT), T0
+	JALR	ZERO, T0
+
+// func procyield(cycles uint32)
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	RET
+
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return. It should gogo(&g->sched)
+// to keep running g.
+
+// func mcall(fn func(*g))
+TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
+	// Save caller state in g->sched
+	MOV	X2, (g_sched+gobuf_sp)(g)
+	MOV	RA, (g_sched+gobuf_pc)(g)
+	MOV	ZERO, (g_sched+gobuf_lr)(g)
+	MOV	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOV	g, T0
+	MOV	g_m(g), T1
+	MOV	m_g0(T1), g
+	CALL	runtime·save_g(SB)
+	BNE	g, T0, 2(PC)
+	JMP	runtime·badmcall(SB)
+	MOV	fn+0(FP), CTXT			// context
+	MOV	0(CTXT), T1			// code pointer
+	MOV	(g_sched+gobuf_sp)(g), X2	// sp = m->g0->sched.sp
+	ADD	$-16, X2
+	MOV	T0, 8(X2)
+	MOV	ZERO, 0(X2)
+	JALR	RA, T1
+	JMP	runtime·badmcall2(SB)
+
+// func gosave(buf *gobuf)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
+	MOV	buf+0(FP), T1
+	MOV	X2, gobuf_sp(T1)
+	MOV	RA, gobuf_pc(T1)
+	MOV	g, gobuf_g(T1)
+	MOV	ZERO, gobuf_lr(T1)
+	MOV	ZERO, gobuf_ret(T1)
+	// Assert ctxt is zero. See func save.
+	MOV	gobuf_ctxt(T1), T1
+	BEQ	T1, ZERO, 2(PC)
+	CALL	runtime·badctxt(SB)
+	RET
+
+// func asmcgocall(fn, arg unsafe.Pointer) int32
+TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+	// TODO(jsing): Add support for cgo - issue #36641.
+	WORD $0		// crash
+
+// func asminit()
+TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
+	RET
+
+// reflectcall: call a function with the given argument list
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)	\
+	MOV	$MAXSIZE, T1	\
+	BLTU	T1, T0, 3(PC)	\
+	MOV	$NAME(SB), T2;	\
+	JALR	ZERO, T2
+// Note: can't just "BR NAME(SB)" - bad inlining results.
+
+// func call(argtype *rtype, fn, arg unsafe.Pointer, n uint32, retoffset uint32)
+TEXT reflect·call(SB), NOSPLIT, $0-0
+	JMP	·reflectcall(SB)
+
+// func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
+TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
+	MOVWU argsize+24(FP), T0
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOV	$runtime·badreflectcall(SB), T2
+	JALR	ZERO, T2
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOV	arg+16(FP), A1;			\
+	MOVWU	argsize+24(FP), A2;		\
+	MOV	X2, A3;				\
+	ADD	$8, A3;				\
+	ADD	A3, A2;				\
+	BEQ	A3, A2, 6(PC);			\
+	MOVBU	(A1), A4;			\
+	ADD	$1, A1;				\
+	MOVB	A4, (A3);			\
+	ADD	$1, A3;				\
+	JMP	-5(PC);				\
+	/* call function */			\
+	MOV	f+8(FP), CTXT;			\
+	MOV	(CTXT), A4;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	JALR	RA, A4;				\
+	/* copy return values back */		\
+	MOV	argtype+0(FP), A5;		\
+	MOV	arg+16(FP), A1;			\
+	MOVWU	n+24(FP), A2;			\
+	MOVWU	retoffset+28(FP), A4;		\
+	ADD	$8, X2, A3;			\
+	ADD	A4, A3; 			\
+	ADD	A4, A1;				\
+	SUB	A4, A2;				\
+	CALL	callRet<>(SB);			\
+	RET
+
+// callRet copies return values back at the end of call*. This is a
+// separate function so it can allocate stack space for the arguments
+// to reflectcallmove. It does not follow the Go ABI; it expects its
+// arguments in registers.
+TEXT callRet<>(SB), NOSPLIT, $32-0
+	MOV	A5, 8(X2)
+	MOV	A1, 16(X2)
+	MOV	A3, 24(X2)
+	MOV	A2, 32(X2)
+	CALL	runtime·reflectcallmove(SB)
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+// func goexit(neverCallThisFunction)
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
+	MOV	ZERO, ZERO	// NOP
+	JMP	runtime·goexit1(SB)	// does not return
+	// traceback from goexit1 must hit code range of goexit
+	MOV	ZERO, ZERO	// NOP
+
+// func cgocallback_gofunc(fv uintptr, frame uintptr, framesize, ctxt uintptr)
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+	// TODO(jsing): Add support for cgo - issue #36641.
+	WORD $0		// crash
+
+TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
+	EBREAK
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
+	EBREAK
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOV	gg+0(FP), g
+	// This only happens if iscgo, so jump straight to save_g
+	CALL	runtime·save_g(SB)
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOV	$1, T0
+	MOV	T0, ret+0(FP)
+	RET
+
+// gcWriteBarrier performs a heap pointer write and informs the GC.
+//
+// gcWriteBarrier does NOT follow the Go ABI. It takes two arguments:
+// - T0 is the destination of the write
+// - T1 is the value being written at T0.
+// It clobbers R30 (the linker temp register - REG_TMP).
+// The act of CALLing gcWriteBarrier will clobber RA (LR).
+// It does not clobber any other general-purpose registers,
+// but may clobber others (e.g., floating point registers).
+TEXT runtime·gcWriteBarrier(SB),NOSPLIT,$296
+	// Save the registers clobbered by the fast path.
+	MOV	A0, 280(X2)
+	MOV	A1, 288(X2)
+	MOV	g_m(g), A0
+	MOV	m_p(A0), A0
+	MOV	(p_wbBuf+wbBuf_next)(A0), A1
+	// Increment wbBuf.next position.
+	ADD	$16, A1
+	MOV	A1, (p_wbBuf+wbBuf_next)(A0)
+	MOV	(p_wbBuf+wbBuf_end)(A0), A0
+	MOV	A0, T6		// T6 is linker temp register (REG_TMP)
+	// Record the write.
+	MOV	T1, -16(A1)	// Record value
+	MOV	(T0), A0	// TODO: This turns bad writes into bad reads.
+	MOV	A0, -8(A1)	// Record *slot
+	// Is the buffer full?
+	BEQ	A1, T6, flush
+ret:
+	MOV	280(X2), A0
+	MOV	288(X2), A1
+	// Do the write.
+	MOV	T1, (T0)
+	RET
+
+flush:
+	// Save all general purpose registers since these could be
+	// clobbered by wbBufFlush and were not saved by the caller.
+	MOV	T0, 8(X2)	// Also first argument to wbBufFlush
+	MOV	T1, 16(X2)	// Also second argument to wbBufFlush
+
+	// TODO: Optimise
+	// R3 is g.
+	// R4 already saved (T0)
+	// R5 already saved (T1)
+	// R9 already saved (A0)
+	// R10 already saved (A1)
+	// R30 is tmp register.
+	MOV	X0, 24(X2)
+	MOV	X1, 32(X2)
+	MOV	X2, 40(X2)
+	MOV	X3, 48(X2)
+	MOV	X4, 56(X2)
+	MOV	X5, 64(X2)
+	MOV	X6, 72(X2)
+	MOV	X7, 80(X2)
+	MOV	X8, 88(X2)
+	MOV	X9, 96(X2)
+	MOV	X10, 104(X2)
+	MOV	X11, 112(X2)
+	MOV	X12, 120(X2)
+	MOV	X13, 128(X2)
+	MOV	X14, 136(X2)
+	MOV	X15, 144(X2)
+	MOV	X16, 152(X2)
+	MOV	X17, 160(X2)
+	MOV	X18, 168(X2)
+	MOV	X19, 176(X2)
+	MOV	X20, 184(X2)
+	MOV	X21, 192(X2)
+	MOV	X22, 200(X2)
+	MOV	X23, 208(X2)
+	MOV	X24, 216(X2)
+	MOV	X25, 224(X2)
+	MOV	X26, 232(X2)
+	MOV	X27, 240(X2)
+	MOV	X28, 248(X2)
+	MOV	X29, 256(X2)
+	MOV	X30, 264(X2)
+	MOV	X31, 272(X2)
+
+	// This takes arguments T0 and T1.
+	CALL	runtime·wbBufFlush(SB)
+
+	MOV	24(X2), X0
+	MOV	32(X2), X1
+	MOV	40(X2), X2
+	MOV	48(X2), X3
+	MOV	56(X2), X4
+	MOV	64(X2), X5
+	MOV	72(X2), X6
+	MOV	80(X2), X7
+	MOV	88(X2), X8
+	MOV	96(X2), X9
+	MOV	104(X2), X10
+	MOV	112(X2), X11
+	MOV	120(X2), X12
+	MOV	128(X2), X13
+	MOV	136(X2), X14
+	MOV	144(X2), X15
+	MOV	152(X2), X16
+	MOV	160(X2), X17
+	MOV	168(X2), X18
+	MOV	176(X2), X19
+	MOV	184(X2), X20
+	MOV	192(X2), X21
+	MOV	200(X2), X22
+	MOV	208(X2), X23
+	MOV	216(X2), X24
+	MOV	224(X2), X25
+	MOV	232(X2), X26
+	MOV	240(X2), X27
+	MOV	248(X2), X28
+	MOV	256(X2), X29
+	MOV	264(X2), X30
+	MOV	272(X2), X31
+
+	JMP	ret
+
+// Note: these functions use a special calling convention to save generated code space.
+// Arguments are passed in registers, but the space for those arguments are allocated
+// in the caller's stack frame. These stubs write the args into that stack space and
+// then tail call to the corresponding runtime handler.
+// The tail call makes these stubs disappear in backtraces.
+TEXT runtime·panicIndex(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicIndex(SB)
+TEXT runtime·panicIndexU(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicIndexU(SB)
+TEXT runtime·panicSliceAlen(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSliceAlen(SB)
+TEXT runtime·panicSliceAlenU(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSliceAlenU(SB)
+TEXT runtime·panicSliceAcap(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSliceAcap(SB)
+TEXT runtime·panicSliceAcapU(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSliceAcapU(SB)
+TEXT runtime·panicSliceB(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicSliceB(SB)
+TEXT runtime·panicSliceBU(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicSliceBU(SB)
+TEXT runtime·panicSlice3Alen(SB),NOSPLIT,$0-16
+	MOV	T2, x+0(FP)
+	MOV	T3, y+8(FP)
+	JMP	runtime·goPanicSlice3Alen(SB)
+TEXT runtime·panicSlice3AlenU(SB),NOSPLIT,$0-16
+	MOV	T2, x+0(FP)
+	MOV	T3, y+8(FP)
+	JMP	runtime·goPanicSlice3AlenU(SB)
+TEXT runtime·panicSlice3Acap(SB),NOSPLIT,$0-16
+	MOV	T2, x+0(FP)
+	MOV	T3, y+8(FP)
+	JMP	runtime·goPanicSlice3Acap(SB)
+TEXT runtime·panicSlice3AcapU(SB),NOSPLIT,$0-16
+	MOV	T2, x+0(FP)
+	MOV	T3, y+8(FP)
+	JMP	runtime·goPanicSlice3AcapU(SB)
+TEXT runtime·panicSlice3B(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSlice3B(SB)
+TEXT runtime·panicSlice3BU(SB),NOSPLIT,$0-16
+	MOV	T1, x+0(FP)
+	MOV	T2, y+8(FP)
+	JMP	runtime·goPanicSlice3BU(SB)
+TEXT runtime·panicSlice3C(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicSlice3C(SB)
+TEXT runtime·panicSlice3CU(SB),NOSPLIT,$0-16
+	MOV	T0, x+0(FP)
+	MOV	T1, y+8(FP)
+	JMP	runtime·goPanicSlice3CU(SB)
+
+DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·mainPC(SB),RODATA,$8
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index ff3caf7..cb39451 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -740,14 +740,14 @@
 	RET
 
 // AES hashing not implemented for s390x
-TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R15
-TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R15
-TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R15
-TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
-	MOVW	(R0), R15
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback(SB)
 
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R3
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index 8f3964f..7d88beb 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -176,6 +176,16 @@
 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
 	RET
 
+// AES hashing not implemented for wasm
+TEXT runtime·memhash(SB),NOSPLIT|NOFRAME,$0-32
+	JMP	runtime·memhashFallback(SB)
+TEXT runtime·strhash(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·strhashFallback(SB)
+TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash32Fallback(SB)
+TEXT runtime·memhash64(SB),NOSPLIT|NOFRAME,$0-24
+	JMP	runtime·memhash64Fallback(SB)
+
 TEXT runtime·return0(SB), NOSPLIT, $0-0
 	MOVD $0, RET0
 	RET
diff --git a/src/runtime/atomic_riscv64.s b/src/runtime/atomic_riscv64.s
new file mode 100644
index 0000000..544a7c5
--- /dev/null
+++ b/src/runtime/atomic_riscv64.s
@@ -0,0 +1,10 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func publicationBarrier()
+TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
+	FENCE
+	RET
diff --git a/src/runtime/callers_test.go b/src/runtime/callers_test.go
index ad83f99..3cf3fbe 100644
--- a/src/runtime/callers_test.go
+++ b/src/runtime/callers_test.go
@@ -5,25 +5,26 @@
 package runtime_test
 
 import (
+	"reflect"
 	"runtime"
 	"strings"
 	"testing"
 )
 
 func f1(pan bool) []uintptr {
-	return f2(pan) // line 14
+	return f2(pan) // line 15
 }
 
 func f2(pan bool) []uintptr {
-	return f3(pan) // line 18
+	return f3(pan) // line 19
 }
 
 func f3(pan bool) []uintptr {
 	if pan {
-		panic("f3") // line 23
+		panic("f3") // line 24
 	}
 	ret := make([]uintptr, 20)
-	return ret[:runtime.Callers(0, ret)] // line 26
+	return ret[:runtime.Callers(0, ret)] // line 27
 }
 
 func testCallers(t *testing.T, pcs []uintptr, pan bool) {
@@ -47,16 +48,16 @@
 
 	var f3Line int
 	if pan {
-		f3Line = 23
+		f3Line = 24
 	} else {
-		f3Line = 26
+		f3Line = 27
 	}
 	want := []struct {
 		name string
 		line int
 	}{
-		{"f1", 14},
-		{"f2", 18},
+		{"f1", 15},
+		{"f2", 19},
 		{"f3", f3Line},
 	}
 	for _, w := range want {
@@ -66,11 +67,35 @@
 	}
 }
 
+func testCallersEqual(t *testing.T, pcs []uintptr, want []string) {
+	t.Helper()
+
+	got := make([]string, 0, len(want))
+
+	frames := runtime.CallersFrames(pcs)
+	for {
+		frame, more := frames.Next()
+		if !more || len(got) >= len(want) {
+			break
+		}
+		got = append(got, frame.Function)
+	}
+	if !reflect.DeepEqual(want, got) {
+		t.Fatalf("wanted %v, got %v", want, got)
+	}
+}
+
 func TestCallers(t *testing.T) {
 	testCallers(t, f1(false), false)
 }
 
 func TestCallersPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersPanic.func1",
+		"runtime.gopanic", "runtime_test.f3", "runtime_test.f2", "runtime_test.f1",
+		"runtime_test.TestCallersPanic"}
+
 	defer func() {
 		if r := recover(); r == nil {
 			t.Fatal("did not panic")
@@ -78,6 +103,209 @@
 		pcs := make([]uintptr, 20)
 		pcs = pcs[:runtime.Callers(0, pcs)]
 		testCallers(t, pcs, true)
+		testCallersEqual(t, pcs, want)
 	}()
 	f1(true)
 }
+
+func TestCallersDoublePanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDoublePanic.func1.1",
+		"runtime.gopanic", "runtime_test.TestCallersDoublePanic.func1", "runtime.gopanic", "runtime_test.TestCallersDoublePanic"}
+
+	defer func() {
+		defer func() {
+			pcs := make([]uintptr, 20)
+			pcs = pcs[:runtime.Callers(0, pcs)]
+			if recover() == nil {
+				t.Fatal("did not panic")
+			}
+			testCallersEqual(t, pcs, want)
+		}()
+		if recover() == nil {
+			t.Fatal("did not panic")
+		}
+		panic(2)
+	}()
+	panic(1)
+}
+
+// Test that a defer after a successful recovery looks like it is called directly
+// from the function with the defers.
+func TestCallersAfterRecovery(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAfterRecovery.func1", "runtime_test.TestCallersAfterRecovery"}
+
+	defer func() {
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	defer func() {
+		if recover() == nil {
+			t.Fatal("did not recover from panic")
+		}
+	}()
+	panic(1)
+}
+
+func TestCallersAbortedPanic(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic.func2", "runtime_test.TestCallersAbortedPanic"}
+
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatalf("should be no panic remaining to recover")
+		}
+	}()
+
+	defer func() {
+		// panic2 was aborted/replaced by panic1, so when panic2 was
+		// recovered, there is no remaining panic on the stack.
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	defer func() {
+		r := recover()
+		if r != "panic2" {
+			t.Fatalf("got %v, wanted %v", r, "panic2")
+		}
+	}()
+	defer func() {
+		// panic2 aborts/replaces panic1, because it is a recursive panic
+		// that is not recovered within the defer function called by
+		// panic1 panicking sequence
+		panic("panic2")
+	}()
+	panic("panic1")
+}
+
+func TestCallersAbortedPanic2(t *testing.T) {
+	want := []string{"runtime.Callers", "runtime_test.TestCallersAbortedPanic2.func2", "runtime_test.TestCallersAbortedPanic2"}
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatalf("should be no panic remaining to recover")
+		}
+	}()
+	defer func() {
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	func() {
+		defer func() {
+			r := recover()
+			if r != "panic2" {
+				t.Fatalf("got %v, wanted %v", r, "panic2")
+			}
+		}()
+		func() {
+			defer func() {
+				// Again, panic2 aborts/replaces panic1
+				panic("panic2")
+			}()
+			panic("panic1")
+		}()
+	}()
+}
+
+func TestCallersNilPointerPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersNilPointerPanic.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic",
+		"runtime_test.TestCallersNilPointerPanic"}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	var p *int
+	if *p == 3 {
+		t.Fatal("did not see nil pointer panic")
+	}
+}
+
+func TestCallersDivZeroPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack (due to
+	// open-coded defer processing)
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDivZeroPanic.func1",
+		"runtime.gopanic", "runtime.panicdivide",
+		"runtime_test.TestCallersDivZeroPanic"}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+	}()
+	var n int
+	if 5/n == 1 {
+		t.Fatal("did not see divide-by-sizer panic")
+	}
+}
+
+func TestCallersDeferNilFuncPanic(t *testing.T) {
+	// Make sure we don't have any extra frames on the stack. We cut off the check
+	// at runtime.sigpanic, because non-open-coded defers (which may be used in
+	// non-opt or race checker mode) include an extra 'deferreturn' frame (which is
+	// where the nil pointer deref happens).
+	state := 1
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanic.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic"}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+		if state == 1 {
+			t.Fatal("nil defer func panicked at defer time rather than function exit time")
+		}
+
+	}()
+	var f func()
+	defer f()
+	// Use the value of 'state' to make sure nil defer func f causes panic at
+	// function exit, rather than at the defer statement.
+	state = 2
+}
+
+// Same test, but forcing non-open-coded defer by putting the defer in a loop.  See
+// issue #36050
+func TestCallersDeferNilFuncPanicWithLoop(t *testing.T) {
+	state := 1
+	want := []string{"runtime.Callers", "runtime_test.TestCallersDeferNilFuncPanicWithLoop.func1",
+		"runtime.gopanic", "runtime.panicmem", "runtime.sigpanic", "runtime.deferreturn", "runtime_test.TestCallersDeferNilFuncPanicWithLoop"}
+
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallersEqual(t, pcs, want)
+		if state == 1 {
+			t.Fatal("nil defer func panicked at defer time rather than function exit time")
+		}
+
+	}()
+
+	for i := 0; i < 1; i++ {
+		var f func()
+		defer f()
+	}
+	// Use the value of 'state' to make sure nil defer func f causes panic at
+	// function exit, rather than at the defer statement.
+	state = 2
+}
diff --git a/src/runtime/cgo/asm_nacl_amd64p32.s b/src/runtime/cgo/asm_nacl_amd64p32.s
deleted file mode 100644
index 82aaecd..0000000
--- a/src/runtime/cgo/asm_nacl_amd64p32.s
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- */
-TEXT crosscall2(SB),NOSPLIT,$0
-	INT $3
-	RET
diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index 5a2b5e4..c02b837 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go
@@ -11,8 +11,7 @@
 
 /*
 
-#cgo darwin,!arm,!arm64 LDFLAGS: -lpthread
-#cgo darwin,arm LDFLAGS: -framework CoreFoundation
+#cgo darwin,!arm64 LDFLAGS: -lpthread
 #cgo darwin,arm64 LDFLAGS: -framework CoreFoundation
 #cgo dragonfly LDFLAGS: -lpthread
 #cgo freebsd LDFLAGS: -lpthread
@@ -23,6 +22,9 @@
 #cgo aix LDFLAGS: -Wl,-berok
 #cgo solaris LDFLAGS: -lxnet
 
+// Issue 35247.
+#cgo darwin CFLAGS: -Wno-nullability-completeness
+
 #cgo CFLAGS: -Wall -Werror
 
 #cgo solaris CPPFLAGS: -D_POSIX_PTHREAD_SEMANTICS
diff --git a/src/runtime/cgo/gcc_android.c b/src/runtime/cgo/gcc_android.c
index 321a515..7ea2135 100644
--- a/src/runtime/cgo/gcc_android.c
+++ b/src/runtime/cgo/gcc_android.c
@@ -35,7 +35,7 @@
 // Truncated to a different magic value on 32-bit; that's ok.
 #define magic1 (0x23581321345589ULL)
 
-// From https://android.googlesource.com/platform/bionic/+/refs/heads/master/libc/private/bionic_asm_tls.h#69.
+// From https://android.googlesource.com/platform/bionic/+/refs/heads/android10-tests-release/libc/private/bionic_asm_tls.h#69.
 #define TLS_SLOT_APP 2
 
 // inittls allocates a thread-local storage slot for g.
diff --git a/src/runtime/cgo/gcc_arm64.S b/src/runtime/cgo/gcc_arm64.S
index 59dce08..9154d2a 100644
--- a/src/runtime/cgo/gcc_arm64.S
+++ b/src/runtime/cgo/gcc_arm64.S
@@ -24,13 +24,28 @@
  */
 .globl EXT(crosscall1)
 EXT(crosscall1):
-	stp x19, x20, [sp, #-16]!
-	stp x21, x22, [sp, #-16]!
-	stp x23, x24, [sp, #-16]!
-	stp x25, x26, [sp, #-16]!
-	stp x27, x28, [sp, #-16]!
-	stp x29, x30, [sp, #-16]!
+	.cfi_startproc
+	stp x29, x30, [sp, #-96]!
+	.cfi_def_cfa_offset 96
+	.cfi_offset 29, -96
+	.cfi_offset 30, -88
 	mov x29, sp
+	.cfi_def_cfa_register 29
+	stp x19, x20, [sp, #80]
+	.cfi_offset 19, -16
+	.cfi_offset 20, -8
+	stp x21, x22, [sp, #64]
+	.cfi_offset 21, -32
+	.cfi_offset 22, -24
+	stp x23, x24, [sp, #48]
+	.cfi_offset 23, -48
+	.cfi_offset 24, -40
+	stp x25, x26, [sp, #32]
+	.cfi_offset 25, -64
+	.cfi_offset 26, -56
+	stp x27, x28, [sp, #16]
+	.cfi_offset 27, -80
+	.cfi_offset 28, -72
 
 	mov x19, x0
 	mov x20, x1
@@ -39,13 +54,27 @@
 	blr x20
 	blr x19
 
-	ldp x29, x30, [sp], #16
-	ldp x27, x28, [sp], #16
-	ldp x25, x26, [sp], #16
-	ldp x23, x24, [sp], #16
-	ldp x21, x22, [sp], #16
-	ldp x19, x20, [sp], #16
+	ldp x27, x28, [sp, #16]
+	.cfi_restore 27
+	.cfi_restore 28
+	ldp x25, x26, [sp, #32]
+	.cfi_restore 25
+	.cfi_restore 26
+	ldp x23, x24, [sp, #48]
+	.cfi_restore 23
+	.cfi_restore 24
+	ldp x21, x22, [sp, #64]
+	.cfi_restore 21
+	.cfi_restore 22
+	ldp x19, x20, [sp, #80]
+	.cfi_restore 19
+	.cfi_restore 20
+	ldp x29, x30, [sp], #96
+	.cfi_restore 29
+	.cfi_restore 30
+	.cfi_def_cfa 31, 0
 	ret
+	.cfi_endproc
 
 
 #ifdef __ELF__
diff --git a/src/runtime/cgo/gcc_darwin_arm.c b/src/runtime/cgo/gcc_darwin_arm.c
deleted file mode 100644
index 205977c..0000000
--- a/src/runtime/cgo/gcc_darwin_arm.c
+++ /dev/null
@@ -1,164 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <limits.h>
-#include <pthread.h>
-#include <signal.h>
-#include <string.h> /* for strerror */
-#include <sys/param.h>
-#include <unistd.h>
-
-#include <CoreFoundation/CFBundle.h>
-#include <CoreFoundation/CFString.h>
-
-#include "libcgo.h"
-#include "libcgo_unix.h"
-
-#define magic (0xe696c4f4U)
-
-// inittls allocates a thread-local storage slot for g.
-//
-// It finds the first available slot using pthread_key_create and uses
-// it as the offset value for runtime.tlsg.
-static void
-inittls(void **tlsg, void **tlsbase)
-{
-	pthread_key_t k;
-	int i, err;
-
-	err = pthread_key_create(&k, nil);
-	if(err != 0) {
-		fprintf(stderr, "runtime/cgo: pthread_key_create failed: %d\n", err);
-		abort();
-	}
-	//fprintf(stderr, "runtime/cgo: k = %d, tlsbase = %p\n", (int)k, tlsbase); // debug
-	pthread_setspecific(k, (void*)magic);
-	// The first key should be at 258.
-	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
-		if (*(tlsbase+i) == (void*)magic) {
-			*tlsg = (void*)(i*sizeof(void *));
-			pthread_setspecific(k, 0);
-			return;
-		}
-	}
-	fprintf(stderr, "runtime/cgo: could not find pthread key.\n");
-	abort();
-}
-
-static void *threadentry(void*);
-static void (*setg_gcc)(void*);
-
-void
-_cgo_sys_thread_start(ThreadStart *ts)
-{
-	pthread_attr_t attr;
-	sigset_t ign, oset;
-	pthread_t p;
-	size_t size;
-	int err;
-
-	sigfillset(&ign);
-	pthread_sigmask(SIG_SETMASK, &ign, &oset);
-
-	pthread_attr_init(&attr);
-	size = 0;
-	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
-	ts->g->stackhi = size;
-	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
-
-	pthread_sigmask(SIG_SETMASK, &oset, nil);
-
-	if (err != 0) {
-		fprintf(stderr, "runtime/cgo: pthread_create failed: %s\n", strerror(err));
-		abort();
-	}
-}
-
-extern void crosscall_arm1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
-static void*
-threadentry(void *v)
-{
-	ThreadStart ts;
-
-	ts = *(ThreadStart*)v;
-	free(v);
-
-	darwin_arm_init_thread_exception_port();
-
-	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
-	return nil;
-}
-
-// init_working_dir sets the current working directory to the app root.
-// By default darwin/arm processes start in "/".
-static void
-init_working_dir()
-{
-	CFBundleRef bundle = CFBundleGetMainBundle();
-	if (bundle == NULL) {
-		fprintf(stderr, "runtime/cgo: no main bundle\n");
-		return;
-	}
-	CFURLRef url_ref = CFBundleCopyResourceURL(bundle, CFSTR("Info"), CFSTR("plist"), NULL);
-	if (url_ref == NULL) {
-		// No Info.plist found. It can happen on Corellium virtual devices.
-		return;
-	}
-	CFStringRef url_str_ref = CFURLGetString(url_ref);
-	char buf[MAXPATHLEN];
-	Boolean res = CFStringGetCString(url_str_ref, buf, sizeof(buf), kCFStringEncodingUTF8);
-	CFRelease(url_ref);
-	if (!res) {
-		fprintf(stderr, "runtime/cgo: cannot get URL string\n");
-		return;
-	}
-
-	// url is of the form "file:///path/to/Info.plist".
-	// strip it down to the working directory "/path/to".
-	int url_len = strlen(buf);
-	if (url_len < sizeof("file://")+sizeof("/Info.plist")) {
-		fprintf(stderr, "runtime/cgo: bad URL: %s\n", buf);
-		return;
-	}
-	buf[url_len-sizeof("/Info.plist")+1] = 0;
-	char *dir = &buf[0] + sizeof("file://")-1;
-
-	if (chdir(dir) != 0) {
-		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
-	}
-
-	// The test harness in go_darwin_arm_exec passes the relative working directory
-	// in the GoExecWrapperWorkingDirectory property of the app bundle.
-	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
-	if (wd_ref != NULL) {
-		if (!CFStringGetCString(wd_ref, buf, sizeof(buf), kCFStringEncodingUTF8)) {
-			fprintf(stderr, "runtime/cgo: cannot get GoExecWrapperWorkingDirectory string\n");
-			return;
-		}
-		if (chdir(buf) != 0) {
-			fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", buf);
-		}
-	}
-}
-
-void
-x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
-{
-	pthread_attr_t attr;
-	size_t size;
-
-	setg_gcc = setg;
-	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
-	g->stacklo = (uintptr)&attr - size + 4096;
-	pthread_attr_destroy(&attr);
-
-	// yes, tlsbase from mrc might not be correctly aligned.
-	inittls(tlsg, (void**)((uintptr)tlsbase & ~3));
-
-	darwin_arm_init_mach_exception_handler();
-	darwin_arm_init_thread_exception_port();
-	init_working_dir();
-}
diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index e6e3057..fd7d408 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -94,7 +94,7 @@
 }
 
 // init_working_dir sets the current working directory to the app root.
-// By default darwin/arm processes start in "/".
+// By default darwin/arm64 processes start in "/".
 static void
 init_working_dir()
 {
diff --git a/src/runtime/cgo/gcc_darwin_386.c b/src/runtime/cgo/gcc_freebsd_arm64.c
similarity index 74%
rename from src/runtime/cgo/gcc_darwin_386.c
rename to src/runtime/cgo/gcc_freebsd_arm64.c
index 501457a..dd8f888 100644
--- a/src/runtime/cgo/gcc_darwin_386.c
+++ b/src/runtime/cgo/gcc_freebsd_arm64.c
@@ -1,28 +1,32 @@
-// Copyright 2009 The Go Authors. All rights reserved.
+// Copyright 2019 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include <string.h> /* for strerror */
+#include <sys/types.h>
+#include <errno.h>
+#include <sys/signalvar.h>
 #include <pthread.h>
 #include <signal.h>
+#include <string.h>
 #include "libcgo.h"
 #include "libcgo_unix.h"
 
 static void* threadentry(void*);
+static void (*setg_gcc)(void*);
 
 void
-x_cgo_init(G *g)
+x_cgo_init(G *g, void (*setg)(void*))
 {
 	pthread_attr_t attr;
 	size_t size;
 
+	setg_gcc = setg;
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 	g->stacklo = (uintptr)&attr - size + 4096;
 	pthread_attr_destroy(&attr);
 }
 
-
 void
 _cgo_sys_thread_start(ThreadStart *ts)
 {
@@ -32,7 +36,7 @@
 	size_t size;
 	int err;
 
-	sigfillset(&ign);
+	SIGFILLSET(ign);
 	pthread_sigmask(SIG_SETMASK, &ign, &oset);
 
 	pthread_attr_init(&attr);
@@ -49,6 +53,8 @@
 	}
 }
 
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+
 static void*
 threadentry(void *v)
 {
@@ -57,10 +63,6 @@
 	ts = *(ThreadStart*)v;
 	free(v);
 
-	// Move the g pointer into the slot reserved in thread local storage.
-	// Constant must match the one in cmd/link/internal/ld/sym.go.
-	asm volatile("movl %0, %%gs:0x18" :: "r"(ts.g));
-
-	crosscall_386(ts.fn);
+	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
diff --git a/src/runtime/cgo/gcc_setenv.c b/src/runtime/cgo/gcc_setenv.c
index 88e92bf..d4f7983 100644
--- a/src/runtime/cgo/gcc_setenv.c
+++ b/src/runtime/cgo/gcc_setenv.c
@@ -20,9 +20,9 @@
 
 /* Stub for calling unsetenv */
 void
-x_cgo_unsetenv(char *arg)
+x_cgo_unsetenv(char **arg)
 {
 	_cgo_tsan_acquire();
-	unsetenv(arg);
+	unsetenv(arg[0]);
 	_cgo_tsan_release();
 }
diff --git a/src/runtime/cgo/gcc_signal2_darwin_armx.c b/src/runtime/cgo/gcc_signal2_darwin_arm64.c
similarity index 79%
rename from src/runtime/cgo/gcc_signal2_darwin_armx.c
rename to src/runtime/cgo/gcc_signal2_darwin_arm64.c
index 54b7e32..5b8a18f 100644
--- a/src/runtime/cgo/gcc_signal2_darwin_armx.c
+++ b/src/runtime/cgo/gcc_signal2_darwin_arm64.c
@@ -3,10 +3,8 @@
 // license that can be found in the LICENSE file.
 
 // +build lldb
-// +build darwin
-// +build arm arm64
 
-// Used by gcc_signal_darwin_armx.c when doing the test build during cgo.
+// Used by gcc_signal_darwin_arm64.c when doing the test build during cgo.
 // We hope that for real binaries the definition provided by Go will take precedence
 // and the linker will drop this .o file altogether, which is why this definition
 // is all by itself in its own file.
diff --git a/src/runtime/cgo/gcc_signal_darwin_armx.c b/src/runtime/cgo/gcc_signal_darwin_arm64.c
similarity index 98%
rename from src/runtime/cgo/gcc_signal_darwin_armx.c
rename to src/runtime/cgo/gcc_signal_darwin_arm64.c
index 3ab1d8b..6519edd 100644
--- a/src/runtime/cgo/gcc_signal_darwin_armx.c
+++ b/src/runtime/cgo/gcc_signal_darwin_arm64.c
@@ -18,8 +18,6 @@
 // The dist tool enables this by build flag when testing.
 
 // +build lldb
-// +build darwin
-// +build arm arm64
 
 #include <limits.h>
 #include <pthread.h>
diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_darwin_lldb.c
index 54d91f6..0ccdae3 100644
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ b/src/runtime/cgo/gcc_signal_darwin_lldb.c
@@ -4,7 +4,7 @@
 
 // +build !lldb
 // +build darwin
-// +build arm arm64
+// +build arm64
 
 #include <stdint.h>
 
diff --git a/src/runtime/cgo/signal_darwin_arm.s b/src/runtime/cgo/signal_darwin_arm.s
deleted file mode 100644
index 0be10c0..0000000
--- a/src/runtime/cgo/signal_darwin_arm.s
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// xx_cgo_panicmem is the entrypoint for SIGSEGV as intercepted via a
-// mach thread port as EXC_BAD_ACCESS. As the segfault may have happened
-// in C code, we first need to load_g then call xx_cgo_panicmem.
-//
-//	R1 - LR at moment of fault
-//	R2 - PC at moment of fault
-TEXT xx_cgo_panicmem(SB),NOSPLIT|NOFRAME,$0
-	// If in external C code, we need to load the g register.
-	BL  runtime·load_g(SB)
-	CMP $0, g
-	BNE ongothread
-
-	// On a foreign thread. We call badsignal, which will, if all
-	// goes according to plan, not return.
-	SUB  $4, R13
-	MOVW $11, R1
-	MOVW $11, R2
-	MOVM.DB.W [R1,R2], (R13)
-	// TODO: badsignal should not return, but it does. Issue #10139.
-	//BL runtime·badsignal(SB)
-	MOVW $139, R1
-	MOVW R1, 4(R13)
-	B    runtime·exit(SB)
-
-ongothread:
-	// Trigger a SIGSEGV panic.
-	//
-	// The goal is to arrange the stack so it looks like the runtime
-	// function sigpanic was called from the PC that faulted. It has
-	// to be sigpanic, as the stack unwinding code in traceback.go
-	// looks explicitly for it.
-	//
-	// To do this we call into runtime·setsigsegv, which sets the
-	// appropriate state inside the g object. We give it the faulting
-	// PC on the stack, then put it in the LR before calling sigpanic.
-	MOVM.DB.W [R1,R2], (R13)
-	BL runtime·setsigsegv(SB)
-	MOVM.IA.W (R13), [R1,R2]
-
-	SUB $4, R13
-	MOVW R1, 0(R13)
-	MOVW R2, R14
-	B runtime·sigpanic(SB)
diff --git a/src/runtime/cgo/signal_darwin_armx.go b/src/runtime/cgo/signal_darwin_arm64.go
similarity index 87%
rename from src/runtime/cgo/signal_darwin_armx.go
rename to src/runtime/cgo/signal_darwin_arm64.go
index 9f4b462..3425c44 100644
--- a/src/runtime/cgo/signal_darwin_armx.go
+++ b/src/runtime/cgo/signal_darwin_arm64.go
@@ -2,9 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin
-// +build arm arm64
-
 package cgo
 
 import _ "unsafe"
diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
index 048621f..d5e0cc1 100644
--- a/src/runtime/cgo_mmap.go
+++ b/src/runtime/cgo_mmap.go
@@ -20,6 +20,11 @@
 //go:linkname _cgo_munmap _cgo_munmap
 var _cgo_munmap unsafe.Pointer
 
+// mmap is used to route the mmap system call through C code when using cgo, to
+// support sanitizer interceptors. Don't allow stack splits, since this function
+// (used by sysAlloc) is called in a lot of low-level parts of the runtime and
+// callers often assume it won't acquire any locks.
+//go:nosplit
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
 	if _cgo_mmap != nil {
 		// Make ret a uintptr so that writing to it in the
diff --git a/src/runtime/cgo_sigaction.go b/src/runtime/cgo_sigaction.go
index bc5e078..de634dc 100644
--- a/src/runtime/cgo_sigaction.go
+++ b/src/runtime/cgo_sigaction.go
@@ -18,12 +18,12 @@
 //go:nosplit
 //go:nowritebarrierrec
 func sigaction(sig uint32, new, old *sigactiont) {
-	// The runtime package is explicitly blacklisted from sanitizer
-	// instrumentation in racewalk.go, but we might be calling into instrumented C
-	// functions here — so we need the pointer parameters to be properly marked.
+	// racewalk.go avoids adding sanitizing instrumentation to package runtime,
+	// but we might be calling into instrumented C functions here,
+	// so we need the pointer parameters to be properly marked.
 	//
-	// Mark the input as having been written before the call and the output as
-	// read after.
+	// Mark the input as having been written before the call
+	// and the output as read after.
 	if msanenabled && new != nil {
 		msanwrite(unsafe.Pointer(new), unsafe.Sizeof(*new))
 	}
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index a881ae1..a4e64b0 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -90,6 +90,11 @@
 type cgoCallers [32]uintptr
 
 // Call from Go to C.
+//
+// This must be nosplit because it's used for syscalls on some
+// platforms. Syscalls may have untyped arguments on the stack, so
+// it's not safe to grow or scan the stack.
+//
 //go:nosplit
 func cgocall(fn, arg unsafe.Pointer) int32 {
 	if !iscgo && GOOS != "solaris" && GOOS != "illumos" && GOOS != "windows" {
@@ -127,6 +132,13 @@
 	// saved by entersyscall here.
 	entersyscall()
 
+	// Tell asynchronous preemption that we're entering external
+	// code. We do this after entersyscall because this may block
+	// and cause an async preemption to fail, but at this point a
+	// sync preemption will succeed (though this is not a matter
+	// of correctness).
+	osPreemptExtEnter(mp)
+
 	mp.incgo = true
 	errno := asmcgocall(fn, arg)
 
@@ -135,6 +147,8 @@
 	mp.incgo = false
 	mp.ncgo--
 
+	osPreemptExtExit(mp)
+
 	exitsyscall()
 
 	// Note that raceacquire must be called only after exitsyscall has
@@ -188,12 +202,16 @@
 	exitsyscall() // coming out of cgo call
 	gp.m.incgo = false
 
+	osPreemptExtExit(gp.m)
+
 	cgocallbackg1(ctxt)
 
 	// At this point unlockOSThread has been called.
 	// The following code must not change to a different m.
 	// This is enforced by checking incgo in the schedule function.
 
+	osPreemptExtEnter(gp.m)
+
 	gp.m.incgo = true
 	// going back to cgo call
 	reentersyscall(savedpc, uintptr(savedsp))
@@ -352,6 +370,7 @@
 		if mp.ncgo > 0 {
 			mp.incgo = false
 			mp.ncgo--
+			osPreemptExtExit(mp)
 		}
 
 		releasem(mp)
@@ -406,24 +425,24 @@
 
 // cgoCheckPointer checks if the argument contains a Go pointer that
 // points to a Go pointer, and panics if it does.
-func cgoCheckPointer(ptr interface{}, args ...interface{}) {
+func cgoCheckPointer(ptr interface{}, arg interface{}) {
 	if debug.cgocheck == 0 {
 		return
 	}
 
-	ep := (*eface)(unsafe.Pointer(&ptr))
+	ep := efaceOf(&ptr)
 	t := ep._type
 
 	top := true
-	if len(args) > 0 && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+	if arg != nil && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
 		p := ep.data
 		if t.kind&kindDirectIface == 0 {
 			p = *(*unsafe.Pointer)(p)
 		}
-		if !cgoIsGoPointer(p) {
+		if p == nil || !cgoIsGoPointer(p) {
 			return
 		}
-		aep := (*eface)(unsafe.Pointer(&args[0]))
+		aep := efaceOf(&arg)
 		switch aep._type.kind & kindMask {
 		case kindBool:
 			if t.kind&kindMask == kindUnsafePointer {
@@ -460,7 +479,7 @@
 // depending on indir. The top parameter is whether we are at the top
 // level, where Go pointers are allowed.
 func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
-	if t.ptrdata == 0 {
+	if t.ptrdata == 0 || p == nil {
 		// If the type has no pointers there is nothing to do.
 		return
 	}
@@ -517,7 +536,7 @@
 		st := (*slicetype)(unsafe.Pointer(t))
 		s := (*slice)(p)
 		p = s.array
-		if !cgoIsGoPointer(p) {
+		if p == nil || !cgoIsGoPointer(p) {
 			return
 		}
 		if !top {
@@ -548,11 +567,17 @@
 			return
 		}
 		for _, f := range st.fields {
+			if f.typ.ptrdata == 0 {
+				continue
+			}
 			cgoCheckArg(f.typ, add(p, f.offset()), true, top, msg)
 		}
 	case kindPtr, kindUnsafePointer:
 		if indir {
 			p = *(*unsafe.Pointer)(p)
+			if p == nil {
+				return
+			}
 		}
 
 		if !cgoIsGoPointer(p) {
@@ -644,7 +669,7 @@
 		return
 	}
 
-	ep := (*eface)(unsafe.Pointer(&val))
+	ep := efaceOf(&val)
 	t := ep._type
 	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
 }
diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go
index ed854e5..516045c 100644
--- a/src/runtime/cgocheck.go
+++ b/src/runtime/cgocheck.go
@@ -76,23 +76,24 @@
 	cgoCheckTypedBlock(typ, src, off, size)
 }
 
-// cgoCheckSliceCopy is called when copying n elements of a slice from
-// src to dst.  typ is the element type of the slice.
+// cgoCheckSliceCopy is called when copying n elements of a slice.
+// src and dst are pointers to the first element of the slice.
+// typ is the element type of the slice.
 // It throws if the program is copying slice elements that contain Go pointers
 // into non-Go memory.
 //go:nosplit
 //go:nowritebarrier
-func cgoCheckSliceCopy(typ *_type, dst, src slice, n int) {
+func cgoCheckSliceCopy(typ *_type, dst, src unsafe.Pointer, n int) {
 	if typ.ptrdata == 0 {
 		return
 	}
-	if !cgoIsGoPointer(src.array) {
+	if !cgoIsGoPointer(src) {
 		return
 	}
-	if cgoIsGoPointer(dst.array) {
+	if cgoIsGoPointer(dst) {
 		return
 	}
-	p := src.array
+	p := src
 	for i := 0; i < n; i++ {
 		cgoCheckTypedBlock(typ, p, 0, typ.size)
 		p = add(p, typ.size)
@@ -133,7 +134,7 @@
 	}
 
 	s := spanOfUnchecked(uintptr(src))
-	if s.state == mSpanManual {
+	if s.state.get() == mSpanManual {
 		// There are no heap bits for value stored on the stack.
 		// For a channel receive src might be on the stack of some
 		// other goroutine, so we can't unwind the stack even if
diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index 8334c1e..f6f4ffd 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go
@@ -109,9 +109,10 @@
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
 	c.dataqsiz = uint(size)
+	lockInit(&c.lock, lockRankHchan)
 
 	if debugChan {
-		print("makechan: chan=", c, "; elemsize=", elem.size, "; elemalg=", elem.alg, "; dataqsiz=", size, "\n")
+		print("makechan: chan=", c, "; elemsize=", elem.size, "; dataqsiz=", size, "\n")
 	}
 	return c
 }
@@ -121,6 +122,21 @@
 	return add(c.buf, uintptr(i)*uintptr(c.elemsize))
 }
 
+// full reports whether a send on c would block (that is, the channel is full).
+// It uses a single word-sized read of mutable state, so although
+// the answer is instantaneously true, the correct answer may have changed
+// by the time the calling function receives the return value.
+func full(c *hchan) bool {
+	// c.dataqsiz is immutable (never written after the channel is created)
+	// so it is safe to read at any time during channel operation.
+	if c.dataqsiz == 0 {
+		// Assumes that a pointer read is relaxed-atomic.
+		return c.recvq.first == nil
+	}
+	// Assumes that a uint read is relaxed-atomic.
+	return c.qcount == c.dataqsiz
+}
+
 // entry point for c <- x from compiled code
 //go:nosplit
 func chansend1(c *hchan, elem unsafe.Pointer) {
@@ -160,7 +176,7 @@
 	//
 	// After observing that the channel is not closed, we observe that the channel is
 	// not ready for sending. Each of these observations is a single word-sized read
-	// (first c.closed and second c.recvq.first or c.qcount depending on kind of channel).
+	// (first c.closed and second full()).
 	// Because a closed channel cannot transition from 'ready for sending' to
 	// 'not ready for sending', even if the channel is closed between the two observations,
 	// they imply a moment between the two when the channel was both not yet closed
@@ -169,9 +185,10 @@
 	//
 	// It is okay if the reads are reordered here: if we observe that the channel is not
 	// ready for sending and then observe that it is not closed, that implies that the
-	// channel wasn't closed during the first observation.
-	if !block && c.closed == 0 && ((c.dataqsiz == 0 && c.recvq.first == nil) ||
-		(c.dataqsiz > 0 && c.qcount == c.dataqsiz)) {
+	// channel wasn't closed during the first observation. However, nothing here
+	// guarantees forward progress. We rely on the side effects of lock release in
+	// chanrecv() and closechan() to update this thread's view of c.closed and full().
+	if !block && c.closed == 0 && full(c) {
 		return false
 	}
 
@@ -233,7 +250,7 @@
 	gp.waiting = mysg
 	gp.param = nil
 	c.sendq.enqueue(mysg)
-	goparkunlock(&c.lock, waitReasonChanSend, traceEvGoBlockSend, 3)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanSend, traceEvGoBlockSend, 2)
 	// Ensure the value being sent is kept alive until the
 	// receiver copies it out. The sudog has a pointer to the
 	// stack object, but sudogs aren't considered as roots of the
@@ -245,6 +262,7 @@
 		throw("G waiting list is corrupted")
 	}
 	gp.waiting = nil
+	gp.activeStackChans = false
 	if gp.param == nil {
 		if c.closed == 0 {
 			throw("chansend: spurious wakeup")
@@ -400,6 +418,16 @@
 	}
 }
 
+// empty reports whether a read from c would block (that is, the channel is
+// empty).  It uses a single atomic read of mutable state.
+func empty(c *hchan) bool {
+	// c.dataqsiz is immutable.
+	if c.dataqsiz == 0 {
+		return atomic.Loadp(unsafe.Pointer(&c.sendq.first)) == nil
+	}
+	return atomic.Loaduint(&c.qcount) == 0
+}
+
 // entry points for <- c from compiled code
 //go:nosplit
 func chanrecv1(c *hchan, elem unsafe.Pointer) {
@@ -435,21 +463,36 @@
 	}
 
 	// Fast path: check for failed non-blocking operation without acquiring the lock.
-	//
-	// After observing that the channel is not ready for receiving, we observe that the
-	// channel is not closed. Each of these observations is a single word-sized read
-	// (first c.sendq.first or c.qcount, and second c.closed).
-	// Because a channel cannot be reopened, the later observation of the channel
-	// being not closed implies that it was also not closed at the moment of the
-	// first observation. We behave as if we observed the channel at that moment
-	// and report that the receive cannot proceed.
-	//
-	// The order of operations is important here: reversing the operations can lead to
-	// incorrect behavior when racing with a close.
-	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
-		c.dataqsiz > 0 && atomic.Loaduint(&c.qcount) == 0) &&
-		atomic.Load(&c.closed) == 0 {
-		return
+	if !block && empty(c) {
+		// After observing that the channel is not ready for receiving, we observe whether the
+		// channel is closed.
+		//
+		// Reordering of these checks could lead to incorrect behavior when racing with a close.
+		// For example, if the channel was open and not empty, was closed, and then drained,
+		// reordered reads could incorrectly indicate "open and empty". To prevent reordering,
+		// we use atomic loads for both checks, and rely on emptying and closing to happen in
+		// separate critical sections under the same lock.  This assumption fails when closing
+		// an unbuffered channel with a blocked send, but that is an error condition anyway.
+		if atomic.Load(&c.closed) == 0 {
+			// Because a channel cannot be reopened, the later observation of the channel
+			// being not closed implies that it was also not closed at the moment of the
+			// first observation. We behave as if we observed the channel at that moment
+			// and report that the receive cannot proceed.
+			return
+		}
+		// The channel is irreversibly closed. Re-check whether the channel has any pending data
+		// to receive, which could have arrived between the empty and closed checks above.
+		// Sequential consistency is also required here, when racing with such a send.
+		if empty(c) {
+			// The channel is irreversibly closed and empty.
+			if raceenabled {
+				raceacquire(c.raceaddr())
+			}
+			if ep != nil {
+				typedmemclr(c.elemtype, ep)
+			}
+			return true, false
+		}
 	}
 
 	var t0 int64
@@ -521,13 +564,14 @@
 	mysg.c = c
 	gp.param = nil
 	c.recvq.enqueue(mysg)
-	goparkunlock(&c.lock, waitReasonChanReceive, traceEvGoBlockRecv, 3)
+	gopark(chanparkcommit, unsafe.Pointer(&c.lock), waitReasonChanReceive, traceEvGoBlockRecv, 2)
 
 	// someone woke us up
 	if mysg != gp.waiting {
 		throw("G waiting list is corrupted")
 	}
 	gp.waiting = nil
+	gp.activeStackChans = false
 	if mysg.releasetime > 0 {
 		blockevent(mysg.releasetime-t0, 2)
 	}
@@ -594,6 +638,14 @@
 	goready(gp, skip+1)
 }
 
+func chanparkcommit(gp *g, chanLock unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	gp.activeStackChans = true
+	unlock((*mutex)(chanLock))
+	return true
+}
+
 // compiler implements
 //
 //	select {
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index b6188f5..039a086 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@@ -480,11 +480,11 @@
 	// If the select in the goroutine is fair,
 	// cnt1 and cnt2 should be about the same value.
 	// With 10,000 trials, the expected margin of error at
-	// a confidence level of five nines is 4.4172 / (2 * Sqrt(10000)).
+	// a confidence level of six nines is 4.891676 / (2 * Sqrt(10000)).
 	r := float64(cnt1) / trials
 	e := math.Abs(r - 0.5)
 	t.Log(cnt1, cnt2, r, e)
-	if e > 4.4172/(2*math.Sqrt(trials)) {
+	if e > 4.891676/(2*math.Sqrt(trials)) {
 		t.Errorf("unfair select: in %d trials, results were %d, %d", trials, cnt1, cnt2)
 	}
 	close(done)
@@ -719,6 +719,7 @@
 		if after.NumGC-before.NumGC >= 2 {
 			goto done
 		}
+		runtime.Gosched()
 	}
 	t.Fatal("failed to trigger concurrent GC")
 done:
@@ -1126,6 +1127,20 @@
 	wg.Wait()
 }
 
+func BenchmarkChanClosed(b *testing.B) {
+	c := make(chan struct{})
+	close(c)
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			select {
+			case <-c:
+			default:
+				b.Error("Unreachable")
+			}
+		}
+	})
+}
+
 var (
 	alwaysFalse = false
 	workSink    = 0
diff --git a/src/runtime/checkptr.go b/src/runtime/checkptr.go
new file mode 100644
index 0000000..59891a0
--- /dev/null
+++ b/src/runtime/checkptr.go
@@ -0,0 +1,83 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+func checkptrAlignment(p unsafe.Pointer, elem *_type, n uintptr) {
+	// Check that (*[n]elem)(p) is appropriately aligned.
+	// Note that we allow unaligned pointers if the types they point to contain
+	// no pointers themselves. See issue 37298.
+	// TODO(mdempsky): What about fieldAlign?
+	if elem.ptrdata != 0 && uintptr(p)&(uintptr(elem.align)-1) != 0 {
+		throw("checkptr: misaligned pointer conversion")
+	}
+
+	// Check that (*[n]elem)(p) doesn't straddle multiple heap objects.
+	if size := n * elem.size; size > 1 && checkptrBase(p) != checkptrBase(add(p, size-1)) {
+		throw("checkptr: converted pointer straddles multiple allocations")
+	}
+}
+
+func checkptrArithmetic(p unsafe.Pointer, originals []unsafe.Pointer) {
+	if 0 < uintptr(p) && uintptr(p) < minLegalPointer {
+		throw("checkptr: pointer arithmetic computed bad pointer value")
+	}
+
+	// Check that if the computed pointer p points into a heap
+	// object, then one of the original pointers must have pointed
+	// into the same object.
+	base := checkptrBase(p)
+	if base == 0 {
+		return
+	}
+
+	for _, original := range originals {
+		if base == checkptrBase(original) {
+			return
+		}
+	}
+
+	throw("checkptr: pointer arithmetic result points to invalid allocation")
+}
+
+// checkptrBase returns the base address for the allocation containing
+// the address p.
+//
+// Importantly, if p1 and p2 point into the same variable, then
+// checkptrBase(p1) == checkptrBase(p2). However, the converse/inverse
+// is not necessarily true as allocations can have trailing padding,
+// and multiple variables may be packed into a single allocation.
+func checkptrBase(p unsafe.Pointer) uintptr {
+	// stack
+	if gp := getg(); gp.stack.lo <= uintptr(p) && uintptr(p) < gp.stack.hi {
+		// TODO(mdempsky): Walk the stack to identify the
+		// specific stack frame or even stack object that p
+		// points into.
+		//
+		// In the mean time, use "1" as a pseudo-address to
+		// represent the stack. This is an invalid address on
+		// all platforms, so it's guaranteed to be distinct
+		// from any of the addresses we might return below.
+		return 1
+	}
+
+	// heap (must check after stack because of #35068)
+	if base, _, _ := findObject(uintptr(p), 0, 0); base != 0 {
+		return base
+	}
+
+	// data or bss
+	for _, datap := range activeModules() {
+		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			return datap.data
+		}
+		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			return datap.bss
+		}
+	}
+
+	return 0
+}
diff --git a/src/runtime/checkptr_test.go b/src/runtime/checkptr_test.go
new file mode 100644
index 0000000..8ab8a49
--- /dev/null
+++ b/src/runtime/checkptr_test.go
@@ -0,0 +1,53 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"internal/testenv"
+	"os/exec"
+	"strings"
+	"testing"
+)
+
+func TestCheckPtr(t *testing.T) {
+	t.Parallel()
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprog", "-gcflags=all=-d=checkptr=1")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	testCases := []struct {
+		cmd  string
+		want string
+	}{
+		{"CheckPtrAlignmentPtr", "fatal error: checkptr: misaligned pointer conversion\n"},
+		{"CheckPtrAlignmentNoPtr", ""},
+		{"CheckPtrArithmetic", "fatal error: checkptr: pointer arithmetic result points to invalid allocation\n"},
+		{"CheckPtrSize", "fatal error: checkptr: converted pointer straddles multiple allocations\n"},
+		{"CheckPtrSmall", "fatal error: checkptr: pointer arithmetic computed bad pointer value\n"},
+	}
+
+	for _, tc := range testCases {
+		tc := tc
+		t.Run(tc.cmd, func(t *testing.T) {
+			t.Parallel()
+			got, err := testenv.CleanCmdEnv(exec.Command(exe, tc.cmd)).CombinedOutput()
+			if err != nil {
+				t.Log(err)
+			}
+			if tc.want == "" {
+				if len(got) > 0 {
+					t.Errorf("output:\n%s\nwant no output", got)
+				}
+				return
+			}
+			if !strings.HasPrefix(string(got), tc.want) {
+				t.Errorf("output:\n%s\n\nwant output starting with: %s", got, tc.want)
+			}
+		})
+	}
+}
diff --git a/src/runtime/conv_wasm_test.go b/src/runtime/conv_wasm_test.go
new file mode 100644
index 0000000..5054fca
--- /dev/null
+++ b/src/runtime/conv_wasm_test.go
@@ -0,0 +1,128 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"testing"
+)
+
+var res int64
+var ures uint64
+
+func TestFloatTruncation(t *testing.T) {
+	testdata := []struct {
+		input      float64
+		convInt64  int64
+		convUInt64 uint64
+		overflow   bool
+	}{
+		// max +- 1
+		{
+			input:      0x7fffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// For out-of-bounds conversion, the result is implementation-dependent.
+		// This test verifies the implementation of wasm architecture.
+		{
+			input:      0x8000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x7ffffffffffffffe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// neg max +- 1
+		{
+			input:      -0x8000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x8000000000000001,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7fffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// trunc point +- 1
+		{
+			input:      0x7ffffffffffffdff,
+			convInt64:  0x7ffffffffffffc00,
+			convUInt64: 0x7ffffffffffffc00,
+		},
+		{
+			input:      0x7ffffffffffffe00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x7ffffffffffffdfe,
+			convInt64:  0x7ffffffffffffc00,
+			convUInt64: 0x7ffffffffffffc00,
+		},
+		// neg trunc point +- 1
+		{
+			input:      -0x7ffffffffffffdff,
+			convInt64:  -0x7ffffffffffffc00,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7ffffffffffffe00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      -0x7ffffffffffffdfe,
+			convInt64:  -0x7ffffffffffffc00,
+			convUInt64: 0x8000000000000000,
+		},
+		// umax +- 1
+		{
+			input:      0xffffffffffffffff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0x10000000000000000,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0xfffffffffffffffe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		// umax trunc +- 1
+		{
+			input:      0xfffffffffffffbff,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0xfffffffffffff800,
+		},
+		{
+			input:      0xfffffffffffffc00,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0x8000000000000000,
+		},
+		{
+			input:      0xfffffffffffffbfe,
+			convInt64:  -0x8000000000000000,
+			convUInt64: 0xfffffffffffff800,
+		},
+	}
+	for _, item := range testdata {
+		if got, want := int64(item.input), item.convInt64; got != want {
+			t.Errorf("int64(%f): got %x, want %x", item.input, got, want)
+		}
+		if got, want := uint64(item.input), item.convUInt64; got != want {
+			t.Errorf("uint64(%f): got %x, want %x", item.input, got, want)
+		}
+	}
+}
diff --git a/src/runtime/cpuflags.go b/src/runtime/cpuflags.go
index 1565afb..5104650 100644
--- a/src/runtime/cpuflags.go
+++ b/src/runtime/cpuflags.go
@@ -11,11 +11,14 @@
 
 // Offsets into internal/cpu records for use in assembly.
 const (
+	offsetX86HasAVX  = unsafe.Offsetof(cpu.X86.HasAVX)
 	offsetX86HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
 	offsetX86HasERMS = unsafe.Offsetof(cpu.X86.HasERMS)
 	offsetX86HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
 
 	offsetARMHasIDIVA = unsafe.Offsetof(cpu.ARM.HasIDIVA)
+
+	offsetMIPS64XHasMSA = unsafe.Offsetof(cpu.MIPS64X.HasMSA)
 )
 
 var (
@@ -23,6 +26,9 @@
 	// TODO: deprecate these; use internal/cpu directly.
 	x86HasPOPCNT bool
 	x86HasSSE41  bool
+	x86HasFMA    bool
+
+	armHasVFPv4 bool
 
 	arm64HasATOMICS bool
 )
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 56cfb08..4872189 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -275,7 +275,13 @@
 		t.Fatal(err)
 	}
 
-	got, err := testenv.CleanCmdEnv(exec.Command(exe, runArg)).CombinedOutput()
+	// pprofCgoTraceback is called whenever CGO code is executing and a signal
+	// is received. Disable signal preemption to increase the likelihood at
+	// least one SIGPROF signal fired to capture a sample. See issue #37201.
+	cmd := testenv.CleanCmdEnv(exec.Command(exe, runArg))
+	cmd.Env = append(cmd.Env, "GODEBUG=asyncpreemptoff=1")
+
+	got, err := cmd.CombinedOutput()
 	if err != nil {
 		if testenv.Builder() == "linux-amd64-alpine" {
 			// See Issue 18243 and Issue 19938.
@@ -549,3 +555,48 @@
 	}
 	return nil
 }
+
+func TestSegv(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+
+	for _, test := range []string{"Segv", "SegvInCgo"} {
+		t.Run(test, func(t *testing.T) {
+			t.Parallel()
+			got := runTestProg(t, "testprogcgo", test)
+			t.Log(got)
+			if !strings.Contains(got, "SIGSEGV") {
+				t.Errorf("expected crash from signal")
+			}
+		})
+	}
+}
+
+// TestEINTR tests that we handle EINTR correctly.
+// See issue #20400 and friends.
+func TestEINTR(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no EINTR on %s", runtime.GOOS)
+	case "linux":
+		if runtime.GOARCH == "386" {
+			// On linux-386 the Go signal handler sets
+			// a restorer function that is not preserved
+			// by the C sigaction call in the test,
+			// causing the signal handler to crash when
+			// returning the normal code. The test is not
+			// architecture-specific, so just skip on 386
+			// rather than doing a complicated workaround.
+			t.Skip("skipping on linux-386; C sigaction does not preserve Go restorer")
+		}
+	}
+
+	t.Parallel()
+	output := runTestProg(t, "testprogcgo", "EINTR")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/crash_nonunix_test.go b/src/runtime/crash_nonunix_test.go
index bf349a5..06c197e 100644
--- a/src/runtime/crash_nonunix_test.go
+++ b/src/runtime/crash_nonunix_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build windows plan9 nacl js,wasm
+// +build windows plan9 js,wasm
 
 package runtime_test
 
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index c54bb57..34f30c9 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -55,6 +55,16 @@
 		t.Fatal(err)
 	}
 
+	return runBuiltTestProg(t, exe, name, env...)
+}
+
+func runBuiltTestProg(t *testing.T, exe, name string, env ...string) string {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+
+	testenv.MustHaveGoBuild(t)
+
 	cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
 	cmd.Env = append(cmd.Env, env...)
 	if testing.Short() {
@@ -64,7 +74,7 @@
 	cmd.Stdout = &b
 	cmd.Stderr = &b
 	if err := cmd.Start(); err != nil {
-		t.Fatalf("starting %s %s: %v", binary, name, err)
+		t.Fatalf("starting %s %s: %v", exe, name, err)
 	}
 
 	// If the process doesn't complete within 1 minute,
@@ -92,7 +102,7 @@
 	}()
 
 	if err := cmd.Wait(); err != nil {
-		t.Logf("%s %s exit status: %v", binary, name, err)
+		t.Logf("%s %s exit status: %v", exe, name, err)
 	}
 	close(done)
 
@@ -104,8 +114,6 @@
 		t.Skip("-quick")
 	}
 
-	checkStaleRuntime(t)
-
 	testprog.Lock()
 	defer testprog.Unlock()
 	if testprog.dir == "" {
@@ -143,31 +151,12 @@
 	return exe, nil
 }
 
-var (
-	staleRuntimeOnce sync.Once // guards init of staleRuntimeErr
-	staleRuntimeErr  error
-)
-
-func checkStaleRuntime(t *testing.T) {
-	staleRuntimeOnce.Do(func() {
-		// 'go run' uses the installed copy of runtime.a, which may be out of date.
-		out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.Stale}}", "runtime")).CombinedOutput()
-		if err != nil {
-			staleRuntimeErr = fmt.Errorf("failed to execute 'go list': %v\n%v", err, string(out))
-			return
-		}
-		if string(out) != "false\n" {
-			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
-			out, err := testenv.CleanCmdEnv(exec.Command(testenv.GoToolPath(t), "list", "-gcflags=all="+os.Getenv("GO_GCFLAGS"), "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
-			if err != nil {
-				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
-			}
-			t.Logf("go list -f {{.StaleReason}} runtime:\n%s", out)
-			staleRuntimeErr = fmt.Errorf("Stale runtime.a. Run 'go install runtime'.")
-		}
-	})
-	if staleRuntimeErr != nil {
-		t.Fatal(staleRuntimeErr)
+func TestVDSO(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "SignalInVDSO")
+	want := "success\n"
+	if output != want {
+		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
 	}
 }
 
@@ -225,9 +214,23 @@
 
 func TestStackOverflow(t *testing.T) {
 	output := runTestProg(t, "testprog", "StackOverflow")
-	want := "runtime: goroutine stack exceeds 1474560-byte limit\nfatal error: stack overflow"
-	if !strings.HasPrefix(output, want) {
-		t.Fatalf("output does not start with %q:\n%s", want, output)
+	want := []string{
+		"runtime: goroutine stack exceeds 1474560-byte limit\n",
+		"fatal error: stack overflow",
+		// information about the current SP and stack bounds
+		"runtime: sp=",
+		"stack=[",
+	}
+	if !strings.HasPrefix(output, want[0]) {
+		t.Errorf("output does not start with %q", want[0])
+	}
+	for _, s := range want[1:] {
+		if !strings.Contains(output, s) {
+			t.Errorf("output does not contain %q", s)
+		}
+	}
+	if t.Failed() {
+		t.Logf("output:\n%s", output)
 	}
 }
 
@@ -251,6 +254,41 @@
 
 }
 
+func TestRecursivePanic2(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic2")
+	want := `first panic
+second panic
+panic: third panic
+
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
+func TestRecursivePanic3(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic3")
+	want := `panic: first panic
+
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
+func TestRecursivePanic4(t *testing.T) {
+	output := runTestProg(t, "testprog", "RecursivePanic4")
+	want := `panic: first panic [recovered]
+	panic: second panic
+`
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+}
+
 func TestGoexitCrash(t *testing.T) {
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
@@ -382,26 +420,32 @@
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
-	// 1. defer a function that recovers
-	// 2. defer a function that panics
-	// 3. call goexit
-	// Goexit should run the #2 defer. Its panic
-	// should be caught by the #1 defer, and execution
-	// should resume in the caller. Like the Goexit
-	// never happened!
-	defer func() {
-		r := recover()
-		if r == nil {
-			panic("bad recover")
-		}
-	}()
-	defer func() {
-		panic("hello")
-	}()
-	runtime.Goexit()
+	t.Parallel()
+	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+}
+
+func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
+	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
 }
 
 func TestNetpollDeadlock(t *testing.T) {
+	if os.Getenv("GO_BUILDER_NAME") == "darwin-amd64-10_12" {
+		// A suspected kernel bug in macOS 10.12 occasionally results in
+		// an apparent deadlock when dialing localhost. The errors have not
+		// been observed on newer versions of the OS, so we don't plan to work
+		// around them. See https://golang.org/issue/22019.
+		testenv.SkipFlaky(t, 22019)
+	}
+
 	t.Parallel()
 	output := runTestProg(t, "testprognet", "NetpollDeadlock")
 	want := "done\n"
@@ -413,7 +457,7 @@
 func TestPanicTraceback(t *testing.T) {
 	t.Parallel()
 	output := runTestProg(t, "testprog", "PanicTraceback")
-	want := "panic: hello"
+	want := "panic: hello\n\tpanic: panic pt2\n\tpanic: panic pt1\n"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index ce227fe..8ef52ab 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -16,8 +16,11 @@
 	"path/filepath"
 	"runtime"
 	"strings"
+	"sync"
 	"syscall"
 	"testing"
+	"time"
+	"unsafe"
 )
 
 // sigquit is the signal to send to kill a hanging testdata program.
@@ -33,6 +36,29 @@
 	}
 }
 
+func TestBadOpen(t *testing.T) {
+	// make sure we get the correct error code if open fails. Same for
+	// read/write/close on the resulting -1 fd. See issue 10052.
+	nonfile := []byte("/notreallyafile")
+	fd := runtime.Open(&nonfile[0], 0, 0)
+	if fd != -1 {
+		t.Errorf("open(%q)=%d, want -1", nonfile, fd)
+	}
+	var buf [32]byte
+	r := runtime.Read(-1, unsafe.Pointer(&buf[0]), int32(len(buf)))
+	if got, want := r, -int32(syscall.EBADF); got != want {
+		t.Errorf("read()=%d, want %d", got, want)
+	}
+	w := runtime.Write(^uintptr(0), unsafe.Pointer(&buf[0]), int32(len(buf)))
+	if got, want := w, -int32(syscall.EBADF); got != want {
+		t.Errorf("write()=%d, want %d", got, want)
+	}
+	c := runtime.Close(-1)
+	if c != -1 {
+		t.Errorf("close()=%d, want -1", c)
+	}
+}
+
 func TestCrashDumpsAllThreads(t *testing.T) {
 	if *flagQuick {
 		t.Skip("-quick")
@@ -53,8 +79,6 @@
 
 	testenv.MustHaveGoBuild(t)
 
-	checkStaleRuntime(t)
-
 	t.Parallel()
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -76,18 +100,17 @@
 
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
 	cmd = testenv.CleanCmdEnv(cmd)
-	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
-
-	// Set GOGC=off. Because of golang.org/issue/10958, the tight
-	// loops in the test program are not preemptible. If GC kicks
-	// in, it may lock up and prevent main from saying it's ready.
-	newEnv := []string{}
-	for _, s := range cmd.Env {
-		if !strings.HasPrefix(s, "GOGC=") {
-			newEnv = append(newEnv, s)
-		}
-	}
-	cmd.Env = append(newEnv, "GOGC=off")
+	cmd.Env = append(cmd.Env,
+		"GOTRACEBACK=crash",
+		// Set GOGC=off. Because of golang.org/issue/10958, the tight
+		// loops in the test program are not preemptible. If GC kicks
+		// in, it may lock up and prevent main from saying it's ready.
+		"GOGC=off",
+		// Set GODEBUG=asyncpreemptoff=1. If a thread is preempted
+		// when it receives SIGQUIT, it won't show the expected
+		// stack trace. See issue 35356.
+		"GODEBUG=asyncpreemptoff=1",
+	)
 
 	var outbuf bytes.Buffer
 	cmd.Stdout = &outbuf
@@ -266,6 +289,12 @@
 }
 
 func TestSignalIgnoreSIGTRAP(t *testing.T) {
+	if runtime.GOOS == "openbsd" {
+		if bn := testenv.Builder(); strings.HasSuffix(bn, "-62") || strings.HasSuffix(bn, "-64") {
+			testenv.SkipFlaky(t, 17496)
+		}
+	}
+
 	output := runTestProg(t, "testprognet", "SignalIgnoreSIGTRAP")
 	want := "OK\n"
 	if output != want {
@@ -285,3 +314,47 @@
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+func TestSignalM(t *testing.T) {
+	r, w, errno := runtime.Pipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+	runtime.Closeonexec(r)
+	runtime.Closeonexec(w)
+
+	var want, got int64
+	var wg sync.WaitGroup
+	ready := make(chan *runtime.M)
+	wg.Add(1)
+	go func() {
+		runtime.LockOSThread()
+		want, got = runtime.WaitForSigusr1(r, w, func(mp *runtime.M) {
+			ready <- mp
+		})
+		runtime.UnlockOSThread()
+		wg.Done()
+	}()
+	waitingM := <-ready
+	runtime.SendSigusr1(waitingM)
+
+	timer := time.AfterFunc(time.Second, func() {
+		// Write 1 to tell WaitForSigusr1 that we timed out.
+		bw := byte(1)
+		if n := runtime.Write(uintptr(w), unsafe.Pointer(&bw), 1); n != 1 {
+			t.Errorf("pipe write failed: %d", n)
+		}
+	})
+	defer timer.Stop()
+
+	wg.Wait()
+	if got == -1 {
+		t.Fatal("signalM signal not received")
+	} else if want != got {
+		t.Fatalf("signal sent to M %d, but received on M %d", want, got)
+	}
+}
diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index af5c3a1..76eeb2e 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -26,12 +26,12 @@
 		return ret
 	}
 
-	stopTheWorld("GOMAXPROCS")
+	stopTheWorldGC("GOMAXPROCS")
 
 	// newprocs will be processed by startTheWorld
 	newprocs = int32(n)
 
-	startTheWorld()
+	startTheWorldGC()
 	return ret
 }
 
diff --git a/src/runtime/debug/heapdump_test.go b/src/runtime/debug/heapdump_test.go
index c986efc..de1ec27 100644
--- a/src/runtime/debug/heapdump_test.go
+++ b/src/runtime/debug/heapdump_test.go
@@ -13,7 +13,7 @@
 )
 
 func TestWriteHeapDumpNonempty(t *testing.T) {
-	if runtime.GOOS == "nacl" || runtime.GOOS == "js" {
+	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
 	f, err := ioutil.TempFile("", "heapdumptest")
@@ -42,7 +42,7 @@
 }
 
 func TestWriteHeapDumpFinalizers(t *testing.T) {
-	if runtime.GOOS == "nacl" || runtime.GOOS == "js" {
+	if runtime.GOOS == "js" {
 		t.Skipf("WriteHeapDump is not available on %s.", runtime.GOOS)
 	}
 	f, err := ioutil.TempFile("", "heapdumptest")
diff --git a/src/runtime/debug/mod.go b/src/runtime/debug/mod.go
index e3b929a..0381bdc 100644
--- a/src/runtime/debug/mod.go
+++ b/src/runtime/debug/mod.go
@@ -22,7 +22,7 @@
 // the running binary.
 type BuildInfo struct {
 	Path string    // The main package path
-	Main Module    // The main module information
+	Main Module    // The module containing the main package
 	Deps []*Module // Module dependencies
 }
 
@@ -47,9 +47,27 @@
 		repLine  = "=>\t"
 	)
 
-	info := &BuildInfo{}
+	readEntryFirstLine := func(elem []string) (Module, bool) {
+		if len(elem) != 2 && len(elem) != 3 {
+			return Module{}, false
+		}
+		sum := ""
+		if len(elem) == 3 {
+			sum = elem[2]
+		}
+		return Module{
+			Path:    elem[0],
+			Version: elem[1],
+			Sum:     sum,
+		}, true
+	}
 
-	var line string
+	var (
+		info = &BuildInfo{}
+		last *Module
+		line string
+		ok   bool
+	)
 	// Reverse of cmd/go/internal/modload.PackageBuildInfo
 	for len(data) > 0 {
 		i := strings.IndexByte(data, '\n')
@@ -63,42 +81,33 @@
 			info.Path = elem
 		case strings.HasPrefix(line, modLine):
 			elem := strings.Split(line[len(modLine):], "\t")
-			if len(elem) != 3 {
+			last = &info.Main
+			*last, ok = readEntryFirstLine(elem)
+			if !ok {
 				return nil, false
 			}
-			info.Main = Module{
-				Path:    elem[0],
-				Version: elem[1],
-				Sum:     elem[2],
-			}
 		case strings.HasPrefix(line, depLine):
 			elem := strings.Split(line[len(depLine):], "\t")
-			if len(elem) != 2 && len(elem) != 3 {
+			last = new(Module)
+			info.Deps = append(info.Deps, last)
+			*last, ok = readEntryFirstLine(elem)
+			if !ok {
 				return nil, false
 			}
-			sum := ""
-			if len(elem) == 3 {
-				sum = elem[2]
-			}
-			info.Deps = append(info.Deps, &Module{
-				Path:    elem[0],
-				Version: elem[1],
-				Sum:     sum,
-			})
 		case strings.HasPrefix(line, repLine):
 			elem := strings.Split(line[len(repLine):], "\t")
 			if len(elem) != 3 {
 				return nil, false
 			}
-			last := len(info.Deps) - 1
-			if last < 0 {
+			if last == nil {
 				return nil, false
 			}
-			info.Deps[last].Replace = &Module{
+			last.Replace = &Module{
 				Path:    elem[0],
 				Version: elem[1],
 				Sum:     elem[2],
 			}
+			last = nil
 		}
 	}
 	return info, true
diff --git a/src/runtime/debug_test.go b/src/runtime/debug_test.go
index f77a373..722e811 100644
--- a/src/runtime/debug_test.go
+++ b/src/runtime/debug_test.go
@@ -126,7 +126,7 @@
 		return x + 1
 	}
 	args.x = 42
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 	if args.yRet != 43 {
@@ -155,7 +155,7 @@
 		args.in[i] = i
 		want[i] = i + 1
 	}
-	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, fn, &args, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 	if want != args.out {
@@ -168,7 +168,7 @@
 	defer after()
 
 	// Inject a call that performs a GC.
-	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, runtime.GC, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -179,7 +179,7 @@
 
 	// Inject a call that grows the stack. debugCallWorker checks
 	// for stack pointer breakage.
-	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill); err != nil {
+	if _, err := runtime.InjectDebugCall(g, func() { growStack(nil) }, nil, debugCallTKill, false); err != nil {
 		t.Fatal(err)
 	}
 }
@@ -215,7 +215,7 @@
 		runtime.Gosched()
 	}
 
-	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill)
+	_, err := runtime.InjectDebugCall(g, func() {}, nil, debugCallTKill, true)
 	if msg := "call not at safe point"; err == nil || err.Error() != msg {
 		t.Fatalf("want %q, got %s", msg, err)
 	}
@@ -239,7 +239,7 @@
 	}()
 	g := <-ready
 
-	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill)
+	p, err := runtime.InjectDebugCall(g, func() { panic("test") }, nil, debugCallTKill, false)
 	if err != nil {
 		t.Fatal(err)
 	}
diff --git a/src/runtime/debugcall.go b/src/runtime/debugcall.go
index f03d235..6c285ec 100644
--- a/src/runtime/debugcall.go
+++ b/src/runtime/debugcall.go
@@ -61,7 +61,7 @@
 			"debugCall16384",
 			"debugCall32768",
 			"debugCall65536":
-			// These functions are whitelisted so that the debugger can initiate multiple function calls.
+			// These functions are allowed so that the debugger can initiate multiple function calls.
 			// See: https://golang.org/cl/161137/
 			return
 		}
@@ -76,28 +76,173 @@
 			return
 		}
 
-		// Look up PC's register map.
-		pcdata := int32(-1)
-		if pc != f.entry {
-			pc--
-			pcdata = pcdatavalue(f, _PCDATA_RegMapIndex, pc, nil)
-		}
-		if pcdata == -1 {
-			pcdata = 0 // in prologue
-		}
-		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_RegPointerMaps))
-		if pcdata == -2 || stkmap == nil {
-			// Not at a safe point.
-			ret = debugCallUnsafePoint
-			return
+		if !go115ReduceLiveness {
+			// Look up PC's register map.
+			pcdata := int32(-1)
+			if pc != f.entry {
+				pc--
+				pcdata = pcdatavalue(f, _PCDATA_RegMapIndex, pc, nil)
+			}
+			if pcdata == -1 {
+				pcdata = 0 // in prologue
+			}
+			stkmap := (*stackmap)(funcdata(f, _FUNCDATA_RegPointerMaps))
+			if pcdata == -2 || stkmap == nil {
+				// Not at a safe point.
+				ret = debugCallUnsafePoint
+				return
+			}
+		} else {
+			// Check that this isn't an unsafe-point.
+			if pc != f.entry {
+				pc--
+			}
+			up := pcdatavalue(f, _PCDATA_UnsafePoint, pc, nil)
+			if up != _PCDATA_UnsafePointSafe {
+				// Not at a safe point.
+				ret = debugCallUnsafePoint
+			}
 		}
 	})
 	return ret
 }
 
-// debugCallWrap pushes a defer to recover from panics in debug calls
-// and then calls the dispatching function at PC dispatch.
+// debugCallWrap starts a new goroutine to run a debug call and blocks
+// the calling goroutine. On the goroutine, it prepares to recover
+// panics from the debug call, and then calls the call dispatching
+// function at PC dispatch.
+//
+// This must be deeply nosplit because there are untyped values on the
+// stack from debugCallV1.
+//
+//go:nosplit
 func debugCallWrap(dispatch uintptr) {
+	var lockedm bool
+	var lockedExt uint32
+	callerpc := getcallerpc()
+	gp := getg()
+
+	// Create a new goroutine to execute the call on. Run this on
+	// the system stack to avoid growing our stack.
+	systemstack(func() {
+		var args struct {
+			dispatch uintptr
+			callingG *g
+		}
+		args.dispatch = dispatch
+		args.callingG = gp
+		fn := debugCallWrap1
+		newg := newproc1(*(**funcval)(unsafe.Pointer(&fn)), unsafe.Pointer(&args), int32(unsafe.Sizeof(args)), gp, callerpc)
+
+		// If the current G is locked, then transfer that
+		// locked-ness to the new goroutine.
+		if gp.lockedm != 0 {
+			// Save lock state to restore later.
+			mp := gp.m
+			if mp != gp.lockedm.ptr() {
+				throw("inconsistent lockedm")
+			}
+
+			lockedm = true
+			lockedExt = mp.lockedExt
+
+			// Transfer external lock count to internal so
+			// it can't be unlocked from the debug call.
+			mp.lockedInt++
+			mp.lockedExt = 0
+
+			mp.lockedg.set(newg)
+			newg.lockedm.set(mp)
+			gp.lockedm = 0
+		}
+
+		// Mark the calling goroutine as being at an async
+		// safe-point, since it has a few conservative frames
+		// at the bottom of the stack. This also prevents
+		// stack shrinks.
+		gp.asyncSafePoint = true
+
+		// Stash newg away so we can execute it below (mcall's
+		// closure can't capture anything).
+		gp.schedlink.set(newg)
+	})
+
+	// Switch to the new goroutine.
+	mcall(func(gp *g) {
+		// Get newg.
+		newg := gp.schedlink.ptr()
+		gp.schedlink = 0
+
+		// Park the calling goroutine.
+		gp.waitreason = waitReasonDebugCall
+		if trace.enabled {
+			traceGoPark(traceEvGoBlock, 1)
+		}
+		casgstatus(gp, _Grunning, _Gwaiting)
+		dropg()
+
+		// Directly execute the new goroutine. The debug
+		// protocol will continue on the new goroutine, so
+		// it's important we not just let the scheduler do
+		// this or it may resume a different goroutine.
+		execute(newg, true)
+	})
+
+	// We'll resume here when the call returns.
+
+	// Restore locked state.
+	if lockedm {
+		mp := gp.m
+		mp.lockedExt = lockedExt
+		mp.lockedInt--
+		mp.lockedg.set(gp)
+		gp.lockedm.set(mp)
+	}
+
+	gp.asyncSafePoint = false
+}
+
+// debugCallWrap1 is the continuation of debugCallWrap on the callee
+// goroutine.
+func debugCallWrap1(dispatch uintptr, callingG *g) {
+	// Dispatch call and trap panics.
+	debugCallWrap2(dispatch)
+
+	// Resume the caller goroutine.
+	getg().schedlink.set(callingG)
+	mcall(func(gp *g) {
+		callingG := gp.schedlink.ptr()
+		gp.schedlink = 0
+
+		// Unlock this goroutine from the M if necessary. The
+		// calling G will relock.
+		if gp.lockedm != 0 {
+			gp.lockedm = 0
+			gp.m.lockedg = 0
+		}
+
+		// Switch back to the calling goroutine. At some point
+		// the scheduler will schedule us again and we'll
+		// finish exiting.
+		if trace.enabled {
+			traceGoSched()
+		}
+		casgstatus(gp, _Grunning, _Grunnable)
+		dropg()
+		lock(&sched.lock)
+		globrunqput(gp)
+		unlock(&sched.lock)
+
+		if trace.enabled {
+			traceGoUnpark(callingG, 0)
+		}
+		casgstatus(callingG, _Gwaiting, _Grunnable)
+		execute(callingG, true)
+	})
+}
+
+func debugCallWrap2(dispatch uintptr) {
+	// Call the dispatch function and trap panics.
 	var dispatchF func()
 	dispatchFV := funcval{dispatch}
 	*(*unsafe.Pointer)(unsafe.Pointer(&dispatchF)) = noescape(unsafe.Pointer(&dispatchFV))
diff --git a/src/runtime/debuglog.go b/src/runtime/debuglog.go
index 100f2d3..3ce3273 100644
--- a/src/runtime/debuglog.go
+++ b/src/runtime/debuglog.go
@@ -665,13 +665,17 @@
 		print("..(", r.uvarint(), " more bytes)..")
 
 	case debugLogPC:
-		printDebugLogPC(uintptr(r.uvarint()))
+		printDebugLogPC(uintptr(r.uvarint()), false)
 
 	case debugLogTraceback:
 		n := int(r.uvarint())
 		for i := 0; i < n; i++ {
 			print("\n\t")
-			printDebugLogPC(uintptr(r.uvarint()))
+			// gentraceback PCs are always return PCs.
+			// Convert them to call PCs.
+			//
+			// TODO(austin): Expand inlined frames.
+			printDebugLogPC(uintptr(r.uvarint()), true)
 		}
 	}
 
@@ -794,9 +798,17 @@
 	printunlock()
 }
 
-func printDebugLogPC(pc uintptr) {
-	print(hex(pc))
+// printDebugLogPC prints a single symbolized PC. If returnPC is true,
+// pc is a return PC that must first be converted to a call PC.
+func printDebugLogPC(pc uintptr, returnPC bool) {
 	fn := findfunc(pc)
+	if returnPC && (!fn.valid() || pc > fn.entry) {
+		// TODO(austin): Don't back up if the previous frame
+		// was a sigpanic.
+		pc--
+	}
+
+	print(hex(pc))
 	if !fn.valid() {
 		print(" [unknown PC]")
 	} else {
diff --git a/src/runtime/defer_test.go b/src/runtime/defer_test.go
new file mode 100644
index 0000000..5ac0814
--- /dev/null
+++ b/src/runtime/defer_test.go
@@ -0,0 +1,412 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+// Make sure open-coded defer exit code is not lost, even when there is an
+// unconditional panic (hence no return from the function)
+func TestUnconditionalPanic(t *testing.T) {
+	defer func() {
+		if recover() != "testUnconditional" {
+			t.Fatal("expected unconditional panic")
+		}
+	}()
+	panic("testUnconditional")
+}
+
+var glob int = 3
+
+// Test an open-coded defer and non-open-coded defer - make sure both defers run
+// and call recover()
+func TestOpenAndNonOpenDefers(t *testing.T) {
+	for {
+		// Non-open defer because in a loop
+		defer func(n int) {
+			if recover() != "testNonOpenDefer" {
+				t.Fatal("expected testNonOpen panic")
+			}
+		}(3)
+		if glob > 2 {
+			break
+		}
+	}
+	testOpen(t, 47)
+	panic("testNonOpenDefer")
+}
+
+//go:noinline
+func testOpen(t *testing.T, arg int) {
+	defer func(n int) {
+		if recover() != "testOpenDefer" {
+			t.Fatal("expected testOpen panic")
+		}
+	}(4)
+	if arg > 2 {
+		panic("testOpenDefer")
+	}
+}
+
+// Test a non-open-coded defer and an open-coded defer - make sure both defers run
+// and call recover()
+func TestNonOpenAndOpenDefers(t *testing.T) {
+	testOpen(t, 47)
+	for {
+		// Non-open defer because in a loop
+		defer func(n int) {
+			if recover() != "testNonOpenDefer" {
+				t.Fatal("expected testNonOpen panic")
+			}
+		}(3)
+		if glob > 2 {
+			break
+		}
+	}
+	panic("testNonOpenDefer")
+}
+
+var list []int
+
+// Make sure that conditional open-coded defers are activated correctly and run in
+// the correct order.
+func TestConditionalDefers(t *testing.T) {
+	list = make([]int, 0, 10)
+
+	defer func() {
+		if recover() != "testConditional" {
+			t.Fatal("expected panic")
+		}
+		want := []int{4, 2, 1}
+		if !reflect.DeepEqual(want, list) {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", want, list))
+		}
+
+	}()
+	testConditionalDefers(8)
+}
+
+func testConditionalDefers(n int) {
+	doappend := func(i int) {
+		list = append(list, i)
+	}
+
+	defer doappend(1)
+	if n > 5 {
+		defer doappend(2)
+		if n > 8 {
+			defer doappend(3)
+		} else {
+			defer doappend(4)
+		}
+	}
+	panic("testConditional")
+}
+
+// Test that there is no compile-time or run-time error if an open-coded defer
+// call is removed by constant propagation and dead-code elimination.
+func TestDisappearingDefer(t *testing.T) {
+	switch runtime.GOOS {
+	case "invalidOS":
+		defer func() {
+			t.Fatal("Defer shouldn't run")
+		}()
+	}
+}
+
+// This tests an extra recursive panic behavior that is only specified in the
+// code. Suppose a first panic P1 happens and starts processing defer calls. If a
+// second panic P2 happens while processing defer call D in frame F, then defer
+// call processing is restarted (with some potentially new defer calls created by
+// D or its callees). If the defer processing reaches the started defer call D
+// again in the defer stack, then the original panic P1 is aborted and cannot
+// continue panic processing or be recovered. If the panic P2 does a recover at
+// some point, it will naturally remove the original panic P1 from the stack
+// (since the original panic had to be in frame F or a descendant of F).
+func TestAbortedPanic(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r != nil {
+			t.Fatal(fmt.Sprintf("wanted nil recover, got %v", r))
+		}
+	}()
+	defer func() {
+		r := recover()
+		if r != "panic2" {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", "panic2", r))
+		}
+	}()
+	defer func() {
+		panic("panic2")
+	}()
+	panic("panic1")
+}
+
+// This tests that recover() does not succeed unless it is called directly from a
+// defer function that is directly called by the panic.  Here, we first call it
+// from a defer function that is created by the defer function called directly by
+// the panic.  In
+func TestRecoverMatching(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r != "panic1" {
+			t.Fatal(fmt.Sprintf("wanted %v, got %v", "panic1", r))
+		}
+	}()
+	defer func() {
+		defer func() {
+			// Shouldn't succeed, even though it is called directly
+			// from a defer function, since this defer function was
+			// not directly called by the panic.
+			r := recover()
+			if r != nil {
+				t.Fatal(fmt.Sprintf("wanted nil recover, got %v", r))
+			}
+		}()
+	}()
+	panic("panic1")
+}
+
+type nonSSAable [128]byte
+
+type bigStruct struct {
+	x, y, z, w, p, q int64
+}
+
+type containsBigStruct struct {
+	element bigStruct
+}
+
+func mknonSSAable() nonSSAable {
+	globint1++
+	return nonSSAable{0, 0, 0, 0, 5}
+}
+
+var globint1, globint2, globint3 int
+
+//go:noinline
+func sideeffect(n int64) int64 {
+	globint2++
+	return n
+}
+
+func sideeffect2(in containsBigStruct) containsBigStruct {
+	globint3++
+	return in
+}
+
+// Test that nonSSAable arguments to defer are handled correctly and only evaluated once.
+func TestNonSSAableArgs(t *testing.T) {
+	globint1 = 0
+	globint2 = 0
+	globint3 = 0
+	var save1 byte
+	var save2 int64
+	var save3 int64
+	var save4 int64
+
+	defer func() {
+		if globint1 != 1 {
+			t.Fatal(fmt.Sprintf("globint1:  wanted: 1, got %v", globint1))
+		}
+		if save1 != 5 {
+			t.Fatal(fmt.Sprintf("save1:  wanted: 5, got %v", save1))
+		}
+		if globint2 != 1 {
+			t.Fatal(fmt.Sprintf("globint2:  wanted: 1, got %v", globint2))
+		}
+		if save2 != 2 {
+			t.Fatal(fmt.Sprintf("save2:  wanted: 2, got %v", save2))
+		}
+		if save3 != 4 {
+			t.Fatal(fmt.Sprintf("save3:  wanted: 4, got %v", save3))
+		}
+		if globint3 != 1 {
+			t.Fatal(fmt.Sprintf("globint3:  wanted: 1, got %v", globint3))
+		}
+		if save4 != 4 {
+			t.Fatal(fmt.Sprintf("save1:  wanted: 4, got %v", save4))
+		}
+	}()
+
+	// Test function returning a non-SSAable arg
+	defer func(n nonSSAable) {
+		save1 = n[4]
+	}(mknonSSAable())
+	// Test composite literal that is not SSAable
+	defer func(b bigStruct) {
+		save2 = b.y
+	}(bigStruct{1, 2, 3, 4, 5, sideeffect(6)})
+
+	// Test struct field reference that is non-SSAable
+	foo := containsBigStruct{}
+	foo.element.z = 4
+	defer func(element bigStruct) {
+		save3 = element.z
+	}(foo.element)
+	defer func(element bigStruct) {
+		save4 = element.z
+	}(sideeffect2(foo).element)
+}
+
+//go:noinline
+func doPanic() {
+	panic("Test panic")
+}
+
+func TestDeferForFuncWithNoExit(t *testing.T) {
+	cond := 1
+	defer func() {
+		if cond != 2 {
+			t.Fatal(fmt.Sprintf("cond: wanted 2, got %v", cond))
+		}
+		if recover() != "Test panic" {
+			t.Fatal("Didn't find expected panic")
+		}
+	}()
+	x := 0
+	// Force a stack copy, to make sure that the &cond pointer passed to defer
+	// function is properly updated.
+	growStackIter(&x, 1000)
+	cond = 2
+	doPanic()
+
+	// This function has no exit/return, since it ends with an infinite loop
+	for {
+	}
+}
+
+// Test case approximating issue #37664, where a recursive function (interpreter)
+// may do repeated recovers/re-panics until it reaches the frame where the panic
+// can actually be handled. The recurseFnPanicRec() function is testing that there
+// are no stale defer structs on the defer chain after the interpreter() sequence,
+// by writing a bunch of 0xffffffffs into several recursive stack frames, and then
+// doing a single panic-recover which would invoke any such stale defer structs.
+func TestDeferWithRepeatedRepanics(t *testing.T) {
+	interpreter(0, 6, 2)
+	recurseFnPanicRec(0, 10)
+	interpreter(0, 5, 1)
+	recurseFnPanicRec(0, 10)
+	interpreter(0, 6, 3)
+	recurseFnPanicRec(0, 10)
+}
+
+func interpreter(level int, maxlevel int, rec int) {
+	defer func() {
+		e := recover()
+		if e == nil {
+			return
+		}
+		if level != e.(int) {
+			//fmt.Fprintln(os.Stderr, "re-panicing, level", level)
+			panic(e)
+		}
+		//fmt.Fprintln(os.Stderr, "Recovered, level", level)
+	}()
+	if level+1 < maxlevel {
+		interpreter(level+1, maxlevel, rec)
+	} else {
+		//fmt.Fprintln(os.Stderr, "Initiating panic")
+		panic(rec)
+	}
+}
+
+func recurseFnPanicRec(level int, maxlevel int) {
+	defer func() {
+		recover()
+	}()
+	recurseFn(level, maxlevel)
+}
+
+var saveInt uint32
+
+func recurseFn(level int, maxlevel int) {
+	a := [40]uint32{0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
+	if level+1 < maxlevel {
+		// Make sure a array is referenced, so it is not optimized away
+		saveInt = a[4]
+		recurseFn(level+1, maxlevel)
+	} else {
+		panic("recurseFn panic")
+	}
+}
+
+// Try to reproduce issue #37688, where a pointer to an open-coded defer struct is
+// mistakenly held, and that struct keeps a pointer to a stack-allocated defer
+// struct, and that stack-allocated struct gets overwritten or the stack gets
+// moved, so a memory error happens on GC.
+func TestIssue37688(t *testing.T) {
+	for j := 0; j < 10; j++ {
+		g2()
+		g3()
+	}
+}
+
+type foo struct {
+}
+
+//go:noinline
+func (f *foo) method1() {
+}
+
+//go:noinline
+func (f *foo) method2() {
+}
+
+func g2() {
+	var a foo
+	ap := &a
+	// The loop forces this defer to be heap-allocated and the remaining two
+	// to be stack-allocated.
+	for i := 0; i < 1; i++ {
+		defer ap.method1()
+	}
+	defer ap.method2()
+	defer ap.method1()
+	ff1(ap, 1, 2, 3, 4, 5, 6, 7, 8, 9)
+	// Try to get the stack to be be moved by growing it too large, so
+	// existing stack-allocated defer becomes invalid.
+	rec1(2000)
+}
+
+func g3() {
+	// Mix up the stack layout by adding in an extra function frame
+	g2()
+}
+
+var globstruct struct {
+	a, b, c, d, e, f, g, h, i int
+}
+
+func ff1(ap *foo, a, b, c, d, e, f, g, h, i int) {
+	defer ap.method1()
+
+	// Make a defer that has a very large set of args, hence big size for the
+	// defer record for the open-coded frame (which means it won't use the
+	// defer pool)
+	defer func(ap *foo, a, b, c, d, e, f, g, h, i int) {
+		if v := recover(); v != nil {
+		}
+		globstruct.a = a
+		globstruct.b = b
+		globstruct.c = c
+		globstruct.d = d
+		globstruct.e = e
+		globstruct.f = f
+		globstruct.g = g
+		globstruct.h = h
+	}(ap, a, b, c, d, e, f, g, h, i)
+	panic("ff1 panic")
+}
+
+func rec1(max int) {
+	if max > 0 {
+		rec1(max - 1)
+	}
+}
diff --git a/src/runtime/defs1_linux.go b/src/runtime/defs1_linux.go
index e136d96..4085d6f 100644
--- a/src/runtime/defs1_linux.go
+++ b/src/runtime/defs1_linux.go
@@ -21,6 +21,7 @@
 
 const (
 	O_RDONLY    = C.O_RDONLY
+	O_NONBLOCK  = C.O_NONBLOCK
 	O_CLOEXEC   = C.O_CLOEXEC
 	SA_RESTORER = C.SA_RESTORER
 )
diff --git a/src/runtime/defs1_netbsd_386.go b/src/runtime/defs1_netbsd_386.go
index 3eae12e..a4548e6 100644
--- a/src/runtime/defs1_netbsd_386.go
+++ b/src/runtime/defs1_netbsd_386.go
@@ -6,6 +6,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs1_netbsd_amd64.go b/src/runtime/defs1_netbsd_amd64.go
index 51d55c9..4b0e79e 100644
--- a/src/runtime/defs1_netbsd_amd64.go
+++ b/src/runtime/defs1_netbsd_amd64.go
@@ -6,6 +6,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs1_netbsd_arm.go b/src/runtime/defs1_netbsd_arm.go
index fadb341..2b5d599 100644
--- a/src/runtime/defs1_netbsd_arm.go
+++ b/src/runtime/defs1_netbsd_arm.go
@@ -6,6 +6,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs1_netbsd_arm64.go b/src/runtime/defs1_netbsd_arm64.go
index 41b7aac..740dc77 100644
--- a/src/runtime/defs1_netbsd_arm64.go
+++ b/src/runtime/defs1_netbsd_arm64.go
@@ -6,6 +6,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x400000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs1_solaris_amd64.go b/src/runtime/defs1_solaris_amd64.go
index 64d51a7..19e8a25 100644
--- a/src/runtime/defs1_solaris_amd64.go
+++ b/src/runtime/defs1_solaris_amd64.go
@@ -8,9 +8,12 @@
 	_EBADF       = 0x9
 	_EFAULT      = 0xe
 	_EAGAIN      = 0xb
+	_EBUSY       = 0x10
+	_ETIME       = 0x3e
 	_ETIMEDOUT   = 0x91
 	_EWOULDBLOCK = 0xb
 	_EINPROGRESS = 0x96
+	_ENOSYS      = 0x59
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -89,6 +92,7 @@
 	_MAXHOSTNAMELEN = 0x100
 
 	_O_NONBLOCK = 0x80
+	_O_CLOEXEC  = 0x800000
 	_FD_CLOEXEC = 0x1
 	_F_GETFL    = 0x3
 	_F_SETFL    = 0x4
@@ -99,7 +103,9 @@
 	_POLLHUP = 0x10
 	_POLLERR = 0x8
 
-	_PORT_SOURCE_FD = 0x4
+	_PORT_SOURCE_FD    = 0x4
+	_PORT_SOURCE_ALERT = 0x5
+	_PORT_ALERT_UPDATE = 0x2
 )
 
 type semt struct {
diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go
index b08c0da..87e19c1 100644
--- a/src/runtime/defs2_linux.go
+++ b/src/runtime/defs2_linux.go
@@ -61,7 +61,7 @@
 	MADV_DONTNEED   = C.MADV_DONTNEED
 	MADV_FREE       = C.MADV_FREE
 	MADV_HUGEPAGE   = C.MADV_HUGEPAGE
-	MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
+	MADV_NOHUGEPAGE = C.MADV_NOHUGEPAGE
 
 	SA_RESTART  = C.SA_RESTART
 	SA_ONSTACK  = C.SA_ONSTACK
diff --git a/src/runtime/defs_aix.go b/src/runtime/defs_aix.go
index bc5101f..23a6cac 100644
--- a/src/runtime/defs_aix.go
+++ b/src/runtime/defs_aix.go
@@ -8,7 +8,7 @@
 Input to cgo -godefs
 GOARCH=ppc64 go tool cgo -godefs defs_aix.go > defs_aix_ppc64_tmp.go
 
-This is only an helper to create defs_aix_ppc64.go
+This is only a helper to create defs_aix_ppc64.go
 Go runtime functions require the "linux" name of fields (ss_sp, si_addr, etc)
 However, AIX structures don't provide such names and must be modified.
 
@@ -123,7 +123,8 @@
 	_ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
 	_ITIMER_PROF    = C.ITIMER_PROF
 
-	_O_RDONLY = C.O_RDONLY
+	_O_RDONLY   = C.O_RDONLY
+	_O_NONBLOCK = C.O_NONBLOCK
 
 	_SS_DISABLE  = C.SS_DISABLE
 	_SI_USER     = C.SI_USER
diff --git a/src/runtime/defs_aix_ppc64.go b/src/runtime/defs_aix_ppc64.go
index dccc3a5..a53fcc5 100644
--- a/src/runtime/defs_aix_ppc64.go
+++ b/src/runtime/defs_aix_ppc64.go
@@ -80,7 +80,8 @@
 	_ITIMER_VIRTUAL = 0x1
 	_ITIMER_PROF    = 0x2
 
-	_O_RDONLY = 0x0
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x4
 
 	_SS_DISABLE  = 0x2
 	_SI_USER     = 0x0
diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
index 0cd133f..cc8c475 100644
--- a/src/runtime/defs_darwin.go
+++ b/src/runtime/defs_darwin.go
@@ -8,7 +8,6 @@
 Input to cgo.
 
 GOARCH=amd64 go tool cgo -cdefs defs_darwin.go >defs_darwin_amd64.h
-GOARCH=386 go tool cgo -cdefs defs_darwin.go >defs_darwin_386.h
 */
 
 package runtime
@@ -30,6 +29,7 @@
 const (
 	EINTR     = C.EINTR
 	EFAULT    = C.EFAULT
+	EAGAIN    = C.EAGAIN
 	ETIMEDOUT = C.ETIMEDOUT
 
 	PROT_NONE  = C.PROT_NONE
diff --git a/src/runtime/defs_darwin_386.go b/src/runtime/defs_darwin_386.go
deleted file mode 100644
index 83928e7..0000000
--- a/src/runtime/defs_darwin_386.go
+++ /dev/null
@@ -1,366 +0,0 @@
-// created by cgo -cdefs and then converted to Go
-// cgo -cdefs defs_darwin.go
-
-package runtime
-
-import "unsafe"
-
-const (
-	_EINTR     = 0x4
-	_EFAULT    = 0xe
-	_ETIMEDOUT = 0x3c
-
-	_PROT_NONE  = 0x0
-	_PROT_READ  = 0x1
-	_PROT_WRITE = 0x2
-	_PROT_EXEC  = 0x4
-
-	_MAP_ANON    = 0x1000
-	_MAP_PRIVATE = 0x2
-	_MAP_FIXED   = 0x10
-
-	_MADV_DONTNEED      = 0x4
-	_MADV_FREE          = 0x5
-	_MADV_FREE_REUSABLE = 0x7
-	_MADV_FREE_REUSE    = 0x8
-
-	_SA_SIGINFO   = 0x40
-	_SA_RESTART   = 0x2
-	_SA_ONSTACK   = 0x1
-	_SA_USERTRAMP = 0x100
-	_SA_64REGSET  = 0x200
-
-	_SIGHUP    = 0x1
-	_SIGINT    = 0x2
-	_SIGQUIT   = 0x3
-	_SIGILL    = 0x4
-	_SIGTRAP   = 0x5
-	_SIGABRT   = 0x6
-	_SIGEMT    = 0x7
-	_SIGFPE    = 0x8
-	_SIGKILL   = 0x9
-	_SIGBUS    = 0xa
-	_SIGSEGV   = 0xb
-	_SIGSYS    = 0xc
-	_SIGPIPE   = 0xd
-	_SIGALRM   = 0xe
-	_SIGTERM   = 0xf
-	_SIGURG    = 0x10
-	_SIGSTOP   = 0x11
-	_SIGTSTP   = 0x12
-	_SIGCONT   = 0x13
-	_SIGCHLD   = 0x14
-	_SIGTTIN   = 0x15
-	_SIGTTOU   = 0x16
-	_SIGIO     = 0x17
-	_SIGXCPU   = 0x18
-	_SIGXFSZ   = 0x19
-	_SIGVTALRM = 0x1a
-	_SIGPROF   = 0x1b
-	_SIGWINCH  = 0x1c
-	_SIGINFO   = 0x1d
-	_SIGUSR1   = 0x1e
-	_SIGUSR2   = 0x1f
-
-	_FPE_INTDIV = 0x7
-	_FPE_INTOVF = 0x8
-	_FPE_FLTDIV = 0x1
-	_FPE_FLTOVF = 0x2
-	_FPE_FLTUND = 0x3
-	_FPE_FLTRES = 0x4
-	_FPE_FLTINV = 0x5
-	_FPE_FLTSUB = 0x6
-
-	_BUS_ADRALN = 0x1
-	_BUS_ADRERR = 0x2
-	_BUS_OBJERR = 0x3
-
-	_SEGV_MAPERR = 0x1
-	_SEGV_ACCERR = 0x2
-
-	_ITIMER_REAL    = 0x0
-	_ITIMER_VIRTUAL = 0x1
-	_ITIMER_PROF    = 0x2
-
-	_EV_ADD       = 0x1
-	_EV_DELETE    = 0x2
-	_EV_CLEAR     = 0x20
-	_EV_RECEIPT   = 0x40
-	_EV_ERROR     = 0x4000
-	_EV_EOF       = 0x8000
-	_EVFILT_READ  = -0x1
-	_EVFILT_WRITE = -0x2
-
-	_PTHREAD_CREATE_DETACHED = 0x2
-
-	_F_SETFD    = 0x2
-	_F_GETFL    = 0x3
-	_F_SETFL    = 0x4
-	_FD_CLOEXEC = 0x1
-
-	_O_NONBLOCK = 4
-)
-
-type stackt struct {
-	ss_sp    *byte
-	ss_size  uintptr
-	ss_flags int32
-}
-
-type sigactiont struct {
-	__sigaction_u [4]byte
-	sa_tramp      unsafe.Pointer
-	sa_mask       uint32
-	sa_flags      int32
-}
-
-type usigactiont struct {
-	__sigaction_u [4]byte
-	sa_mask       uint32
-	sa_flags      int32
-}
-
-type siginfo struct {
-	si_signo  int32
-	si_errno  int32
-	si_code   int32
-	si_pid    int32
-	si_uid    uint32
-	si_status int32
-	si_addr   uint32
-	si_value  [4]byte
-	si_band   int32
-	__pad     [7]uint32
-}
-
-type timeval struct {
-	tv_sec  int32
-	tv_usec int32
-}
-
-func (tv *timeval) set_usec(x int32) {
-	tv.tv_usec = x
-}
-
-type itimerval struct {
-	it_interval timeval
-	it_value    timeval
-}
-
-type timespec struct {
-	tv_sec  int32
-	tv_nsec int32
-}
-
-//go:nosplit
-func (ts *timespec) setNsec(ns int64) {
-	ts.tv_sec = timediv(ns, 1e9, &ts.tv_nsec)
-}
-
-type fpcontrol struct {
-	pad_cgo_0 [2]byte
-}
-
-type fpstatus struct {
-	pad_cgo_0 [2]byte
-}
-
-type regmmst struct {
-	mmst_reg  [10]int8
-	mmst_rsrv [6]int8
-}
-
-type regxmm struct {
-	xmm_reg [16]int8
-}
-
-type regs64 struct {
-	rax    uint64
-	rbx    uint64
-	rcx    uint64
-	rdx    uint64
-	rdi    uint64
-	rsi    uint64
-	rbp    uint64
-	rsp    uint64
-	r8     uint64
-	r9     uint64
-	r10    uint64
-	r11    uint64
-	r12    uint64
-	r13    uint64
-	r14    uint64
-	r15    uint64
-	rip    uint64
-	rflags uint64
-	cs     uint64
-	fs     uint64
-	gs     uint64
-}
-
-type floatstate64 struct {
-	fpu_reserved  [2]int32
-	fpu_fcw       fpcontrol
-	fpu_fsw       fpstatus
-	fpu_ftw       uint8
-	fpu_rsrv1     uint8
-	fpu_fop       uint16
-	fpu_ip        uint32
-	fpu_cs        uint16
-	fpu_rsrv2     uint16
-	fpu_dp        uint32
-	fpu_ds        uint16
-	fpu_rsrv3     uint16
-	fpu_mxcsr     uint32
-	fpu_mxcsrmask uint32
-	fpu_stmm0     regmmst
-	fpu_stmm1     regmmst
-	fpu_stmm2     regmmst
-	fpu_stmm3     regmmst
-	fpu_stmm4     regmmst
-	fpu_stmm5     regmmst
-	fpu_stmm6     regmmst
-	fpu_stmm7     regmmst
-	fpu_xmm0      regxmm
-	fpu_xmm1      regxmm
-	fpu_xmm2      regxmm
-	fpu_xmm3      regxmm
-	fpu_xmm4      regxmm
-	fpu_xmm5      regxmm
-	fpu_xmm6      regxmm
-	fpu_xmm7      regxmm
-	fpu_xmm8      regxmm
-	fpu_xmm9      regxmm
-	fpu_xmm10     regxmm
-	fpu_xmm11     regxmm
-	fpu_xmm12     regxmm
-	fpu_xmm13     regxmm
-	fpu_xmm14     regxmm
-	fpu_xmm15     regxmm
-	fpu_rsrv4     [96]int8
-	fpu_reserved1 int32
-}
-
-type exceptionstate64 struct {
-	trapno     uint16
-	cpu        uint16
-	err        uint32
-	faultvaddr uint64
-}
-
-type mcontext64 struct {
-	es exceptionstate64
-	ss regs64
-	fs floatstate64
-}
-
-type regs32 struct {
-	eax    uint32
-	ebx    uint32
-	ecx    uint32
-	edx    uint32
-	edi    uint32
-	esi    uint32
-	ebp    uint32
-	esp    uint32
-	ss     uint32
-	eflags uint32
-	eip    uint32
-	cs     uint32
-	ds     uint32
-	es     uint32
-	fs     uint32
-	gs     uint32
-}
-
-type floatstate32 struct {
-	fpu_reserved  [2]int32
-	fpu_fcw       fpcontrol
-	fpu_fsw       fpstatus
-	fpu_ftw       uint8
-	fpu_rsrv1     uint8
-	fpu_fop       uint16
-	fpu_ip        uint32
-	fpu_cs        uint16
-	fpu_rsrv2     uint16
-	fpu_dp        uint32
-	fpu_ds        uint16
-	fpu_rsrv3     uint16
-	fpu_mxcsr     uint32
-	fpu_mxcsrmask uint32
-	fpu_stmm0     regmmst
-	fpu_stmm1     regmmst
-	fpu_stmm2     regmmst
-	fpu_stmm3     regmmst
-	fpu_stmm4     regmmst
-	fpu_stmm5     regmmst
-	fpu_stmm6     regmmst
-	fpu_stmm7     regmmst
-	fpu_xmm0      regxmm
-	fpu_xmm1      regxmm
-	fpu_xmm2      regxmm
-	fpu_xmm3      regxmm
-	fpu_xmm4      regxmm
-	fpu_xmm5      regxmm
-	fpu_xmm6      regxmm
-	fpu_xmm7      regxmm
-	fpu_rsrv4     [224]int8
-	fpu_reserved1 int32
-}
-
-type exceptionstate32 struct {
-	trapno     uint16
-	cpu        uint16
-	err        uint32
-	faultvaddr uint32
-}
-
-type mcontext32 struct {
-	es exceptionstate32
-	ss regs32
-	fs floatstate32
-}
-
-type ucontext struct {
-	uc_onstack  int32
-	uc_sigmask  uint32
-	uc_stack    stackt
-	uc_link     *ucontext
-	uc_mcsize   uint32
-	uc_mcontext *mcontext32
-}
-
-type keventt struct {
-	ident  uint32
-	filter int16
-	flags  uint16
-	fflags uint32
-	data   int32
-	udata  *byte
-}
-
-type pthread uintptr
-type pthreadattr struct {
-	X__sig    int32
-	X__opaque [36]int8
-}
-type pthreadmutex struct {
-	X__sig    int32
-	X__opaque [40]int8
-}
-type pthreadmutexattr struct {
-	X__sig    int32
-	X__opaque [8]int8
-}
-type pthreadcond struct {
-	X__sig    int32
-	X__opaque [24]int8
-}
-type pthreadcondattr struct {
-	X__sig    int32
-	X__opaque [4]int8
-}
-type machTimebaseInfo struct {
-	numer uint32
-	denom uint32
-}
diff --git a/src/runtime/defs_darwin_amd64.go b/src/runtime/defs_darwin_amd64.go
index 45c34a8..cbc26bf 100644
--- a/src/runtime/defs_darwin_amd64.go
+++ b/src/runtime/defs_darwin_amd64.go
@@ -8,6 +8,7 @@
 const (
 	_EINTR     = 0x4
 	_EFAULT    = 0xe
+	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
 	_PROT_NONE  = 0x0
diff --git a/src/runtime/defs_darwin_arm.go b/src/runtime/defs_darwin_arm.go
deleted file mode 100644
index 5e2af97..0000000
--- a/src/runtime/defs_darwin_arm.go
+++ /dev/null
@@ -1,230 +0,0 @@
-// Note: cgo can't handle some Darwin/ARM structures, so this file can't
-// be auto generated by cgo yet.
-// Created based on output of `cgo -cdefs defs_darwin.go` and Darwin/ARM
-// specific header (mainly mcontext and ucontext related stuff)
-
-package runtime
-
-import "unsafe"
-
-const (
-	_EINTR     = 0x4
-	_EFAULT    = 0xe
-	_ETIMEDOUT = 0x3c
-
-	_PROT_NONE  = 0x0
-	_PROT_READ  = 0x1
-	_PROT_WRITE = 0x2
-	_PROT_EXEC  = 0x4
-
-	_MAP_ANON    = 0x1000
-	_MAP_PRIVATE = 0x2
-	_MAP_FIXED   = 0x10
-
-	_MADV_DONTNEED      = 0x4
-	_MADV_FREE          = 0x5
-	_MADV_FREE_REUSABLE = 0x7
-	_MADV_FREE_REUSE    = 0x8
-
-	_SA_SIGINFO   = 0x40
-	_SA_RESTART   = 0x2
-	_SA_ONSTACK   = 0x1
-	_SA_USERTRAMP = 0x100
-	_SA_64REGSET  = 0x200
-
-	_SIGHUP    = 0x1
-	_SIGINT    = 0x2
-	_SIGQUIT   = 0x3
-	_SIGILL    = 0x4
-	_SIGTRAP   = 0x5
-	_SIGABRT   = 0x6
-	_SIGEMT    = 0x7
-	_SIGFPE    = 0x8
-	_SIGKILL   = 0x9
-	_SIGBUS    = 0xa
-	_SIGSEGV   = 0xb
-	_SIGSYS    = 0xc
-	_SIGPIPE   = 0xd
-	_SIGALRM   = 0xe
-	_SIGTERM   = 0xf
-	_SIGURG    = 0x10
-	_SIGSTOP   = 0x11
-	_SIGTSTP   = 0x12
-	_SIGCONT   = 0x13
-	_SIGCHLD   = 0x14
-	_SIGTTIN   = 0x15
-	_SIGTTOU   = 0x16
-	_SIGIO     = 0x17
-	_SIGXCPU   = 0x18
-	_SIGXFSZ   = 0x19
-	_SIGVTALRM = 0x1a
-	_SIGPROF   = 0x1b
-	_SIGWINCH  = 0x1c
-	_SIGINFO   = 0x1d
-	_SIGUSR1   = 0x1e
-	_SIGUSR2   = 0x1f
-
-	_FPE_INTDIV = 0x7
-	_FPE_INTOVF = 0x8
-	_FPE_FLTDIV = 0x1
-	_FPE_FLTOVF = 0x2
-	_FPE_FLTUND = 0x3
-	_FPE_FLTRES = 0x4
-	_FPE_FLTINV = 0x5
-	_FPE_FLTSUB = 0x6
-
-	_BUS_ADRALN = 0x1
-	_BUS_ADRERR = 0x2
-	_BUS_OBJERR = 0x3
-
-	_SEGV_MAPERR = 0x1
-	_SEGV_ACCERR = 0x2
-
-	_ITIMER_REAL    = 0x0
-	_ITIMER_VIRTUAL = 0x1
-	_ITIMER_PROF    = 0x2
-
-	_EV_ADD       = 0x1
-	_EV_DELETE    = 0x2
-	_EV_CLEAR     = 0x20
-	_EV_RECEIPT   = 0x40
-	_EV_ERROR     = 0x4000
-	_EV_EOF       = 0x8000
-	_EVFILT_READ  = -0x1
-	_EVFILT_WRITE = -0x2
-
-	_PTHREAD_CREATE_DETACHED = 0x2
-
-	_F_SETFD    = 0x2
-	_F_GETFL    = 0x3
-	_F_SETFL    = 0x4
-	_FD_CLOEXEC = 0x1
-
-	_O_NONBLOCK = 4
-)
-
-type stackt struct {
-	ss_sp    *byte
-	ss_size  uintptr
-	ss_flags int32
-}
-
-type sigactiont struct {
-	__sigaction_u [4]byte
-	sa_tramp      unsafe.Pointer
-	sa_mask       uint32
-	sa_flags      int32
-}
-
-type usigactiont struct {
-	__sigaction_u [4]byte
-	sa_mask       uint32
-	sa_flags      int32
-}
-
-type siginfo struct {
-	si_signo  int32
-	si_errno  int32
-	si_code   int32
-	si_pid    int32
-	si_uid    uint32
-	si_status int32
-	si_addr   uint32
-	si_value  [4]byte
-	si_band   int32
-	__pad     [7]uint32
-}
-
-type timeval struct {
-	tv_sec  int32
-	tv_usec int32
-}
-
-func (tv *timeval) set_usec(x int32) {
-	tv.tv_usec = x
-}
-
-type itimerval struct {
-	it_interval timeval
-	it_value    timeval
-}
-
-type timespec struct {
-	tv_sec  int32
-	tv_nsec int32
-}
-
-//go:nosplit
-func (ts *timespec) setNsec(ns int64) {
-	ts.tv_sec = timediv(ns, 1e9, &ts.tv_nsec)
-}
-
-type floatstate32 struct {
-	r     [32]uint32
-	fpscr uint32
-}
-
-type regs32 struct {
-	r    [13]uint32 // r0 to r12
-	sp   uint32     // r13
-	lr   uint32     // r14
-	pc   uint32     // r15
-	cpsr uint32
-}
-
-type exceptionstate32 struct {
-	trapno     uint32 // NOTE: on 386, the trapno field is split into trapno and cpu
-	err        uint32
-	faultvaddr uint32
-}
-
-type mcontext32 struct {
-	es exceptionstate32
-	ss regs32
-	fs floatstate32
-}
-
-type ucontext struct {
-	uc_onstack  int32
-	uc_sigmask  uint32
-	uc_stack    stackt
-	uc_link     *ucontext
-	uc_mcsize   uint32
-	uc_mcontext *mcontext32
-}
-
-type keventt struct {
-	ident  uint32
-	filter int16
-	flags  uint16
-	fflags uint32
-	data   int32
-	udata  *byte
-}
-
-type pthread uintptr
-type pthreadattr struct {
-	X__sig    int32
-	X__opaque [36]int8
-}
-type pthreadmutex struct {
-	X__sig    int32
-	X__opaque [40]int8
-}
-type pthreadmutexattr struct {
-	X__sig    int32
-	X__opaque [8]int8
-}
-type pthreadcond struct {
-	X__sig    int32
-	X__opaque [24]int8
-}
-type pthreadcondattr struct {
-	X__sig    int32
-	X__opaque [4]int8
-}
-
-type machTimebaseInfo struct {
-	numer uint32
-	denom uint32
-}
diff --git a/src/runtime/defs_darwin_arm64.go b/src/runtime/defs_darwin_arm64.go
index f673eb7..2f46604 100644
--- a/src/runtime/defs_darwin_arm64.go
+++ b/src/runtime/defs_darwin_arm64.go
@@ -8,6 +8,7 @@
 const (
 	_EINTR     = 0x4
 	_EFAULT    = 0xe
+	_EAGAIN    = 0x23
 	_ETIMEDOUT = 0x3c
 
 	_PROT_NONE  = 0x0
diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
index 53c1508..e196dff 100644
--- a/src/runtime/defs_freebsd.go
+++ b/src/runtime/defs_freebsd.go
@@ -47,6 +47,11 @@
 const (
 	EINTR  = C.EINTR
 	EFAULT = C.EFAULT
+	EAGAIN = C.EAGAIN
+	ENOSYS = C.ENOSYS
+
+	O_NONBLOCK = C.O_NONBLOCK
+	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
diff --git a/src/runtime/defs_freebsd_386.go b/src/runtime/defs_freebsd_386.go
index c4d5c89..7677554 100644
--- a/src/runtime/defs_freebsd_386.go
+++ b/src/runtime/defs_freebsd_386.go
@@ -15,6 +15,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -121,6 +126,8 @@
 	spare      [3]uintptr
 }
 
+type thread int32 // long
+
 type sigset struct {
 	__bits [4]uint32
 }
diff --git a/src/runtime/defs_freebsd_amd64.go b/src/runtime/defs_freebsd_amd64.go
index 89d36c2..5a83342 100644
--- a/src/runtime/defs_freebsd_amd64.go
+++ b/src/runtime/defs_freebsd_amd64.go
@@ -15,6 +15,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -122,6 +127,8 @@
 	spare      [3]uintptr
 }
 
+type thread int64 // long
+
 type sigset struct {
 	__bits [4]uint32
 }
diff --git a/src/runtime/defs_freebsd_arm.go b/src/runtime/defs_freebsd_arm.go
index cc8c924..b55dfd8 100644
--- a/src/runtime/defs_freebsd_arm.go
+++ b/src/runtime/defs_freebsd_arm.go
@@ -15,6 +15,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x100000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -121,6 +126,8 @@
 	spare      [3]uintptr
 }
 
+type thread int32 // long
+
 type sigset struct {
 	__bits [4]uint32
 }
diff --git a/src/runtime/defs_freebsd_arm64.go b/src/runtime/defs_freebsd_arm64.go
new file mode 100644
index 0000000..5b9d504
--- /dev/null
+++ b/src/runtime/defs_freebsd_arm64.go
@@ -0,0 +1,259 @@
+// created by cgo -cdefs and then converted to Go
+// cgo -cdefs defs_freebsd.go
+
+package runtime
+
+import "unsafe"
+
+const (
+	_NBBY            = 0x8
+	_CTL_MAXNAME     = 0x18
+	_CPU_LEVEL_WHICH = 0x3
+	_CPU_WHICH_PID   = 0x2
+)
+
+const (
+	_EINTR  = 0x4
+	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x100000
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x1000
+	_MAP_SHARED  = 0x1
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_FREE = 0x5
+
+	_SA_SIGINFO = 0x40
+	_SA_RESTART = 0x2
+	_SA_ONSTACK = 0x1
+
+	_CLOCK_MONOTONIC = 0x4
+	_CLOCK_REALTIME  = 0x0
+
+	_UMTX_OP_WAIT_UINT         = 0xb
+	_UMTX_OP_WAIT_UINT_PRIVATE = 0xf
+	_UMTX_OP_WAKE              = 0x3
+	_UMTX_OP_WAKE_PRIVATE      = 0x10
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGEMT    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGBUS    = 0xa
+	_SIGSEGV   = 0xb
+	_SIGSYS    = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGTERM   = 0xf
+	_SIGURG    = 0x10
+	_SIGSTOP   = 0x11
+	_SIGTSTP   = 0x12
+	_SIGCONT   = 0x13
+	_SIGCHLD   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGIO     = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGINFO   = 0x1d
+	_SIGUSR1   = 0x1e
+	_SIGUSR2   = 0x1f
+
+	_FPE_INTDIV = 0x2
+	_FPE_INTOVF = 0x1
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EV_ADD       = 0x1
+	_EV_DELETE    = 0x2
+	_EV_CLEAR     = 0x20
+	_EV_RECEIPT   = 0x40
+	_EV_ERROR     = 0x4000
+	_EV_EOF       = 0x8000
+	_EVFILT_READ  = -0x1
+	_EVFILT_WRITE = -0x2
+)
+
+type rtprio struct {
+	_type uint16
+	prio  uint16
+}
+
+type thrparam struct {
+	start_func uintptr
+	arg        unsafe.Pointer
+	stack_base uintptr
+	stack_size uintptr
+	tls_base   unsafe.Pointer
+	tls_size   uintptr
+	child_tid  unsafe.Pointer // *int64
+	parent_tid *int64
+	flags      int32
+	pad_cgo_0  [4]byte
+	rtp        *rtprio
+	spare      [3]uintptr
+}
+
+type thread int64 // long
+
+type sigset struct {
+	__bits [4]uint32
+}
+
+type stackt struct {
+	ss_sp     uintptr
+	ss_size   uintptr
+	ss_flags  int32
+	pad_cgo_0 [4]byte
+}
+
+type siginfo struct {
+	si_signo  int32
+	si_errno  int32
+	si_code   int32
+	si_pid    int32
+	si_uid    uint32
+	si_status int32
+	si_addr   uint64
+	si_value  [8]byte
+	_reason   [40]byte
+}
+
+type gpregs struct {
+	gp_x    [30]uint64
+	gp_lr   uint64
+	gp_sp   uint64
+	gp_elr  uint64
+	gp_spsr uint32
+	gp_pad  int32
+}
+
+type fpregs struct {
+	fp_q     [64]uint64 // actually [32]uint128
+	fp_sr    uint32
+	fp_cr    uint32
+	fp_flags int32
+	fp_pad   int32
+}
+
+type mcontext struct {
+	mc_gpregs gpregs
+	mc_fpregs fpregs
+	mc_flags  int32
+	mc_pad    int32
+	mc_spare  [8]uint64
+}
+
+type ucontext struct {
+	uc_sigmask  sigset
+	uc_mcontext mcontext
+	uc_link     *ucontext
+	uc_stack    stackt
+	uc_flags    int32
+	__spare__   [4]int32
+	pad_cgo_0   [12]byte
+}
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type umtx_time struct {
+	_timeout timespec
+	_flags   uint32
+	_clockid uint32
+}
+
+type keventt struct {
+	ident  uint64
+	filter int16
+	flags  uint16
+	fflags uint32
+	data   int64
+	udata  *byte
+}
+
+type bintime struct {
+	sec  int64
+	frac uint64
+}
+
+type vdsoTimehands struct {
+	algo         uint32
+	gen          uint32
+	scale        uint64
+	offset_count uint32
+	counter_mask uint32
+	offset       bintime
+	boottime     bintime
+	physical     uint32
+	res          [7]uint32
+}
+
+type vdsoTimekeep struct {
+	ver       uint32
+	enabled   uint32
+	current   uint32
+	pad_cgo_0 [4]byte
+}
+
+const (
+	_VDSO_TK_VER_CURR = 0x1
+
+	vdsoTimehandsSize = 0x58
+	vdsoTimekeepSize  = 0x10
+)
diff --git a/src/runtime/defs_illumos_amd64.go b/src/runtime/defs_illumos_amd64.go
new file mode 100644
index 0000000..9c5413b
--- /dev/null
+++ b/src/runtime/defs_illumos_amd64.go
@@ -0,0 +1,14 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_RCTL_LOCAL_DENY = 0x2
+
+	_RCTL_LOCAL_MAXIMAL = 0x80000000
+
+	_RCTL_FIRST = 0x0
+	_RCTL_NEXT  = 0x1
+)
diff --git a/src/runtime/defs_linux.go b/src/runtime/defs_linux.go
index 2d81013..7b14063 100644
--- a/src/runtime/defs_linux.go
+++ b/src/runtime/defs_linux.go
@@ -37,6 +37,7 @@
 	EINTR  = C.EINTR
 	EAGAIN = C.EAGAIN
 	ENOMEM = C.ENOMEM
+	ENOSYS = C.ENOSYS
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
@@ -50,7 +51,7 @@
 	MADV_DONTNEED   = C.MADV_DONTNEED
 	MADV_FREE       = C.MADV_FREE
 	MADV_HUGEPAGE   = C.MADV_HUGEPAGE
-	MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
+	MADV_NOHUGEPAGE = C.MADV_NOHUGEPAGE
 
 	SA_RESTART = C.SA_RESTART
 	SA_ONSTACK = C.SA_ONSTACK
diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go
index e2fcbca..f4db8cf 100644
--- a/src/runtime/defs_linux_386.go
+++ b/src/runtime/defs_linux_386.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -78,8 +79,9 @@
 	_ITIMER_VIRTUAL = 0x1
 	_ITIMER_PROF    = 0x2
 
-	_O_RDONLY  = 0x0
-	_O_CLOEXEC = 0x80000
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x800
+	_O_CLOEXEC  = 0x80000
 
 	_EPOLLIN       = 0x1
 	_EPOLLOUT      = 0x4
@@ -93,7 +95,6 @@
 	_EPOLL_CTL_MOD = 0x3
 
 	_AF_UNIX    = 0x1
-	_F_SETFL    = 0x4
 	_SOCK_DGRAM = 0x2
 )
 
@@ -225,3 +226,14 @@
 	family uint16
 	path   [108]byte
 }
+
+const __NEW_UTS_LEN = 64
+
+type new_utsname struct {
+	sysname    [__NEW_UTS_LEN + 1]byte
+	nodename   [__NEW_UTS_LEN + 1]byte
+	release    [__NEW_UTS_LEN + 1]byte
+	version    [__NEW_UTS_LEN + 1]byte
+	machine    [__NEW_UTS_LEN + 1]byte
+	domainname [__NEW_UTS_LEN + 1]byte
+}
diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go
index ddad7fd..8480d85 100644
--- a/src/runtime/defs_linux_amd64.go
+++ b/src/runtime/defs_linux_amd64.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -90,7 +91,6 @@
 	_EPOLL_CTL_MOD = 0x3
 
 	_AF_UNIX    = 0x1
-	_F_SETFL    = 0x4
 	_SOCK_DGRAM = 0x2
 )
 
@@ -143,8 +143,9 @@
 // cgo -cdefs defs_linux.go defs1_linux.go
 
 const (
-	_O_RDONLY  = 0x0
-	_O_CLOEXEC = 0x80000
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x800
+	_O_CLOEXEC  = 0x80000
 )
 
 type usigset struct {
@@ -261,3 +262,14 @@
 	family uint16
 	path   [108]byte
 }
+
+const __NEW_UTS_LEN = 64
+
+type new_utsname struct {
+	sysname    [__NEW_UTS_LEN + 1]byte
+	nodename   [__NEW_UTS_LEN + 1]byte
+	release    [__NEW_UTS_LEN + 1]byte
+	version    [__NEW_UTS_LEN + 1]byte
+	machine    [__NEW_UTS_LEN + 1]byte
+	domainname [__NEW_UTS_LEN + 1]byte
+}
diff --git a/src/runtime/defs_linux_arm.go b/src/runtime/defs_linux_arm.go
index 9d10d66..ea29fd9 100644
--- a/src/runtime/defs_linux_arm.go
+++ b/src/runtime/defs_linux_arm.go
@@ -5,6 +5,7 @@
 	_EINTR  = 0x4
 	_ENOMEM = 0xc
 	_EAGAIN = 0xb
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0
 	_PROT_READ  = 0x1
@@ -71,6 +72,7 @@
 	_ITIMER_PROF    = 0x2
 	_ITIMER_VIRTUAL = 0x1
 	_O_RDONLY       = 0
+	_O_NONBLOCK     = 0x800
 	_O_CLOEXEC      = 0x80000
 
 	_EPOLLIN       = 0x1
@@ -85,7 +87,6 @@
 	_EPOLL_CTL_MOD = 0x3
 
 	_AF_UNIX    = 0x1
-	_F_SETFL    = 0x4
 	_SOCK_DGRAM = 0x2
 )
 
diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go
index b325a22..0690cd3 100644
--- a/src/runtime/defs_linux_arm64.go
+++ b/src/runtime/defs_linux_arm64.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -90,7 +91,6 @@
 	_EPOLL_CTL_MOD = 0x3
 
 	_AF_UNIX    = 0x1
-	_F_SETFL    = 0x4
 	_SOCK_DGRAM = 0x2
 )
 
@@ -144,8 +144,9 @@
 // ../cmd/cgo/cgo -cdefs defs_linux.go defs1_linux.go defs2_linux.go
 
 const (
-	_O_RDONLY  = 0x0
-	_O_CLOEXEC = 0x80000
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x800
+	_O_CLOEXEC  = 0x80000
 )
 
 type usigset struct {
diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go
index a52d0d4..0fb53d5 100644
--- a/src/runtime/defs_linux_mips64x.go
+++ b/src/runtime/defs_linux_mips64x.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x59
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -145,6 +146,7 @@
 
 const (
 	_O_RDONLY    = 0x0
+	_O_NONBLOCK  = 0x80
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_linux_mipsx.go b/src/runtime/defs_linux_mipsx.go
index f3a1dd0..9315ba9 100644
--- a/src/runtime/defs_linux_mipsx.go
+++ b/src/runtime/defs_linux_mipsx.go
@@ -11,6 +11,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x59
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -143,6 +144,7 @@
 
 const (
 	_O_RDONLY    = 0x0
+	_O_NONBLOCK  = 0x80
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_linux_ppc64.go b/src/runtime/defs_linux_ppc64.go
index f438993..90b1dc1 100644
--- a/src/runtime/defs_linux_ppc64.go
+++ b/src/runtime/defs_linux_ppc64.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -145,6 +146,7 @@
 
 const (
 	_O_RDONLY    = 0x0
+	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_linux_ppc64le.go b/src/runtime/defs_linux_ppc64le.go
index f438993..90b1dc1 100644
--- a/src/runtime/defs_linux_ppc64le.go
+++ b/src/runtime/defs_linux_ppc64le.go
@@ -7,6 +7,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -145,6 +146,7 @@
 
 const (
 	_O_RDONLY    = 0x0
+	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_linux_riscv64.go b/src/runtime/defs_linux_riscv64.go
new file mode 100644
index 0000000..60da0fa
--- /dev/null
+++ b/src/runtime/defs_linux_riscv64.go
@@ -0,0 +1,209 @@
+// Generated using cgo, then manually converted into appropriate naming and code
+// for the Go runtime.
+// go tool cgo -godefs defs_linux.go defs1_linux.go defs2_linux.go
+
+package runtime
+
+const (
+	_EINTR  = 0x4
+	_EAGAIN = 0xb
+	_ENOMEM = 0xc
+	_ENOSYS = 0x26
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x20
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED   = 0x4
+	_MADV_FREE       = 0x8
+	_MADV_HUGEPAGE   = 0xe
+	_MADV_NOHUGEPAGE = 0xf
+
+	_SA_RESTART  = 0x10000000
+	_SA_ONSTACK  = 0x8000000
+	_SA_RESTORER = 0x0
+	_SA_SIGINFO  = 0x4
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGBUS    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGUSR1   = 0xa
+	_SIGSEGV   = 0xb
+	_SIGUSR2   = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGSTKFLT = 0x10
+	_SIGCHLD   = 0x11
+	_SIGCONT   = 0x12
+	_SIGSTOP   = 0x13
+	_SIGTSTP   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGURG    = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGIO     = 0x1d
+	_SIGPWR    = 0x1e
+	_SIGSYS    = 0x1f
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EPOLLIN       = 0x1
+	_EPOLLOUT      = 0x4
+	_EPOLLERR      = 0x8
+	_EPOLLHUP      = 0x10
+	_EPOLLRDHUP    = 0x2000
+	_EPOLLET       = 0x80000000
+	_EPOLL_CLOEXEC = 0x80000
+	_EPOLL_CTL_ADD = 0x1
+	_EPOLL_CTL_DEL = 0x2
+	_EPOLL_CTL_MOD = 0x3
+)
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+//go:nosplit
+func (ts *timespec) setNsec(ns int64) {
+	ts.tv_sec = ns / 1e9
+	ts.tv_nsec = ns % 1e9
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type sigactiont struct {
+	sa_handler  uintptr
+	sa_flags    uint64
+	sa_restorer uintptr
+	sa_mask     uint64
+}
+
+type siginfo struct {
+	si_signo int32
+	si_errno int32
+	si_code  int32
+	// below here is a union; si_addr is the only field we use
+	si_addr uint64
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type epollevent struct {
+	events    uint32
+	pad_cgo_0 [4]byte
+	data      [8]byte // unaligned uintptr
+}
+
+const (
+	_O_RDONLY   = 0x0
+	_O_NONBLOCK = 0x800
+	_O_CLOEXEC  = 0x80000
+)
+
+type user_regs_struct struct {
+	pc  uint64
+	ra  uint64
+	sp  uint64
+	gp  uint64
+	tp  uint64
+	t0  uint64
+	t1  uint64
+	t2  uint64
+	s0  uint64
+	s1  uint64
+	a0  uint64
+	a1  uint64
+	a2  uint64
+	a3  uint64
+	a4  uint64
+	a5  uint64
+	a6  uint64
+	a7  uint64
+	s2  uint64
+	s3  uint64
+	s4  uint64
+	s5  uint64
+	s6  uint64
+	s7  uint64
+	s8  uint64
+	s9  uint64
+	s10 uint64
+	s11 uint64
+	t3  uint64
+	t4  uint64
+	t5  uint64
+	t6  uint64
+}
+
+type user_fpregs_struct struct {
+	f [528]byte
+}
+
+type usigset struct {
+	us_x__val [16]uint64
+}
+
+type sigcontext struct {
+	sc_regs   user_regs_struct
+	sc_fpregs user_fpregs_struct
+}
+
+type stackt struct {
+	ss_sp    *byte
+	ss_flags int32
+	ss_size  uintptr
+}
+
+type ucontext struct {
+	uc_flags     uint64
+	uc_link      *ucontext
+	uc_stack     stackt
+	uc_sigmask   usigset
+	uc_x__unused [0]uint8
+	uc_pad_cgo_0 [8]byte
+	uc_mcontext  sigcontext
+}
diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go
index 19b99b5..fa289d5 100644
--- a/src/runtime/defs_linux_s390x.go
+++ b/src/runtime/defs_linux_s390x.go
@@ -8,6 +8,7 @@
 	_EINTR  = 0x4
 	_EAGAIN = 0xb
 	_ENOMEM = 0xc
+	_ENOSYS = 0x26
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
@@ -138,6 +139,7 @@
 
 const (
 	_O_RDONLY    = 0x0
+	_O_NONBLOCK  = 0x800
 	_O_CLOEXEC   = 0x80000
 	_SA_RESTORER = 0
 )
diff --git a/src/runtime/defs_nacl_386.go b/src/runtime/defs_nacl_386.go
deleted file mode 100644
index 70dfcf2..0000000
--- a/src/runtime/defs_nacl_386.go
+++ /dev/null
@@ -1,49 +0,0 @@
-package runtime
-
-const (
-	// These values are referred to in the source code
-	// but really don't matter. Even so, use the standard numbers.
-	_SIGQUIT = 3
-	_SIGTRAP = 5
-	_SIGSEGV = 11
-	_SIGPROF = 27
-)
-
-type timespec struct {
-	tv_sec  int64
-	tv_nsec int32
-}
-
-//go:nosplit
-func (ts *timespec) setNsec(ns int64) {
-	ts.tv_sec = int64(timediv(ns, 1e9, &ts.tv_nsec))
-}
-
-type excregs386 struct {
-	eax    uint32
-	ecx    uint32
-	edx    uint32
-	ebx    uint32
-	esp    uint32
-	ebp    uint32
-	esi    uint32
-	edi    uint32
-	eip    uint32
-	eflags uint32
-}
-
-type exccontext struct {
-	size                    uint32
-	portable_context_offset uint32
-	portable_context_size   uint32
-	arch                    uint32
-	regs_size               uint32
-	reserved                [11]uint32
-	regs                    excregs386
-}
-
-type excportablecontext struct {
-	pc uint32
-	sp uint32
-	fp uint32
-}
diff --git a/src/runtime/defs_nacl_amd64p32.go b/src/runtime/defs_nacl_amd64p32.go
deleted file mode 100644
index 3706748..0000000
--- a/src/runtime/defs_nacl_amd64p32.go
+++ /dev/null
@@ -1,71 +0,0 @@
-package runtime
-
-const (
-	// These values are referred to in the source code
-	// but really don't matter. Even so, use the standard numbers.
-	_SIGQUIT = 3
-	_SIGTRAP = 5
-	_SIGSEGV = 11
-	_SIGPROF = 27
-)
-
-type timespec struct {
-	tv_sec  int64
-	tv_nsec int32
-}
-
-//go:nosplit
-func (ts *timespec) setNsec(ns int64) {
-	ts.tv_sec = ns / 1e9
-	ts.tv_nsec = int32(ns % 1e9)
-}
-
-type excregs386 struct {
-	eax    uint32
-	ecx    uint32
-	edx    uint32
-	ebx    uint32
-	esp    uint32
-	ebp    uint32
-	esi    uint32
-	edi    uint32
-	eip    uint32
-	eflags uint32
-}
-
-type excregsamd64 struct {
-	rax    uint64
-	rcx    uint64
-	rdx    uint64
-	rbx    uint64
-	rsp    uint64
-	rbp    uint64
-	rsi    uint64
-	rdi    uint64
-	r8     uint64
-	r9     uint64
-	r10    uint64
-	r11    uint64
-	r12    uint64
-	r13    uint64
-	r14    uint64
-	r15    uint64
-	rip    uint64
-	rflags uint32
-}
-
-type exccontext struct {
-	size                    uint32
-	portable_context_offset uint32
-	portable_context_size   uint32
-	arch                    uint32
-	regs_size               uint32
-	reserved                [11]uint32
-	regs                    excregsamd64
-}
-
-type excportablecontext struct {
-	pc uint32
-	sp uint32
-	fp uint32
-}
diff --git a/src/runtime/defs_nacl_arm.go b/src/runtime/defs_nacl_arm.go
deleted file mode 100644
index 89e539e..0000000
--- a/src/runtime/defs_nacl_arm.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package runtime
-
-const (
-	// These values are referred to in the source code
-	// but really don't matter. Even so, use the standard numbers.
-	_SIGQUIT = 3
-	_SIGTRAP = 5
-	_SIGSEGV = 11
-	_SIGPROF = 27
-)
-
-type timespec struct {
-	tv_sec  int64
-	tv_nsec int32
-}
-
-//go:nosplit
-func (ts *timespec) setNsec(ns int64) {
-	ts.tv_sec = int64(timediv(ns, 1e9, &ts.tv_nsec))
-}
-
-type excregsarm struct {
-	r0   uint32
-	r1   uint32
-	r2   uint32
-	r3   uint32
-	r4   uint32
-	r5   uint32
-	r6   uint32
-	r7   uint32
-	r8   uint32
-	r9   uint32 // the value reported here is undefined.
-	r10  uint32
-	r11  uint32
-	r12  uint32
-	sp   uint32 /* r13 */
-	lr   uint32 /* r14 */
-	pc   uint32 /* r15 */
-	cpsr uint32
-}
-
-type exccontext struct {
-	size                    uint32
-	portable_context_offset uint32
-	portable_context_size   uint32
-	arch                    uint32
-	regs_size               uint32
-	reserved                [11]uint32
-	regs                    excregsarm
-}
-
-type excportablecontext struct {
-	pc uint32
-	sp uint32
-	fp uint32
-}
diff --git a/src/runtime/defs_netbsd.go b/src/runtime/defs_netbsd.go
index 41aa07a..3f5ce5a 100644
--- a/src/runtime/defs_netbsd.go
+++ b/src/runtime/defs_netbsd.go
@@ -32,6 +32,11 @@
 const (
 	EINTR  = C.EINTR
 	EFAULT = C.EFAULT
+	EAGAIN = C.EAGAIN
+	ENOSYS = C.ENOSYS
+
+	O_NONBLOCK = C.O_NONBLOCK
+	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
diff --git a/src/runtime/defs_openbsd.go b/src/runtime/defs_openbsd.go
index a328d25..4774e36 100644
--- a/src/runtime/defs_openbsd.go
+++ b/src/runtime/defs_openbsd.go
@@ -28,6 +28,11 @@
 const (
 	EINTR  = C.EINTR
 	EFAULT = C.EFAULT
+	EAGAIN = C.EAGAIN
+	ENOSYS = C.ENOSYS
+
+	O_NONBLOCK = C.O_NONBLOCK
+	O_CLOEXEC  = C.O_CLOEXEC
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
diff --git a/src/runtime/defs_openbsd_386.go b/src/runtime/defs_openbsd_386.go
index 0e59a05..35f2e53 100644
--- a/src/runtime/defs_openbsd_386.go
+++ b/src/runtime/defs_openbsd_386.go
@@ -8,6 +8,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs_openbsd_amd64.go b/src/runtime/defs_openbsd_amd64.go
index 5cefac5..c187a98 100644
--- a/src/runtime/defs_openbsd_amd64.go
+++ b/src/runtime/defs_openbsd_amd64.go
@@ -8,6 +8,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs_openbsd_arm.go b/src/runtime/defs_openbsd_arm.go
index b187e97..170bb38 100644
--- a/src/runtime/defs_openbsd_arm.go
+++ b/src/runtime/defs_openbsd_arm.go
@@ -8,6 +8,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs_openbsd_arm64.go b/src/runtime/defs_openbsd_arm64.go
index 6b9d601..8b8d5cd 100644
--- a/src/runtime/defs_openbsd_arm64.go
+++ b/src/runtime/defs_openbsd_arm64.go
@@ -5,6 +5,11 @@
 const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
+	_EAGAIN = 0x23
+	_ENOSYS = 0x4e
+
+	_O_NONBLOCK = 0x4
+	_O_CLOEXEC  = 0x10000
 
 	_PROT_NONE  = 0x0
 	_PROT_READ  = 0x1
diff --git a/src/runtime/defs_solaris.go b/src/runtime/defs_solaris.go
index 0638e0b..22df590 100644
--- a/src/runtime/defs_solaris.go
+++ b/src/runtime/defs_solaris.go
@@ -38,9 +38,12 @@
 	EBADF       = C.EBADF
 	EFAULT      = C.EFAULT
 	EAGAIN      = C.EAGAIN
+	EBUSY       = C.EBUSY
+	ETIME       = C.ETIME
 	ETIMEDOUT   = C.ETIMEDOUT
 	EWOULDBLOCK = C.EWOULDBLOCK
 	EINPROGRESS = C.EINPROGRESS
+	ENOSYS      = C.ENOSYS
 
 	PROT_NONE  = C.PROT_NONE
 	PROT_READ  = C.PROT_READ
@@ -118,6 +121,7 @@
 	MAXHOSTNAMELEN = C.MAXHOSTNAMELEN
 
 	O_NONBLOCK = C.O_NONBLOCK
+	O_CLOEXEC  = C.O_CLOEXEC
 	FD_CLOEXEC = C.FD_CLOEXEC
 	F_GETFL    = C.F_GETFL
 	F_SETFL    = C.F_SETFL
@@ -128,7 +132,9 @@
 	POLLHUP = C.POLLHUP
 	POLLERR = C.POLLERR
 
-	PORT_SOURCE_FD = C.PORT_SOURCE_FD
+	PORT_SOURCE_FD    = C.PORT_SOURCE_FD
+	PORT_SOURCE_ALERT = C.PORT_SOURCE_ALERT
+	PORT_ALERT_UPDATE = C.PORT_ALERT_UPDATE
 )
 
 type SemT C.sem_t
diff --git a/src/runtime/defs_windows.go b/src/runtime/defs_windows.go
index 9bd9107..43f358d 100644
--- a/src/runtime/defs_windows.go
+++ b/src/runtime/defs_windows.go
@@ -41,9 +41,13 @@
 	DUPLICATE_SAME_ACCESS   = C.DUPLICATE_SAME_ACCESS
 	THREAD_PRIORITY_HIGHEST = C.THREAD_PRIORITY_HIGHEST
 
-	SIGINT           = C.SIGINT
-	CTRL_C_EVENT     = C.CTRL_C_EVENT
-	CTRL_BREAK_EVENT = C.CTRL_BREAK_EVENT
+	SIGINT              = C.SIGINT
+	SIGTERM             = C.SIGTERM
+	CTRL_C_EVENT        = C.CTRL_C_EVENT
+	CTRL_BREAK_EVENT    = C.CTRL_BREAK_EVENT
+	CTRL_CLOSE_EVENT    = C.CTRL_CLOSE_EVENT
+	CTRL_LOGOFF_EVENT   = C.CTRL_LOGOFF_EVENT
+	CTRL_SHUTDOWN_EVENT = C.CTRL_SHUTDOWN_EVENT
 
 	CONTEXT_CONTROL = C.CONTEXT_CONTROL
 	CONTEXT_FULL    = C.CONTEXT_FULL
diff --git a/src/runtime/defs_windows_386.go b/src/runtime/defs_windows_386.go
index 8c0d6d8..3c5057b 100644
--- a/src/runtime/defs_windows_386.go
+++ b/src/runtime/defs_windows_386.go
@@ -15,9 +15,13 @@
 	_DUPLICATE_SAME_ACCESS   = 0x2
 	_THREAD_PRIORITY_HIGHEST = 0x2
 
-	_SIGINT           = 0x2
-	_CTRL_C_EVENT     = 0x0
-	_CTRL_BREAK_EVENT = 0x1
+	_SIGINT              = 0x2
+	_SIGTERM             = 0xF
+	_CTRL_C_EVENT        = 0x0
+	_CTRL_BREAK_EVENT    = 0x1
+	_CTRL_CLOSE_EVENT    = 0x2
+	_CTRL_LOGOFF_EVENT   = 0x5
+	_CTRL_SHUTDOWN_EVENT = 0x6
 
 	_CONTEXT_CONTROL = 0x10001
 	_CONTEXT_FULL    = 0x10007
diff --git a/src/runtime/defs_windows_amd64.go b/src/runtime/defs_windows_amd64.go
index 42a446d..ebb1506 100644
--- a/src/runtime/defs_windows_amd64.go
+++ b/src/runtime/defs_windows_amd64.go
@@ -15,9 +15,13 @@
 	_DUPLICATE_SAME_ACCESS   = 0x2
 	_THREAD_PRIORITY_HIGHEST = 0x2
 
-	_SIGINT           = 0x2
-	_CTRL_C_EVENT     = 0x0
-	_CTRL_BREAK_EVENT = 0x1
+	_SIGINT              = 0x2
+	_SIGTERM             = 0xF
+	_CTRL_C_EVENT        = 0x0
+	_CTRL_BREAK_EVENT    = 0x1
+	_CTRL_CLOSE_EVENT    = 0x2
+	_CTRL_LOGOFF_EVENT   = 0x5
+	_CTRL_SHUTDOWN_EVENT = 0x6
 
 	_CONTEXT_CONTROL = 0x100001
 	_CONTEXT_FULL    = 0x10000b
diff --git a/src/runtime/defs_windows_arm.go b/src/runtime/defs_windows_arm.go
index 049f5b6..b275b05 100644
--- a/src/runtime/defs_windows_arm.go
+++ b/src/runtime/defs_windows_arm.go
@@ -16,9 +16,13 @@
 	_DUPLICATE_SAME_ACCESS   = 0x2
 	_THREAD_PRIORITY_HIGHEST = 0x2
 
-	_SIGINT           = 0x2
-	_CTRL_C_EVENT     = 0x0
-	_CTRL_BREAK_EVENT = 0x1
+	_SIGINT              = 0x2
+	_SIGTERM             = 0xF
+	_CTRL_C_EVENT        = 0x0
+	_CTRL_BREAK_EVENT    = 0x1
+	_CTRL_CLOSE_EVENT    = 0x2
+	_CTRL_LOGOFF_EVENT   = 0x5
+	_CTRL_SHUTDOWN_EVENT = 0x6
 
 	_CONTEXT_CONTROL = 0x10001
 	_CONTEXT_FULL    = 0x10007
diff --git a/src/runtime/duff_mips64x.s b/src/runtime/duff_mips64x.s
index acf0a4e..c4e04cc 100644
--- a/src/runtime/duff_mips64x.s
+++ b/src/runtime/duff_mips64x.s
@@ -265,7 +265,645 @@
 	ADDV	$8, R1
 	RET
 
-// TODO: Implement runtime·duffcopy.
-TEXT runtime·duffcopy(SB),NOSPLIT|NOFRAME,$0-0
-	MOVV	R0, 2(R0)
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
+	MOVV	(R1), R23
+	ADDV	$8, R1
+	MOVV	R23, (R2)
+	ADDV	$8, R2
+
 	RET
diff --git a/src/runtime/duff_ppc64x.s b/src/runtime/duff_ppc64x.s
index 0c62d0a..d6b89ba 100644
--- a/src/runtime/duff_ppc64x.s
+++ b/src/runtime/duff_ppc64x.s
@@ -137,7 +137,5 @@
 	MOVDU	R0, 8(R3)
 	RET
 
-// TODO: Implement runtime·duffcopy.
-TEXT runtime·duffcopy(SB),NOSPLIT|NOFRAME,$0-0
-	MOVD	R0, 0(R0)
-	RET
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	UNDEF
diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
index 03208c7..f95ff68 100644
--- a/src/runtime/env_posix.go
+++ b/src/runtime/env_posix.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd js,wasm linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd js,wasm linux netbsd openbsd solaris windows
 
 package runtime
 
diff --git a/src/runtime/error.go b/src/runtime/error.go
index 0085dfc..386569b 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go
@@ -88,7 +88,7 @@
 	return string(e)
 }
 
-// An boundsError represents a an indexing or slicing operation gone wrong.
+// A boundsError represents an indexing or slicing operation gone wrong.
 type boundsError struct {
 	x int64
 	y int
@@ -185,11 +185,6 @@
 	String() string
 }
 
-func typestring(x interface{}) string {
-	e := efaceOf(&x)
-	return e._type.string()
-}
-
 // printany prints an argument passed to panic.
 // If panic is called with a value that has a String or Error method,
 // it has already been converted into a string by preprintpanics.
@@ -232,7 +227,51 @@
 	case string:
 		print(v)
 	default:
-		print("(", typestring(i), ") ", i)
+		printanycustomtype(i)
+	}
+}
+
+func printanycustomtype(i interface{}) {
+	eface := efaceOf(&i)
+	typestring := eface._type.string()
+
+	switch eface._type.kind {
+	case kindString:
+		print(typestring, `("`, *(*string)(eface.data), `")`)
+	case kindBool:
+		print(typestring, "(", *(*bool)(eface.data), ")")
+	case kindInt:
+		print(typestring, "(", *(*int)(eface.data), ")")
+	case kindInt8:
+		print(typestring, "(", *(*int8)(eface.data), ")")
+	case kindInt16:
+		print(typestring, "(", *(*int16)(eface.data), ")")
+	case kindInt32:
+		print(typestring, "(", *(*int32)(eface.data), ")")
+	case kindInt64:
+		print(typestring, "(", *(*int64)(eface.data), ")")
+	case kindUint:
+		print(typestring, "(", *(*uint)(eface.data), ")")
+	case kindUint8:
+		print(typestring, "(", *(*uint8)(eface.data), ")")
+	case kindUint16:
+		print(typestring, "(", *(*uint16)(eface.data), ")")
+	case kindUint32:
+		print(typestring, "(", *(*uint32)(eface.data), ")")
+	case kindUint64:
+		print(typestring, "(", *(*uint64)(eface.data), ")")
+	case kindUintptr:
+		print(typestring, "(", *(*uintptr)(eface.data), ")")
+	case kindFloat32:
+		print(typestring, "(", *(*float32)(eface.data), ")")
+	case kindFloat64:
+		print(typestring, "(", *(*float64)(eface.data), ")")
+	case kindComplex64:
+		print(typestring, *(*complex64)(eface.data))
+	case kindComplex128:
+		print(typestring, *(*complex128)(eface.data))
+	default:
+		print("(", typestring, ") ", eface.data)
 	}
 }
 
diff --git a/src/runtime/export_aix_test.go b/src/runtime/export_aix_test.go
new file mode 100644
index 0000000..162552d
--- /dev/null
+++ b/src/runtime/export_aix_test.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+var Fcntl = syscall_fcntl1
diff --git a/src/runtime/export_darwin_test.go b/src/runtime/export_darwin_test.go
new file mode 100644
index 0000000..e9b6eb3
--- /dev/null
+++ b/src/runtime/export_darwin_test.go
@@ -0,0 +1,13 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func Fcntl(fd, cmd, arg uintptr) (uintptr, uintptr) {
+	r := fcntl(int32(fd), int32(cmd), int32(arg))
+	if r < 0 {
+		return ^uintptr(0), uintptr(-r)
+	}
+	return uintptr(r), 0
+}
diff --git a/src/runtime/export_debug_test.go b/src/runtime/export_debug_test.go
index e97dd52..ed4242e 100644
--- a/src/runtime/export_debug_test.go
+++ b/src/runtime/export_debug_test.go
@@ -20,7 +20,7 @@
 //
 // On success, InjectDebugCall returns the panic value of fn or nil.
 // If fn did not panic, its results will be available in args.
-func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error) (interface{}, error) {
+func InjectDebugCall(gp *g, fn, args interface{}, tkill func(tid int) error, returnOnUnsafePoint bool) (interface{}, error) {
 	if gp.lockedm == 0 {
 		return nil, plainError("goroutine not locked to thread")
 	}
@@ -48,6 +48,9 @@
 
 	h := new(debugCallHandler)
 	h.gp = gp
+	// gp may not be running right now, but we can still get the M
+	// it will run on since it's locked.
+	h.mp = gp.lockedm.ptr()
 	h.fv, h.argp, h.argSize = fv, argp, argSize
 	h.handleF = h.handle // Avoid allocating closure during signal
 
@@ -64,9 +67,16 @@
 		notetsleepg(&h.done, -1)
 		if h.err != "" {
 			switch h.err {
-			case "retry _Grunnable", "executing on Go runtime stack":
+			case "call not at safe point":
+				if returnOnUnsafePoint {
+					// This is for TestDebugCallUnsafePoint.
+					return nil, h.err
+				}
+				fallthrough
+			case "retry _Grunnable", "executing on Go runtime stack", "call from within the Go runtime":
 				// These are transient states. Try to get out of them.
 				if i < 100 {
+					usleep(100)
 					Gosched()
 					continue
 				}
@@ -79,6 +89,7 @@
 
 type debugCallHandler struct {
 	gp      *g
+	mp      *m
 	fv      *funcval
 	argp    unsafe.Pointer
 	argSize uintptr
@@ -95,8 +106,8 @@
 func (h *debugCallHandler) inject(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 	switch h.gp.atomicstatus {
 	case _Grunning:
-		if getg().m != h.gp.m {
-			println("trap on wrong M", getg().m, h.gp.m)
+		if getg().m != h.mp {
+			println("trap on wrong M", getg().m, h.mp)
 			return false
 		}
 		// Push current PC on the stack.
@@ -128,8 +139,8 @@
 
 func (h *debugCallHandler) handle(info *siginfo, ctxt *sigctxt, gp2 *g) bool {
 	// Sanity check.
-	if getg().m != h.gp.m {
-		println("trap on wrong M", getg().m, h.gp.m)
+	if getg().m != h.mp {
+		println("trap on wrong M", getg().m, h.mp)
 		return false
 	}
 	f := findfunc(uintptr(ctxt.rip()))
diff --git a/src/runtime/export_futex_test.go b/src/runtime/export_futex_test.go
index 5e27236..a727a93 100644
--- a/src/runtime/export_futex_test.go
+++ b/src/runtime/export_futex_test.go
@@ -6,5 +6,14 @@
 
 package runtime
 
-var Futexsleep = futexsleep
 var Futexwakeup = futexwakeup
+
+//go:nosplit
+func Futexsleep(addr *uint32, val uint32, ns int64) {
+	// Temporarily disable preemption so that a preemption signal
+	// doesn't interrupt the system call.
+	poff := debug.asyncpreemptoff
+	debug.asyncpreemptoff = 1
+	futexsleep(addr, val, ns)
+	debug.asyncpreemptoff = poff
+}
diff --git a/src/runtime/export_linux_test.go b/src/runtime/export_linux_test.go
index c73f2f3..b7c901f 100644
--- a/src/runtime/export_linux_test.go
+++ b/src/runtime/export_linux_test.go
@@ -10,6 +10,9 @@
 
 var NewOSProc0 = newosproc0
 var Mincore = mincore
+var Add = add
+
+type EpollEvent epollevent
 
 func Epollctl(epfd, op, fd int32, ev unsafe.Pointer) int32 {
 	return epollctl(epfd, op, fd, (*epollevent)(ev))
diff --git a/src/runtime/export_mmap_test.go b/src/runtime/export_mmap_test.go
index 6c4a446..aeaf37f 100644
--- a/src/runtime/export_mmap_test.go
+++ b/src/runtime/export_mmap_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 // Export guts for testing.
 
diff --git a/src/runtime/export_solaris_test.go b/src/runtime/export_solaris_test.go
new file mode 100644
index 0000000..e865c77
--- /dev/null
+++ b/src/runtime/export_solaris_test.go
@@ -0,0 +1,9 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func Fcntl(fd, cmd, arg uintptr) (uintptr, uintptr) {
+	return sysvicall3Err(&libc_fcntl, fd, cmd, arg)
+}
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 6009932..5ab03f3 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -35,9 +35,21 @@
 var Atoi32 = atoi32
 
 var Nanotime = nanotime
+var NetpollBreak = netpollBreak
+var Usleep = usleep
 
+var PhysPageSize = physPageSize
 var PhysHugePageSize = physHugePageSize
 
+var NetpollGenericInit = netpollGenericInit
+
+var ParseRelease = parseRelease
+
+var Memmove = memmove
+var MemclrNoHeapPointers = memclrNoHeapPointers
+
+const PreemptMSupported = preemptMSupported
+
 type LFNode struct {
 	Next    uint64
 	Pushcnt uintptr
@@ -51,6 +63,12 @@
 	return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
 }
 
+func Netpoll(delta int64) {
+	systemstack(func() {
+		netpoll(delta)
+	})
+}
+
 func GCMask(x interface{}) (ret []byte) {
 	systemstack(func() {
 		ret = getgcmask(x)
@@ -246,7 +264,7 @@
 	pagesInUse = uintptr(mheap_.pagesInUse)
 
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			counted += s.npages
 		}
 	}
@@ -308,7 +326,7 @@
 
 		// Add up current allocations in spans.
 		for _, s := range mheap_.allspans {
-			if s.state != mSpanInUse {
+			if s.state.get() != mSpanInUse {
 				continue
 			}
 			if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
@@ -341,9 +359,17 @@
 			slow.BySize[i].Frees = bySize[i].Frees
 		}
 
-		for i := mheap_.free.start(0, 0); i.valid(); i = i.next() {
-			slow.HeapReleased += uint64(i.span().released())
+		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
+			pg := mheap_.pages.chunkOf(i).scavenged.popcntRange(0, pallocChunkPages)
+			slow.HeapReleased += uint64(pg) * pageSize
 		}
+		for _, p := range allp {
+			pg := sys.OnesCount64(p.pcache.scav)
+			slow.HeapReleased += uint64(pg) * pageSize
+		}
+
+		// Unused space in the current arena also counts as released space.
+		slow.HeapReleased += uint64(mheap_.curArena.end - mheap_.curArena.base)
 
 		getg().m.mallocing--
 	})
@@ -457,6 +483,8 @@
 
 type G = g
 
+type Sudog = sudog
+
 func Getg() *G {
 	return getg()
 }
@@ -518,170 +546,6 @@
 	}
 }
 
-// UnscavHugePagesSlow returns the value of mheap_.freeHugePages
-// and the number of unscavenged huge pages calculated by
-// scanning the heap.
-func UnscavHugePagesSlow() (uintptr, uintptr) {
-	var base, slow uintptr
-	// Run on the system stack to avoid deadlock from stack growth
-	// trying to acquire the heap lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		base = mheap_.free.unscavHugePages
-		for _, s := range mheap_.allspans {
-			if s.state == mSpanFree && !s.scavenged {
-				slow += s.hugePages()
-			}
-		}
-		unlock(&mheap_.lock)
-	})
-	return base, slow
-}
-
-// Span is a safe wrapper around an mspan, whose memory
-// is managed manually.
-type Span struct {
-	*mspan
-}
-
-func AllocSpan(base, npages uintptr, scavenged bool) Span {
-	var s *mspan
-	systemstack(func() {
-		lock(&mheap_.lock)
-		s = (*mspan)(mheap_.spanalloc.alloc())
-		unlock(&mheap_.lock)
-	})
-	s.init(base, npages)
-	s.scavenged = scavenged
-	return Span{s}
-}
-
-func (s *Span) Free() {
-	systemstack(func() {
-		lock(&mheap_.lock)
-		mheap_.spanalloc.free(unsafe.Pointer(s.mspan))
-		unlock(&mheap_.lock)
-	})
-	s.mspan = nil
-}
-
-func (s Span) Base() uintptr {
-	return s.mspan.base()
-}
-
-func (s Span) Pages() uintptr {
-	return s.mspan.npages
-}
-
-type TreapIterType treapIterType
-
-const (
-	TreapIterScav TreapIterType = TreapIterType(treapIterScav)
-	TreapIterHuge               = TreapIterType(treapIterHuge)
-	TreapIterBits               = treapIterBits
-)
-
-type TreapIterFilter treapIterFilter
-
-func TreapFilter(mask, match TreapIterType) TreapIterFilter {
-	return TreapIterFilter(treapFilter(treapIterType(mask), treapIterType(match)))
-}
-
-func (s Span) MatchesIter(mask, match TreapIterType) bool {
-	return treapFilter(treapIterType(mask), treapIterType(match)).matches(s.treapFilter())
-}
-
-type TreapIter struct {
-	treapIter
-}
-
-func (t TreapIter) Span() Span {
-	return Span{t.span()}
-}
-
-func (t TreapIter) Valid() bool {
-	return t.valid()
-}
-
-func (t TreapIter) Next() TreapIter {
-	return TreapIter{t.next()}
-}
-
-func (t TreapIter) Prev() TreapIter {
-	return TreapIter{t.prev()}
-}
-
-// Treap is a safe wrapper around mTreap for testing.
-//
-// It must never be heap-allocated because mTreap is
-// notinheap.
-//
-//go:notinheap
-type Treap struct {
-	mTreap
-}
-
-func (t *Treap) Start(mask, match TreapIterType) TreapIter {
-	return TreapIter{t.start(treapIterType(mask), treapIterType(match))}
-}
-
-func (t *Treap) End(mask, match TreapIterType) TreapIter {
-	return TreapIter{t.end(treapIterType(mask), treapIterType(match))}
-}
-
-func (t *Treap) Insert(s Span) {
-	// mTreap uses a fixalloc in mheap_ for treapNode
-	// allocation which requires the mheap_ lock to manipulate.
-	// Locking here is safe because the treap itself never allocs
-	// or otherwise ends up grabbing this lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		t.insert(s.mspan)
-		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
-}
-
-func (t *Treap) Find(npages uintptr) TreapIter {
-	return TreapIter{t.find(npages)}
-}
-
-func (t *Treap) Erase(i TreapIter) {
-	// mTreap uses a fixalloc in mheap_ for treapNode
-	// freeing which requires the mheap_ lock to manipulate.
-	// Locking here is safe because the treap itself never allocs
-	// or otherwise ends up grabbing this lock.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		t.erase(i.treapIter)
-		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
-}
-
-func (t *Treap) RemoveSpan(s Span) {
-	// See Erase about locking.
-	systemstack(func() {
-		lock(&mheap_.lock)
-		t.removeSpan(s.mspan)
-		unlock(&mheap_.lock)
-	})
-	t.CheckInvariants()
-}
-
-func (t *Treap) Size() int {
-	i := 0
-	t.mTreap.treap.walkTreap(func(t *treapNode) {
-		i++
-	})
-	return i
-}
-
-func (t *Treap) CheckInvariants() {
-	t.mTreap.treap.walkTreap(checkTreapNode)
-	t.mTreap.treap.validateInvariants()
-}
-
 func RunGetgThreadSwitchTest() {
 	// Test that getg works correctly with thread switch.
 	// With gccgo, if we generate getg inlined, the backend
@@ -715,3 +579,413 @@
 		panic("g1 != g3")
 	}
 }
+
+const (
+	PageSize         = pageSize
+	PallocChunkPages = pallocChunkPages
+	PageAlloc64Bit   = pageAlloc64Bit
+	PallocSumBytes   = pallocSumBytes
+)
+
+// Expose pallocSum for testing.
+type PallocSum pallocSum
+
+func PackPallocSum(start, max, end uint) PallocSum { return PallocSum(packPallocSum(start, max, end)) }
+func (m PallocSum) Start() uint                    { return pallocSum(m).start() }
+func (m PallocSum) Max() uint                      { return pallocSum(m).max() }
+func (m PallocSum) End() uint                      { return pallocSum(m).end() }
+
+// Expose pallocBits for testing.
+type PallocBits pallocBits
+
+func (b *PallocBits) Find(npages uintptr, searchIdx uint) (uint, uint) {
+	return (*pallocBits)(b).find(npages, searchIdx)
+}
+func (b *PallocBits) AllocRange(i, n uint)       { (*pallocBits)(b).allocRange(i, n) }
+func (b *PallocBits) Free(i, n uint)             { (*pallocBits)(b).free(i, n) }
+func (b *PallocBits) Summarize() PallocSum       { return PallocSum((*pallocBits)(b).summarize()) }
+func (b *PallocBits) PopcntRange(i, n uint) uint { return (*pageBits)(b).popcntRange(i, n) }
+
+// SummarizeSlow is a slow but more obviously correct implementation
+// of (*pallocBits).summarize. Used for testing.
+func SummarizeSlow(b *PallocBits) PallocSum {
+	var start, max, end uint
+
+	const N = uint(len(b)) * 64
+	for start < N && (*pageBits)(b).get(start) == 0 {
+		start++
+	}
+	for end < N && (*pageBits)(b).get(N-end-1) == 0 {
+		end++
+	}
+	run := uint(0)
+	for i := uint(0); i < N; i++ {
+		if (*pageBits)(b).get(i) == 0 {
+			run++
+		} else {
+			run = 0
+		}
+		if run > max {
+			max = run
+		}
+	}
+	return PackPallocSum(start, max, end)
+}
+
+// Expose non-trivial helpers for testing.
+func FindBitRange64(c uint64, n uint) uint { return findBitRange64(c, n) }
+
+// Given two PallocBits, returns a set of bit ranges where
+// they differ.
+func DiffPallocBits(a, b *PallocBits) []BitRange {
+	ba := (*pageBits)(a)
+	bb := (*pageBits)(b)
+
+	var d []BitRange
+	base, size := uint(0), uint(0)
+	for i := uint(0); i < uint(len(ba))*64; i++ {
+		if ba.get(i) != bb.get(i) {
+			if size == 0 {
+				base = i
+			}
+			size++
+		} else {
+			if size != 0 {
+				d = append(d, BitRange{base, size})
+			}
+			size = 0
+		}
+	}
+	if size != 0 {
+		d = append(d, BitRange{base, size})
+	}
+	return d
+}
+
+// StringifyPallocBits gets the bits in the bit range r from b,
+// and returns a string containing the bits as ASCII 0 and 1
+// characters.
+func StringifyPallocBits(b *PallocBits, r BitRange) string {
+	str := ""
+	for j := r.I; j < r.I+r.N; j++ {
+		if (*pageBits)(b).get(j) != 0 {
+			str += "1"
+		} else {
+			str += "0"
+		}
+	}
+	return str
+}
+
+// Expose pallocData for testing.
+type PallocData pallocData
+
+func (d *PallocData) FindScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
+	return (*pallocData)(d).findScavengeCandidate(searchIdx, min, max)
+}
+func (d *PallocData) AllocRange(i, n uint) { (*pallocData)(d).allocRange(i, n) }
+func (d *PallocData) ScavengedSetRange(i, n uint) {
+	(*pallocData)(d).scavenged.setRange(i, n)
+}
+func (d *PallocData) PallocBits() *PallocBits {
+	return (*PallocBits)(&(*pallocData)(d).pallocBits)
+}
+func (d *PallocData) Scavenged() *PallocBits {
+	return (*PallocBits)(&(*pallocData)(d).scavenged)
+}
+
+// Expose fillAligned for testing.
+func FillAligned(x uint64, m uint) uint64 { return fillAligned(x, m) }
+
+// Expose pageCache for testing.
+type PageCache pageCache
+
+const PageCachePages = pageCachePages
+
+func NewPageCache(base uintptr, cache, scav uint64) PageCache {
+	return PageCache(pageCache{base: base, cache: cache, scav: scav})
+}
+func (c *PageCache) Empty() bool   { return (*pageCache)(c).empty() }
+func (c *PageCache) Base() uintptr { return (*pageCache)(c).base }
+func (c *PageCache) Cache() uint64 { return (*pageCache)(c).cache }
+func (c *PageCache) Scav() uint64  { return (*pageCache)(c).scav }
+func (c *PageCache) Alloc(npages uintptr) (uintptr, uintptr) {
+	return (*pageCache)(c).alloc(npages)
+}
+func (c *PageCache) Flush(s *PageAlloc) {
+	(*pageCache)(c).flush((*pageAlloc)(s))
+}
+
+// Expose chunk index type.
+type ChunkIdx chunkIdx
+
+// Expose pageAlloc for testing. Note that because pageAlloc is
+// not in the heap, so is PageAlloc.
+type PageAlloc pageAlloc
+
+func (p *PageAlloc) Alloc(npages uintptr) (uintptr, uintptr) {
+	return (*pageAlloc)(p).alloc(npages)
+}
+func (p *PageAlloc) AllocToCache() PageCache {
+	return PageCache((*pageAlloc)(p).allocToCache())
+}
+func (p *PageAlloc) Free(base, npages uintptr) {
+	(*pageAlloc)(p).free(base, npages)
+}
+func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
+	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
+}
+func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
+	pp := (*pageAlloc)(p)
+	systemstack(func() {
+		lock(pp.mheapLock)
+		r = pp.scavenge(nbytes, mayUnlock)
+		unlock(pp.mheapLock)
+	})
+	return
+}
+func (p *PageAlloc) InUse() []AddrRange {
+	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
+	for _, r := range p.inUse.ranges {
+		ranges = append(ranges, AddrRange{
+			Base:  r.base.addr(),
+			Limit: r.limit.addr(),
+		})
+	}
+	return ranges
+}
+
+// Returns nil if the PallocData's L2 is missing.
+func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
+	ci := chunkIdx(i)
+	l2 := (*pageAlloc)(p).chunks[ci.l1()]
+	if l2 == nil {
+		return nil
+	}
+	return (*PallocData)(&l2[ci.l2()])
+}
+
+// AddrRange represents a range over addresses.
+// Specifically, it represents the range [Base, Limit).
+type AddrRange struct {
+	Base, Limit uintptr
+}
+
+// BitRange represents a range over a bitmap.
+type BitRange struct {
+	I, N uint // bit index and length in bits
+}
+
+// NewPageAlloc creates a new page allocator for testing and
+// initializes it with the scav and chunks maps. Each key in these maps
+// represents a chunk index and each value is a series of bit ranges to
+// set within each bitmap's chunk.
+//
+// The initialization of the pageAlloc preserves the invariant that if a
+// scavenged bit is set the alloc bit is necessarily unset, so some
+// of the bits described by scav may be cleared in the final bitmap if
+// ranges in chunks overlap with them.
+//
+// scav is optional, and if nil, the scavenged bitmap will be cleared
+// (as opposed to all 1s, which it usually is). Furthermore, every
+// chunk index in scav must appear in chunks; ones that do not are
+// ignored.
+func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
+	p := new(pageAlloc)
+
+	// We've got an entry, so initialize the pageAlloc.
+	p.init(new(mutex), nil)
+	lockInit(p.mheapLock, lockRankMheap)
+	p.test = true
+
+	for i, init := range chunks {
+		addr := chunkBase(chunkIdx(i))
+
+		// Mark the chunk's existence in the pageAlloc.
+		p.grow(addr, pallocChunkBytes)
+
+		// Initialize the bitmap and update pageAlloc metadata.
+		chunk := p.chunkOf(chunkIndex(addr))
+
+		// Clear all the scavenged bits which grow set.
+		chunk.scavenged.clearRange(0, pallocChunkPages)
+
+		// Apply scavenge state if applicable.
+		if scav != nil {
+			if scvg, ok := scav[i]; ok {
+				for _, s := range scvg {
+					// Ignore the case of s.N == 0. setRange doesn't handle
+					// it and it's a no-op anyway.
+					if s.N != 0 {
+						chunk.scavenged.setRange(s.I, s.N)
+					}
+				}
+			}
+		}
+
+		// Apply alloc state.
+		for _, s := range init {
+			// Ignore the case of s.N == 0. allocRange doesn't handle
+			// it and it's a no-op anyway.
+			if s.N != 0 {
+				chunk.allocRange(s.I, s.N)
+			}
+		}
+
+		// Update heap metadata for the allocRange calls above.
+		p.update(addr, pallocChunkPages, false, false)
+	}
+	systemstack(func() {
+		lock(p.mheapLock)
+		p.scavengeStartGen()
+		unlock(p.mheapLock)
+	})
+	return (*PageAlloc)(p)
+}
+
+// FreePageAlloc releases hard OS resources owned by the pageAlloc. Once this
+// is called the pageAlloc may no longer be used. The object itself will be
+// collected by the garbage collector once it is no longer live.
+func FreePageAlloc(pp *PageAlloc) {
+	p := (*pageAlloc)(pp)
+
+	// Free all the mapped space for the summary levels.
+	if pageAlloc64Bit != 0 {
+		for l := 0; l < summaryLevels; l++ {
+			sysFree(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes, nil)
+		}
+	} else {
+		resSize := uintptr(0)
+		for _, s := range p.summary {
+			resSize += uintptr(cap(s)) * pallocSumBytes
+		}
+		sysFree(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize), nil)
+	}
+
+	// Free the mapped space for chunks.
+	for i := range p.chunks {
+		if x := p.chunks[i]; x != nil {
+			p.chunks[i] = nil
+			// This memory comes from sysAlloc and will always be page-aligned.
+			sysFree(unsafe.Pointer(x), unsafe.Sizeof(*p.chunks[0]), nil)
+		}
+	}
+}
+
+// BaseChunkIdx is a convenient chunkIdx value which works on both
+// 64 bit and 32 bit platforms, allowing the tests to share code
+// between the two.
+//
+// This should not be higher than 0x100*pallocChunkBytes to support
+// mips and mipsle, which only have 31-bit address spaces.
+var BaseChunkIdx = ChunkIdx(chunkIndex(((0xc000*pageAlloc64Bit + 0x100*pageAlloc32Bit) * pallocChunkBytes) + arenaBaseOffset*sys.GoosAix))
+
+// PageBase returns an address given a chunk index and a page index
+// relative to that chunk.
+func PageBase(c ChunkIdx, pageIdx uint) uintptr {
+	return chunkBase(chunkIdx(c)) + uintptr(pageIdx)*pageSize
+}
+
+type BitsMismatch struct {
+	Base      uintptr
+	Got, Want uint64
+}
+
+func CheckScavengedBitsCleared(mismatches []BitsMismatch) (n int, ok bool) {
+	ok = true
+
+	// Run on the system stack to avoid stack growth allocation.
+	systemstack(func() {
+		getg().m.mallocing++
+
+		// Lock so that we can safely access the bitmap.
+		lock(&mheap_.lock)
+	chunkLoop:
+		for i := mheap_.pages.start; i < mheap_.pages.end; i++ {
+			chunk := mheap_.pages.chunkOf(i)
+			for j := 0; j < pallocChunkPages/64; j++ {
+				// Run over each 64-bit bitmap section and ensure
+				// scavenged is being cleared properly on allocation.
+				// If a used bit and scavenged bit are both set, that's
+				// an error, and could indicate a larger problem, or
+				// an accounting problem.
+				want := chunk.scavenged[j] &^ chunk.pallocBits[j]
+				got := chunk.scavenged[j]
+				if want != got {
+					ok = false
+					if n >= len(mismatches) {
+						break chunkLoop
+					}
+					mismatches[n] = BitsMismatch{
+						Base: chunkBase(i) + uintptr(j)*64*pageSize,
+						Got:  got,
+						Want: want,
+					}
+					n++
+				}
+			}
+		}
+		unlock(&mheap_.lock)
+
+		getg().m.mallocing--
+	})
+	return
+}
+
+func PageCachePagesLeaked() (leaked uintptr) {
+	stopTheWorld("PageCachePagesLeaked")
+
+	// Walk over destroyed Ps and look for unflushed caches.
+	deadp := allp[len(allp):cap(allp)]
+	for _, p := range deadp {
+		// Since we're going past len(allp) we may see nil Ps.
+		// Just ignore them.
+		if p != nil {
+			leaked += uintptr(sys.OnesCount64(p.pcache.cache))
+		}
+	}
+
+	startTheWorld()
+	return
+}
+
+var Semacquire = semacquire
+var Semrelease1 = semrelease1
+
+func SemNwait(addr *uint32) uint32 {
+	root := semroot(addr)
+	return atomic.Load(&root.nwait)
+}
+
+// MapHashCheck computes the hash of the key k for the map m, twice.
+// Method 1 uses the built-in hasher for the map.
+// Method 2 uses the typehash function (the one used by reflect).
+// Returns the two hash values, which should always be equal.
+func MapHashCheck(m interface{}, k interface{}) (uintptr, uintptr) {
+	// Unpack m.
+	mt := (*maptype)(unsafe.Pointer(efaceOf(&m)._type))
+	mh := (*hmap)(efaceOf(&m).data)
+
+	// Unpack k.
+	kt := efaceOf(&k)._type
+	var p unsafe.Pointer
+	if isDirectIface(kt) {
+		q := efaceOf(&k).data
+		p = unsafe.Pointer(&q)
+	} else {
+		p = efaceOf(&k).data
+	}
+
+	// Compute the hash functions.
+	x := mt.hasher(noescape(p), uintptr(mh.hash0))
+	y := typehash(kt, noescape(p), uintptr(mh.hash0))
+	return x, y
+}
+
+func MSpanCountAlloc(bits []byte) int {
+	s := mspan{
+		nelems:     uintptr(len(bits) * 8),
+		gcmarkBits: (*gcBits)(unsafe.Pointer(&bits[0])),
+	}
+	return s.countAlloc()
+}
diff --git a/src/runtime/export_unix_test.go b/src/runtime/export_unix_test.go
index eecdfb7..621488e 100644
--- a/src/runtime/export_unix_test.go
+++ b/src/runtime/export_unix_test.go
@@ -6,6 +6,13 @@
 
 package runtime
 
+import "unsafe"
+
+var NonblockingPipe = nonblockingPipe
+var Pipe = pipe
+var SetNonblock = setNonblock
+var Closeonexec = closeonexec
+
 func sigismember(mask *sigset, i int) bool {
 	clear := *mask
 	sigdelset(&clear, i)
@@ -17,3 +24,71 @@
 	sigprocmask(_SIG_SETMASK, nil, &sigmask)
 	return sigismember(&sigmask, i)
 }
+
+type M = m
+
+var waitForSigusr1 struct {
+	rdpipe int32
+	wrpipe int32
+	mID    int64
+}
+
+// WaitForSigusr1 blocks until a SIGUSR1 is received. It calls ready
+// when it is set up to receive SIGUSR1. The ready function should
+// cause a SIGUSR1 to be sent. The r and w arguments are a pipe that
+// the signal handler can use to report when the signal is received.
+//
+// Once SIGUSR1 is received, it returns the ID of the current M and
+// the ID of the M the SIGUSR1 was received on. If the caller writes
+// a non-zero byte to w, WaitForSigusr1 returns immediately with -1, -1.
+func WaitForSigusr1(r, w int32, ready func(mp *M)) (int64, int64) {
+	lockOSThread()
+	// Make sure we can receive SIGUSR1.
+	unblocksig(_SIGUSR1)
+
+	waitForSigusr1.rdpipe = r
+	waitForSigusr1.wrpipe = w
+
+	mp := getg().m
+	testSigusr1 = waitForSigusr1Callback
+	ready(mp)
+
+	// Wait for the signal. We use a pipe rather than a note
+	// because write is always async-signal-safe.
+	entersyscallblock()
+	var b byte
+	read(waitForSigusr1.rdpipe, noescape(unsafe.Pointer(&b)), 1)
+	exitsyscall()
+
+	gotM := waitForSigusr1.mID
+	testSigusr1 = nil
+
+	unlockOSThread()
+
+	if b != 0 {
+		// timeout signal from caller
+		return -1, -1
+	}
+	return mp.id, gotM
+}
+
+// waitForSigusr1Callback is called from the signal handler during
+// WaitForSigusr1. It must not have write barriers because there may
+// not be a P.
+//
+//go:nowritebarrierrec
+func waitForSigusr1Callback(gp *g) bool {
+	if gp == nil || gp.m == nil {
+		waitForSigusr1.mID = -1
+	} else {
+		waitForSigusr1.mID = gp.m.id
+	}
+	b := byte(0)
+	write(uintptr(waitForSigusr1.wrpipe), noescape(unsafe.Pointer(&b)), 1)
+	return true
+}
+
+// SendSigusr1 sends SIGUSR1 to mp.
+func SendSigusr1(mp *M) {
+	signalM(mp, _SIGUSR1)
+}
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 2917efe..7316503 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -78,21 +78,6 @@
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
-	Setting gctrace to any value > 0 also causes the garbage collector
-	to emit a summary when memory is released back to the system.
-	This process of returning memory to the system is called scavenging.
-	The format of this summary is subject to change.
-	Currently it is:
-		scvg#: # MB released  printed only if non-zero
-		scvg#: inuse: # idle: # sys: # released: # consumed: # (MB)
-	where the fields are as follows:
-		scvg#        the scavenge cycle number, incremented at each scavenge
-		inuse: #     MB used or partially used spans
-		idle: #      MB spans pending scavenging
-		sys: #       MB mapped from the system
-		released: #  MB released to the system
-		consumed: #  MB allocated from the system
-
 	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
 	instead of MADV_FREE on Linux when returning memory to the
 	kernel. This is less efficient, but causes RSS numbers to drop
@@ -102,7 +87,7 @@
 	When set to 0 memory profiling is disabled.  Refer to the description of
 	MemProfileRate for the default value.
 
-	invalidptr: defaults to invalidptr=1, causing the garbage collector and stack
+	invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
 	copier to crash the program if an invalid pointer value (for example, 1)
 	is found in a pointer-typed location. Setting invalidptr=0 disables this check.
 	This should only be used as a temporary workaround to diagnose buggy code.
@@ -114,6 +99,20 @@
 
 	scavenge: scavenge=1 enables debugging mode of heap scavenger.
 
+	scavtrace: setting scavtrace=1 causes the runtime to emit a single line to standard
+	error, roughly once per GC cycle, summarizing the amount of work done by the
+	scavenger as well as the total amount of memory returned to the operating system
+	and an estimate of physical memory utilization. The format of this line is subject
+	to change, but currently it is:
+		scav # # KiB work, # KiB total, #% util
+	where the fields are as follows:
+		scav #       the scavenge cycle number
+		# KiB work   the amount of memory returned to the OS since the last line
+		# KiB total  the total amount of memory returned to the OS
+		#% util      the fraction of all unscavenged memory which is in-use
+	If the line ends with "(forced)", then scavenging was forced by a
+	debug.FreeOSMemory() call.
+
 	scheddetail: setting schedtrace=X and scheddetail=1 causes the scheduler to emit
 	detailed multiline info every X milliseconds, describing state of the scheduler,
 	processors, threads and goroutines.
@@ -127,6 +126,13 @@
 	IDs will refer to the ID of the goroutine at the time of creation; it's possible for this
 	ID to be reused for another goroutine. Setting N to 0 will report no ancestry information.
 
+	asyncpreemptoff: asyncpreemptoff=1 disables signal-based
+	asynchronous goroutine preemption. This makes some loops
+	non-preemptible for long periods, which may delay GC and
+	goroutine scheduling. This is useful for debugging GC issues
+	because it also disables the conservative stack scanning used
+	for asynchronously preempted goroutines.
+
 The net, net/http, and crypto/tls packages also refer to debugging variables in GODEBUG.
 See the documentation for those packages for details.
 
@@ -200,7 +206,6 @@
 // directly is discouraged, as is using FuncForPC on any of the
 // returned PCs, since these cannot account for inlining or return
 // program counter adjustment.
-//go:noinline
 func Callers(skip int, pc []uintptr) int {
 	// runtime.callers uses pc.array==nil as a signal
 	// to print a stack trace. Pick off 0-length pc here
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index d9a35c5..0fb50dd 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -17,6 +17,7 @@
 #define FUNCDATA_RegPointerMaps 2
 #define FUNCDATA_StackObjects 3
 #define FUNCDATA_InlTree 4
+#define FUNCDATA_OpenCodedDeferInfo 5 /* info for func with open-coded defers */
 
 // Pseudo-assembly statements.
 
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index d55a934..c5c8a4c 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -6,10 +6,13 @@
 
 import (
 	"fmt"
+	"math/rand"
 	"os"
 	"reflect"
 	"runtime"
 	"runtime/debug"
+	"sort"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -21,12 +24,6 @@
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
-	if runtime.GOOS == "windows" {
-		t.Skip("skipping test; GOOS=windows http://golang.org/issue/27156")
-	}
-	if runtime.GOOS == "linux" && runtime.GOARCH == "arm64" {
-		t.Skip("skipping test; GOOS=linux GOARCH=arm64 https://github.com/golang/go/issues/27636")
-	}
 	got := runTestProg(t, "testprog", "GCSys")
 	want := "OK\n"
 	if got != want {
@@ -196,6 +193,15 @@
 	}
 }
 
+func TestGcZombieReporting(t *testing.T) {
+	// This test is somewhat sensitive to how the allocator works.
+	got := runTestProg(t, "testprog", "GCZombie")
+	want := "found pointer to free object"
+	if !strings.Contains(got, want) {
+		t.Fatalf("expected %q in output, but got %q", want, got)
+	}
+}
+
 func BenchmarkSetTypePtr(b *testing.B) {
 	benchSetType(b, new(*byte))
 }
@@ -470,25 +476,6 @@
 	}
 }
 
-func TestUnscavHugePages(t *testing.T) {
-	// Allocate 20 MiB and immediately free it a few times to increase
-	// the chance that unscavHugePages isn't zero and that some kind of
-	// accounting had to happen in the runtime.
-	for j := 0; j < 3; j++ {
-		var large [][]byte
-		for i := 0; i < 5; i++ {
-			large = append(large, make([]byte, runtime.PhysHugePageSize))
-		}
-		runtime.KeepAlive(large)
-		runtime.GC()
-	}
-	base, slow := runtime.UnscavHugePagesSlow()
-	if base != slow {
-		logDiff(t, "unscavHugePages", reflect.ValueOf(base), reflect.ValueOf(slow))
-		t.Fatal("unscavHugePages mismatch")
-	}
-}
-
 func logDiff(t *testing.T, prefix string, got, want reflect.Value) {
 	typ := got.Type()
 	switch typ.Kind() {
@@ -531,6 +518,90 @@
 	hugeSink = nil
 }
 
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	// We’ll apply load to the runtime with maxProcs-1 goroutines
+	// and use one more to actually benchmark. It doesn't make sense
+	// to try to run this test with only 1 P (that's what
+	// BenchmarkReadMemStats is for).
+	maxProcs := runtime.GOMAXPROCS(-1)
+	if maxProcs == 1 {
+		b.Skip("This benchmark can only be run with GOMAXPROCS > 1")
+	}
+
+	// Code to build a big tree with lots of pointers.
+	type node struct {
+		children [16]*node
+	}
+	var buildTree func(depth int) *node
+	buildTree = func(depth int) *node {
+		tree := new(node)
+		if depth != 0 {
+			for i := range tree.children {
+				tree.children[i] = buildTree(depth - 1)
+			}
+		}
+		return tree
+	}
+
+	// Keep the GC busy by continuously generating large trees.
+	done := make(chan struct{})
+	var wg sync.WaitGroup
+	for i := 0; i < maxProcs-1; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			var hold *node
+		loop:
+			for {
+				hold = buildTree(5)
+				select {
+				case <-done:
+					break loop
+				default:
+				}
+			}
+			runtime.KeepAlive(hold)
+		}()
+	}
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+
+	// Run for timeToBench hitting ReadMemStats continuously
+	// and measuring the latency.
+	b.ResetTimer()
+	var ms runtime.MemStats
+	for i := 0; i < b.N; i++ {
+		// Sleep for a bit, otherwise we're just going to keep
+		// stopping the world and no one will get to do anything.
+		time.Sleep(100 * time.Millisecond)
+		start := time.Now()
+		runtime.ReadMemStats(&ms)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	close(done)
+	// Make sure to stop the timer before we wait! The goroutines above
+	// are very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	wg.Wait()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
+
 func TestUserForcedGC(t *testing.T) {
 	// Test that runtime.GC() triggers a GC even if GOGC=off.
 	defer debug.SetGCPercent(debug.SetGCPercent(-1))
@@ -691,6 +762,24 @@
 	close(teardown)
 }
 
+func BenchmarkMSpanCountAlloc(b *testing.B) {
+	// n is the number of bytes to benchmark against.
+	// n must always be a multiple of 8, since gcBits is
+	// always rounded up 8 bytes.
+	for _, n := range []int{8, 16, 32, 64, 128} {
+		b.Run(fmt.Sprintf("bits=%d", n*8), func(b *testing.B) {
+			// Initialize a new byte slice with pseduo-random data.
+			bits := make([]byte, n)
+			rand.Read(bits)
+
+			b.ResetTimer()
+			for i := 0; i < b.N; i++ {
+				runtime.MSpanCountAlloc(bits)
+			}
+		})
+	}
+}
+
 func countpwg(n *int, ready *sync.WaitGroup, teardown chan bool) {
 	if *n == 0 {
 		ready.Done()
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index 0741f63..ec1ba90 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -179,7 +179,7 @@
 			typeScalar, typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
 			typePointer, typeScalar, // i string
 		}
-	case "arm64", "amd64", "mips64", "mips64le", "ppc64", "ppc64le", "s390x", "wasm":
+	case "arm64", "amd64", "mips64", "mips64le", "ppc64", "ppc64le", "riscv64", "s390x", "wasm":
 		return []byte{
 			typePointer,                        // q *int
 			typeScalar, typeScalar, typeScalar, // w byte; e [17]byte
@@ -187,14 +187,6 @@
 			typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
 			typePointer, typeScalar, // i string
 		}
-	case "amd64p32":
-		return []byte{
-			typePointer,                                                // q *int
-			typeScalar, typeScalar, typeScalar, typeScalar, typeScalar, // w byte; e [17]byte
-			typePointer, typeScalar, typeScalar, // r []byte
-			typeScalar, typeScalar, typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
-			typePointer, typeScalar, // i string
-		}
 	default:
 		panic("unknown arch")
 	}
diff --git a/src/runtime/go_tls.h b/src/runtime/go_tls.h
index 61f7dbe..a47e798 100644
--- a/src/runtime/go_tls.h
+++ b/src/runtime/go_tls.h
@@ -11,11 +11,6 @@
 #define	g(r)	0(r)(TLS*1)
 #endif
 
-#ifdef GOARCH_amd64p32
-#define	get_tls(r)	MOVL TLS, r
-#define	g(r)	0(r)(TLS*1)
-#endif
-
 #ifdef GOARCH_386
 #define	get_tls(r)	MOVL TLS, r
 #define	g(r)	0(r)(TLS*1)
diff --git a/src/runtime/hash32.go b/src/runtime/hash32.go
index 5574923..966f70e 100644
--- a/src/runtime/hash32.go
+++ b/src/runtime/hash32.go
@@ -20,10 +20,7 @@
 	m4 = 2336365089
 )
 
-func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
-	if GOARCH == "386" && GOOS != "nacl" && useAeshash {
-		return aeshash(p, seed, s)
-	}
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
 	h := uint32(seed + s*hashkey[0])
 tail:
 	switch {
@@ -81,7 +78,7 @@
 	return uintptr(h)
 }
 
-func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
 	h := uint32(seed + 4*hashkey[0])
 	h ^= readUnaligned32(p)
 	h = rotl_15(h*m1) * m2
@@ -93,7 +90,7 @@
 	return uintptr(h)
 }
 
-func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
 	h := uint32(seed + 8*hashkey[0])
 	h ^= readUnaligned32(p)
 	h = rotl_15(h*m1) * m2
diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index c3f2b9b..d128382 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go
@@ -6,7 +6,7 @@
 //   xxhash: https://code.google.com/p/xxhash/
 // cityhash: https://code.google.com/p/cityhash/
 
-// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x wasm
+// +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm
 
 package runtime
 
@@ -20,11 +20,7 @@
 	m4 = 15839092249703872147
 )
 
-func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
-	if (GOARCH == "amd64" || GOARCH == "arm64") &&
-		GOOS != "nacl" && useAeshash {
-		return aeshash(p, seed, s)
-	}
+func memhashFallback(p unsafe.Pointer, seed, s uintptr) uintptr {
 	h := uint64(seed + s*hashkey[0])
 tail:
 	switch {
@@ -82,7 +78,7 @@
 	return uintptr(h)
 }
 
-func memhash32(p unsafe.Pointer, seed uintptr) uintptr {
+func memhash32Fallback(p unsafe.Pointer, seed uintptr) uintptr {
 	h := uint64(seed + 4*hashkey[0])
 	v := uint64(readUnaligned32(p))
 	h ^= v
@@ -94,7 +90,7 @@
 	return uintptr(h)
 }
 
-func memhash64(p unsafe.Pointer, seed uintptr) uintptr {
+func memhash64Fallback(p unsafe.Pointer, seed uintptr) uintptr {
 	h := uint64(seed + 8*hashkey[0])
 	h ^= uint64(readUnaligned32(p)) | uint64(readUnaligned32(add(p, 4)))<<32
 	h = rotl_31(h*m1) * m2
diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
index fe25a7f..655ca18 100644
--- a/src/runtime/hash_test.go
+++ b/src/runtime/hash_test.go
@@ -8,6 +8,7 @@
 	"fmt"
 	"math"
 	"math/rand"
+	"reflect"
 	. "runtime"
 	"strings"
 	"testing"
@@ -48,6 +49,54 @@
 	}
 }
 
+func TestCompilerVsRuntimeHash(t *testing.T) {
+	// Test to make sure the compiler's hash function and the runtime's hash function agree.
+	// See issue 37716.
+	for _, m := range []interface{}{
+		map[bool]int{},
+		map[int8]int{},
+		map[uint8]int{},
+		map[int16]int{},
+		map[uint16]int{},
+		map[int32]int{},
+		map[uint32]int{},
+		map[int64]int{},
+		map[uint64]int{},
+		map[int]int{},
+		map[uint]int{},
+		map[uintptr]int{},
+		map[*byte]int{},
+		map[chan int]int{},
+		map[unsafe.Pointer]int{},
+		map[float32]int{},
+		map[float64]int{},
+		map[complex64]int{},
+		map[complex128]int{},
+		map[string]int{},
+		//map[interface{}]int{},
+		//map[interface{F()}]int{},
+		map[[8]uint64]int{},
+		map[[8]string]int{},
+		map[struct{ a, b, c, d int32 }]int{}, // Note: tests AMEM128
+		map[struct{ a, b, _, d int32 }]int{},
+		map[struct {
+			a, b int32
+			c    float32
+			d, e [8]byte
+		}]int{},
+		map[struct {
+			a int16
+			b int64
+		}]int{},
+	} {
+		k := reflect.New(reflect.TypeOf(m).Key()).Elem().Interface() // the zero key
+		x, y := MapHashCheck(m, k)
+		if x != y {
+			t.Errorf("hashes did not match (%x vs %x) for map %T", x, y, m)
+		}
+	}
+}
+
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 992df63..cfd5c25 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -371,7 +371,12 @@
 		dumpint(uint64(d.sp))
 		dumpint(uint64(d.pc))
 		dumpint(uint64(uintptr(unsafe.Pointer(d.fn))))
-		dumpint(uint64(uintptr(unsafe.Pointer(d.fn.fn))))
+		if d.fn == nil {
+			// d.fn can be nil for open-coded defers
+			dumpint(uint64(0))
+		} else {
+			dumpint(uint64(uintptr(unsafe.Pointer(d.fn.fn))))
+		}
 		dumpint(uint64(uintptr(unsafe.Pointer(d.link))))
 	}
 	for p := gp._panic; p != nil; p = p.link {
@@ -430,7 +435,7 @@
 
 	// mspan.types
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			// Finalizers
 			for sp := s.specials; sp != nil; sp = sp.next {
 				if sp.kind != _KindSpecialFinalizer {
@@ -453,7 +458,7 @@
 
 func dumpobjs() {
 	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+		if s.state.get() != mSpanInUse {
 			continue
 		}
 		p := s.base()
@@ -616,7 +621,7 @@
 func dumpmemprof() {
 	iterate_memprof(dumpmemprof_callback)
 	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+		if s.state.get() != mSpanInUse {
 			continue
 		}
 		for sp := s.specials; sp != nil; sp = sp.next {
@@ -637,7 +642,7 @@
 func mdump() {
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			s.ensureSwept()
 		}
 	}
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index bb4eccc..0504b89 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -66,6 +66,12 @@
 	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*sys.PtrSize, 0, &memstats.other_sys))
 	m.inter = inter
 	m._type = typ
+	// The hash is used in type switches. However, compiler statically generates itab's
+	// for all interface/type pairs used in switches (which are added to itabTable
+	// in itabsinit). The dynamically-generated itab's never participate in type switches,
+	// and thus the hash is irrelevant.
+	// Note: m.hash is _not_ the hash used for the runtime itabTable hash table.
+	m.hash = 0
 	m.init()
 	itabAdd(m)
 	unlock(&itabLock)
@@ -233,11 +239,11 @@
 		return iname
 	}
 	m.fun[0] = uintptr(fun0)
-	m.hash = typ.hash
 	return ""
 }
 
 func itabsinit() {
+	lockInit(&itabLock, lockRankItab)
 	lock(&itabLock)
 	for _, md := range activeModules() {
 		for _, i := range md.itablinks {
@@ -295,11 +301,11 @@
 	stringEface interface{} = stringInterfacePtr("")
 	sliceEface  interface{} = sliceInterfacePtr(nil)
 
-	uint16Type *_type = (*eface)(unsafe.Pointer(&uint16Eface))._type
-	uint32Type *_type = (*eface)(unsafe.Pointer(&uint32Eface))._type
-	uint64Type *_type = (*eface)(unsafe.Pointer(&uint64Eface))._type
-	stringType *_type = (*eface)(unsafe.Pointer(&stringEface))._type
-	sliceType  *_type = (*eface)(unsafe.Pointer(&sliceEface))._type
+	uint16Type *_type = efaceOf(&uint16Eface)._type
+	uint32Type *_type = efaceOf(&uint32Eface)._type
+	uint64Type *_type = efaceOf(&uint64Eface)._type
+	stringType *_type = efaceOf(&stringEface)._type
+	sliceType  *_type = efaceOf(&sliceEface)._type
 )
 
 // The conv and assert functions below do very similar things.
@@ -326,8 +332,11 @@
 }
 
 func convT16(val uint16) (x unsafe.Pointer) {
-	if val == 0 {
-		x = unsafe.Pointer(&zeroVal[0])
+	if val < uint16(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
+		if sys.BigEndian {
+			x = add(x, 6)
+		}
 	} else {
 		x = mallocgc(2, uint16Type, false)
 		*(*uint16)(x) = val
@@ -336,8 +345,11 @@
 }
 
 func convT32(val uint32) (x unsafe.Pointer) {
-	if val == 0 {
-		x = unsafe.Pointer(&zeroVal[0])
+	if val < uint32(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
+		if sys.BigEndian {
+			x = add(x, 4)
+		}
 	} else {
 		x = mallocgc(4, uint32Type, false)
 		*(*uint32)(x) = val
@@ -346,8 +358,8 @@
 }
 
 func convT64(val uint64) (x unsafe.Pointer) {
-	if val == 0 {
-		x = unsafe.Pointer(&zeroVal[0])
+	if val < uint64(len(staticuint64s)) {
+		x = unsafe.Pointer(&staticuint64s[val])
 	} else {
 		x = mallocgc(8, uint64Type, false)
 		*(*uint64)(x) = val
@@ -516,8 +528,8 @@
 	}
 }
 
-// staticbytes is used to avoid convT2E for byte-sized values.
-var staticbytes = [...]byte{
+// staticuint64s is used to avoid allocating in convTx for small integer values.
+var staticuint64s = [...]uint64{
 	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
diff --git a/src/runtime/iface_test.go b/src/runtime/iface_test.go
index 6d8f861..4fab6c9 100644
--- a/src/runtime/iface_test.go
+++ b/src/runtime/iface_test.go
@@ -95,6 +95,19 @@
 	}
 }
 
+func BenchmarkConvT2EByteSized(b *testing.B) {
+	b.Run("bool", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			e = yes
+		}
+	})
+	b.Run("uint8", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			e = eight8
+		}
+	})
+}
+
 func BenchmarkConvT2ESmall(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		e = ts
@@ -310,18 +323,22 @@
 var (
 	eight8  uint8 = 8
 	eight8I T8    = 8
+	yes     bool  = true
 
-	zero16  uint16 = 0
-	zero16I T16    = 0
-	one16   uint16 = 1
+	zero16     uint16 = 0
+	zero16I    T16    = 0
+	one16      uint16 = 1
+	thousand16 uint16 = 1000
 
-	zero32  uint32 = 0
-	zero32I T32    = 0
-	one32   uint32 = 1
+	zero32     uint32 = 0
+	zero32I    T32    = 0
+	one32      uint32 = 1
+	thousand32 uint32 = 1000
 
-	zero64  uint64 = 0
-	zero64I T64    = 0
-	one64   uint64 = 1
+	zero64     uint64 = 0
+	zero64I    T64    = 0
+	one64      uint64 = 1
+	thousand64 uint64 = 1000
 
 	zerostr  string = ""
 	zerostrI Tstr   = ""
@@ -369,6 +386,23 @@
 		})
 	})
 	b.Run("nonzero", func(b *testing.B) {
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzstr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzbig
+			}
+		})
+	})
+	b.Run("smallint", func(b *testing.B) {
 		b.Run("16", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
 				e = one16
@@ -384,19 +418,21 @@
 				e = one64
 			}
 		})
-		b.Run("str", func(b *testing.B) {
+	})
+	b.Run("largeint", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzstr
+				e = thousand16
 			}
 		})
-		b.Run("slice", func(b *testing.B) {
+		b.Run("32", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzslice
+				e = thousand32
 			}
 		})
-		b.Run("big", func(b *testing.B) {
+		b.Run("64", func(b *testing.B) {
 			for i := 0; i < b.N; i++ {
-				e = nzbig
+				e = thousand64
 			}
 		})
 	})
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 13289a8..9b9dc14 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -229,3 +229,9 @@
 	LOCK
 	ANDB	BX, (AX)
 	RET
+
+TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	XCHGB	AX, 0(BX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index e18aee7..90c5642 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -136,6 +136,12 @@
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), BX
+	MOVB	val+8(FP), AX
+	XCHGB	AX, 0(BX)
+	RET
+
 TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 	MOVQ	ptr+0(FP), BX
 	MOVQ	val+8(FP), AX
diff --git a/src/runtime/internal/atomic/asm_amd64p32.s b/src/runtime/internal/atomic/asm_amd64p32.s
deleted file mode 100644
index 35b5ef2..0000000
--- a/src/runtime/internal/atomic/asm_amd64p32.s
+++ /dev/null
@@ -1,159 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// bool Cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-17
-	JMP	runtime∕internal∕atomic·Cas(SB)
-
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-17
-	JMP	runtime∕internal∕atomic·Cas(SB)
-
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Load(SB)
-
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Load(SB)
-
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
-
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
-	JMP	runtime∕internal∕atomic·Load64(SB)
-
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
-	JMP	runtime∕internal∕atomic·Xadd64(SB)
-
-// bool	runtime∕internal∕atomic·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
-	MOVL	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// bool Casp1(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// uint32 Xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	delta+4(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xadd(SB)
-
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	TESTL	$7, BX
-	JZ	2(PC)
-	MOVL	0, BX // crash when unaligned
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xchg(SB)
-
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
-
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
-	MOVL	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ORB	AX, 0(BX)
-	RET
-
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ANDB	AX, 0(BX)
-	RET
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 9cb1037..3290fb7 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -166,6 +166,14 @@
 	SYNC
 	RET
 
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVB	val+8(FP), R2
+	SYNC
+	MOVB	R2, 0(R1)
+	SYNC
+	RET
+
 TEXT ·Store64(SB), NOSPLIT, $0-16
 	MOVV	ptr+0(FP), R1
 	MOVV	val+8(FP), R2
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index af6bce5..62811a6 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -32,6 +32,14 @@
 	SYNC
 	RET
 
+TEXT ·Store8(SB),NOSPLIT,$0-5
+	MOVW	ptr+0(FP), R1
+	MOVB	val+4(FP), R2
+	SYNC
+	MOVB	R2, 0(R1)
+	SYNC
+	RET
+
 TEXT ·Load(SB),NOSPLIT,$0-8
 	MOVW	ptr+0(FP), R1
 	SYNC
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 052b031..06dc931 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -170,6 +170,13 @@
 	MOVW	R4, 0(R3)
 	RET
 
+TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVB	val+8(FP), R4
+	SYNC
+	MOVB	R4, 0(R3)
+	RET
+
 TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 	MOVD	ptr+0(FP), R3
 	MOVD	val+8(FP), R4
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 084f5b5..9a19bc0 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -12,6 +12,14 @@
 	SYNC
 	RET
 
+// func Store8(ptr *uint8, val uint8)
+TEXT ·Store8(SB), NOSPLIT, $0
+	MOVD	ptr+0(FP), R2
+	MOVB	val+8(FP), R3
+	MOVB	R3, 0(R2)
+	SYNC
+	RET
+
 // func Store64(ptr *uint64, val uint64)
 TEXT ·Store64(SB), NOSPLIT, $0
 	MOVD	ptr+0(FP), R2
@@ -168,37 +176,27 @@
 TEXT ·Or8(SB), NOSPLIT, $0-9
 	MOVD    ptr+0(FP), R3
 	MOVBZ   val+8(FP), R4
-	// Calculate shift.
-	MOVD	R3, R5
-	AND	$3, R5
-	XOR	$3, R5 // big endian - flip direction
-	SLD	$3, R5 // MUL $8, R5
-	SLD	R5, R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	AND	$-4, R3
-	MOVWZ	0(R3), R6
-again:
-	OR	R4, R6, R7
-	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
-	BNE	again
+	// We don't have atomic operations that work on individual bytes so we
+	// need to align addr down to a word boundary and create a mask
+	// containing v to OR with the entire word atomically.
+	MOVD	$(3<<3), R5
+	RXSBG	$59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
+	ANDW	$~3, R3              // R3 = floor(addr, 4) = addr &^ 3
+	SLW	R5, R4               // R4 = uint32(v) << R5
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
 	RET
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
 	MOVD    ptr+0(FP), R3
 	MOVBZ   val+8(FP), R4
-	// Calculate shift.
-	MOVD	R3, R5
-	AND	$3, R5
-	XOR	$3, R5 // big endian - flip direction
-	SLD	$3, R5 // MUL $8, R5
-	OR	$-256, R4 // create 0xffffffffffffffxx
-	RLLG	R5, R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	AND	$-4, R3
-	MOVWZ	0(R3), R6
-again:
-	AND	R4, R6, R7
-	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
-	BNE	again
+	// We don't have atomic operations that work on individual bytes so we
+	// need to align addr down to a word boundary and create a mask
+	// containing v to AND with the entire word atomically.
+	ORW	$~0xff, R4           // R4 = uint32(v) | 0xffffff00
+	MOVD	$(3<<3), R5
+	RXSBG	$59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
+	ANDW	$~3, R3              // R3 = floor(addr, 4) = addr &^ 3
+	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index d7f82cc..8d002eb 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -75,6 +75,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_amd64x.go b/src/runtime/internal/atomic/atomic_amd64.go
similarity index 96%
rename from src/runtime/internal/atomic/atomic_amd64x.go
rename to src/runtime/internal/atomic/atomic_amd64.go
index 31c1636..14b8101 100644
--- a/src/runtime/internal/atomic/atomic_amd64x.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 amd64p32
-
 package atomic
 
 import "unsafe"
@@ -79,6 +77,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index c1fc1f7..95713af 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -210,4 +210,7 @@
 func Load64(addr *uint64) uint64
 
 //go:noescape
+func Store8(addr *uint8, v uint8)
+
+//go:noescape
 func Store64(addr *uint64, v uint64)
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 0182f30..26ca94d 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -57,6 +57,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index a7e8c35..a2eb756 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -48,6 +48,12 @@
 	STLRW	R1, (R0)
 	RET
 
+TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R0
+	MOVB	val+8(FP), R1
+	STLRB	R1, (R0)
+	RET
+
 TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
 	MOVD	ptr+0(FP), R0
 	MOVD	val+8(FP), R1
@@ -55,9 +61,9 @@
 	RET
 
 TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
-again:
 	MOVD	ptr+0(FP), R0
 	MOVW	new+8(FP), R1
+again:
 	LDAXRW	(R0), R2
 	STLXRW	R1, (R0), R3
 	CBNZ	R3, again
@@ -65,9 +71,9 @@
 	RET
 
 TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
-again:
 	MOVD	ptr+0(FP), R0
 	MOVD	new+8(FP), R1
+again:
 	LDAXR	(R0), R2
 	STLXR	R1, (R0), R3
 	CBNZ	R3, again
@@ -102,9 +108,9 @@
 //      *val += delta;
 //      return *val;
 TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
-again:
 	MOVD	ptr+0(FP), R0
 	MOVW	delta+8(FP), R1
+again:
 	LDAXRW	(R0), R2
 	ADDW	R2, R1, R2
 	STLXRW	R2, (R0), R3
@@ -113,9 +119,9 @@
 	RET
 
 TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
-again:
 	MOVD	ptr+0(FP), R0
 	MOVD	delta+8(FP), R1
+again:
 	LDAXR	(R0), R2
 	ADD	R2, R1, R2
 	STLXR	R2, (R0), R3
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index ce11e38..1d99778 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -59,6 +59,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 6e39262..0e2d77a 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -141,6 +141,9 @@
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
+//go:noescape
+func Store8(ptr *uint8, val uint8)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index 13805a5..a48ecf5 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -59,6 +59,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
new file mode 100644
index 0000000..d525123
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -0,0 +1,67 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load8(ptr *uint8) uint8
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+// NO go:noescape annotation; *ptr escapes if result escapes (#31525)
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func LoadAcq(ptr *uint32) uint32
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func CasRel(ptr *uint32, old, new uint32) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
+
+//go:noescape
+func StoreRel(ptr *uint32, val uint32)
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
new file mode 100644
index 0000000..d005325
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -0,0 +1,232 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// RISC-V's atomic operations have two bits, aq ("acquire") and rl ("release"),
+// which may be toggled on and off. Their precise semantics are defined in
+// section 6.3 of the specification, but the basic idea is as follows:
+//
+//   - If neither aq nor rl is set, the CPU may reorder the atomic arbitrarily.
+//     It guarantees only that it will execute atomically.
+//
+//   - If aq is set, the CPU may move the instruction backward, but not forward.
+//
+//   - If rl is set, the CPU may move the instruction forward, but not backward.
+//
+//   - If both are set, the CPU may not reorder the instruction at all.
+//
+// These four modes correspond to other well-known memory models on other CPUs.
+// On ARM, aq corresponds to a dmb ishst, aq+rl corresponds to a dmb ish. On
+// Intel, aq corresponds to an lfence, rl to an sfence, and aq+rl to an mfence
+// (or a lock prefix).
+//
+// Go's memory model requires that
+//   - if a read happens after a write, the read must observe the write, and
+//     that
+//   - if a read happens concurrently with a write, the read may observe the
+//     write.
+// aq is sufficient to guarantee this, so that's what we use here. (This jibes
+// with ARM, which uses dmb ishst.)
+
+#include "textflag.h"
+
+// Atomically:
+//      if(*val == *old){
+//              *val = new;
+//              return 1;
+//      } else {
+//              return 0;
+//      }
+
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOV	ptr+0(FP), A0
+	MOVW	old+8(FP), A1
+	MOVW	new+12(FP), A2
+cas_again:
+	LRW	(A0), A3
+	BNE	A3, A1, cas_fail
+	SCW	A2, (A0), A4
+	BNE	A4, ZERO, cas_again
+	MOV	$1, A0
+	MOVB	A0, ret+16(FP)
+	RET
+cas_fail:
+	MOV	$0, A0
+	MOV	A0, ret+16(FP)
+	RET
+
+// func Cas64(ptr *uint64, old, new uint64) bool
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOV	ptr+0(FP), A0
+	MOV	old+8(FP), A1
+	MOV	new+16(FP), A2
+cas_again:
+	LRD	(A0), A3
+	BNE	A3, A1, cas_fail
+	SCD	A2, (A0), A4
+	BNE	A4, ZERO, cas_again
+	MOV	$1, A0
+	MOVB	A0, ret+24(FP)
+	RET
+cas_fail:
+	MOVB	ZERO, ret+24(FP)
+	RET
+
+// func Load(ptr *uint32) uint32
+TEXT ·Load(SB),NOSPLIT|NOFRAME,$0-12
+	MOV	ptr+0(FP), A0
+	LRW	(A0), A0
+	MOVW	A0, ret+8(FP)
+	RET
+
+// func Load8(ptr *uint8) uint8
+TEXT ·Load8(SB),NOSPLIT|NOFRAME,$0-9
+	MOV	ptr+0(FP), A0
+	FENCE
+	MOVBU	(A0), A1
+	FENCE
+	MOVB	A1, ret+8(FP)
+	RET
+
+// func Load64(ptr *uint64) uint64
+TEXT ·Load64(SB),NOSPLIT|NOFRAME,$0-16
+	MOV	ptr+0(FP), A0
+	LRD	(A0), A0
+	MOV	A0, ret+8(FP)
+	RET
+
+// func Store(ptr *uint32, val uint32)
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOSWAPW A1, (A0), ZERO
+	RET
+
+// func Store8(ptr *uint8, val uint8)
+TEXT ·Store8(SB), NOSPLIT, $0-9
+	MOV	ptr+0(FP), A0
+	MOVBU	val+8(FP), A1
+	FENCE
+	MOVB	A1, (A0)
+	FENCE
+	RET
+
+// func Store64(ptr *uint64, val uint64)
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOV	ptr+0(FP), A0
+	MOV	val+8(FP), A1
+	AMOSWAPD A1, (A0), ZERO
+	RET
+
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·Casuintptr(SB),NOSPLIT,$0-25
+	JMP	·Cas64(SB)
+
+TEXT ·CasRel(SB), NOSPLIT, $0-17
+	JMP	·Cas(SB)
+
+TEXT ·Loaduintptr(SB),NOSPLIT,$0-16
+	JMP	·Load64(SB)
+
+TEXT ·Storeuintptr(SB),NOSPLIT,$0-16
+	JMP	·Store64(SB)
+
+TEXT ·Loaduint(SB),NOSPLIT,$0-16
+	JMP ·Loaduintptr(SB)
+
+TEXT ·Loadint64(SB),NOSPLIT,$0-16
+	JMP ·Loaduintptr(SB)
+
+TEXT ·Xaddint64(SB),NOSPLIT,$0-24
+	MOV	ptr+0(FP), A0
+	MOV	delta+8(FP), A1
+	AMOADDD A1, (A0), A0
+	ADD	A0, A1, A0
+	MOVW	A0, ret+16(FP)
+	RET
+
+TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
+	JMP	·Load(SB)
+
+// func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+TEXT ·Loadp(SB),NOSPLIT,$0-16
+	JMP	·Load64(SB)
+
+// func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreRel(SB), NOSPLIT, $0-12
+	JMP	·Store(SB)
+
+// func Xchg(ptr *uint32, new uint32) uint32
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOV	ptr+0(FP), A0
+	MOVW	new+8(FP), A1
+	AMOSWAPW A1, (A0), A1
+	MOVW	A1, ret+16(FP)
+	RET
+
+// func Xchg64(ptr *uint64, new uint64) uint64
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOV	ptr+0(FP), A0
+	MOV	new+8(FP), A1
+	AMOSWAPD A1, (A0), A1
+	MOV	A1, ret+16(FP)
+	RET
+
+// Atomically:
+//      *val += delta;
+//      return *val;
+
+// func Xadd(ptr *uint32, delta int32) uint32
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOV	ptr+0(FP), A0
+	MOVW	delta+8(FP), A1
+	AMOADDW A1, (A0), A2
+	ADD	A2,A1,A0
+	MOVW	A0, ret+16(FP)
+	RET
+
+// func Xadd64(ptr *uint64, delta int64) uint64
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOV	ptr+0(FP), A0
+	MOV	delta+8(FP), A1
+	AMOADDD A1, (A0), A2
+	ADD	A2, A1, A0
+	MOV	A0, ret+16(FP)
+	RET
+
+// func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+// func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+// func And8(ptr *uint8, val uint8)
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOV	ptr+0(FP), A0
+	MOVBU	val+8(FP), A1
+	AND	$3, A0, A2
+	AND	$-4, A0
+	SLL	$3, A2
+	XOR	$255, A1
+	SLL	A2, A1
+	XOR	$-1, A1
+	AMOANDW A1, (A0), ZERO
+	RET
+
+// func Or8(ptr *uint8, val uint8)
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOV	ptr+0(FP), A0
+	MOVBU	val+8(FP), A1
+	AND	$3, A0, A2
+	AND	$-4, A0
+	SLL	$3, A2
+	SLL	A2, A1
+	AMOORW	A1, (A0), ZERO
+	RET
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index 25fd890..4d73b39 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -45,6 +45,9 @@
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
+func Store8(ptr *uint8, val uint8)
+
+//go:noescape
 func Store64(ptr *uint64, val uint64)
 
 // NO go:noescape annotation; see atomic_pointer.go.
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index 0ba7544..0c1125c 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -86,14 +86,8 @@
 	// a continual source of pain. Test that on 32-bit systems they crash
 	// instead of failing silently.
 
-	switch runtime.GOARCH {
-	default:
-		if unsafe.Sizeof(int(0)) != 4 {
-			t.Skip("test only runs on 32-bit systems")
-		}
-	case "amd64p32":
-		// amd64p32 can handle unaligned atomics.
-		t.Skipf("test not needed on %v", runtime.GOARCH)
+	if unsafe.Sizeof(int(0)) != 4 {
+		t.Skip("test only runs on 32-bit systems")
 	}
 
 	x := make([]uint32, 4)
@@ -109,3 +103,120 @@
 	shouldPanic(t, "Xchg64", func() { atomic.Xchg64(up64, 1) })
 	shouldPanic(t, "Cas64", func() { atomic.Cas64(up64, 1, 2) })
 }
+
+func TestAnd8(t *testing.T) {
+	// Basic sanity check.
+	x := uint8(0xff)
+	for i := uint8(0); i < 8; i++ {
+		atomic.And8(&x, ^(1 << i))
+		if r := uint8(0xff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint8(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint8, 1<<12)
+	for i := range a {
+		a[i] = 0xff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := ^uint8(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And8(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint8(0), v)
+		}
+	}
+}
+
+func TestOr8(t *testing.T) {
+	// Basic sanity check.
+	x := uint8(0)
+	for i := uint8(0); i < 8; i++ {
+		atomic.Or8(&x, 1<<i)
+		if r := (uint8(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint8(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint8, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := uint8(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or8(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint8(0xff), v)
+		}
+	}
+}
+
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint8, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 8; i++ {
+		m := uint8(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or8(&a[i], m)
+					if atomic.Load8(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And8(&a[i], ^m)
+					if atomic.Load8(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 8; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint8(0), v)
+		}
+	}
+}
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 0731763..9037c2f 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -143,6 +143,12 @@
 
 //go:nosplit
 //go:noinline
+func Store8(ptr *uint8, val uint8) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
 func Store64(ptr *uint64, val uint64) {
 	*ptr = val
 }
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index 083a75c..de71b0f 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -43,6 +43,46 @@
 	}
 }
 
+func BenchmarkAnd8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkAnd8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.And8(&x[255], i)
+			i++
+		}
+	})
+}
+
+func BenchmarkOr8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkOr8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.Or8(&x[255], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
diff --git a/src/runtime/internal/atomic/sys_linux_arm.s b/src/runtime/internal/atomic/sys_linux_arm.s
index df62f6c..192be4b 100644
--- a/src/runtime/internal/atomic/sys_linux_arm.s
+++ b/src/runtime/internal/atomic/sys_linux_arm.s
@@ -29,9 +29,9 @@
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	·armcas(SB)
-	JMP	·kernelcas<>(SB)
+	JMP	kernelcas<>(SB)
 
-TEXT runtime∕internal∕atomic·kernelcas<>(SB),NOSPLIT,$0
+TEXT kernelcas<>(SB),NOSPLIT,$0
 	MOVW	ptr+0(FP), R2
 	// trigger potential paging fault here,
 	// because we don't know how to traceback through __kuser_cmpxchg
@@ -120,3 +120,25 @@
 	MOVB	R1, ret+4(FP)
 	RET
 
+TEXT	·Store8(SB),NOSPLIT,$0-5
+	MOVW	addr+0(FP), R1
+	MOVB	v+4(FP), R2
+
+	MOVB	runtime·goarm(SB), R8
+	CMP	$7, R8
+	BGE	native_barrier
+	BL	memory_barrier<>(SB)
+	B	store
+native_barrier:
+	DMB	MB_ISH
+
+store:
+	MOVB	R2, (R1)
+
+	CMP	$7, R8
+	BGE	native_barrier2
+	BL	memory_barrier<>(SB)
+	RET
+native_barrier2:
+	DMB	MB_ISH
+	RET
diff --git a/src/runtime/internal/atomic/sys_nonlinux_arm.s b/src/runtime/internal/atomic/sys_nonlinux_arm.s
index 9d81334..57568b2 100644
--- a/src/runtime/internal/atomic/sys_nonlinux_arm.s
+++ b/src/runtime/internal/atomic/sys_nonlinux_arm.s
@@ -60,3 +60,20 @@
 
 	MOVB	R1, ret+4(FP)
 	RET
+
+TEXT	·Store8(SB),NOSPLIT,$0-5
+	MOVW	addr+0(FP), R1
+	MOVB	v+4(FP), R2
+
+	MOVB	runtime·goarm(SB), R8
+	CMP	$7, R8
+	BLT	2(PC)
+	DMB	MB_ISH
+
+	MOVB	R2, (R1)
+
+	CMP	$7, R8
+	BLT	2(PC)
+	DMB	MB_ISH
+	RET
+
diff --git a/src/runtime/internal/sys/arch.go b/src/runtime/internal/sys/arch.go
index 75beb78..13c00cf 100644
--- a/src/runtime/internal/sys/arch.go
+++ b/src/runtime/internal/sys/arch.go
@@ -14,6 +14,7 @@
 	MIPS
 	MIPS64
 	PPC64
+	RISCV64
 	S390X
 	WASM
 )
diff --git a/src/runtime/internal/sys/arch_386.go b/src/runtime/internal/sys/arch_386.go
index 3426fd1..b51f70a 100644
--- a/src/runtime/internal/sys/arch_386.go
+++ b/src/runtime/internal/sys/arch_386.go
@@ -7,7 +7,7 @@
 const (
 	ArchFamily          = I386
 	BigEndian           = false
-	DefaultPhysPageSize = GoosNacl*65536 + (1-GoosNacl)*4096 // 4k normally; 64k on NaCl
+	DefaultPhysPageSize = 4096
 	PCQuantum           = 1
 	Int64Align          = 4
 	MinFrameSize        = 0
diff --git a/src/runtime/internal/sys/arch_amd64p32.go b/src/runtime/internal/sys/arch_amd64p32.go
deleted file mode 100644
index d51c8a5..0000000
--- a/src/runtime/internal/sys/arch_amd64p32.go
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package sys
-
-const (
-	ArchFamily          = AMD64
-	BigEndian           = false
-	DefaultPhysPageSize = 65536*GoosNacl + 4096*(1-GoosNacl)
-	PCQuantum           = 1
-	Int64Align          = 8
-	MinFrameSize        = 0
-)
-
-type Uintreg uint64
diff --git a/src/runtime/internal/sys/arch_riscv64.go b/src/runtime/internal/sys/arch_riscv64.go
new file mode 100644
index 0000000..7cdcc8f
--- /dev/null
+++ b/src/runtime/internal/sys/arch_riscv64.go
@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily          = RISCV64
+	BigEndian           = false
+	CacheLineSize       = 64
+	DefaultPhysPageSize = 4096
+	PCQuantum           = 4
+	Int64Align          = 8
+	HugePageSize        = 1 << 21
+	MinFrameSize        = 8
+)
+
+type Uintreg uint64
diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
index ad6f0c3..3c88982 100644
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go
@@ -4,13 +4,16 @@
 
 // +build !386
 
+// TODO finish intrinsifying 386, deadcode the assembly, remove build tags, merge w/ intrinsics_common
+// TODO replace all uses of CtzXX with TrailingZerosXX; they are the same.
+
 package sys
 
 // Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf
 
-const deBruijn64 = 0x0218a392cd3d5dbf
+const deBruijn64ctz = 0x0218a392cd3d5dbf
 
-var deBruijnIdx64 = [64]byte{
+var deBruijnIdx64ctz = [64]byte{
 	0, 1, 2, 7, 3, 13, 8, 19,
 	4, 25, 14, 28, 9, 34, 20, 40,
 	5, 17, 26, 38, 15, 46, 29, 48,
@@ -21,9 +24,9 @@
 	61, 22, 43, 51, 60, 42, 59, 58,
 }
 
-const deBruijn32 = 0x04653adf
+const deBruijn32ctz = 0x04653adf
 
-var deBruijnIdx32 = [32]byte{
+var deBruijnIdx32ctz = [32]byte{
 	0, 1, 2, 6, 3, 11, 7, 16,
 	4, 14, 12, 21, 8, 23, 17, 26,
 	31, 5, 10, 15, 13, 20, 22, 25,
@@ -33,20 +36,20 @@
 // Ctz64 counts trailing (low-order) zeroes,
 // and if all are zero, then 64.
 func Ctz64(x uint64) int {
-	x &= -x                      // isolate low-order bit
-	y := x * deBruijn64 >> 58    // extract part of deBruijn sequence
-	i := int(deBruijnIdx64[y])   // convert to bit index
-	z := int((x - 1) >> 57 & 64) // adjustment if zero
+	x &= -x                       // isolate low-order bit
+	y := x * deBruijn64ctz >> 58  // extract part of deBruijn sequence
+	i := int(deBruijnIdx64ctz[y]) // convert to bit index
+	z := int((x - 1) >> 57 & 64)  // adjustment if zero
 	return i + z
 }
 
 // Ctz32 counts trailing (low-order) zeroes,
 // and if all are zero, then 32.
 func Ctz32(x uint32) int {
-	x &= -x                      // isolate low-order bit
-	y := x * deBruijn32 >> 27    // extract part of deBruijn sequence
-	i := int(deBruijnIdx32[y])   // convert to bit index
-	z := int((x - 1) >> 26 & 32) // adjustment if zero
+	x &= -x                       // isolate low-order bit
+	y := x * deBruijn32ctz >> 27  // extract part of deBruijn sequence
+	i := int(deBruijnIdx32ctz[y]) // convert to bit index
+	z := int((x - 1) >> 26 & 32)  // adjustment if zero
 	return i + z
 }
 
@@ -55,25 +58,6 @@
 	return int(ntz8tab[x])
 }
 
-var ntz8tab = [256]uint8{
-	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
-}
-
 // Bswap64 returns its input with byte order reversed
 // 0x0102030405060708 -> 0x0807060504030201
 func Bswap64(x uint64) uint64 {
diff --git a/src/runtime/internal/sys/intrinsics_common.go b/src/runtime/internal/sys/intrinsics_common.go
new file mode 100644
index 0000000..818d75e
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics_common.go
@@ -0,0 +1,143 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+// Copied from math/bits to avoid dependence.
+
+var len8tab = [256]uint8{
+	0x00, 0x01, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
+	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07, 0x07,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08,
+}
+
+var ntz8tab = [256]uint8{
+	0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+	0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00,
+}
+
+// len64 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len64(x uint64) (n int) {
+	if x >= 1<<32 {
+		x >>= 32
+		n = 32
+	}
+	if x >= 1<<16 {
+		x >>= 16
+		n += 16
+	}
+	if x >= 1<<8 {
+		x >>= 8
+		n += 8
+	}
+	return n + int(len8tab[x])
+}
+
+// --- OnesCount ---
+
+const m0 = 0x5555555555555555 // 01010101 ...
+const m1 = 0x3333333333333333 // 00110011 ...
+const m2 = 0x0f0f0f0f0f0f0f0f // 00001111 ...
+
+// OnesCount64 returns the number of one bits ("population count") in x.
+func OnesCount64(x uint64) int {
+	// Implementation: Parallel summing of adjacent bits.
+	// See "Hacker's Delight", Chap. 5: Counting Bits.
+	// The following pattern shows the general approach:
+	//
+	//   x = x>>1&(m0&m) + x&(m0&m)
+	//   x = x>>2&(m1&m) + x&(m1&m)
+	//   x = x>>4&(m2&m) + x&(m2&m)
+	//   x = x>>8&(m3&m) + x&(m3&m)
+	//   x = x>>16&(m4&m) + x&(m4&m)
+	//   x = x>>32&(m5&m) + x&(m5&m)
+	//   return int(x)
+	//
+	// Masking (& operations) can be left away when there's no
+	// danger that a field's sum will carry over into the next
+	// field: Since the result cannot be > 64, 8 bits is enough
+	// and we can ignore the masks for the shifts by 8 and up.
+	// Per "Hacker's Delight", the first line can be simplified
+	// more, but it saves at best one instruction, so we leave
+	// it alone for clarity.
+	const m = 1<<64 - 1
+	x = x>>1&(m0&m) + x&(m0&m)
+	x = x>>2&(m1&m) + x&(m1&m)
+	x = (x>>4 + x) & (m2 & m)
+	x += x >> 8
+	x += x >> 16
+	x += x >> 32
+	return int(x) & (1<<7 - 1)
+}
+
+var deBruijn64tab = [64]byte{
+	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
+	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
+	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
+	54, 26, 40, 15, 34, 20, 31, 10, 25, 14, 19, 9, 13, 8, 7, 6,
+}
+
+const deBruijn64 = 0x03f79d71b4ca8b09
+
+// TrailingZeros64 returns the number of trailing zero bits in x; the result is 64 for x == 0.
+func TrailingZeros64(x uint64) int {
+	if x == 0 {
+		return 64
+	}
+	// If popcount is fast, replace code below with return popcount(^x & (x - 1)).
+	//
+	// x & -x leaves only the right-most bit set in the word. Let k be the
+	// index of that bit. Since only a single bit is set, the value is two
+	// to the power of k. Multiplying by a power of two is equivalent to
+	// left shifting, in this case by k bits. The de Bruijn (64 bit) constant
+	// is such that all six bit, consecutive substrings are distinct.
+	// Therefore, if we have a left shifted version of this constant we can
+	// find by how many bits it was shifted by looking at which six bit
+	// substring ended up at the top of the word.
+	// (Knuth, volume 4, section 7.3.1)
+	return int(deBruijn64tab[(x&-x)*deBruijn64>>(64-6)])
+}
+
+// LeadingZeros64 returns the number of leading zero bits in x; the result is 64 for x == 0.
+func LeadingZeros64(x uint64) int { return 64 - Len64(x) }
+
+// LeadingZeros8 returns the number of leading zero bits in x; the result is 8 for x == 0.
+func LeadingZeros8(x uint8) int { return 8 - Len8(x) }
+
+// TrailingZeros8 returns the number of trailing zero bits in x; the result is 8 for x == 0.
+func TrailingZeros8(x uint8) int {
+	return int(ntz8tab[x])
+}
+
+// Len8 returns the minimum number of bits required to represent x; the result is 0 for x == 0.
+func Len8(x uint8) int {
+	return int(len8tab[x])
+}
diff --git a/src/runtime/internal/sys/zgoarch_amd64p32.go b/src/runtime/internal/sys/zgoarch_amd64p32.go
deleted file mode 100644
index 13dc2e7..0000000
--- a/src/runtime/internal/sys/zgoarch_amd64p32.go
+++ /dev/null
@@ -1,31 +0,0 @@
-// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
-
-// +build amd64p32
-
-package sys
-
-const GOARCH = `amd64p32`
-
-const Goarch386 = 0
-const GoarchAmd64 = 0
-const GoarchAmd64p32 = 1
-const GoarchArm = 0
-const GoarchArmbe = 0
-const GoarchArm64 = 0
-const GoarchArm64be = 0
-const GoarchPpc64 = 0
-const GoarchPpc64le = 0
-const GoarchMips = 0
-const GoarchMipsle = 0
-const GoarchMips64 = 0
-const GoarchMips64le = 0
-const GoarchMips64p32 = 0
-const GoarchMips64p32le = 0
-const GoarchPpc = 0
-const GoarchRiscv = 0
-const GoarchRiscv64 = 0
-const GoarchS390 = 0
-const GoarchS390x = 0
-const GoarchSparc = 0
-const GoarchSparc64 = 0
-const GoarchWasm = 0
diff --git a/src/runtime/internal/sys/zgoos_nacl.go b/src/runtime/internal/sys/zgoos_nacl.go
deleted file mode 100644
index 9e65b6f..0000000
--- a/src/runtime/internal/sys/zgoos_nacl.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Code generated by gengoos.go using 'go generate'. DO NOT EDIT.
-
-// +build nacl
-
-package sys
-
-const GOOS = `nacl`
-
-const GoosAix = 0
-const GoosAndroid = 0
-const GoosDarwin = 0
-const GoosDragonfly = 0
-const GoosFreebsd = 0
-const GoosHurd = 0
-const GoosIllumos = 0
-const GoosJs = 0
-const GoosLinux = 0
-const GoosNacl = 1
-const GoosNetbsd = 0
-const GoosOpenbsd = 0
-const GoosPlan9 = 0
-const GoosSolaris = 0
-const GoosWindows = 0
-const GoosZos = 0
diff --git a/src/runtime/internal/sys/zversion.go b/src/runtime/internal/sys/zversion.go
index db1e69b..ffe1ac1 100644
--- a/src/runtime/internal/sys/zversion.go
+++ b/src/runtime/internal/sys/zversion.go
@@ -2,6 +2,6 @@
 
 package sys
 
-const TheVersion = `go1.13`
+const TheVersion = `go1.15beta1`
 const Goexperiment = ``
 const StackGuardMultiplierDefault = 1
diff --git a/src/runtime/lfstack_32bit.go b/src/runtime/lfstack_32bit.go
index d36ca50..f07ff1c 100644
--- a/src/runtime/lfstack_32bit.go
+++ b/src/runtime/lfstack_32bit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm nacl mips mipsle
+// +build 386 arm mips mipsle
 
 package runtime
 
diff --git a/src/runtime/lfstack_64bit.go b/src/runtime/lfstack_64bit.go
index ea3455a..9d821b9 100644
--- a/src/runtime/lfstack_64bit.go
+++ b/src/runtime/lfstack_64bit.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 arm64 mips64 mips64le ppc64 ppc64le s390x wasm
+// +build amd64 arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x wasm
 
 package runtime
 
diff --git a/src/runtime/libfuzzer.go b/src/runtime/libfuzzer.go
new file mode 100644
index 0000000..0161955
--- /dev/null
+++ b/src/runtime/libfuzzer.go
@@ -0,0 +1,75 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build libfuzzer
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+func libfuzzerCall(fn *byte, arg0, arg1 uintptr)
+
+func libfuzzerTraceCmp1(arg0, arg1 uint8) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp1, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp2(arg0, arg1 uint16) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp2, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp4(arg0, arg1 uint32) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp4, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceCmp8(arg0, arg1 uint64) {
+	libfuzzerCall(&__sanitizer_cov_trace_cmp8, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp1(arg0, arg1 uint8) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp1, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp2(arg0, arg1 uint16) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp2, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp4(arg0, arg1 uint32) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp4, uintptr(arg0), uintptr(arg1))
+}
+
+func libfuzzerTraceConstCmp8(arg0, arg1 uint64) {
+	libfuzzerCall(&__sanitizer_cov_trace_const_cmp8, uintptr(arg0), uintptr(arg1))
+}
+
+//go:linkname __sanitizer_cov_trace_cmp1 __sanitizer_cov_trace_cmp1
+//go:cgo_import_static __sanitizer_cov_trace_cmp1
+var __sanitizer_cov_trace_cmp1 byte
+
+//go:linkname __sanitizer_cov_trace_cmp2 __sanitizer_cov_trace_cmp2
+//go:cgo_import_static __sanitizer_cov_trace_cmp2
+var __sanitizer_cov_trace_cmp2 byte
+
+//go:linkname __sanitizer_cov_trace_cmp4 __sanitizer_cov_trace_cmp4
+//go:cgo_import_static __sanitizer_cov_trace_cmp4
+var __sanitizer_cov_trace_cmp4 byte
+
+//go:linkname __sanitizer_cov_trace_cmp8 __sanitizer_cov_trace_cmp8
+//go:cgo_import_static __sanitizer_cov_trace_cmp8
+var __sanitizer_cov_trace_cmp8 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp1 __sanitizer_cov_trace_const_cmp1
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp1
+var __sanitizer_cov_trace_const_cmp1 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp2 __sanitizer_cov_trace_const_cmp2
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp2
+var __sanitizer_cov_trace_const_cmp2 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp4 __sanitizer_cov_trace_const_cmp4
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp4
+var __sanitizer_cov_trace_const_cmp4 byte
+
+//go:linkname __sanitizer_cov_trace_const_cmp8 __sanitizer_cov_trace_const_cmp8
+//go:cgo_import_static __sanitizer_cov_trace_const_cmp8
+var __sanitizer_cov_trace_const_cmp8 byte
diff --git a/src/runtime/libfuzzer_amd64.s b/src/runtime/libfuzzer_amd64.s
new file mode 100644
index 0000000..890fde3
--- /dev/null
+++ b/src/runtime/libfuzzer_amd64.s
@@ -0,0 +1,42 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build libfuzzer
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+// Based on race_amd64.s; see commentary there.
+
+#ifdef GOOS_windows
+#define RARG0 CX
+#define RARG1 DX
+#else
+#define RARG0 DI
+#define RARG1 SI
+#endif
+
+// void runtime·libfuzzerCall(fn, arg0, arg1 uintptr)
+// Calls C function fn from libFuzzer and passes 2 arguments to it.
+TEXT	runtime·libfuzzerCall(SB), NOSPLIT, $0-24
+	MOVQ	fn+0(FP), AX
+	MOVQ	arg0+8(FP), RARG0
+	MOVQ	arg1+16(FP), RARG1
+
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_m(R14), R13
+
+	// Switch to g0 stack.
+	MOVQ	SP, R12		// callee-saved, preserved across the CALL
+	MOVQ	m_g0(R13), R10
+	CMPQ	R10, R14
+	JE	call	// already on g0
+	MOVQ	(g_sched+gobuf_sp)(R10), SP
+call:
+	ANDQ	$~15, SP	// alignment for gcc ABI
+	CALL	AX
+	MOVQ	R12, SP
+	RET
diff --git a/src/runtime/libfuzzer_arm64.s b/src/runtime/libfuzzer_arm64.s
new file mode 100644
index 0000000..121673e
--- /dev/null
+++ b/src/runtime/libfuzzer_arm64.s
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build libfuzzer
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Based on race_arm64.s; see commentary there.
+
+// func runtime·libfuzzerCall(fn, arg0, arg1 uintptr)
+// Calls C function fn from libFuzzer and passes 2 arguments to it.
+TEXT	runtime·libfuzzerCall(SB), NOSPLIT, $0-24
+	MOVD	fn+0(FP), R9
+	MOVD	arg0+8(FP), R0
+	MOVD	arg1+16(FP), R1
+
+	MOVD	g_m(g), R10
+
+	// Switch to g0 stack.
+	MOVD	RSP, R19	// callee-saved, preserved across the CALL
+	MOVD	m_g0(R10), R11
+	CMP	R11, g
+	BEQ	call	// already on g0
+	MOVD	(g_sched+gobuf_sp)(R11), R12
+	MOVD	R12, RSP
+call:
+	BL	R9
+	MOVD	R19, RSP
+	RET
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index d2828b1..91467fd 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -44,6 +44,10 @@
 }
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 
 	if gp.m.locks < 0 {
@@ -104,6 +108,10 @@
 }
 
 func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+func unlock2(l *mutex) {
 	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
 	if v == mutex_unlocked {
 		throw("unlock of unlocked lock")
@@ -230,8 +238,8 @@
 	return ok
 }
 
-func beforeIdle() bool {
-	return false
+func beforeIdle(int64) (*g, bool) {
+	return nil, false
 }
 
 func checkTimeouts() {}
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go
index c038499..14bdc76 100644
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -26,6 +26,10 @@
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	if l.key == mutex_locked {
 		// js/wasm is single-threaded so we should never
 		// observe this.
@@ -40,6 +44,10 @@
 }
 
 func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
+func unlock2(l *mutex) {
 	if l.key == mutex_unlocked {
 		throw("unlock of unlocked lock")
 	}
@@ -111,6 +119,8 @@
 		gopark(nil, nil, waitReasonSleep, traceEvNone, 1)
 
 		clearTimeoutEvent(id) // note might have woken early, clear timeout
+		clearIdleID()
+
 		mp = acquirem()
 		delete(notes, n)
 		delete(notesWithTimeout, n)
@@ -144,31 +154,64 @@
 	}
 }
 
-var returnedEventHandler *g
+// events is a stack of calls from JavaScript into Go.
+var events []*event
 
-func init() {
-	// At the toplevel we need an extra goroutine that handles asynchronous events.
-	initg := getg()
-	go func() {
-		returnedEventHandler = getg()
-		goready(initg, 1)
-
-		gopark(nil, nil, waitReasonZero, traceEvNone, 1)
-		returnedEventHandler = nil
-
-		pause(getcallersp() - 16)
-	}()
-	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
+type event struct {
+	// g was the active goroutine when the call from JavaScript occurred.
+	// It needs to be active when returning to JavaScript.
+	gp *g
+	// returned reports whether the event handler has returned.
+	// When all goroutines are idle and the event handler has returned,
+	// then g gets resumed and returns the execution to JavaScript.
+	returned bool
 }
 
+// The timeout event started by beforeIdle.
+var idleID int32
+
 // beforeIdle gets called by the scheduler if no goroutine is awake.
-// We resume the event handler (if available) which will pause the execution.
-func beforeIdle() bool {
-	if returnedEventHandler != nil {
-		goready(returnedEventHandler, 1)
-		return true
+// If we are not already handling an event, then we pause for an async event.
+// If an event handler returned, we resume it and it will pause the execution.
+// beforeIdle either returns the specific goroutine to schedule next or
+// indicates with otherReady that some goroutine became ready.
+func beforeIdle(delay int64) (gp *g, otherReady bool) {
+	if delay > 0 {
+		clearIdleID()
+		if delay < 1e6 {
+			delay = 1
+		} else if delay < 1e15 {
+			delay = delay / 1e6
+		} else {
+			// An arbitrary cap on how long to wait for a timer.
+			// 1e9 ms == ~11.5 days.
+			delay = 1e9
+		}
+		idleID = scheduleTimeoutEvent(delay)
 	}
-	return false
+
+	if len(events) == 0 {
+		go handleAsyncEvent()
+		return nil, true
+	}
+
+	e := events[len(events)-1]
+	if e.returned {
+		return e.gp, false
+	}
+	return nil, false
+}
+
+func handleAsyncEvent() {
+	pause(getcallersp() - 16)
+}
+
+// clearIdleID clears our record of the timeout started by beforeIdle.
+func clearIdleID() {
+	if idleID != 0 {
+		clearTimeoutEvent(idleID)
+		idleID = 0
+	}
 }
 
 // pause sets SP to newsp and pauses the execution of Go's WebAssembly code until an event is triggered.
@@ -181,18 +224,29 @@
 // clearTimeoutEvent clears a timeout event scheduled by scheduleTimeoutEvent.
 func clearTimeoutEvent(id int32)
 
+// handleEvent gets invoked on a call from JavaScript into Go. It calls the event handler of the syscall/js package
+// and then parks the handler goroutine to allow other goroutines to run before giving execution back to JavaScript.
+// When no other goroutine is awake any more, beforeIdle resumes the handler goroutine. Now that the same goroutine
+// is running as was running when the call came in from JavaScript, execution can be safely passed back to JavaScript.
 func handleEvent() {
-	prevReturnedEventHandler := returnedEventHandler
-	returnedEventHandler = nil
+	e := &event{
+		gp:       getg(),
+		returned: false,
+	}
+	events = append(events, e)
 
-	checkTimeouts()
 	eventHandler()
 
-	returnedEventHandler = getg()
+	clearIdleID()
+
+	// wait until all goroutines are idle
+	e.returned = true
 	gopark(nil, nil, waitReasonZero, traceEvNone, 1)
 
-	returnedEventHandler = prevReturnedEventHandler
+	events[len(events)-1] = nil
+	events = events[:len(events)-1]
 
+	// return execution to JavaScript
 	pause(getcallersp() - 16)
 }
 
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index b36c97f..671e524 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin nacl netbsd openbsd plan9 solaris windows
+// +build aix darwin netbsd openbsd plan9 solaris windows
 
 package runtime
 
@@ -33,6 +33,10 @@
 )
 
 func lock(l *mutex) {
+	lockWithRank(l, getLockRank(l))
+}
+
+func lock2(l *mutex) {
 	gp := getg()
 	if gp.m.locks < 0 {
 		throw("runtime·lock: lock count")
@@ -89,9 +93,13 @@
 	}
 }
 
+func unlock(l *mutex) {
+	unlockWithRank(l)
+}
+
 //go:nowritebarrier
 // We might not be holding a p in this code.
-func unlock(l *mutex) {
+func unlock2(l *mutex) {
 	gp := getg()
 	var mp *m
 	for {
@@ -289,8 +297,8 @@
 	return ok
 }
 
-func beforeIdle() bool {
-	return false
+func beforeIdle(int64) (*g, bool) {
+	return nil, false
 }
 
 func checkTimeouts() {}
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
new file mode 100644
index 0000000..0001935
--- /dev/null
+++ b/src/runtime/lockrank.go
@@ -0,0 +1,254 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file records the static ranks of the locks in the runtime. If a lock
+// is not given a rank, then it is assumed to be a leaf lock, which means no other
+// lock can be acquired while it is held. Therefore, leaf locks do not need to be
+// given an explicit rank. We list all of the architecture-independent leaf locks
+// for documentation purposes, but don't list any of the architecture-dependent
+// locks (which are all leaf locks). debugLock is ignored for ranking, since it is used
+// when printing out lock ranking errors.
+//
+// lockInit(l *mutex, rank int) is used to set the rank of lock before it is used.
+// If there is no clear place to initialize a lock, then the rank of a lock can be
+// specified during the lock call itself via lockWithrank(l *mutex, rank int).
+//
+// Besides the static lock ranking (which is a total ordering of the locks), we
+// also represent and enforce the actual partial order among the locks in the
+// arcs[] array below. That is, if it is possible that lock B can be acquired when
+// lock A is the previous acquired lock that is still held, then there should be
+// an entry for A in arcs[B][]. We will currently fail not only if the total order
+// (the lock ranking) is violated, but also if there is a missing entry in the
+// partial order.
+
+package runtime
+
+type lockRank int
+
+// Constants representing the lock rank of the architecture-independent locks in
+// the runtime. Locks with lower rank must be taken before locks with higher
+// rank.
+const (
+	lockRankDummy lockRank = iota
+
+	// Locks held above sched
+	lockRankSysmon
+	lockRankScavenge
+	lockRankForcegc
+	lockRankSweepWaiters
+	lockRankAssistQueue
+	lockRankCpuprof
+	lockRankSweep
+
+	lockRankSched
+	lockRankDeadlock
+	lockRankPanic
+	lockRankAllg
+	lockRankAllp
+	lockRankPollDesc
+
+	lockRankTimers // Multiple timers locked simultaneously in destroy()
+	lockRankItab
+	lockRankReflectOffs
+	lockRankHchan // Multiple hchans acquired in lock order in syncadjustsudogs()
+	lockRankFin
+	lockRankNotifyList
+	lockRankTraceBuf
+	lockRankTraceStrings
+	lockRankMspanSpecial
+	lockRankProf
+	lockRankGcBitsArenas
+	lockRankRoot
+	lockRankTrace
+	lockRankTraceStackTab
+	lockRankNetpollInit
+
+	lockRankRwmutexW
+	lockRankRwmutexR
+
+	lockRankMcentral // For !go115NewMCentralImpl
+	lockRankSpine    // For !go115NewMCentralImpl
+	lockRankSpanSetSpine
+	lockRankGscan
+	lockRankStackpool
+	lockRankStackLarge
+	lockRankDefer
+	lockRankSudog
+
+	// Memory-related non-leaf locks
+	lockRankWbufSpans
+	lockRankMheap
+	lockRankMheapSpecial
+
+	// Memory-related leaf locks
+	lockRankGlobalAlloc
+
+	// Other leaf locks
+	lockRankGFree
+	// Generally, hchan must be acquired before gscan. But in one specific
+	// case (in syncadjustsudogs from markroot after the g has been suspended
+	// by suspendG), we allow gscan to be acquired, and then an hchan lock. To
+	// allow this case, we get this lockRankHchanLeaf rank in
+	// syncadjustsudogs(), rather than lockRankHchan. By using this special
+	// rank, we don't allow any further locks to be acquired other than more
+	// hchan locks.
+	lockRankHchanLeaf
+
+	// Leaf locks with no dependencies, so these constants are not actually used anywhere.
+	// There are other architecture-dependent leaf locks as well.
+	lockRankNewmHandoff
+	lockRankDebugPtrmask
+	lockRankFaketimeState
+	lockRankTicks
+	lockRankRaceFini
+	lockRankPollCache
+	lockRankDebug
+)
+
+// lockRankLeafRank is the rank of lock that does not have a declared rank, and hence is
+// a leaf lock.
+const lockRankLeafRank lockRank = 1000
+
+// lockNames gives the names associated with each of the above ranks
+var lockNames = []string{
+	lockRankDummy: "",
+
+	lockRankSysmon:       "sysmon",
+	lockRankScavenge:     "scavenge",
+	lockRankForcegc:      "forcegc",
+	lockRankSweepWaiters: "sweepWaiters",
+	lockRankAssistQueue:  "assistQueue",
+	lockRankCpuprof:      "cpuprof",
+	lockRankSweep:        "sweep",
+
+	lockRankSched:    "sched",
+	lockRankDeadlock: "deadlock",
+	lockRankPanic:    "panic",
+	lockRankAllg:     "allg",
+	lockRankAllp:     "allp",
+	lockRankPollDesc: "pollDesc",
+
+	lockRankTimers:      "timers",
+	lockRankItab:        "itab",
+	lockRankReflectOffs: "reflectOffs",
+
+	lockRankHchan:         "hchan",
+	lockRankFin:           "fin",
+	lockRankNotifyList:    "notifyList",
+	lockRankTraceBuf:      "traceBuf",
+	lockRankTraceStrings:  "traceStrings",
+	lockRankMspanSpecial:  "mspanSpecial",
+	lockRankProf:          "prof",
+	lockRankGcBitsArenas:  "gcBitsArenas",
+	lockRankRoot:          "root",
+	lockRankTrace:         "trace",
+	lockRankTraceStackTab: "traceStackTab",
+	lockRankNetpollInit:   "netpollInit",
+
+	lockRankRwmutexW: "rwmutexW",
+	lockRankRwmutexR: "rwmutexR",
+
+	lockRankMcentral:     "mcentral",
+	lockRankSpine:        "spine",
+	lockRankSpanSetSpine: "spanSetSpine",
+	lockRankGscan:        "gscan",
+	lockRankStackpool:    "stackpool",
+	lockRankStackLarge:   "stackLarge",
+	lockRankDefer:        "defer",
+	lockRankSudog:        "sudog",
+
+	lockRankWbufSpans:    "wbufSpans",
+	lockRankMheap:        "mheap",
+	lockRankMheapSpecial: "mheapSpecial",
+
+	lockRankGlobalAlloc: "globalAlloc.mutex",
+
+	lockRankGFree:     "gFree",
+	lockRankHchanLeaf: "hchanLeaf",
+
+	lockRankNewmHandoff:   "newmHandoff.lock",
+	lockRankDebugPtrmask:  "debugPtrmask.lock",
+	lockRankFaketimeState: "faketimeState.lock",
+	lockRankTicks:         "ticks.lock",
+	lockRankRaceFini:      "raceFiniLock",
+	lockRankPollCache:     "pollCache.lock",
+	lockRankDebug:         "debugLock",
+}
+
+func (rank lockRank) String() string {
+	if rank == 0 {
+		return "UNKNOWN"
+	}
+	if rank == lockRankLeafRank {
+		return "LEAF"
+	}
+	return lockNames[rank]
+}
+
+// lockPartialOrder is a partial order among the various lock types, listing the immediate
+// ordering that has actually been observed in the runtime. Each entry (which
+// corresponds to a particular lock rank) specifies the list of locks that can be
+// already be held immediately "above" it.
+//
+// So, for example, the lockRankSched entry shows that all the locks preceding it in
+// rank can actually be held. The fin lock shows that only the sched, timers, or
+// hchan lock can be held immediately above it when it is acquired.
+var lockPartialOrder [][]lockRank = [][]lockRank{
+	lockRankDummy:         {},
+	lockRankSysmon:        {},
+	lockRankScavenge:      {lockRankSysmon},
+	lockRankForcegc:       {lockRankSysmon},
+	lockRankSweepWaiters:  {},
+	lockRankAssistQueue:   {},
+	lockRankCpuprof:       {},
+	lockRankSweep:         {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankDeadlock:      {lockRankDeadlock},
+	lockRankPanic:         {lockRankDeadlock},
+	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
+	lockRankAllp:          {lockRankSysmon, lockRankSched},
+	lockRankPollDesc:      {},
+	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
+	lockRankItab:          {},
+	lockRankReflectOffs:   {lockRankItab},
+	lockRankHchan:         {lockRankScavenge, lockRankSweep, lockRankHchan},
+	lockRankFin:           {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan},
+	lockRankNotifyList:    {},
+	lockRankTraceBuf:      {lockRankSysmon, lockRankScavenge},
+	lockRankTraceStrings:  {lockRankTraceBuf},
+	lockRankMspanSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings},
+	lockRankProf:          {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGcBitsArenas:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankRoot:          {},
+	lockRankTrace:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankSched, lockRankHchan, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankSweep},
+	lockRankTraceStackTab: {lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankTimers, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankTrace},
+	lockRankNetpollInit:   {lockRankTimers},
+
+	lockRankRwmutexW: {},
+	lockRankRwmutexR: {lockRankRwmutexW},
+
+	lockRankMcentral:     {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpine:        {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankTraceBuf, lockRankTraceStrings, lockRankRoot, lockRankNotifyList, lockRankProf, lockRankGcBitsArenas, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine},
+	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankMcentral, lockRankSpine, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankSpanSetSpine, lockRankGscan},
+	lockRankDefer:        {},
+	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
+	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankMcentral, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
+	lockRankGlobalAlloc:  {lockRankProf, lockRankSpine, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+
+	lockRankGFree:     {lockRankSched},
+	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
+
+	lockRankNewmHandoff:   {},
+	lockRankDebugPtrmask:  {},
+	lockRankFaketimeState: {},
+	lockRankTicks:         {},
+	lockRankRaceFini:      {},
+	lockRankPollCache:     {},
+	lockRankDebug:         {},
+}
diff --git a/src/runtime/lockrank_off.go b/src/runtime/lockrank_off.go
new file mode 100644
index 0000000..891589c
--- /dev/null
+++ b/src/runtime/lockrank_off.go
@@ -0,0 +1,32 @@
+// +build !goexperiment.staticlockranking
+
+package runtime
+
+// // lockRankStruct is embedded in mutex, but is empty when staticklockranking is
+// disabled (the default)
+type lockRankStruct struct {
+}
+
+func lockInit(l *mutex, rank lockRank) {
+}
+
+func getLockRank(l *mutex) lockRank {
+	return 0
+}
+
+func lockWithRank(l *mutex, rank lockRank) {
+	lock2(l)
+}
+
+func acquireLockRank(rank lockRank) {
+}
+
+func unlockWithRank(l *mutex) {
+	unlock2(l)
+}
+
+func releaseLockRank(rank lockRank) {
+}
+
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+}
diff --git a/src/runtime/lockrank_on.go b/src/runtime/lockrank_on.go
new file mode 100644
index 0000000..cf4151f
--- /dev/null
+++ b/src/runtime/lockrank_on.go
@@ -0,0 +1,210 @@
+// +build goexperiment.staticlockranking
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// lockRankStruct is embedded in mutex
+type lockRankStruct struct {
+	// static lock ranking of the lock
+	rank lockRank
+	// pad field to make sure lockRankStruct is a multiple of 8 bytes, even on
+	// 32-bit systems.
+	pad int
+}
+
+// init checks that the partial order in lockPartialOrder fits within the total
+// order determined by the order of the lockRank constants.
+func init() {
+	for rank, list := range lockPartialOrder {
+		for _, entry := range list {
+			if entry > lockRank(rank) {
+				println("lockPartial order row", lockRank(rank).String(), "entry", entry.String())
+				throw("lockPartialOrder table is inconsistent with total lock ranking order")
+			}
+		}
+	}
+}
+
+func lockInit(l *mutex, rank lockRank) {
+	l.rank = rank
+}
+
+func getLockRank(l *mutex) lockRank {
+	return l.rank
+}
+
+// The following functions are the entry-points to record lock
+// operations.
+// All of these are nosplit and switch to the system stack immediately
+// to avoid stack growths. Since a stack growth could itself have lock
+// operations, this prevents re-entrant calls.
+
+// lockWithRank is like lock(l), but allows the caller to specify a lock rank
+// when acquiring a non-static lock.
+//go:nosplit
+func lockWithRank(l *mutex, rank lockRank) {
+	if l == &debuglock || l == &paniclk {
+		// debuglock is only used for println/printlock(). Don't do lock
+		// rank recording for it, since print/println are used when
+		// printing out a lock ordering problem below.
+		//
+		// paniclk has an ordering problem, since it can be acquired
+		// during a panic with any other locks held (especially if the
+		// panic is because of a directed segv), and yet also allg is
+		// acquired after paniclk in tracebackothers()). This is a genuine
+		// problem, so for now we don't do lock rank recording for paniclk
+		// either.
+		lock2(l)
+		return
+	}
+	if rank == 0 {
+		rank = lockRankLeafRank
+	}
+	gp := getg()
+	// Log the new class.
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+
+		// i is the index of the lock being acquired
+		if i > 0 {
+			checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		}
+		lock2(l)
+	})
+}
+
+// acquireLockRank acquires a rank which is not associated with a mutex lock
+//go:nosplit
+func acquireLockRank(rank lockRank) {
+	gp := getg()
+	// Log the new class.
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = 0
+		gp.m.locksHeldLen++
+
+		// i is the index of the lock being acquired
+		if i > 0 {
+			checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		}
+	})
+}
+
+// checkRanks checks if goroutine g, which has mostly recently acquired a lock
+// with rank 'prevRank', can now acquire a lock with rank 'rank'.
+func checkRanks(gp *g, prevRank, rank lockRank) {
+	rankOK := false
+	if rank < prevRank {
+		// If rank < prevRank, then we definitely have a rank error
+		rankOK = false
+	} else if rank == lockRankLeafRank {
+		// If new lock is a leaf lock, then the preceding lock can
+		// be anything except another leaf lock.
+		rankOK = prevRank < lockRankLeafRank
+	} else {
+		// We've now verified the total lock ranking, but we
+		// also enforce the partial ordering specified by
+		// lockPartialOrder as well. Two locks with the same rank
+		// can only be acquired at the same time if explicitly
+		// listed in the lockPartialOrder table.
+		list := lockPartialOrder[rank]
+		for _, entry := range list {
+			if entry == prevRank {
+				rankOK = true
+				break
+			}
+		}
+	}
+	if !rankOK {
+		printlock()
+		println(gp.m.procid, " ======")
+		for j, held := range gp.m.locksHeld[:gp.m.locksHeldLen] {
+			println(j, ":", held.rank.String(), held.rank, unsafe.Pointer(gp.m.locksHeld[j].lockAddr))
+		}
+		throw("lock ordering problem")
+	}
+}
+
+//go:nosplit
+func unlockWithRank(l *mutex) {
+	if l == &debuglock || l == &paniclk {
+		// See comment at beginning of lockWithRank.
+		unlock2(l)
+		return
+	}
+	gp := getg()
+	systemstack(func() {
+		found := false
+		for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+			if gp.m.locksHeld[i].lockAddr == uintptr(unsafe.Pointer(l)) {
+				found = true
+				copy(gp.m.locksHeld[i:gp.m.locksHeldLen-1], gp.m.locksHeld[i+1:gp.m.locksHeldLen])
+				gp.m.locksHeldLen--
+				break
+			}
+		}
+		if !found {
+			println(gp.m.procid, ":", l.rank.String(), l.rank, l)
+			throw("unlock without matching lock acquire")
+		}
+		unlock2(l)
+	})
+}
+
+// releaseLockRank releases a rank which is not associated with a mutex lock
+//go:nosplit
+func releaseLockRank(rank lockRank) {
+	gp := getg()
+	systemstack(func() {
+		found := false
+		for i := gp.m.locksHeldLen - 1; i >= 0; i-- {
+			if gp.m.locksHeld[i].rank == rank && gp.m.locksHeld[i].lockAddr == 0 {
+				found = true
+				copy(gp.m.locksHeld[i:gp.m.locksHeldLen-1], gp.m.locksHeld[i+1:gp.m.locksHeldLen])
+				gp.m.locksHeldLen--
+				break
+			}
+		}
+		if !found {
+			println(gp.m.procid, ":", rank.String(), rank)
+			throw("lockRank release without matching lockRank acquire")
+		}
+	})
+}
+
+//go:nosplit
+func lockWithRankMayAcquire(l *mutex, rank lockRank) {
+	gp := getg()
+	if gp.m.locksHeldLen == 0 {
+		// No possibilty of lock ordering problem if no other locks held
+		return
+	}
+
+	systemstack(func() {
+		i := gp.m.locksHeldLen
+		if i >= len(gp.m.locksHeld) {
+			throw("too many locks held concurrently for rank checking")
+		}
+		// Temporarily add this lock to the locksHeld list, so
+		// checkRanks() will print out list, including this lock, if there
+		// is a lock ordering problem.
+		gp.m.locksHeld[i].rank = rank
+		gp.m.locksHeld[i].lockAddr = uintptr(unsafe.Pointer(l))
+		gp.m.locksHeldLen++
+		checkRanks(gp, gp.m.locksHeld[i-1].rank, rank)
+		gp.m.locksHeldLen--
+	})
+}
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index d768054..eaf8db7 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -19,7 +19,7 @@
 //	fixalloc: a free-list allocator for fixed-size off-heap objects,
 //		used to manage storage used by the allocator.
 //	mheap: the malloc heap, managed at page (8192-byte) granularity.
-//	mspan: a run of pages managed by the mheap.
+//	mspan: a run of in-use pages managed by the mheap.
 //	mcentral: collects all spans of a given size class.
 //	mcache: a per-P cache of mspans with free space.
 //	mstats: allocation statistics.
@@ -56,20 +56,16 @@
 //	   it is placed on the mcentral free list for the mspan's size
 //	   class.
 //
-//	3. Otherwise, if all objects in the mspan are free, the mspan
-//	   is now "idle", so it is returned to the mheap and no longer
-//	   has a size class.
-//	   This may coalesce it with adjacent idle mspans.
-//
-//	4. If an mspan remains idle for long enough, return its pages
-//	   to the operating system.
+//	3. Otherwise, if all objects in the mspan are free, the mspan's
+//	   pages are returned to the mheap and the mspan is now dead.
 //
 // Allocating and freeing a large object uses the mheap
 // directly, bypassing the mcache and mcentral.
 //
-// Free object slots in an mspan are zeroed only if mspan.needzero is
-// false. If needzero is true, objects are zeroed as they are
-// allocated. There are various benefits to delaying zeroing this way:
+// If mspan.needzero is false, then free object slots in the mspan are
+// already zeroed. Otherwise if needzero is true, objects are zeroed as
+// they are allocated. There are various benefits to delaying zeroing
+// this way:
 //
 //	1. Stack frame allocation can avoid zeroing altogether.
 //
@@ -197,17 +193,21 @@
 	// exceed Go's 48 bit limit, it's extremely unlikely in
 	// practice.
 	//
-	// On aix/ppc64, the limits is increased to 1<<60 to accept addresses
-	// returned by mmap syscall. These are in range:
-	//  0x0a00000000000000 - 0x0afffffffffffff
-	//
 	// On 32-bit platforms, we accept the full 32-bit address
 	// space because doing so is cheap.
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
+	// On darwin/arm64, although 64-bit pointers are presumably
+	// available, pointers are truncated to 33 bits. Furthermore,
+	// only the top 4 GiB of the address space are actually available
+	// to the application, but we allow the whole 33 bits anyway for
+	// simplicity.
+	// TODO(mknyszek): Consider limiting it to 32 bits and using
+	// arenaBaseOffset to offset into the top 4 GiB.
+	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosAix))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 60*sys.GoosAix
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosDarwin*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosDarwin*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -226,7 +226,6 @@
 	//       Platform  Addr bits  Arena size  L1 entries   L2 entries
 	// --------------  ---------  ----------  ----------  -----------
 	//       */64-bit         48        64MB           1    4M (32MB)
-	//     aix/64-bit         60       256MB        4096    4M (32MB)
 	// windows/64-bit         48         4MB          64    1M  (8MB)
 	//       */32-bit         32         4MB           1  1024  (4KB)
 	//     */mips(le)         31         4MB           1   512  (2KB)
@@ -248,7 +247,7 @@
 	// logHeapArenaBytes is log_2 of heapArenaBytes. For clarity,
 	// prefer using heapArenaBytes where possible (we need the
 	// constant to compute some other constants).
-	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoosAix)*(1-sys.GoarchWasm)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (8+20)*sys.GoosAix + (2+20)*sys.GoarchWasm
+	logHeapArenaBytes = (6+20)*(_64bit*(1-sys.GoosWindows)*(1-sys.GoarchWasm)) + (2+20)*(_64bit*sys.GoosWindows) + (2+20)*(1-_64bit) + (2+20)*sys.GoarchWasm
 
 	// heapArenaBitmapBytes is the size of each heap arena's bitmap.
 	heapArenaBitmapBytes = heapArenaBytes / (sys.PtrSize * 8 / 2)
@@ -268,10 +267,7 @@
 	// We use the L1 map on 64-bit Windows because the arena size
 	// is small, but the address space is still 48 bits, and
 	// there's a high cost to having a large L2.
-	//
-	// We use the L1 map on aix/ppc64 to keep the same L2 value
-	// as on Linux.
-	arenaL1Bits = 6*(_64bit*sys.GoosWindows) + 12*sys.GoosAix
+	arenaL1Bits = 6 * (_64bit * sys.GoosWindows)
 
 	// arenaL2Bits is the number of bits of the arena number
 	// covered by the second level arena index.
@@ -298,9 +294,15 @@
 	// bits. This offset lets us handle "negative" addresses (or
 	// high addresses if viewed as unsigned).
 	//
+	// On aix/ppc64, this offset allows to keep the heapAddrBits to
+	// 48. Otherwize, it would be 60 in order to handle mmap addresses
+	// (in range 0x0a00000000000000 - 0x0afffffffffffff). But in this
+	// case, the memory reserved in (s *pageAlloc).init for chunks
+	// is causing important slowdowns.
+	//
 	// On other platforms, the user address space is contiguous
 	// and starts at 0, so no offset is necessary.
-	arenaBaseOffset uintptr = sys.GoarchAmd64 * (1 << 47)
+	arenaBaseOffset = 0xffff800000000000*sys.GoarchAmd64 + 0x0a00000000000000*sys.GoosAix
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
@@ -433,6 +435,10 @@
 		// The OS init code failed to fetch the physical page size.
 		throw("failed to get system page size")
 	}
+	if physPageSize > maxPhysPageSize {
+		print("system page size (", physPageSize, ") is larger than maximum page size (", maxPhysPageSize, ")\n")
+		throw("bad system page size")
+	}
 	if physPageSize < minPhysPageSize {
 		print("system page size (", physPageSize, ") is smaller than minimum page size (", minPhysPageSize, ")\n")
 		throw("bad system page size")
@@ -445,6 +451,13 @@
 		print("system huge page size (", physHugePageSize, ") must be a power of 2\n")
 		throw("bad system huge page size")
 	}
+	if physHugePageSize > maxPhysHugePageSize {
+		// physHugePageSize is greater than the maximum supported huge page size.
+		// Don't throw here, like in the other cases, since a system configured
+		// in this way isn't wrong, we just don't have the code to support them.
+		// Instead, silently set the huge page size to zero.
+		physHugePageSize = 0
+	}
 	if physHugePageSize != 0 {
 		// Since physHugePageSize is a power of 2, it suffices to increase
 		// physHugePageShift until 1<<physHugePageShift == physHugePageSize.
@@ -452,11 +465,21 @@
 			physHugePageShift++
 		}
 	}
+	if pagesPerArena%pagesPerSpanRoot != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerSpanRoot (", pagesPerSpanRoot, ")\n")
+		throw("bad pagesPerSpanRoot")
+	}
+	if pagesPerArena%pagesPerReclaimerChunk != 0 {
+		print("pagesPerArena (", pagesPerArena, ") is not divisible by pagesPerReclaimerChunk (", pagesPerReclaimerChunk, ")\n")
+		throw("bad pagesPerReclaimerChunk")
+	}
 
 	// Initialize the heap.
 	mheap_.init()
-	_g_ := getg()
-	_g_.m.mcache = allocmcache()
+	mcache0 = allocmcache()
+	lockInit(&gcBitsArenas.lock, lockRankGcBitsArenas)
+	lockInit(&proflock, lockRankProf)
+	lockInit(&globalAlloc.mutex, lockRankGlobalAlloc)
 
 	// Create initial arena growth hints.
 	if sys.PtrSize == 8 {
@@ -490,6 +513,7 @@
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
 		// On darwin/arm64, the address space is even smaller.
+		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
@@ -568,7 +592,7 @@
 		if mheap_.heapArenaAlloc.next <= p && p < mheap_.heapArenaAlloc.end {
 			p = mheap_.heapArenaAlloc.end
 		}
-		p = round(p+(256<<10), heapArenaBytes)
+		p = alignUp(p+(256<<10), heapArenaBytes)
 		// Because we're worried about fragmentation on
 		// 32-bit, we try to make a large initial reservation.
 		arenaSizes := []uintptr{
@@ -580,7 +604,7 @@
 			a, size := sysReserveAligned(unsafe.Pointer(p), arenaSize, heapArenaBytes)
 			if a != nil {
 				mheap_.arena.init(uintptr(a), size)
-				p = uintptr(a) + size // For hint below
+				p = mheap_.arena.end // For hint below
 				break
 			}
 		}
@@ -601,7 +625,7 @@
 //
 // h must be locked.
 func (h *mheap) sysAlloc(n uintptr) (v unsafe.Pointer, size uintptr) {
-	n = round(n, heapArenaBytes)
+	n = alignUp(n, heapArenaBytes)
 
 	// First, try the arena pre-reservation.
 	v = h.arena.alloc(n, heapArenaBytes, &memstats.heap_sys)
@@ -784,7 +808,7 @@
 		// re-reserve the aligned sub-region. This may race,
 		// so we may have to try again.
 		sysFree(unsafe.Pointer(p), size+align, nil)
-		p = round(p, align)
+		p = alignUp(p, align)
 		p2 := sysReserve(unsafe.Pointer(p), size)
 		if p != uintptr(p2) {
 			// Must have raced. Try again.
@@ -798,7 +822,7 @@
 		return p2, size
 	default:
 		// Trim off the unaligned parts.
-		pAligned := round(p, align)
+		pAligned := alignUp(p, align)
 		sysFree(unsafe.Pointer(p), pAligned-p, nil)
 		end := pAligned + size
 		endLen := (p + size + align) - end
@@ -939,7 +963,20 @@
 
 	shouldhelpgc := false
 	dataSize := size
-	c := gomcache()
+	var c *mcache
+	if mp.p != 0 {
+		c = mp.p.ptr().mcache
+	} else {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("malloc called with no P")
+		}
+	}
+	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
 	if size <= maxSmallSize {
@@ -976,11 +1013,11 @@
 			off := c.tinyoffset
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
-				off = round(off, 8)
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
-				off = round(off, 4)
+				off = alignUp(off, 4)
 			} else if size&1 == 0 {
-				off = round(off, 2)
+				off = alignUp(off, 2)
 			}
 			if off+size <= maxTinySize && c.tiny != 0 {
 				// The object fits into existing tiny block.
@@ -992,10 +1029,10 @@
 				return x
 			}
 			// Allocate a new maxTinySize block.
-			span := c.alloc[tinySpanClass]
+			span = c.alloc[tinySpanClass]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, _, shouldhelpgc = c.nextFree(tinySpanClass)
+				v, span, shouldhelpgc = c.nextFree(tinySpanClass)
 			}
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
@@ -1010,13 +1047,13 @@
 		} else {
 			var sizeclass uint8
 			if size <= smallSizeMax-8 {
-				sizeclass = size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]
+				sizeclass = size_to_class8[divRoundUp(size, smallSizeDiv)]
 			} else {
-				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
+				sizeclass = size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]
 			}
 			size = uintptr(class_to_size[sizeclass])
 			spc := makeSpanClass(sizeclass, noscan)
-			span := c.alloc[spc]
+			span = c.alloc[spc]
 			v := nextFreeFast(span)
 			if v == 0 {
 				v, span, shouldhelpgc = c.nextFree(spc)
@@ -1027,15 +1064,14 @@
 			}
 		}
 	} else {
-		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, needzero, noscan)
+			span = largeAlloc(size, needzero, noscan)
 		})
-		s.freeindex = 1
-		s.allocCount = 1
-		x = unsafe.Pointer(s.base())
-		size = s.elemsize
+		span.freeindex = 1
+		span.allocCount = 1
+		x = unsafe.Pointer(span.base())
+		size = span.elemsize
 	}
 
 	var scanSize uintptr
@@ -1076,7 +1112,7 @@
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
 	if gcphase != _GCoff {
-		gcmarknewobject(uintptr(x), size, scanSize)
+		gcmarknewobject(span, uintptr(x), size, scanSize)
 	}
 
 	if raceenabled {
@@ -1135,10 +1171,16 @@
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, makeSpanClass(0, noscan), true, needzero)
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
+	if go115NewMCentralImpl {
+		// Put the large span in the mcentral swept list so that it's
+		// visible to the background sweeper.
+		mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	}
 	s.limit = s.base() + size
 	heapBitsForAddr(s.base()).initSpan(s)
 	return s
@@ -1179,7 +1221,16 @@
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	mp.mcache.next_sample = nextSample()
+	var c *mcache
+	if mp.p != 0 {
+		c = mp.p.ptr().mcache
+	} else {
+		c = mcache0
+		if c == nil {
+			throw("profilealloc called with no P")
+		}
+	}
+	c.next_sample = nextSample()
 	mProf_Malloc(x, size)
 }
 
@@ -1313,7 +1364,7 @@
 		lock(&globalAlloc.mutex)
 		persistent = &globalAlloc.persistentAlloc
 	}
-	persistent.off = round(persistent.off, align)
+	persistent.off = alignUp(persistent.off, align)
 	if persistent.off+size > persistentChunkSize || persistent.base == nil {
 		persistent.base = (*notInHeap)(sysAlloc(persistentChunkSize, &memstats.other_sys))
 		if persistent.base == nil {
@@ -1331,7 +1382,7 @@
 				break
 			}
 		}
-		persistent.off = round(sys.PtrSize, align)
+		persistent.off = alignUp(sys.PtrSize, align)
 	}
 	p := persistent.base.add(persistent.off)
 	persistent.off += size
@@ -1372,17 +1423,24 @@
 }
 
 func (l *linearAlloc) init(base, size uintptr) {
+	if base+size < base {
+		// Chop off the last byte. The runtime isn't prepared
+		// to deal with situations where the bounds could overflow.
+		// Leave that memory reserved, though, so we don't map it
+		// later.
+		size -= 1
+	}
 	l.next, l.mapped = base, base
 	l.end = base + size
 }
 
 func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
-	p := round(l.next, align)
+	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
 	}
 	l.next = p + size
-	if pEnd := round(l.next-1, physPageSize); pEnd > l.mapped {
+	if pEnd := alignUp(l.next-1, physPageSize); pEnd > l.mapped {
 		// Transition from Reserved to Prepared to Ready.
 		sysMap(unsafe.Pointer(l.mapped), pEnd-l.mapped, sysStat)
 		sysUsed(unsafe.Pointer(l.mapped), pEnd-l.mapped)
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index a2d5864..5c97f54 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -168,6 +168,14 @@
 	}
 }
 
+func TestPageCacheLeak(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(1))
+	leaked := PageCachePagesLeaked()
+	if leaked != 0 {
+		t.Fatalf("found %d leaked pages in page caches", leaked)
+	}
+}
+
 func TestPhysicalMemoryUtilization(t *testing.T) {
 	got := runTestProg(t, "testprog", "GCPhys")
 	want := "OK\n"
@@ -176,6 +184,19 @@
 	}
 }
 
+func TestScavengedBitsCleared(t *testing.T) {
+	var mismatches [128]BitsMismatch
+	if n, ok := CheckScavengedBitsCleared(mismatches[:]); !ok {
+		t.Errorf("uncleared scavenged bits")
+		for _, m := range mismatches[:n] {
+			t.Logf("\t@ address 0x%x", m.Base)
+			t.Logf("\t|  got: %064b", m.Got)
+			t.Logf("\t| want: %064b", m.Want)
+		}
+		t.FailNow()
+	}
+}
+
 type acLink struct {
 	x [1 << 20]byte
 }
@@ -183,14 +204,6 @@
 var arenaCollisionSink []*acLink
 
 func TestArenaCollision(t *testing.T) {
-	if GOOS == "darwin" && race.Enabled {
-		// Skip this test on Darwin in race mode because Darwin 10.10 has
-		// issues following arena hints and runs out of them in race mode, so
-		// MAP_FIXED is used to ensure we keep the heap in the memory region the
-		// race detector expects.
-		// TODO(mknyszek): Delete this when Darwin 10.10 is no longer supported.
-		t.Skip("disabled on Darwin with race mode since MAP_FIXED is used")
-	}
 	testenv.MustHaveExec(t)
 
 	// Test that mheap.sysAlloc handles collisions with other
diff --git a/src/runtime/map.go b/src/runtime/map.go
index 386f965..399c1b0 100644
--- a/src/runtime/map.go
+++ b/src/runtime/map.go
@@ -66,7 +66,7 @@
 	bucketCnt     = 1 << bucketCntBits
 
 	// Maximum average load of a bucket that triggers growth is 6.5.
-	// Represent as loadFactorNum/loadFactDen, to allow integer math.
+	// Represent as loadFactorNum/loadFactorDen, to allow integer math.
 	loadFactorNum = 13
 	loadFactorDen = 2
 
@@ -403,15 +403,14 @@
 	}
 	if h == nil || h.count == 0 {
 		if t.hashMightPanic() {
-			t.key.alg.hash(key, 0) // see issue 23734
+			t.hasher(key, 0) // see issue 23734
 		}
 		return unsafe.Pointer(&zeroVal[0])
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
 	}
-	alg := t.key.alg
-	hash := alg.hash(key, uintptr(h.hash0))
+	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
@@ -438,7 +437,7 @@
 			if t.indirectkey() {
 				k = *((*unsafe.Pointer)(k))
 			}
-			if alg.equal(key, k) {
+			if t.key.equal(key, k) {
 				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
 				if t.indirectelem() {
 					e = *((*unsafe.Pointer)(e))
@@ -462,15 +461,14 @@
 	}
 	if h == nil || h.count == 0 {
 		if t.hashMightPanic() {
-			t.key.alg.hash(key, 0) // see issue 23734
+			t.hasher(key, 0) // see issue 23734
 		}
 		return unsafe.Pointer(&zeroVal[0]), false
 	}
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map read and map write")
 	}
-	alg := t.key.alg
-	hash := alg.hash(key, uintptr(h.hash0))
+	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
@@ -497,7 +495,7 @@
 			if t.indirectkey() {
 				k = *((*unsafe.Pointer)(k))
 			}
-			if alg.equal(key, k) {
+			if t.key.equal(key, k) {
 				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
 				if t.indirectelem() {
 					e = *((*unsafe.Pointer)(e))
@@ -514,8 +512,7 @@
 	if h == nil || h.count == 0 {
 		return nil, nil
 	}
-	alg := t.key.alg
-	hash := alg.hash(key, uintptr(h.hash0))
+	hash := t.hasher(key, uintptr(h.hash0))
 	m := bucketMask(h.B)
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
@@ -542,7 +539,7 @@
 			if t.indirectkey() {
 				k = *((*unsafe.Pointer)(k))
 			}
-			if alg.equal(key, k) {
+			if t.key.equal(key, k) {
 				e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.elemsize))
 				if t.indirectelem() {
 					e = *((*unsafe.Pointer)(e))
@@ -587,10 +584,9 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	alg := t.key.alg
-	hash := alg.hash(key, uintptr(h.hash0))
+	hash := t.hasher(key, uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// Set hashWriting after calling t.hasher, since t.hasher may panic,
 	// in which case we have not actually done a write.
 	h.flags ^= hashWriting
 
@@ -627,7 +623,7 @@
 			if t.indirectkey() {
 				k = *((*unsafe.Pointer)(k))
 			}
-			if !alg.equal(key, k) {
+			if !t.key.equal(key, k) {
 				continue
 			}
 			// already have a mapping for key. Update it.
@@ -698,7 +694,7 @@
 	}
 	if h == nil || h.count == 0 {
 		if t.hashMightPanic() {
-			t.key.alg.hash(key, 0) // see issue 23734
+			t.hasher(key, 0) // see issue 23734
 		}
 		return
 	}
@@ -706,10 +702,9 @@
 		throw("concurrent map writes")
 	}
 
-	alg := t.key.alg
-	hash := alg.hash(key, uintptr(h.hash0))
+	hash := t.hasher(key, uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// Set hashWriting after calling t.hasher, since t.hasher may panic,
 	// in which case we have not actually done a write (delete).
 	h.flags ^= hashWriting
 
@@ -734,7 +729,7 @@
 			if t.indirectkey() {
 				k2 = *((*unsafe.Pointer)(k2))
 			}
-			if !alg.equal(key, k2) {
+			if !t.key.equal(key, k2) {
 				continue
 			}
 			// Only clear key if there are pointers in it.
@@ -862,7 +857,6 @@
 	b := it.bptr
 	i := it.i
 	checkBucket := it.checkBucket
-	alg := t.key.alg
 
 next:
 	if b == nil {
@@ -916,10 +910,10 @@
 			// through the oldbucket, skipping any keys that will go
 			// to the other new bucket (each oldbucket expands to two
 			// buckets during a grow).
-			if t.reflexivekey() || alg.equal(k, k) {
+			if t.reflexivekey() || t.key.equal(k, k) {
 				// If the item in the oldbucket is not destined for
 				// the current new bucket in the iteration, skip it.
-				hash := alg.hash(k, uintptr(h.hash0))
+				hash := t.hasher(k, uintptr(h.hash0))
 				if hash&bucketMask(it.B) != checkBucket {
 					continue
 				}
@@ -937,7 +931,7 @@
 			}
 		}
 		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
-			!(t.reflexivekey() || alg.equal(k, k)) {
+			!(t.reflexivekey() || t.key.equal(k, k)) {
 			// This is the golden data, we can return it.
 			// OR
 			// key!=key, so the entry can't be deleted or updated, so we can just return it.
@@ -1174,8 +1168,8 @@
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/elem to bucket x or bucket y).
-					hash := t.key.alg.hash(k2, uintptr(h.hash0))
-					if h.flags&iterator != 0 && !t.reflexivekey() && !t.key.alg.equal(k2, k2) {
+					hash := t.hasher(k2, uintptr(h.hash0))
+					if h.flags&iterator != 0 && !t.reflexivekey() && !t.key.equal(k2, k2) {
 						// If key != key (NaNs), then the hash could be (and probably
 						// will be) entirely different from the old hash. Moreover,
 						// it isn't reproducible. Reproducibility is required in the
@@ -1269,16 +1263,12 @@
 	}
 }
 
-func ismapkey(t *_type) bool {
-	return t.alg.hash != nil
-}
-
 // Reflect stubs. Called from ../reflect/asm_*.s
 
 //go:linkname reflect_makemap reflect.makemap
 func reflect_makemap(t *maptype, cap int) *hmap {
 	// Check invariants and reflects math.
-	if !ismapkey(t.key) {
+	if t.key.equal == nil {
 		throw("runtime.reflect_makemap: unsupported map key type")
 	}
 	if t.key.size > maxKeySize && (!t.indirectkey() || t.keysize != uint8(sys.PtrSize)) ||
@@ -1381,10 +1371,5 @@
 	return h.count
 }
 
-//go:linkname reflect_ismapkey reflect.ismapkey
-func reflect_ismapkey(t *_type) bool {
-	return ismapkey(t)
-}
-
-const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go
+const maxZero = 1024 // must match value in cmd/compile/internal/gc/walk.go:zeroValSize
 var zeroVal [maxZero]byte
diff --git a/src/runtime/map_benchmark_test.go b/src/runtime/map_benchmark_test.go
index d37dadc..893cb6c 100644
--- a/src/runtime/map_benchmark_test.go
+++ b/src/runtime/map_benchmark_test.go
@@ -251,7 +251,7 @@
 }
 
 func BenchmarkMapCycle(b *testing.B) {
-	// Arrange map entries to be a permuation, so that
+	// Arrange map entries to be a permutation, so that
 	// we hit all entries, and one lookup is data dependent
 	// on the previous lookup.
 	const N = 3127
@@ -483,3 +483,52 @@
 		})
 	}
 }
+
+var BoolSink bool
+
+func BenchmarkMapInterfaceString(b *testing.B) {
+	m := map[interface{}]bool{}
+
+	for i := 0; i < 100; i++ {
+		m[fmt.Sprintf("%d", i)] = true
+	}
+
+	key := (interface{})("A")
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BoolSink = m[key]
+	}
+}
+func BenchmarkMapInterfacePtr(b *testing.B) {
+	m := map[interface{}]bool{}
+
+	for i := 0; i < 100; i++ {
+		i := i
+		m[&i] = true
+	}
+
+	key := new(int)
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		BoolSink = m[key]
+	}
+}
+
+var (
+	hintLessThan8    = 7
+	hintGreaterThan8 = 32
+)
+
+func BenchmarkNewEmptyMapHintLessThan8(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = make(map[int]int, hintLessThan8)
+	}
+}
+
+func BenchmarkNewEmptyMapHintGreaterThan8(b *testing.B) {
+	b.ReportAllocs()
+	for i := 0; i < b.N; i++ {
+		_ = make(map[int]int, hintGreaterThan8)
+	}
+}
diff --git a/src/runtime/map_fast32.go b/src/runtime/map_fast32.go
index 0ab75ca..534454f 100644
--- a/src/runtime/map_fast32.go
+++ b/src/runtime/map_fast32.go
@@ -25,7 +25,7 @@
 		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
-		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
@@ -65,7 +65,7 @@
 		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
-		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
@@ -100,9 +100,9 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
 	h.flags ^= hashWriting
 
 	if h.buckets == nil {
@@ -190,9 +190,9 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
 	h.flags ^= hashWriting
 
 	if h.buckets == nil {
@@ -281,9 +281,9 @@
 		throw("concurrent map writes")
 	}
 
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
 	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
@@ -400,7 +400,7 @@
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/elem to bucket x or bucket y).
-					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					hash := t.hasher(k, uintptr(h.hash0))
 					if hash&newbit != 0 {
 						useY = 1
 					}
diff --git a/src/runtime/map_fast64.go b/src/runtime/map_fast64.go
index 4d420e7..1669c7c 100644
--- a/src/runtime/map_fast64.go
+++ b/src/runtime/map_fast64.go
@@ -25,7 +25,7 @@
 		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
-		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
@@ -65,7 +65,7 @@
 		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
-		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+		hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 		m := bucketMask(h.B)
 		b = (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 		if c := h.oldbuckets; c != nil {
@@ -100,9 +100,9 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
 	h.flags ^= hashWriting
 
 	if h.buckets == nil {
@@ -190,9 +190,9 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
 	h.flags ^= hashWriting
 
 	if h.buckets == nil {
@@ -281,9 +281,9 @@
 		throw("concurrent map writes")
 	}
 
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
 	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
@@ -400,7 +400,7 @@
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/elem to bucket x or bucket y).
-					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					hash := t.hasher(k, uintptr(h.hash0))
 					if hash&newbit != 0 {
 						useY = 1
 					}
diff --git a/src/runtime/map_faststr.go b/src/runtime/map_faststr.go
index 069994f..069cda6 100644
--- a/src/runtime/map_faststr.go
+++ b/src/runtime/map_faststr.go
@@ -76,7 +76,7 @@
 		return unsafe.Pointer(&zeroVal[0])
 	}
 dohash:
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
 	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
@@ -171,7 +171,7 @@
 		return unsafe.Pointer(&zeroVal[0]), false
 	}
 dohash:
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
 	m := bucketMask(h.B)
 	b := (*bmap)(add(h.buckets, (hash&m)*uintptr(t.bucketsize)))
 	if c := h.oldbuckets; c != nil {
@@ -211,9 +211,9 @@
 		throw("concurrent map writes")
 	}
 	key := stringStructOf(&s)
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&s)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	// Set hashWriting after calling t.hasher for consistency with mapassign.
 	h.flags ^= hashWriting
 
 	if h.buckets == nil {
@@ -307,9 +307,9 @@
 	}
 
 	key := stringStructOf(&ky)
-	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+	hash := t.hasher(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
 
-	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	// Set hashWriting after calling t.hasher for consistency with mapdelete
 	h.flags ^= hashWriting
 
 	bucket := hash & bucketMask(h.B)
@@ -429,7 +429,7 @@
 				if !h.sameSizeGrow() {
 					// Compute hash to make our evacuation decision (whether we need
 					// to send this key/elem to bucket x or bucket y).
-					hash := t.key.alg.hash(k, uintptr(h.hash0))
+					hash := t.hasher(k, uintptr(h.hash0))
 					if hash&newbit != 0 {
 						useY = 1
 					}
diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index ee9468d..1b7ccad 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go
@@ -1156,3 +1156,64 @@
 	}
 	runtime.MapTombstoneCheck(m)
 }
+
+type canString int
+
+func (c canString) String() string {
+	return fmt.Sprintf("%d", int(c))
+}
+
+func TestMapInterfaceKey(t *testing.T) {
+	// Test all the special cases in runtime.typehash.
+	type GrabBag struct {
+		f32  float32
+		f64  float64
+		c64  complex64
+		c128 complex128
+		s    string
+		i0   interface{}
+		i1   interface {
+			String() string
+		}
+		a [4]string
+	}
+
+	m := map[interface{}]bool{}
+	// Put a bunch of data in m, so that a bad hash is likely to
+	// lead to a bad bucket, which will lead to a missed lookup.
+	for i := 0; i < 1000; i++ {
+		m[i] = true
+	}
+	m[GrabBag{f32: 1.0}] = true
+	if !m[GrabBag{f32: 1.0}] {
+		panic("f32 not found")
+	}
+	m[GrabBag{f64: 1.0}] = true
+	if !m[GrabBag{f64: 1.0}] {
+		panic("f64 not found")
+	}
+	m[GrabBag{c64: 1.0i}] = true
+	if !m[GrabBag{c64: 1.0i}] {
+		panic("c64 not found")
+	}
+	m[GrabBag{c128: 1.0i}] = true
+	if !m[GrabBag{c128: 1.0i}] {
+		panic("c128 not found")
+	}
+	m[GrabBag{s: "foo"}] = true
+	if !m[GrabBag{s: "foo"}] {
+		panic("string not found")
+	}
+	m[GrabBag{i0: "foo"}] = true
+	if !m[GrabBag{i0: "foo"}] {
+		panic("interface{} not found")
+	}
+	m[GrabBag{i1: canString(5)}] = true
+	if !m[GrabBag{i1: canString(5)}] {
+		panic("interface{String() string} not found")
+	}
+	m[GrabBag{a: [4]string{"foo", "bar", "baz", "bop"}}] = true
+	if !m[GrabBag{a: [4]string{"foo", "bar", "baz", "bop"}}] {
+		panic("array not found")
+	}
+}
diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index df3ab6f..f7875d3 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go
@@ -157,8 +157,8 @@
 	if dst == src {
 		return
 	}
-	if typ.ptrdata != 0 {
-		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.size)
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), typ.ptrdata)
 	}
 	// There's a race here: if some other goroutine can write to
 	// src, it may change some pointer in src after we've
@@ -193,17 +193,18 @@
 
 // typedmemmovepartial is like typedmemmove but assumes that
 // dst and src point off bytes into the value and only copies size bytes.
+// off must be a multiple of sys.PtrSize.
 //go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
 func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
-	if writeBarrier.needed && typ.ptrdata != 0 && size >= sys.PtrSize {
-		// Pointer-align start address for bulk barrier.
-		adst, asrc, asize := dst, src, size
-		if frag := -off & (sys.PtrSize - 1); frag != 0 {
-			adst = add(dst, frag)
-			asrc = add(src, frag)
-			asize -= frag
+	if writeBarrier.needed && typ.ptrdata > off && size >= sys.PtrSize {
+		if off&(sys.PtrSize-1) != 0 {
+			panic("reflect: internal error: misaligned offset")
 		}
-		bulkBarrierPreWrite(uintptr(adst), uintptr(asrc), asize&^(sys.PtrSize-1))
+		pwsize := alignDown(size, sys.PtrSize)
+		if poff := typ.ptrdata - off; pwsize > poff {
+			pwsize = poff
+		}
+		bulkBarrierPreWrite(uintptr(dst), uintptr(src), pwsize)
 	}
 
 	memmove(dst, src, size)
@@ -230,16 +231,14 @@
 }
 
 //go:nosplit
-func typedslicecopy(typ *_type, dst, src slice) int {
-	n := dst.len
-	if n > src.len {
-		n = src.len
+func typedslicecopy(typ *_type, dstPtr unsafe.Pointer, dstLen int, srcPtr unsafe.Pointer, srcLen int) int {
+	n := dstLen
+	if n > srcLen {
+		n = srcLen
 	}
 	if n == 0 {
 		return 0
 	}
-	dstp := dst.array
-	srcp := src.array
 
 	// The compiler emits calls to typedslicecopy before
 	// instrumentation runs, so unlike the other copying and
@@ -248,19 +247,19 @@
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
-		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
-		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
+		racewriterangepc(dstPtr, uintptr(n)*typ.size, callerpc, pc)
+		racereadrangepc(srcPtr, uintptr(n)*typ.size, callerpc, pc)
 	}
 	if msanenabled {
-		msanwrite(dstp, uintptr(n)*typ.size)
-		msanread(srcp, uintptr(n)*typ.size)
+		msanwrite(dstPtr, uintptr(n)*typ.size)
+		msanread(srcPtr, uintptr(n)*typ.size)
 	}
 
 	if writeBarrier.cgo {
-		cgoCheckSliceCopy(typ, dst, src, n)
+		cgoCheckSliceCopy(typ, dstPtr, srcPtr, n)
 	}
 
-	if dstp == srcp {
+	if dstPtr == srcPtr {
 		return n
 	}
 
@@ -270,11 +269,12 @@
 	// before calling typedslicecopy.
 	size := uintptr(n) * typ.size
 	if writeBarrier.needed {
-		bulkBarrierPreWrite(uintptr(dstp), uintptr(srcp), size)
+		pwsize := size - typ.size + typ.ptrdata
+		bulkBarrierPreWrite(uintptr(dstPtr), uintptr(srcPtr), pwsize)
 	}
 	// See typedmemmove for a discussion of the race between the
 	// barrier and memmove.
-	memmove(dstp, srcp, size)
+	memmove(dstPtr, srcPtr, size)
 	return n
 }
 
@@ -304,7 +304,7 @@
 		memmove(dst.array, src.array, size)
 		return n
 	}
-	return typedslicecopy(elemType, dst, src)
+	return typedslicecopy(elemType, dst.array, dst.len, src.array, src.len)
 }
 
 // typedmemclr clears the typed memory at ptr with type typ. The
@@ -317,8 +317,8 @@
 //
 //go:nosplit
 func typedmemclr(typ *_type, ptr unsafe.Pointer) {
-	if typ.ptrdata != 0 {
-		bulkBarrierPreWrite(uintptr(ptr), 0, typ.size)
+	if writeBarrier.needed && typ.ptrdata != 0 {
+		bulkBarrierPreWrite(uintptr(ptr), 0, typ.ptrdata)
 	}
 	memclrNoHeapPointers(ptr, typ.size)
 }
@@ -330,7 +330,7 @@
 
 //go:linkname reflect_typedmemclrpartial reflect.typedmemclrpartial
 func reflect_typedmemclrpartial(typ *_type, ptr unsafe.Pointer, off, size uintptr) {
-	if typ.ptrdata != 0 {
+	if writeBarrier.needed && typ.ptrdata != 0 {
 		bulkBarrierPreWrite(uintptr(ptr), 0, size)
 	}
 	memclrNoHeapPointers(ptr, size)
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 30ec5f1..35332c9 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -243,6 +243,10 @@
 }
 
 // isFree reports whether the index'th object in s is unallocated.
+//
+// The caller must ensure s.state is mSpanInUse, and there must have
+// been no preemption points since ensuring this (which could allow a
+// GC transition, which would allow the state to change).
 func (s *mspan) isFree(index uintptr) bool {
 	if index < s.freeindex {
 		return false
@@ -349,6 +353,33 @@
 	return
 }
 
+// badPointer throws bad pointer in heap panic.
+func badPointer(s *mspan, p, refBase, refOff uintptr) {
+	// Typically this indicates an incorrect use
+	// of unsafe or cgo to store a bad pointer in
+	// the Go heap. It may also indicate a runtime
+	// bug.
+	//
+	// TODO(austin): We could be more aggressive
+	// and detect pointers to unallocated objects
+	// in allocated spans.
+	printlock()
+	print("runtime: pointer ", hex(p))
+	state := s.state.get()
+	if state != mSpanInUse {
+		print(" to unallocated span")
+	} else {
+		print(" to unused region of span")
+	}
+	print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", state, "\n")
+	if refBase != 0 {
+		print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+		gcDumpObject("object", refBase, refOff)
+	}
+	getg().m.traceback = 2
+	throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+}
+
 // findObject returns the base address for the heap object containing
 // the address p, the object's span, and the index of the object in s.
 // If p does not point into a heap object, it returns base == 0.
@@ -359,42 +390,30 @@
 // refBase and refOff optionally give the base address of the object
 // in which the pointer p was found and the byte offset at which it
 // was found. These are used for error reporting.
+//
+// It is nosplit so it is safe for p to be a pointer to the current goroutine's stack.
+// Since p is a uintptr, it would not be adjusted if the stack were to move.
+//go:nosplit
 func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex uintptr) {
 	s = spanOf(p)
+	// If s is nil, the virtual address has never been part of the heap.
+	// This pointer may be to some mmap'd region, so we allow it.
+	if s == nil {
+		return
+	}
 	// If p is a bad pointer, it may not be in s's bounds.
-	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
-		if s == nil || s.state == mSpanManual {
-			// If s is nil, the virtual address has never been part of the heap.
-			// This pointer may be to some mmap'd region, so we allow it.
-			// Pointers into stacks are also ok, the runtime manages these explicitly.
+	//
+	// Check s.state to synchronize with span initialization
+	// before checking other fields. See also spanOfHeap.
+	if state := s.state.get(); state != mSpanInUse || p < s.base() || p >= s.limit {
+		// Pointers into stacks are also ok, the runtime manages these explicitly.
+		if state == mSpanManual {
 			return
 		}
-
 		// The following ensures that we are rigorous about what data
 		// structures hold valid pointers.
 		if debug.invalidptr != 0 {
-			// Typically this indicates an incorrect use
-			// of unsafe or cgo to store a bad pointer in
-			// the Go heap. It may also indicate a runtime
-			// bug.
-			//
-			// TODO(austin): We could be more aggressive
-			// and detect pointers to unallocated objects
-			// in allocated spans.
-			printlock()
-			print("runtime: pointer ", hex(p))
-			if s.state != mSpanInUse {
-				print(" to unallocated span")
-			} else {
-				print(" to unused region of span")
-			}
-			print(" span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
-			if refBase != 0 {
-				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
-				gcDumpObject("object", refBase, refOff)
-			}
-			getg().m.traceback = 2
-			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
+			badPointer(s, p, refBase, refOff)
 		}
 		return
 	}
@@ -609,7 +628,7 @@
 			}
 		}
 		return
-	} else if s.state != mSpanInUse || dst < s.base() || s.limit <= dst {
+	} else if s.state.get() != mSpanInUse || dst < s.base() || s.limit <= dst {
 		// dst was heap memory at some point, but isn't now.
 		// It can't be a global. It must be either our stack,
 		// or in the case of direct channel sends, it could be
@@ -781,29 +800,19 @@
 // words to pointer/scan.
 // Otherwise, it initializes all words to scalar/dead.
 func (h heapBits) initSpan(s *mspan) {
-	size, n, total := s.layout()
-
-	// Init the markbit structures
-	s.freeindex = 0
-	s.allocCache = ^uint64(0) // all 1s indicating all free.
-	s.nelems = n
-	s.allocBits = nil
-	s.gcmarkBits = nil
-	s.gcmarkBits = newMarkBits(s.nelems)
-	s.allocBits = newAllocBits(s.nelems)
-
 	// Clear bits corresponding to objects.
-	nw := total / sys.PtrSize
+	nw := (s.npages << _PageShift) / sys.PtrSize
 	if nw%wordsPerBitmapByte != 0 {
 		throw("initSpan: unaligned length")
 	}
 	if h.shift != 0 {
 		throw("initSpan: unaligned base")
 	}
+	isPtrs := sys.PtrSize == 8 && s.elemsize == sys.PtrSize
 	for nw > 0 {
 		hNext, anw := h.forwardOrBoundary(nw)
 		nbyte := anw / wordsPerBitmapByte
-		if sys.PtrSize == 8 && size == sys.PtrSize {
+		if isPtrs {
 			bitp := h.bitp
 			for i := uintptr(0); i < nbyte; i++ {
 				*bitp = bitPointerAll | bitScanAll
@@ -856,58 +865,22 @@
 	}
 }
 
-// oneBitCount is indexed by byte and produces the
-// number of 1 bits in that byte. For example 128 has 1 bit set
-// and oneBitCount[128] will holds 1.
-var oneBitCount = [256]uint8{
-	0, 1, 1, 2, 1, 2, 2, 3,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	1, 2, 2, 3, 2, 3, 3, 4,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	2, 3, 3, 4, 3, 4, 4, 5,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	3, 4, 4, 5, 4, 5, 5, 6,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	4, 5, 5, 6, 5, 6, 6, 7,
-	5, 6, 6, 7, 6, 7, 7, 8}
-
 // countAlloc returns the number of objects allocated in span s by
 // scanning the allocation bitmap.
-// TODO:(rlh) Use popcount intrinsic.
 func (s *mspan) countAlloc() int {
 	count := 0
-	maxIndex := s.nelems / 8
-	for i := uintptr(0); i < maxIndex; i++ {
-		mrkBits := *s.gcmarkBits.bytep(i)
-		count += int(oneBitCount[mrkBits])
-	}
-	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
-		mrkBits := *s.gcmarkBits.bytep(maxIndex)
-		mask := uint8((1 << bitsInLastByte) - 1)
-		bits := mrkBits & mask
-		count += int(oneBitCount[bits])
+	bytes := divRoundUp(s.nelems, 8)
+	// Iterate over each 8-byte chunk and count allocations
+	// with an intrinsic. Note that newMarkBits guarantees that
+	// gcmarkBits will be 8-byte aligned, so we don't have to
+	// worry about edge cases, irrelevant bits will simply be zero.
+	for i := uintptr(0); i < bytes; i += 8 {
+		// Extract 64 bits from the byte pointer and get a OnesCount.
+		// Note that the unsafe cast here doesn't preserve endianness,
+		// but that's OK. We only care about how many bits are 1, not
+		// about the order we discover them in.
+		mrkBits := *(*uint64)(unsafe.Pointer(s.gcmarkBits.bytep(i)))
+		count += sys.OnesCount64(mrkBits)
 	}
 	return count
 }
@@ -1912,7 +1885,11 @@
 // The bitmask starts at s.startAddr.
 // The result must be deallocated with dematerializeGCProg.
 func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
-	s := mheap_.allocManual((ptrdata/(8*sys.PtrSize)+pageSize-1)/pageSize, &memstats.gc_sys)
+	// Each word of ptrdata needs one bit in the bitmap.
+	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
+	// Compute the number of pages needed for bitmapBytes.
+	pages := divRoundUp(bitmapBytes, pageSize)
+	s := mheap_.allocManual(pages, &memstats.gc_sys)
 	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
 	return s
 }
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index d4fa9a0..5bceb51 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -131,7 +131,11 @@
 		if s.sweepgen != mheap_.sweepgen+3 {
 			throw("bad sweepgen in refill")
 		}
-		atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		if go115NewMCentralImpl {
+			mheap_.central[spc].mcentral.uncacheSpan(s)
+		} else {
+			atomic.Store(&s.sweepgen, mheap_.sweepgen)
+		}
 	}
 
 	// Get a new cached span from the central lists.
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index cd59010..ed49d86 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -20,8 +20,31 @@
 type mcentral struct {
 	lock      mutex
 	spanclass spanClass
-	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
-	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// For !go115NewMCentralImpl.
+	nonempty mSpanList // list of spans with a free object, ie a nonempty free list
+	empty    mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// partial and full contain two mspan sets: one of swept in-use
+	// spans, and one of unswept in-use spans. These two trade
+	// roles on each GC cycle. The unswept set is drained either by
+	// allocation or by the background sweeper in every GC cycle,
+	// so only two roles are necessary.
+	//
+	// sweepgen is increased by 2 on each GC cycle, so the swept
+	// spans are in partial[sweepgen/2%2] and the unswept spans are in
+	// partial[1-sweepgen/2%2]. Sweeping pops spans from the
+	// unswept set and pushes spans that are still in-use on the
+	// swept set. Likewise, allocating an in-use span pushes it
+	// on the swept set.
+	//
+	// Some parts of the sweeper can sweep arbitrary spans, and hence
+	// can't remove them from the unswept set, but will add the span
+	// to the appropriate swept list. As a result, the parts of the
+	// sweeper and mcentral that do consume from the unswept list may
+	// encounter swept spans, and these should be ignored.
+	partial [2]spanSet // list of spans with a free object
+	full    [2]spanSet // list of spans with no free objects
 
 	// nmalloc is the cumulative count of objects allocated from
 	// this mcentral, assuming all spans in mcaches are
@@ -32,12 +55,168 @@
 // Initialize a single central free list.
 func (c *mcentral) init(spc spanClass) {
 	c.spanclass = spc
-	c.nonempty.init()
-	c.empty.init()
+	if go115NewMCentralImpl {
+		lockInit(&c.partial[0].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.partial[1].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.full[0].spineLock, lockRankSpanSetSpine)
+		lockInit(&c.full[1].spineLock, lockRankSpanSetSpine)
+	} else {
+		c.nonempty.init()
+		c.empty.init()
+		lockInit(&c.lock, lockRankMcentral)
+	}
+}
+
+// partialUnswept returns the spanSet which holds partially-filled
+// unswept spans for this sweepgen.
+func (c *mcentral) partialUnswept(sweepgen uint32) *spanSet {
+	return &c.partial[1-sweepgen/2%2]
+}
+
+// partialSwept returns the spanSet which holds partially-filled
+// swept spans for this sweepgen.
+func (c *mcentral) partialSwept(sweepgen uint32) *spanSet {
+	return &c.partial[sweepgen/2%2]
+}
+
+// fullUnswept returns the spanSet which holds unswept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullUnswept(sweepgen uint32) *spanSet {
+	return &c.full[1-sweepgen/2%2]
+}
+
+// fullSwept returns the spanSet which holds swept spans without any
+// free slots for this sweepgen.
+func (c *mcentral) fullSwept(sweepgen uint32) *spanSet {
+	return &c.full[sweepgen/2%2]
 }
 
 // Allocate a span to use in an mcache.
 func (c *mcentral) cacheSpan() *mspan {
+	if !go115NewMCentralImpl {
+		return c.oldCacheSpan()
+	}
+	// Deduct credit for this span allocation and sweep if necessary.
+	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
+
+	sg := mheap_.sweepgen
+
+	traceDone := false
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+	// If we sweep spanBudget spans without finding any free
+	// space, just allocate a fresh span. This limits the amount
+	// of time we can spend trying to find free space and
+	// amortizes the cost of small object sweeping over the
+	// benefit of having a full free span to allocate from. By
+	// setting this to 100, we limit the space overhead to 1%.
+	//
+	// TODO(austin,mknyszek): This still has bad worst-case
+	// throughput. For example, this could find just one free slot
+	// on the 100th swept span. That limits allocation latency, but
+	// still has very poor throughput. We could instead keep a
+	// running free-to-used budget and switch to fresh span
+	// allocation if the budget runs low.
+	spanBudget := 100
+
+	var s *mspan
+
+	// Try partial swept spans first.
+	if s = c.partialSwept(sg).pop(); s != nil {
+		goto havespan
+	}
+
+	// Now try partial unswept spans.
+	for ; spanBudget >= 0; spanBudget-- {
+		s = c.partialUnswept(sg).pop()
+		if s == nil {
+			break
+		}
+		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// We got ownership of the span, so let's sweep it and use it.
+			s.sweep(true)
+			goto havespan
+		}
+		// We failed to get ownership of the span, which means it's being or
+		// has been swept by an asynchronous sweeper that just couldn't remove it
+		// from the unswept list. That sweeper took ownership of the span and
+		// responsibility for either freeing it to the heap or putting it on the
+		// right swept list. Either way, we should just ignore it (and it's unsafe
+		// for us to do anything else).
+	}
+	// Now try full unswept spans, sweeping them and putting them into the
+	// right list if we fail to get a span.
+	for ; spanBudget >= 0; spanBudget-- {
+		s = c.fullUnswept(sg).pop()
+		if s == nil {
+			break
+		}
+		if atomic.Load(&s.sweepgen) == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			// We got ownership of the span, so let's sweep it.
+			s.sweep(true)
+			// Check if there's any free space.
+			freeIndex := s.nextFreeIndex()
+			if freeIndex != s.nelems {
+				s.freeindex = freeIndex
+				goto havespan
+			}
+			// Add it to the swept list, because sweeping didn't give us any free space.
+			c.fullSwept(sg).push(s)
+		}
+		// See comment for partial unswept spans.
+	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceDone = true
+	}
+
+	// We failed to get a span from the mcentral so get one from mheap.
+	s = c.grow()
+	if s == nil {
+		return nil
+	}
+
+	// At this point s is a span that should have free slots.
+havespan:
+	if trace.enabled && !traceDone {
+		traceGCSweepDone()
+	}
+	n := int(s.nelems) - int(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
+	}
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	atomic.Xadd64(&c.nmalloc, int64(n))
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
+	}
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
+	return s
+}
+
+// Allocate a span to use in an mcache.
+//
+// For !go115NewMCentralImpl.
+func (c *mcentral) oldCacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
 	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
@@ -147,7 +326,77 @@
 }
 
 // Return span from an mcache.
+//
+// s must have a span class corresponding to this
+// mcentral and it must not be empty.
 func (c *mcentral) uncacheSpan(s *mspan) {
+	if !go115NewMCentralImpl {
+		c.oldUncacheSpan(s)
+		return
+	}
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
+	}
+
+	sg := mheap_.sweepgen
+	stale := s.sweepgen == sg+1
+
+	// Fix up sweepgen.
+	if stale {
+		// Span was cached before sweep began. It's our
+		// responsibility to sweep it.
+		//
+		// Set sweepgen to indicate it's not cached but needs
+		// sweeping and can't be allocated from. sweep will
+		// set s.sweepgen to indicate s is swept.
+		atomic.Store(&s.sweepgen, sg-1)
+	} else {
+		// Indicate that s is no longer cached.
+		atomic.Store(&s.sweepgen, sg)
+	}
+	n := int(s.nelems) - int(s.allocCount)
+
+	// Fix up statistics.
+	if n > 0 {
+		// cacheSpan updated alloc assuming all objects on s
+		// were going to be allocated. Adjust for any that
+		// weren't. We must do this before potentially
+		// sweeping the span.
+		atomic.Xadd64(&c.nmalloc, -int64(n))
+
+		if !stale {
+			// (*mcentral).cacheSpan conservatively counted
+			// unallocated slots in heap_live. Undo this.
+			//
+			// If this span was cached before sweep, then
+			// heap_live was totally recomputed since
+			// caching this span, so we don't do this for
+			// stale spans.
+			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+		}
+	}
+
+	// Put the span in the appropriate place.
+	if stale {
+		// It's stale, so just sweep it. Sweeping will put it on
+		// the right list.
+		s.sweep(false)
+	} else {
+		if n > 0 {
+			// Put it back on the partial swept list.
+			c.partialSwept(sg).push(s)
+		} else {
+			// There's no free space and it's not stale, so put it on the
+			// full swept list.
+			c.fullSwept(sg).push(s)
+		}
+	}
+}
+
+// Return span from an mcache.
+//
+// For !go115NewMCentralImpl.
+func (c *mcentral) oldUncacheSpan(s *mspan) {
 	if s.allocCount == 0 {
 		throw("uncaching span but s.allocCount == 0")
 	}
@@ -206,6 +455,8 @@
 // freeSpan reports whether s was returned to the heap.
 // If preserve=true, it does not move s (the caller
 // must take care of it).
+//
+// For !go115NewMCentralImpl.
 func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 	if sg := mheap_.sweepgen; s.sweepgen == sg+1 || s.sweepgen == sg+3 {
 		throw("freeSpan given cached span")
@@ -243,7 +494,7 @@
 
 	c.nonempty.remove(s)
 	unlock(&c.lock)
-	mheap_.freeSpan(s, false)
+	mheap_.freeSpan(s)
 	return true
 }
 
@@ -252,7 +503,7 @@
 	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
 	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 
-	s := mheap_.alloc(npages, c.spanclass, false, true)
+	s := mheap_.alloc(npages, c.spanclass, true)
 	if s == nil {
 		return nil
 	}
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go
index eeebfa7..7e145b0 100644
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -63,14 +63,15 @@
 	mSysStatInc(sysStat, n)
 
 	// AIX does not allow mapping a range that is already mapped.
-	// So always unmap first even if it is already unmapped.
-	munmap(v, n)
-	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
-
+	// So, call mprotect to change permissions.
+	// Note that sysMap is always called with a non-nil pointer
+	// since it transitions a Reserved memory region to Prepared,
+	// so mprotect is always possible.
+	_, err := mprotect(v, n, _PROT_READ|_PROT_WRITE)
 	if err == _ENOMEM {
 		throw("runtime: out of memory")
 	}
-	if p != v || err != 0 {
+	if err != 0 {
 		throw("runtime: cannot map pages in arena address space")
 	}
 }
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 08a2391..4d860e7 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build dragonfly freebsd nacl netbsd openbsd solaris
+// +build dragonfly freebsd netbsd openbsd solaris
 
 package runtime
 
@@ -44,8 +44,18 @@
 	mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE|_MAP_FIXED, -1, 0)
 }
 
+// Indicates not to reserve swap space for the mapping.
+const _sunosMAP_NORESERVE = 0x40
+
 func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
-	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
+	flags := int32(_MAP_ANON | _MAP_PRIVATE)
+	if GOOS == "solaris" || GOOS == "illumos" {
+		// Be explicit that we don't want to reserve swap space
+		// for PROT_NONE anonymous mappings. This avoids an issue
+		// wherein large mappings can cause fork to fail.
+		flags |= _sunosMAP_NORESERVE
+	}
+	p, err := mmap(v, n, _PROT_NONE, flags, -1, 0)
 	if err != 0 {
 		return nil
 	}
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 86d9fca..3b5d565 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -49,19 +49,7 @@
 }
 
 func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
-	flags := int32(_MAP_ANON | _MAP_PRIVATE)
-	if raceenabled {
-		// Currently the race detector expects memory to live within a certain
-		// range, and on Darwin 10.10 mmap is prone to ignoring hints, moreso
-		// than later versions and other BSDs (#26475). So, even though it's
-		// potentially dangerous to MAP_FIXED, we do it in the race detection
-		// case because it'll help maintain the race detector's invariants.
-		//
-		// TODO(mknyszek): Drop this once support for Darwin 10.10 is dropped,
-		// and reconsider this when #24133 is addressed.
-		flags |= _MAP_FIXED
-	}
-	p, err := mmap(v, n, _PROT_NONE, flags, -1, 0)
+	p, err := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index de90f53..092b3d4 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -7,7 +7,6 @@
 package runtime
 
 import (
-	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -52,18 +51,23 @@
 		return nil
 	}
 
-	if reserveEnd < lastmoduledatap.end {
-		reserveEnd = lastmoduledatap.end
+	// Round up the initial reserveEnd to 64 KiB so that
+	// reservations are always aligned to the page size.
+	initReserveEnd := alignUp(lastmoduledatap.end, physPageSize)
+	if reserveEnd < initReserveEnd {
+		reserveEnd = initReserveEnd
 	}
 	v = unsafe.Pointer(reserveEnd)
-	reserveEnd += n
+	reserveEnd += alignUp(n, physPageSize)
 
 	current := currentMemory()
-	needed := int32(reserveEnd/sys.DefaultPhysPageSize + 1)
+	// reserveEnd is always at a page boundary.
+	needed := int32(reserveEnd / physPageSize)
 	if current < needed {
 		if growMemory(needed-current) == -1 {
 			return nil
 		}
+		resetMemoryDataView()
 	}
 
 	return v
@@ -72,6 +76,10 @@
 func currentMemory() int32
 func growMemory(pages int32) int32
 
+// resetMemoryDataView signals the JS front-end that WebAssembly's memory.grow instruction has been used.
+// This allows the front-end to replace the old DataView object with a new one.
+func resetMemoryDataView()
+
 func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
 	mSysStatInc(sysStat, n)
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 524915f..59b0bca 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -70,11 +70,11 @@
 		var head, tail uintptr
 		if uintptr(v)&(physHugePageSize-1) != 0 {
 			// Compute huge page containing v.
-			head = uintptr(v) &^ (physHugePageSize - 1)
+			head = alignDown(uintptr(v), physHugePageSize)
 		}
 		if (uintptr(v)+n)&(physHugePageSize-1) != 0 {
 			// Compute huge page containing v+n-1.
-			tail = (uintptr(v) + n - 1) &^ (physHugePageSize - 1)
+			tail = alignDown(uintptr(v)+n-1, physHugePageSize)
 		}
 
 		// Note that madvise will return EINVAL if the flag is
@@ -131,9 +131,9 @@
 func sysHugePage(v unsafe.Pointer, n uintptr) {
 	if physHugePageSize != 0 {
 		// Round v up to a huge page boundary.
-		beg := (uintptr(v) + (physHugePageSize - 1)) &^ (physHugePageSize - 1)
+		beg := alignUp(uintptr(v), physHugePageSize)
 		// Round v+n down to a huge page boundary.
-		end := (uintptr(v) + n) &^ (physHugePageSize - 1)
+		end := alignDown(uintptr(v)+n, physHugePageSize)
 
 		if beg < end {
 			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 688cdd3..4fea851 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -193,7 +193,7 @@
 		// so try to extend the address space.
 		p = sbrk(n)
 	}
-	if p == nil {
+	if p == nil && v == nil {
 		p = memAlloc(n)
 		memCheck()
 	}
diff --git a/src/runtime/memclr_amd64p32.s b/src/runtime/memclr_amd64p32.s
deleted file mode 100644
index 71040f3..0000000
--- a/src/runtime/memclr_amd64p32.s
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
-TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-8
-	MOVL	ptr+0(FP), DI
-	MOVL	n+4(FP), CX
-	MOVQ	CX, BX
-	ANDQ	$3, BX
-	SHRQ	$2, CX
-	MOVQ	$0, AX
-	CLD
-	REP
-	STOSL
-	MOVQ	BX, CX
-	REP
-	STOSB
-	// Note: we zero only 4 bytes at a time so that the tail is at most
-	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
-	// See issue 13160.
-	RET
diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s
index ea3c67a..7326b8b 100644
--- a/src/runtime/memclr_arm.s
+++ b/src/runtime/memclr_arm.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/memset-arm.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memset-arm.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memset-arm.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
diff --git a/src/runtime/memclr_mips64x.s b/src/runtime/memclr_mips64x.s
index 111983b..4c2292e 100644
--- a/src/runtime/memclr_mips64x.s
+++ b/src/runtime/memclr_mips64x.s
@@ -4,6 +4,7 @@
 
 // +build mips64 mips64le
 
+#include "go_asm.h"
 #include "textflag.h"
 
 // func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
@@ -12,6 +13,60 @@
 	MOVV	n+8(FP), R2
 	ADDV	R1, R2, R4
 
+	// if less than 16 bytes or no MSA, do words check
+	SGTU	$16, R2, R3
+	BNE	R3, no_msa
+	MOVBU	internal∕cpu·MIPS64X+const_offsetMIPS64XHasMSA(SB), R3
+	BEQ	R3, R0, no_msa
+
+	VMOVB	$0, W0
+
+	SGTU	$128, R2, R3
+	BEQ	R3, msa_large
+
+	AND	$15, R2, R5
+	XOR	R2, R5, R6
+	ADDVU	R1, R6
+
+msa_small:
+	VMOVB	W0, (R1)
+	ADDVU	$16, R1
+	SGTU	R6, R1, R3
+	BNE	R3, R0, msa_small
+	BEQ	R5, R0, done
+	VMOVB	W0, -16(R4)
+	JMP	done
+
+msa_large:
+	AND	$127, R2, R5
+	XOR	R2, R5, R6
+	ADDVU	R1, R6
+
+msa_large_loop:
+	VMOVB	W0, (R1)
+	VMOVB	W0, 16(R1)
+	VMOVB	W0, 32(R1)
+	VMOVB	W0, 48(R1)
+	VMOVB	W0, 64(R1)
+	VMOVB	W0, 80(R1)
+	VMOVB	W0, 96(R1)
+	VMOVB	W0, 112(R1)
+
+	ADDVU	$128, R1
+	SGTU	R6, R1, R3
+	BNE	R3, R0, msa_large_loop
+	BEQ	R5, R0, done
+	VMOVB	W0, -128(R4)
+	VMOVB	W0, -112(R4)
+	VMOVB	W0, -96(R4)
+	VMOVB	W0, -80(R4)
+	VMOVB	W0, -64(R4)
+	VMOVB	W0, -48(R4)
+	VMOVB	W0, -32(R4)
+	VMOVB	W0, -16(R4)
+	JMP	done
+
+no_msa:
 	// if less than 8 bytes, do one byte at a time
 	SGTU	$8, R2, R3
 	BNE	R3, out
diff --git a/src/runtime/memclr_riscv64.s b/src/runtime/memclr_riscv64.s
new file mode 100644
index 0000000..ba7704e
--- /dev/null
+++ b/src/runtime/memclr_riscv64.s
@@ -0,0 +1,44 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memclrNoHeapPointers(void*, uintptr)
+TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16
+	MOV	ptr+0(FP), T1
+	MOV	n+8(FP), T2
+	ADD	T1, T2, T4
+
+	// If less than eight bytes, do one byte at a time.
+	SLTU	$8, T2, T3
+	BNE	T3, ZERO, outcheck
+
+	// Do one byte at a time until eight-aligned.
+	JMP	aligncheck
+align:
+	MOVB	ZERO, (T1)
+	ADD	$1, T1
+aligncheck:
+	AND	$7, T1, T3
+	BNE	T3, ZERO, align
+
+	// Do eight bytes at a time as long as there is room.
+	ADD	$-7, T4, T5
+	JMP	wordscheck
+words:
+	MOV	ZERO, (T1)
+	ADD	$8, T1
+wordscheck:
+	SLTU	T5, T1, T3
+	BNE	T3, ZERO, words
+
+	JMP	outcheck
+out:
+	MOVB	ZERO, (T1)
+	ADD	$1, T1
+outcheck:
+	BNE	T1, T4, out
+
+done:
+	RET
diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
index 7b54070..d99546c 100644
--- a/src/runtime/memmove_386.s
+++ b/src/runtime/memmove_386.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/memmove-386.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -28,6 +28,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $0-12
 	MOVL	to+0(FP), DI
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index b4243a8..d91641a 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -1,5 +1,5 @@
 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -28,6 +28,8 @@
 #include "go_asm.h"
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $0-24
 
diff --git a/src/runtime/memmove_amd64p32.s b/src/runtime/memmove_amd64p32.s
deleted file mode 100644
index 1140773..0000000
--- a/src/runtime/memmove_amd64p32.s
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// This could use MOVSQ, but we use MOVSL so that if an object ends in
-// a 4 byte pointer, we copy it as a unit instead of byte by byte.
-
-// func memmove(to, from unsafe.Pointer, n uintptr)
-TEXT runtime·memmove(SB), NOSPLIT, $0-12
-	MOVL	to+0(FP), DI
-	MOVL	from+4(FP), SI
-	MOVL	n+8(FP), BX
-
-	CMPL	SI, DI
-	JLS back
-
-forward:
-	MOVL	BX, CX
-	SHRL	$2, CX
-	ANDL	$3, BX
-	REP; MOVSL
-	MOVL	BX, CX
-	REP; MOVSB
-	RET
-
-back:
-	MOVL	SI, CX
-	ADDL	BX, CX
-	CMPL	CX, DI
-	JLS forward
-
-	ADDL	BX, DI
-	ADDL	BX, SI
-	STD
-
-	MOVL	BX, CX
-	SHRL	$2, CX
-	ANDL	$3, BX
-	SUBL	$4, DI
-	SUBL	$4, SI
-	REP; MOVSL
-	ADDL	$3, DI
-	ADDL	$3, SI
-	MOVL	BX, CX
-	REP; MOVSB
-	CLD
-
-	// Note: we copy only 4 bytes at a time so that the tail is at most
-	// 3 bytes. That guarantees that we aren't copying pointers with MOVSB.
-	// See issue 13160.
-	RET
diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
index 8352fb7..43d53fa 100644
--- a/src/runtime/memmove_arm.s
+++ b/src/runtime/memmove_arm.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/memmove-arm.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-arm.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-arm.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -58,6 +58,8 @@
 #define FW3	R4
 #define FR3	R8					/* shared with TE */
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $4-12
 _memmove:
diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index ac29f94..dbb7e9a 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s
@@ -4,6 +4,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVD	to+0(FP), R3
@@ -22,7 +24,7 @@
 	CMP	R3, R4
 	BLT	backward
 
-	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// Copying forward proceeds by copying R7/32 quadwords then R6 <= 31 tail bytes.
 	// R3 and R4 are advanced as we copy.
 
 	// (There may be implementations of armv8 where copying by bytes until
@@ -30,11 +32,12 @@
 	// optimization, but the on the one tested so far (xgene) it did not
 	// make a significance difference.)
 
-	CBZ	R7, noforwardlarge	// Do we need to do any doubleword-by-doubleword copying?
+	CBZ	R7, noforwardlarge	// Do we need to do any quadword copying?
 
 	ADD	R3, R7, R9	// R9 points just past where we copy by word
 
 forwardlargeloop:
+	// Copy 32 bytes at a time.
 	LDP.P	32(R4), (R8, R10)
 	STP.P	(R8, R10), 32(R3)
 	LDP	-16(R4), (R11, R12)
@@ -43,10 +46,26 @@
 	CBNZ	R7, forwardlargeloop
 
 noforwardlarge:
-	CBNZ	R6, forwardtail		// Do we need to do any byte-by-byte copying?
+	CBNZ	R6, forwardtail		// Do we need to copy any tail bytes?
 	RET
 
 forwardtail:
+	// There are R6 <= 31 bytes remaining to copy.
+	// This is large enough to still contain pointers,
+	// which must be copied atomically.
+	// Copy the next 16 bytes, then 8 bytes, then any remaining bytes.
+	TBZ	$4, R6, 3(PC)	// write 16 bytes if R6&16 != 0
+	LDP.P	16(R4), (R8, R10)
+	STP.P	(R8, R10), 16(R3)
+
+	TBZ	$3, R6, 3(PC)	// write 8 bytes if R6&8 != 0
+	MOVD.P	8(R4), R8
+	MOVD.P	R8, 8(R3)
+
+	AND	$7, R6
+	CBNZ	R6, 2(PC)
+	RET
+
 	ADD	R3, R6, R9	// R9 points just past the destination memory
 
 forwardtailloop:
@@ -90,7 +109,7 @@
 	RET
 
 backward:
-	// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
+	// Copying backwards first copies R6 <= 31 tail bytes, then R7/32 quadwords.
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
@@ -99,13 +118,28 @@
 
 	CBZ	R6, nobackwardtail	// Do we need to do any byte-by-byte copying?
 
-	SUB	R6, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
+	AND	$7, R6, R12
+	CBZ	R12, backwardtaillarge
+
+	SUB	R12, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
 backwardtailloop:
+	// Copy sub-pointer-size tail.
 	MOVBU.W	-1(R4), R8
 	MOVBU.W	R8, -1(R3)
 	CMP	R9, R3
 	BNE	backwardtailloop
 
+backwardtaillarge:
+	// Do 8/16-byte write if possible.
+	// See comment at forwardtail.
+	TBZ	$3, R6, 3(PC)
+	MOVD.W	-8(R4), R8
+	MOVD.W	R8, -8(R3)
+
+	TBZ	$4, R6, 3(PC)
+	LDP.W	-16(R4), (R8, R10)
+	STP.W	(R8, R10), -16(R3)
+
 nobackwardtail:
 	CBNZ     R7, backwardlarge	// Do we need to do any doubleword-by-doubleword copying?
 	RET
diff --git a/src/runtime/memmove_mips64x.s b/src/runtime/memmove_mips64x.s
index a4cb7dc..8a1b88a 100644
--- a/src/runtime/memmove_mips64x.s
+++ b/src/runtime/memmove_mips64x.s
@@ -6,6 +6,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVV	to+0(FP), R1
diff --git a/src/runtime/memmove_mipsx.s b/src/runtime/memmove_mipsx.s
index 13544a3..6c86558 100644
--- a/src/runtime/memmove_mipsx.s
+++ b/src/runtime/memmove_mipsx.s
@@ -14,6 +14,8 @@
 #define MOVWLO  MOVWL
 #endif
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB),NOSPLIT,$-0-12
 	MOVW	n+8(FP), R3
diff --git a/src/runtime/memmove_plan9_386.s b/src/runtime/memmove_plan9_386.s
index 65dec93..cfce0e9 100644
--- a/src/runtime/memmove_plan9_386.s
+++ b/src/runtime/memmove_plan9_386.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/memmove-386.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -25,6 +25,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $0-12
 	MOVL	to+0(FP), DI
diff --git a/src/runtime/memmove_plan9_amd64.s b/src/runtime/memmove_plan9_amd64.s
index b729c7c..217aa60 100644
--- a/src/runtime/memmove_plan9_amd64.s
+++ b/src/runtime/memmove_plan9_amd64.s
@@ -1,5 +1,5 @@
 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/memmove-386.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/memmove-386.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -25,6 +25,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $0-24
 
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 60cbcc4..edc6452 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -6,79 +6,115 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
+
+// target address
+#define TGT R3
+// source address
+#define SRC R4
+// length to move
+#define LEN R5
+// number of doublewords
+#define DWORDS R6
+// number of bytes < 8
+#define BYTES R7
+// const 16 used as index
+#define IDX16 R8
+// temp used for copies, etc.
+#define TMP R9
+// number of 32 byte chunks
+#define QWORDS R10
+
 TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
-	MOVD	to+0(FP), R3
-	MOVD	from+8(FP), R4
-	MOVD	n+16(FP), R5
+	MOVD	to+0(FP), TGT
+	MOVD	from+8(FP), SRC
+	MOVD	n+16(FP), LEN
 
 	// Determine if there are doublewords to
 	// copy so a more efficient move can be done
 check:
-	ANDCC	$7, R5, R7	// R7: bytes to copy
-	SRD	$3, R5, R6	// R6: double words to copy
-	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
+	ANDCC	$7, LEN, BYTES	// R7: bytes to copy
+	SRD	$3, LEN, DWORDS	// R6: double words to copy
+	MOVFL	CR0, CR3	// save CR from ANDCC
+	CMP	DWORDS, $0, CR1	// CR1[EQ] set if no double words to copy
 
 	// Determine overlap by subtracting dest - src and comparing against the
-	// length.  The catches the cases where src and dest are in different types
+	// length.  This catches the cases where src and dest are in different types
 	// of storage such as stack and static to avoid doing backward move when not
 	// necessary.
 
-	SUB	R4, R3, R8	// dest - src
-	CMPU	R8, R5, CR2	// < len?
+	SUB	SRC, TGT, TMP	// dest - src
+	CMPU	TMP, LEN, CR2	// < len?
 	BC	12, 8, backward // BLT CR2 backward
 
 	// Copying forward if no overlap.
 
-	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
-	SRDCC	$2,R6,R8		// 32 byte chunks?
-	BNE	forward32setup		//
-	MOVD	R6,CTR			// R6 = number of double words
-
-	// Move double words
-
-forward8:
-	MOVD    0(R4), R8		// double word
-	ADD     $8,R4
-	MOVD    R8, 0(R3)		//
-	ADD     $8,R3
-	BC      16, 0, forward8
-	BR	noforwardlarge		// handle remainder
+	BC	12, 6, checkbytes	// BEQ CR1, checkbytes
+	SRDCC	$2, DWORDS, QWORDS	// 32 byte chunks?
+	BEQ	lt32gt8			// < 32 bytes
 
 	// Prepare for moves of 32 bytes at a time.
 
 forward32setup:
-	DCBTST	(R3)			// prepare data cache
-	DCBT	(R4)
-	MOVD	R8, CTR			// double work count
-	MOVD	$16, R8
+	DCBTST	(TGT)			// prepare data cache
+	DCBT	(SRC)
+	MOVD	QWORDS, CTR		// Number of 32 byte chunks
+	MOVD	$16, IDX16		// 16 for index
 
 forward32:
-	LXVD2X	(R4+R0), VS32		// load 16 bytes
-	LXVD2X	(R4+R8), VS33
-	ADD	$32, R4
-	STXVD2X	VS32, (R3+R0)		// store 16 bytes
-	STXVD2X	VS33, (R3+R8)
-	ADD	$32,R3			// bump up for next set
+	LXVD2X	(R0)(SRC), VS32		// load 16 bytes
+	LXVD2X	(IDX16)(SRC), VS33	// load 16 bytes
+	ADD	$32, SRC
+	STXVD2X	VS32, (R0)(TGT)		// store 16 bytes
+	STXVD2X	VS33, (IDX16)(TGT)
+	ADD	$32,TGT			// bump up for next set
 	BC	16, 0, forward32	// continue
-	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
-	BEQ	noforwardlarge
-	MOVD	R6,CTR			// set up the CTR
-	BR	forward8
+	ANDCC	$3, DWORDS		// remaining doublewords
+	BEQ	checkbytes		// only bytes remain
 
-noforwardlarge:
-	CMP	R7,$0			// any remaining bytes
-	BC	4, 1, LR		// ble lr
+lt32gt8:
+        // At this point >= 8 and < 32
+	// Move 16 bytes if possible
+	CMP     DWORDS, $2
+	BLT     lt16
+	LXVD2X	(R0)(SRC), VS32
+	ADD	$-2, DWORDS
+	STXVD2X	VS32, (R0)(TGT)
+	ADD     $16, SRC
+	ADD     $16, TGT
 
-forwardtail:
-	MOVD	R7, CTR			// move tail bytes
-
-forwardtailloop:
-	MOVBZ	0(R4), R8		// move single bytes
-	ADD	$1,R4
-	MOVBZ	R8, 0(R3)
-	ADD	$1,R3
-	BC	16, 0, forwardtailloop
+lt16:	// Move 8 bytes if possible
+	CMP     DWORDS, $1
+	BLT     checkbytes
+	MOVD    0(SRC), TMP
+	ADD	$8, SRC
+	MOVD    TMP, 0(TGT)
+	ADD     $8, TGT
+checkbytes:
+	BC	12, 14, LR		// BEQ lr
+lt8:	// Move word if possible
+	CMP BYTES, $4
+	BLT lt4
+	MOVWZ 0(SRC), TMP
+	ADD $-4, BYTES
+	MOVW TMP, 0(TGT)
+	ADD $4, SRC
+	ADD $4, TGT
+lt4:	// Move halfword if possible
+	CMP BYTES, $2
+	BLT lt2
+	MOVHZ 0(SRC), TMP
+	ADD $-2, BYTES
+	MOVH TMP, 0(TGT)
+	ADD $2, SRC
+	ADD $2, TGT
+lt2:	// Move last byte if 1 left
+	CMP BYTES, $1
+	BC 12, 0, LR	// ble lr
+	MOVBZ 0(SRC), TMP
+	MOVBZ TMP, 0(TGT)
 	RET
 
 backward:
@@ -86,51 +122,51 @@
 	// R3 and R4 are advanced to the end of the destination/source buffers
 	// respectively and moved back as we copy.
 
-	ADD	R5, R4, R4		// end of source
-	ADD	R3, R5, R3		// end of dest
+	ADD	LEN, SRC, SRC		// end of source
+	ADD	TGT, LEN, TGT		// end of dest
 
 	BEQ	nobackwardtail		// earlier condition
 
-	MOVD	R7, CTR			// bytes to move
+	MOVD	BYTES, CTR			// bytes to move
 
 backwardtailloop:
-	MOVBZ 	-1(R4), R8		// point to last byte
-	SUB	$1,R4
-	MOVBZ 	R8, -1(R3)
-	SUB	$1,R3
+	MOVBZ 	-1(SRC), TMP		// point to last byte
+	SUB	$1,SRC
+	MOVBZ 	TMP, -1(TGT)
+	SUB	$1,TGT
 	BC	16, 0, backwardtailloop // bndz
 
 nobackwardtail:
 	BC	4, 5, LR		// ble CR1 lr
 
 backwardlarge:
-	MOVD	R6, CTR
-	SUB	R3, R4, R9		// Use vsx if moving
-	CMP	R9, $32			// at least 32 byte chunks
+	MOVD	DWORDS, CTR
+	SUB	TGT, SRC, TMP		// Use vsx if moving
+	CMP	TMP, $32		// at least 32 byte chunks
 	BLT	backwardlargeloop	// and distance >= 32
-	SRDCC	$2,R6,R8		// 32 byte chunks
+	SRDCC	$2,DWORDS,QWORDS	// 32 byte chunks
 	BNE	backward32setup
 
 backwardlargeloop:
-	MOVD 	-8(R4), R8
-	SUB	$8,R4
-	MOVD 	R8, -8(R3)
-	SUB	$8,R3
+	MOVD 	-8(SRC), TMP
+	SUB	$8,SRC
+	MOVD 	TMP, -8(TGT)
+	SUB	$8,TGT
 	BC	16, 0, backwardlargeloop // bndz
 	RET
 
 backward32setup:
-	MOVD	R8, CTR			// set up loop ctr
-	MOVD	$16, R8			// 32 bytes at at time
+	MOVD	QWORDS, CTR			// set up loop ctr
+	MOVD	$16, IDX16			// 32 bytes at at time
 
 backward32loop:
-	SUB	$32, R4
-	SUB	$32, R3
-	LXVD2X	(R4+R0), VS32           // load 16 bytes
-	LXVD2X	(R4+R8), VS33
-	STXVD2X	VS32, (R3+R0)           // store 16 bytes
-	STXVD2X	VS33, (R3+R8)
+	SUB	$32, TGT
+	SUB	$32, SRC
+	LXVD2X	(R0)(TGT), VS32           // load 16 bytes
+	LXVD2X	(IDX16)(TGT), VS33
+	STXVD2X	VS32, (R0)(SRC)           // store 16 bytes
+	STXVD2X	VS33, (IDX16)(SRC)
 	BC      16, 0, backward32loop   // bndz
 	BC	4, 5, LR		// ble CR1 lr
-	MOVD	R6, CTR
+	MOVD	DWORDS, CTR
 	BR	backwardlargeloop
diff --git a/src/runtime/memmove_riscv64.s b/src/runtime/memmove_riscv64.s
new file mode 100644
index 0000000..5dec8d0
--- /dev/null
+++ b/src/runtime/memmove_riscv64.s
@@ -0,0 +1,98 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// See memmove Go doc for important implementation constraints.
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB),NOSPLIT,$-0-24
+	MOV	to+0(FP), T0
+	MOV	from+8(FP), T1
+	MOV	n+16(FP), T2
+	ADD	T1, T2, T5
+
+	// If the destination is ahead of the source, start at the end of the
+	// buffer and go backward.
+	BLTU	T1, T0, b
+
+	// If less than eight bytes, do one byte at a time.
+	SLTU	$8, T2, T3
+	BNE	T3, ZERO, f_outcheck
+
+	// Do one byte at a time until from is eight-aligned.
+	JMP	f_aligncheck
+f_align:
+	MOVB	(T1), T3
+	MOVB	T3, (T0)
+	ADD	$1, T0
+	ADD	$1, T1
+f_aligncheck:
+	AND	$7, T1, T3
+	BNE	T3, ZERO, f_align
+
+	// Do eight bytes at a time as long as there is room.
+	ADD	$-7, T5, T6
+	JMP	f_wordscheck
+f_words:
+	MOV	(T1), T3
+	MOV	T3, (T0)
+	ADD	$8, T0
+	ADD	$8, T1
+f_wordscheck:
+	SLTU	T6, T1, T3
+	BNE	T3, ZERO, f_words
+
+	// Finish off the remaining partial word.
+	JMP 	f_outcheck
+f_out:
+	MOVB	(T1), T3
+	MOVB	T3, (T0)
+	ADD	$1, T0
+	ADD	$1, T1
+f_outcheck:
+	BNE	T1, T5, f_out
+
+	RET
+
+b:
+	ADD	T0, T2, T4
+	// If less than eight bytes, do one byte at a time.
+	SLTU	$8, T2, T3
+	BNE	T3, ZERO, b_outcheck
+
+	// Do one byte at a time until from+n is eight-aligned.
+	JMP	b_aligncheck
+b_align:
+	ADD	$-1, T4
+	ADD	$-1, T5
+	MOVB	(T5), T3
+	MOVB	T3, (T4)
+b_aligncheck:
+	AND	$7, T5, T3
+	BNE	T3, ZERO, b_align
+
+	// Do eight bytes at a time as long as there is room.
+	ADD	$7, T1, T6
+	JMP	b_wordscheck
+b_words:
+	ADD	$-8, T4
+	ADD	$-8, T5
+	MOV	(T5), T3
+	MOV	T3, (T4)
+b_wordscheck:
+	SLTU	T5, T6, T3
+	BNE	T3, ZERO, b_words
+
+	// Finish off the remaining partial word.
+	JMP	b_outcheck
+b_out:
+	ADD	$-1, T4
+	ADD	$-1, T5
+	MOVB	(T5), T3
+	MOVB	T3, (T4)
+b_outcheck:
+	BNE	T5, T1, b_out
+
+	RET
diff --git a/src/runtime/memmove_s390x.s b/src/runtime/memmove_s390x.s
index 4ce98b0..f4c2b87 100644
--- a/src/runtime/memmove_s390x.s
+++ b/src/runtime/memmove_s390x.s
@@ -4,6 +4,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB),NOSPLIT|NOFRAME,$0-24
 	MOVD	to+0(FP), R6
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 0b2e191..396c130 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -11,7 +11,9 @@
 	"internal/race"
 	"internal/testenv"
 	. "runtime"
+	"sync/atomic"
 	"testing"
+	"unsafe"
 )
 
 func TestMemmove(t *testing.T) {
@@ -206,6 +208,71 @@
 	return l
 }
 
+// Ensure that memmove writes pointers atomically, so the GC won't
+// observe a partially updated pointer.
+func TestMemmoveAtomicity(t *testing.T) {
+	if race.Enabled {
+		t.Skip("skip under the race detector -- this test is intentionally racy")
+	}
+
+	var x int
+
+	for _, backward := range []bool{true, false} {
+		for _, n := range []int{3, 4, 5, 6, 7, 8, 9, 10, 15, 25, 49} {
+			n := n
+
+			// test copying [N]*int.
+			sz := uintptr(n * PtrSize)
+			name := fmt.Sprint(sz)
+			if backward {
+				name += "-backward"
+			} else {
+				name += "-forward"
+			}
+			t.Run(name, func(t *testing.T) {
+				// Use overlapping src and dst to force forward/backward copy.
+				var s [100]*int
+				src := s[n-1 : 2*n-1]
+				dst := s[:n]
+				if backward {
+					src, dst = dst, src
+				}
+				for i := range src {
+					src[i] = &x
+				}
+				for i := range dst {
+					dst[i] = nil
+				}
+
+				var ready uint32
+				go func() {
+					sp := unsafe.Pointer(&src[0])
+					dp := unsafe.Pointer(&dst[0])
+					atomic.StoreUint32(&ready, 1)
+					for i := 0; i < 10000; i++ {
+						Memmove(dp, sp, sz)
+						MemclrNoHeapPointers(dp, sz)
+					}
+					atomic.StoreUint32(&ready, 2)
+				}()
+
+				for atomic.LoadUint32(&ready) == 0 {
+					Gosched()
+				}
+
+				for atomic.LoadUint32(&ready) != 2 {
+					for i := range dst {
+						p := dst[i]
+						if p != nil && p != &x {
+							t.Fatalf("got partially updated pointer %p at dst[%d], want either nil or %p", p, i, &x)
+						}
+					}
+				}
+			})
+		}
+	}
+}
+
 func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
 	for _, n := range sizes {
 		b.Run(fmt.Sprint(n), func(b *testing.B) {
diff --git a/src/runtime/memmove_wasm.s b/src/runtime/memmove_wasm.s
index d5e2016..8525fea 100644
--- a/src/runtime/memmove_wasm.s
+++ b/src/runtime/memmove_wasm.s
@@ -4,6 +4,8 @@
 
 #include "textflag.h"
 
+// See memmove Go doc for important implementation constraints.
+
 // func memmove(to, from unsafe.Pointer, n uintptr)
 TEXT runtime·memmove(SB), NOSPLIT, $0-24
 	MOVD to+0(FP), R0
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 37b2c38..d6c85a8 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -407,9 +407,9 @@
 	// compute size needed for return parameters
 	nret := uintptr(0)
 	for _, t := range ft.out() {
-		nret = round(nret, uintptr(t.align)) + uintptr(t.size)
+		nret = alignUp(nret, uintptr(t.align)) + uintptr(t.size)
 	}
-	nret = round(nret, sys.PtrSize)
+	nret = alignUp(nret, sys.PtrSize)
 
 	// make sure we have a finalizer goroutine
 	createfing()
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 823b556..b349951 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -139,6 +139,10 @@
 	_ConcurrentSweep = true
 	_FinBlockSize    = 4 * 1024
 
+	// debugScanConservative enables debug logging for stack
+	// frames that are scanned conservatively.
+	debugScanConservative = false
+
 	// sweepMinHeapDistance is a lower bound on the heap distance
 	// (in bytes) reserved for concurrent sweeping between GC
 	// cycles.
@@ -187,6 +191,9 @@
 
 	work.startSema = 1
 	work.markDoneSema = 1
+	lockInit(&work.sweepWaiters.lock, lockRankSweepWaiters)
+	lockInit(&work.assistQueue.lock, lockRankAssistQueue)
+	lockInit(&work.wbufSpans.lock, lockRankWbufSpans)
 }
 
 func readgogc() int32 {
@@ -488,25 +495,25 @@
 	}
 	live := atomic.Load64(&memstats.heap_live)
 
-	var heapGoal, scanWorkExpected int64
-	if live <= memstats.next_gc {
-		// We're under the soft goal. Pace GC to complete at
-		// next_gc assuming the heap is in steady-state.
-		heapGoal = int64(memstats.next_gc)
+	// Assume we're under the soft goal. Pace GC to complete at
+	// next_gc assuming the heap is in steady-state.
+	heapGoal := int64(memstats.next_gc)
 
-		// Compute the expected scan work remaining.
-		//
-		// This is estimated based on the expected
-		// steady-state scannable heap. For example, with
-		// GOGC=100, only half of the scannable heap is
-		// expected to be live, so that's what we target.
-		//
-		// (This is a float calculation to avoid overflowing on
-		// 100*heap_scan.)
-		scanWorkExpected = int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
-	} else {
-		// We're past the soft goal. Pace GC so that in the
-		// worst case it will complete by the hard goal.
+	// Compute the expected scan work remaining.
+	//
+	// This is estimated based on the expected
+	// steady-state scannable heap. For example, with
+	// GOGC=100, only half of the scannable heap is
+	// expected to be live, so that's what we target.
+	//
+	// (This is a float calculation to avoid overflowing on
+	// 100*heap_scan.)
+	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+
+	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+		// We're past the soft goal, or we've already done more scan
+		// work than we expected. Pace GC so that in the worst case it
+		// will complete by the hard goal.
 		const maxOvershoot = 1.1
 		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
 
@@ -518,7 +525,7 @@
 	//
 	// Note that we currently count allocations during GC as both
 	// scannable heap (heap_scan) and scan work completed
-	// (scanWork), so allocation will change this difference will
+	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
 	scanWorkRemaining := scanWorkExpected - c.scanWork
@@ -764,17 +771,39 @@
 	}
 
 	// Set the trigger ratio, capped to reasonable bounds.
-	if triggerRatio < 0 {
-		// This can happen if the mutator is allocating very
-		// quickly or the GC is scanning very slowly.
-		triggerRatio = 0
-	} else if gcpercent >= 0 {
+	if gcpercent >= 0 {
+		scalingFactor := float64(gcpercent) / 100
 		// Ensure there's always a little margin so that the
 		// mutator assist ratio isn't infinity.
-		maxTriggerRatio := 0.95 * float64(gcpercent) / 100
+		maxTriggerRatio := 0.95 * scalingFactor
 		if triggerRatio > maxTriggerRatio {
 			triggerRatio = maxTriggerRatio
 		}
+
+		// If we let triggerRatio go too low, then if the application
+		// is allocating very rapidly we might end up in a situation
+		// where we're allocating black during a nearly always-on GC.
+		// The result of this is a growing heap and ultimately an
+		// increase in RSS. By capping us at a point >0, we're essentially
+		// saying that we're OK using more CPU during the GC to prevent
+		// this growth in RSS.
+		//
+		// The current constant was chosen empirically: given a sufficiently
+		// fast/scalable allocator with 48 Ps that could drive the trigger ratio
+		// to <0.05, this constant causes applications to retain the same peak
+		// RSS compared to not having this allocator.
+		minTriggerRatio := 0.6 * scalingFactor
+		if triggerRatio < minTriggerRatio {
+			triggerRatio = minTriggerRatio
+		}
+	} else if triggerRatio < 0 {
+		// gcpercent < 0, so just make sure we're not getting a negative
+		// triggerRatio. This case isn't expected to happen in practice,
+		// and doesn't really matter because if gcpercent < 0 then we won't
+		// ever consume triggerRatio further on in this function, but let's
+		// just be defensive here; the triggerRatio being negative is almost
+		// certainly undesirable.
+		triggerRatio = 0
 	}
 	memstats.triggerRatio = triggerRatio
 
@@ -845,7 +874,8 @@
 			heapDistance = _PageSize
 		}
 		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
-		sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
+		pagesInUse := atomic.Load64(&mheap_.pagesInUse)
+		sweepDistancePages := int64(pagesInUse) - int64(pagesSwept)
 		if sweepDistancePages <= 0 {
 			mheap_.sweepPagesPerByte = 0
 		} else {
@@ -1248,6 +1278,7 @@
 	}
 
 	// Ok, we're doing it! Stop everybody else
+	semacquire(&gcsema)
 	semacquire(&worldsema)
 
 	if trace.enabled {
@@ -1287,6 +1318,7 @@
 	systemstack(func() {
 		finishsweep_m()
 	})
+
 	// clearpools before we start the GC. If we wait they memory will not be
 	// reclaimed until the next GC cycle.
 	clearpools()
@@ -1340,15 +1372,26 @@
 	// the world.
 	gcController.markStartTime = now
 
+	// In STW mode, we could block the instant systemstack
+	// returns, so make sure we're not preemptible.
+	mp = acquirem()
+
 	// Concurrent mark.
 	systemstack(func() {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
 	})
-	// In STW mode, we could block the instant systemstack
-	// returns, so don't do anything important here. Make sure we
-	// block rather than returning to user code.
+
+	// Release the world sema before Gosched() in STW mode
+	// because we will need to reacquire it later but before
+	// this goroutine becomes runnable again, and we could
+	// self-deadlock otherwise.
+	semrelease(&worldsema)
+	releasem(mp)
+
+	// Make sure we block instead of returning to user code
+	// in STW mode.
 	if mode != gcBackgroundMode {
 		Gosched()
 	}
@@ -1415,6 +1458,10 @@
 		return
 	}
 
+	// forEachP needs worldsema to execute, and we'll need it to
+	// stop the world later, so acquire worldsema now.
+	semacquire(&worldsema)
+
 	// Flush all local buffers and collect flushedWork flags.
 	gcMarkDoneFlushed = 0
 	systemstack(func() {
@@ -1475,6 +1522,7 @@
 		// work to do. Keep going. It's possible the
 		// transition condition became true again during the
 		// ragged barrier, so re-check it.
+		semrelease(&worldsema)
 		goto top
 	}
 
@@ -1551,6 +1599,7 @@
 				now := startTheWorldWithSema(true)
 				work.pauseNS += now - work.pauseStart
 			})
+			semrelease(&worldsema)
 			goto top
 		}
 	}
@@ -1649,6 +1698,10 @@
 		throw("gc done but gcphase != _GCoff")
 	}
 
+	// Record next_gc and heap_inuse for scavenger.
+	memstats.last_next_gc = memstats.next_gc
+	memstats.last_heap_inuse = memstats.heap_inuse
+
 	// Update GC trigger and pacing for the next cycle.
 	gcSetTriggerRatio(nextTriggerRatio)
 
@@ -1761,6 +1814,7 @@
 	}
 
 	semrelease(&worldsema)
+	semrelease(&gcsema)
 	// Careful: another GC cycle may start now.
 
 	releasem(mp)
@@ -2083,6 +2137,9 @@
 
 // gcSweep must be called on the system stack because it acquires the heap
 // lock. See mheap for details.
+//
+// The world must be stopped.
+//
 //go:systemstack
 func gcSweep(mode gcMode) {
 	if gcphase != _GCoff {
@@ -2092,7 +2149,7 @@
 	lock(&mheap_.lock)
 	mheap_.sweepgen += 2
 	mheap_.sweepdone = 0
-	if mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
+	if !go115NewMCentralImpl && mheap_.sweepSpans[mheap_.sweepgen/2%2].index != 0 {
 		// We should have drained this list during the last
 		// sweep phase. We certainly need to start this phase
 		// with an empty swept list.
@@ -2104,6 +2161,10 @@
 	mheap_.reclaimCredit = 0
 	unlock(&mheap_.lock)
 
+	if go115NewMCentralImpl {
+		sweep.centralIndex.clear()
+	}
+
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
 		// Special case synchronous sweep.
 		// Record that no proportional sweeping has to happen.
@@ -2150,8 +2211,7 @@
 	// allgs doesn't change.
 	lock(&allglock)
 	for _, gp := range allgs {
-		gp.gcscandone = false  // set to true in gcphasework
-		gp.gcscanvalid = false // stack has not been scanned
+		gp.gcscandone = false // set to true in gcphasework
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
diff --git a/src/runtime/mgclarge.go b/src/runtime/mgclarge.go
deleted file mode 100644
index 414db10..0000000
--- a/src/runtime/mgclarge.go
+++ /dev/null
@@ -1,657 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Page heap.
-//
-// See malloc.go for the general overview.
-//
-// Allocation policy is the subject of this file. All free spans live in
-// a treap for most of their time being free. See
-// https://en.wikipedia.org/wiki/Treap or
-// https://faculty.washington.edu/aragon/pubs/rst89.pdf for an overview.
-// sema.go also holds an implementation of a treap.
-//
-// Each treapNode holds a single span. The treap is sorted by base address
-// and each span necessarily has a unique base address.
-// Spans are returned based on a first-fit algorithm, acquiring the span
-// with the lowest base address which still satisfies the request.
-//
-// The first-fit algorithm is possible due to an augmentation of each
-// treapNode to maintain the size of the largest span in the subtree rooted
-// at that treapNode. Below we refer to this invariant as the maxPages
-// invariant.
-//
-// The primary routines are
-// insert: adds a span to the treap
-// remove: removes the span from that treap that best fits the required size
-// removeSpan: which removes a specific span from the treap
-//
-// Whenever a pointer to a span which is owned by the treap is acquired, that
-// span must not be mutated. To mutate a span in the treap, remove it first.
-//
-// mheap_.lock must be held when manipulating this data structure.
-
-package runtime
-
-import (
-	"unsafe"
-)
-
-//go:notinheap
-type mTreap struct {
-	treap           *treapNode
-	unscavHugePages uintptr // number of unscavenged huge pages in the treap
-}
-
-//go:notinheap
-type treapNode struct {
-	right    *treapNode      // all treapNodes > this treap node
-	left     *treapNode      // all treapNodes < this treap node
-	parent   *treapNode      // direct parent of this node, nil if root
-	key      uintptr         // base address of the span, used as primary sort key
-	span     *mspan          // span at base address key
-	maxPages uintptr         // the maximum size of any span in this subtree, including the root
-	priority uint32          // random number used by treap algorithm to keep tree probabilistically balanced
-	types    treapIterFilter // the types of spans available in this subtree
-}
-
-// updateInvariants is a helper method which has a node recompute its own
-// maxPages and types values by looking at its own span as well as the
-// values of its direct children.
-//
-// Returns true if anything changed.
-func (t *treapNode) updateInvariants() bool {
-	m, i := t.maxPages, t.types
-	t.maxPages = t.span.npages
-	t.types = t.span.treapFilter()
-	if t.left != nil {
-		t.types |= t.left.types
-		if t.maxPages < t.left.maxPages {
-			t.maxPages = t.left.maxPages
-		}
-	}
-	if t.right != nil {
-		t.types |= t.right.types
-		if t.maxPages < t.right.maxPages {
-			t.maxPages = t.right.maxPages
-		}
-	}
-	return m != t.maxPages || i != t.types
-}
-
-// findMinimal finds the minimal (lowest base addressed) node in the treap
-// which matches the criteria set out by the filter f and returns nil if
-// none exists.
-//
-// This algorithm is functionally the same as (*mTreap).find, so see that
-// method for more details.
-func (t *treapNode) findMinimal(f treapIterFilter) *treapNode {
-	if t == nil || !f.matches(t.types) {
-		return nil
-	}
-	for t != nil {
-		if t.left != nil && f.matches(t.left.types) {
-			t = t.left
-		} else if f.matches(t.span.treapFilter()) {
-			break
-		} else if t.right != nil && f.matches(t.right.types) {
-			t = t.right
-		} else {
-			println("runtime: f=", f)
-			throw("failed to find minimal node matching filter")
-		}
-	}
-	return t
-}
-
-// findMaximal finds the maximal (highest base addressed) node in the treap
-// which matches the criteria set out by the filter f and returns nil if
-// none exists.
-//
-// This algorithm is the logical inversion of findMinimal and just changes
-// the order of the left and right tests.
-func (t *treapNode) findMaximal(f treapIterFilter) *treapNode {
-	if t == nil || !f.matches(t.types) {
-		return nil
-	}
-	for t != nil {
-		if t.right != nil && f.matches(t.right.types) {
-			t = t.right
-		} else if f.matches(t.span.treapFilter()) {
-			break
-		} else if t.left != nil && f.matches(t.left.types) {
-			t = t.left
-		} else {
-			println("runtime: f=", f)
-			throw("failed to find minimal node matching filter")
-		}
-	}
-	return t
-}
-
-// pred returns the predecessor of t in the treap subject to the criteria
-// specified by the filter f. Returns nil if no such predecessor exists.
-func (t *treapNode) pred(f treapIterFilter) *treapNode {
-	if t.left != nil && f.matches(t.left.types) {
-		// The node has a left subtree which contains at least one matching
-		// node, find the maximal matching node in that subtree.
-		return t.left.findMaximal(f)
-	}
-	// Lacking a left subtree, look to the parents.
-	p := t // previous node
-	t = t.parent
-	for t != nil {
-		// Walk up the tree until we find a node that has a left subtree
-		// that we haven't already visited.
-		if t.right == p {
-			if f.matches(t.span.treapFilter()) {
-				// If this node matches, then it's guaranteed to be the
-				// predecessor since everything to its left is strictly
-				// greater.
-				return t
-			} else if t.left != nil && f.matches(t.left.types) {
-				// Failing the root of this subtree, if its left subtree has
-				// something, that's where we'll find our predecessor.
-				return t.left.findMaximal(f)
-			}
-		}
-		p = t
-		t = t.parent
-	}
-	// If the parent is nil, then we've hit the root without finding
-	// a suitable left subtree containing the node (and the predecessor
-	// wasn't on the path). Thus, there's no predecessor, so just return
-	// nil.
-	return nil
-}
-
-// succ returns the successor of t in the treap subject to the criteria
-// specified by the filter f. Returns nil if no such successor exists.
-func (t *treapNode) succ(f treapIterFilter) *treapNode {
-	// See pred. This method is just the logical inversion of it.
-	if t.right != nil && f.matches(t.right.types) {
-		return t.right.findMinimal(f)
-	}
-	p := t
-	t = t.parent
-	for t != nil {
-		if t.left == p {
-			if f.matches(t.span.treapFilter()) {
-				return t
-			} else if t.right != nil && f.matches(t.right.types) {
-				return t.right.findMinimal(f)
-			}
-		}
-		p = t
-		t = t.parent
-	}
-	return nil
-}
-
-// isSpanInTreap is handy for debugging. One should hold the heap lock, usually
-// mheap_.lock().
-func (t *treapNode) isSpanInTreap(s *mspan) bool {
-	if t == nil {
-		return false
-	}
-	return t.span == s || t.left.isSpanInTreap(s) || t.right.isSpanInTreap(s)
-}
-
-// walkTreap is handy for debugging and testing.
-// Starting at some treapnode t, for example the root, do a depth first preorder walk of
-// the tree executing fn at each treap node. One should hold the heap lock, usually
-// mheap_.lock().
-func (t *treapNode) walkTreap(fn func(tn *treapNode)) {
-	if t == nil {
-		return
-	}
-	fn(t)
-	t.left.walkTreap(fn)
-	t.right.walkTreap(fn)
-}
-
-// checkTreapNode when used in conjunction with walkTreap can usually detect a
-// poorly formed treap.
-func checkTreapNode(t *treapNode) {
-	if t == nil {
-		return
-	}
-	if t.span.next != nil || t.span.prev != nil || t.span.list != nil {
-		throw("span may be on an mSpanList while simultaneously in the treap")
-	}
-	if t.span.base() != t.key {
-		println("runtime: checkTreapNode treapNode t=", t, "     t.key=", t.key,
-			"t.span.base()=", t.span.base())
-		throw("why does span.base() and treap.key do not match?")
-	}
-	if t.left != nil && t.key < t.left.key {
-		throw("found out-of-order spans in treap (left child has greater base address)")
-	}
-	if t.right != nil && t.key > t.right.key {
-		throw("found out-of-order spans in treap (right child has lesser base address)")
-	}
-}
-
-// validateInvariants is handy for debugging and testing.
-// It ensures that the various invariants on each treap node are
-// appropriately maintained throughout the treap by walking the
-// treap in a post-order manner.
-func (t *treapNode) validateInvariants() (uintptr, treapIterFilter) {
-	if t == nil {
-		return 0, 0
-	}
-	leftMax, leftTypes := t.left.validateInvariants()
-	rightMax, rightTypes := t.right.validateInvariants()
-	max := t.span.npages
-	if leftMax > max {
-		max = leftMax
-	}
-	if rightMax > max {
-		max = rightMax
-	}
-	if max != t.maxPages {
-		println("runtime: t.maxPages=", t.maxPages, "want=", max)
-		throw("maxPages invariant violated in treap")
-	}
-	typ := t.span.treapFilter() | leftTypes | rightTypes
-	if typ != t.types {
-		println("runtime: t.types=", t.types, "want=", typ)
-		throw("types invariant violated in treap")
-	}
-	return max, typ
-}
-
-// treapIterType represents the type of iteration to perform
-// over the treap. Each different flag is represented by a bit
-// in the type, and types may be combined together by a bitwise
-// or operation.
-//
-// Note that only 5 bits are available for treapIterType, do not
-// use the 3 higher-order bits. This constraint is to allow for
-// expansion into a treapIterFilter, which is a uint32.
-type treapIterType uint8
-
-const (
-	treapIterScav treapIterType = 1 << iota // scavenged spans
-	treapIterHuge                           // spans containing at least one huge page
-	treapIterBits = iota
-)
-
-// treapIterFilter is a bitwise filter of different spans by binary
-// properties. Each bit of a treapIterFilter represents a unique
-// combination of bits set in a treapIterType, in other words, it
-// represents the power set of a treapIterType.
-//
-// The purpose of this representation is to allow the existence of
-// a specific span type to bubble up in the treap (see the types
-// field on treapNode).
-//
-// More specifically, any treapIterType may be transformed into a
-// treapIterFilter for a specific combination of flags via the
-// following operation: 1 << (0x1f&treapIterType).
-type treapIterFilter uint32
-
-// treapFilterAll represents the filter which allows all spans.
-const treapFilterAll = ^treapIterFilter(0)
-
-// treapFilter creates a new treapIterFilter from two treapIterTypes.
-// mask represents a bitmask for which flags we should check against
-// and match for the expected result after applying the mask.
-func treapFilter(mask, match treapIterType) treapIterFilter {
-	allow := treapIterFilter(0)
-	for i := treapIterType(0); i < 1<<treapIterBits; i++ {
-		if mask&i == match {
-			allow |= 1 << i
-		}
-	}
-	return allow
-}
-
-// matches returns true if m and f intersect.
-func (f treapIterFilter) matches(m treapIterFilter) bool {
-	return f&m != 0
-}
-
-// treapFilter returns the treapIterFilter exactly matching this span,
-// i.e. popcount(result) == 1.
-func (s *mspan) treapFilter() treapIterFilter {
-	have := treapIterType(0)
-	if s.scavenged {
-		have |= treapIterScav
-	}
-	if s.hugePages() > 0 {
-		have |= treapIterHuge
-	}
-	return treapIterFilter(uint32(1) << (0x1f & have))
-}
-
-// treapIter is a bidirectional iterator type which may be used to iterate over a
-// an mTreap in-order forwards (increasing order) or backwards (decreasing order).
-// Its purpose is to hide details about the treap from users when trying to iterate
-// over it.
-//
-// To create iterators over the treap, call start or end on an mTreap.
-type treapIter struct {
-	f treapIterFilter
-	t *treapNode
-}
-
-// span returns the span at the current position in the treap.
-// If the treap is not valid, span will panic.
-func (i *treapIter) span() *mspan {
-	return i.t.span
-}
-
-// valid returns whether the iterator represents a valid position
-// in the mTreap.
-func (i *treapIter) valid() bool {
-	return i.t != nil
-}
-
-// next moves the iterator forward by one. Once the iterator
-// ceases to be valid, calling next will panic.
-func (i treapIter) next() treapIter {
-	i.t = i.t.succ(i.f)
-	return i
-}
-
-// prev moves the iterator backwards by one. Once the iterator
-// ceases to be valid, calling prev will panic.
-func (i treapIter) prev() treapIter {
-	i.t = i.t.pred(i.f)
-	return i
-}
-
-// start returns an iterator which points to the start of the treap (the
-// left-most node in the treap) subject to mask and match constraints.
-func (root *mTreap) start(mask, match treapIterType) treapIter {
-	f := treapFilter(mask, match)
-	return treapIter{f, root.treap.findMinimal(f)}
-}
-
-// end returns an iterator which points to the end of the treap (the
-// right-most node in the treap) subject to mask and match constraints.
-func (root *mTreap) end(mask, match treapIterType) treapIter {
-	f := treapFilter(mask, match)
-	return treapIter{f, root.treap.findMaximal(f)}
-}
-
-// mutate allows one to mutate the span without removing it from the treap via a
-// callback. The span's base and size are allowed to change as long as the span
-// remains in the same order relative to its predecessor and successor.
-//
-// Note however that any operation that causes a treap rebalancing inside of fn
-// is strictly forbidden, as that may cause treap node metadata to go
-// out-of-sync.
-func (root *mTreap) mutate(i treapIter, fn func(span *mspan)) {
-	s := i.span()
-	// Save some state about the span for later inspection.
-	hpages := s.hugePages()
-	scavenged := s.scavenged
-	// Call the mutator.
-	fn(s)
-	// Update unscavHugePages appropriately.
-	if !scavenged {
-		mheap_.free.unscavHugePages -= hpages
-	}
-	if !s.scavenged {
-		mheap_.free.unscavHugePages += s.hugePages()
-	}
-	// Update the key in case the base changed.
-	i.t.key = s.base()
-	// Updating invariants up the tree needs to happen if
-	// anything changed at all, so just go ahead and do it
-	// unconditionally.
-	//
-	// If it turns out nothing changed, it'll exit quickly.
-	t := i.t
-	for t != nil && t.updateInvariants() {
-		t = t.parent
-	}
-}
-
-// insert adds span to the large span treap.
-func (root *mTreap) insert(span *mspan) {
-	if !span.scavenged {
-		root.unscavHugePages += span.hugePages()
-	}
-	base := span.base()
-	var last *treapNode
-	pt := &root.treap
-	for t := *pt; t != nil; t = *pt {
-		last = t
-		if t.key < base {
-			pt = &t.right
-		} else if t.key > base {
-			pt = &t.left
-		} else {
-			throw("inserting span already in treap")
-		}
-	}
-
-	// Add t as new leaf in tree of span size and unique addrs.
-	// The balanced tree is a treap using priority as the random heap priority.
-	// That is, it is a binary tree ordered according to the key,
-	// but then among the space of possible binary trees respecting those
-	// keys, it is kept balanced on average by maintaining a heap ordering
-	// on the priority: s.priority <= both s.right.priority and s.right.priority.
-	// https://en.wikipedia.org/wiki/Treap
-	// https://faculty.washington.edu/aragon/pubs/rst89.pdf
-
-	t := (*treapNode)(mheap_.treapalloc.alloc())
-	t.key = span.base()
-	t.priority = fastrand()
-	t.span = span
-	t.maxPages = span.npages
-	t.types = span.treapFilter()
-	t.parent = last
-	*pt = t // t now at a leaf.
-
-	// Update the tree to maintain the various invariants.
-	i := t
-	for i.parent != nil && i.parent.updateInvariants() {
-		i = i.parent
-	}
-
-	// Rotate up into tree according to priority.
-	for t.parent != nil && t.parent.priority > t.priority {
-		if t != nil && t.span.base() != t.key {
-			println("runtime: insert t=", t, "t.key=", t.key)
-			println("runtime:      t.span=", t.span, "t.span.base()=", t.span.base())
-			throw("span and treap node base addresses do not match")
-		}
-		if t.parent.left == t {
-			root.rotateRight(t.parent)
-		} else {
-			if t.parent.right != t {
-				throw("treap insert finds a broken treap")
-			}
-			root.rotateLeft(t.parent)
-		}
-	}
-}
-
-func (root *mTreap) removeNode(t *treapNode) {
-	if !t.span.scavenged {
-		root.unscavHugePages -= t.span.hugePages()
-	}
-	if t.span.base() != t.key {
-		throw("span and treap node base addresses do not match")
-	}
-	// Rotate t down to be leaf of tree for removal, respecting priorities.
-	for t.right != nil || t.left != nil {
-		if t.right == nil || t.left != nil && t.left.priority < t.right.priority {
-			root.rotateRight(t)
-		} else {
-			root.rotateLeft(t)
-		}
-	}
-	// Remove t, now a leaf.
-	if t.parent != nil {
-		p := t.parent
-		if p.left == t {
-			p.left = nil
-		} else {
-			p.right = nil
-		}
-		// Walk up the tree updating invariants until no updates occur.
-		for p != nil && p.updateInvariants() {
-			p = p.parent
-		}
-	} else {
-		root.treap = nil
-	}
-	// Return the found treapNode's span after freeing the treapNode.
-	mheap_.treapalloc.free(unsafe.Pointer(t))
-}
-
-// find searches for, finds, and returns the treap iterator over all spans
-// representing the position of the span with the smallest base address which is
-// at least npages in size. If no span has at least npages it returns an invalid
-// iterator.
-//
-// This algorithm is as follows:
-// * If there's a left child and its subtree can satisfy this allocation,
-//   continue down that subtree.
-// * If there's no such left child, check if the root of this subtree can
-//   satisfy the allocation. If so, we're done.
-// * If the root cannot satisfy the allocation either, continue down the
-//   right subtree if able.
-// * Else, break and report that we cannot satisfy the allocation.
-//
-// The preference for left, then current, then right, results in us getting
-// the left-most node which will contain the span with the lowest base
-// address.
-//
-// Note that if a request cannot be satisfied the fourth case will be
-// reached immediately at the root, since neither the left subtree nor
-// the right subtree will have a sufficient maxPages, whilst the root
-// node is also unable to satisfy it.
-func (root *mTreap) find(npages uintptr) treapIter {
-	t := root.treap
-	for t != nil {
-		if t.span == nil {
-			throw("treap node with nil span found")
-		}
-		// Iterate over the treap trying to go as far left
-		// as possible while simultaneously ensuring that the
-		// subtrees we choose always have a span which can
-		// satisfy the allocation.
-		if t.left != nil && t.left.maxPages >= npages {
-			t = t.left
-		} else if t.span.npages >= npages {
-			// Before going right, if this span can satisfy the
-			// request, stop here.
-			break
-		} else if t.right != nil && t.right.maxPages >= npages {
-			t = t.right
-		} else {
-			t = nil
-		}
-	}
-	return treapIter{treapFilterAll, t}
-}
-
-// removeSpan searches for, finds, deletes span along with
-// the associated treap node. If the span is not in the treap
-// then t will eventually be set to nil and the t.span
-// will throw.
-func (root *mTreap) removeSpan(span *mspan) {
-	base := span.base()
-	t := root.treap
-	for t.span != span {
-		if t.key < base {
-			t = t.right
-		} else if t.key > base {
-			t = t.left
-		}
-	}
-	root.removeNode(t)
-}
-
-// erase removes the element referred to by the current position of the
-// iterator. This operation consumes the given iterator, so it should no
-// longer be used. It is up to the caller to get the next or previous
-// iterator before calling erase, if need be.
-func (root *mTreap) erase(i treapIter) {
-	root.removeNode(i.t)
-}
-
-// rotateLeft rotates the tree rooted at node x.
-// turning (x a (y b c)) into (y (x a b) c).
-func (root *mTreap) rotateLeft(x *treapNode) {
-	// p -> (x a (y b c))
-	p := x.parent
-	a, y := x.left, x.right
-	b, c := y.left, y.right
-
-	y.left = x
-	x.parent = y
-	y.right = c
-	if c != nil {
-		c.parent = y
-	}
-	x.left = a
-	if a != nil {
-		a.parent = x
-	}
-	x.right = b
-	if b != nil {
-		b.parent = x
-	}
-
-	y.parent = p
-	if p == nil {
-		root.treap = y
-	} else if p.left == x {
-		p.left = y
-	} else {
-		if p.right != x {
-			throw("large span treap rotateLeft")
-		}
-		p.right = y
-	}
-
-	x.updateInvariants()
-	y.updateInvariants()
-}
-
-// rotateRight rotates the tree rooted at node y.
-// turning (y (x a b) c) into (x a (y b c)).
-func (root *mTreap) rotateRight(y *treapNode) {
-	// p -> (y (x a b) c)
-	p := y.parent
-	x, c := y.left, y.right
-	a, b := x.left, x.right
-
-	x.left = a
-	if a != nil {
-		a.parent = x
-	}
-	x.right = y
-	y.parent = x
-	y.left = b
-	if b != nil {
-		b.parent = y
-	}
-	y.right = c
-	if c != nil {
-		c.parent = y
-	}
-
-	x.parent = p
-	if p == nil {
-		root.treap = x
-	} else if p.left == y {
-		p.left = x
-	} else {
-		if p.right != y {
-			throw("large span treap rotateRight")
-		}
-		p.right = x
-	}
-
-	y.updateInvariants()
-	x.updateInvariants()
-}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 2c63724..fe988c4 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -21,10 +21,6 @@
 	// BSS root.
 	rootBlockBytes = 256 << 10
 
-	// rootBlockSpans is the number of spans to scan per span
-	// root.
-	rootBlockSpans = 8 * 1024 // 64MB worth of spans
-
 	// maxObletBytes is the maximum bytes of an object to scan at
 	// once. Larger objects will be split up into "oblets" of at
 	// most this size. Since we can scan 1–2 MB/ms, 128 KB bounds
@@ -41,22 +37,32 @@
 	// a syscall, so its overhead is nontrivial). Higher values
 	// make the system less responsive to incoming work.
 	drainCheckThreshold = 100000
+
+	// pagesPerSpanRoot indicates how many pages to scan from a span root
+	// at a time. Used by special root marking.
+	//
+	// Higher values improve throughput by increasing locality, but
+	// increase the minimum latency of a marking operation.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divide pagesPerArena.
+	pagesPerSpanRoot = 512
+
+	// go115NewMarkrootSpans is a feature flag that indicates whether
+	// to use the new bitmap-based markrootSpans implementation.
+	go115NewMarkrootSpans = true
 )
 
 // gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
 // some miscellany) and initializes scanning-related state.
 //
-// The caller must have call gcCopySpans().
-//
 // The world must be stopped.
-//
-//go:nowritebarrier
 func gcMarkRootPrepare() {
 	work.nFlushCacheRoots = 0
 
 	// Compute how many data and BSS root blocks there are.
 	nBlocks := func(bytes uintptr) int {
-		return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
+		return int(divRoundUp(bytes, rootBlockBytes))
 	}
 
 	work.nDataRoots = 0
@@ -81,13 +87,24 @@
 	//
 	// We depend on addfinalizer to mark objects that get
 	// finalizers after root marking.
-	//
-	// We're only interested in scanning the in-use spans,
-	// which will all be swept at this point. More spans
-	// may be added to this list during concurrent GC, but
-	// we only care about spans that were allocated before
-	// this mark phase.
-	work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+	if go115NewMarkrootSpans {
+		// We're going to scan the whole heap (that was available at the time the
+		// mark phase started, i.e. markArenas) for in-use spans which have specials.
+		//
+		// Break up the work into arenas, and further into chunks.
+		//
+		// Snapshot allArenas as markArenas. This snapshot is safe because allArenas
+		// is append-only.
+		mheap_.markArenas = mheap_.allArenas[:len(mheap_.allArenas):len(mheap_.allArenas)]
+		work.nSpanRoots = len(mheap_.markArenas) * (pagesPerArena / pagesPerSpanRoot)
+	} else {
+		// We're only interested in scanning the in-use spans,
+		// which will all be swept at this point. More spans
+		// may be added to this list during concurrent GC, but
+		// we only care about spans that were allocated before
+		// this mark phase.
+		work.nSpanRoots = mheap_.sweepSpans[mheap_.sweepgen/2%2].numBlocks()
+	}
 
 	// Scan stacks.
 	//
@@ -125,8 +142,7 @@
 fail:
 	println("gp", gp, "goid", gp.goid,
 		"status", readgstatus(gp),
-		"gcscandone", gp.gcscandone,
-		"gcscanvalid", gp.gcscanvalid)
+		"gcscandone", gp.gcscandone)
 	unlock(&allglock) // Avoid self-deadlock with traceback.
 	throw("scan missed a g")
 }
@@ -197,7 +213,7 @@
 			gp.waitsince = work.tstart
 		}
 
-		// scang must be done on the system stack in case
+		// scanstack must be done on the system stack in case
 		// we're trying to scan our own stack.
 		systemstack(func() {
 			// If this is a self-scan, put the user G in
@@ -211,14 +227,24 @@
 				userG.waitreason = waitReasonGarbageCollectionScan
 			}
 
-			// TODO: scang blocks until gp's stack has
-			// been scanned, which may take a while for
+			// TODO: suspendG blocks (and spins) until gp
+			// stops, which may take a while for
 			// running goroutines. Consider doing this in
 			// two phases where the first is non-blocking:
 			// we scan the stacks we can and ask running
 			// goroutines to scan themselves; and the
 			// second blocks.
-			scang(gp, gcw)
+			stopped := suspendG(gp)
+			if stopped.dead {
+				gp.gcscandone = true
+				return
+			}
+			if gp.gcscandone {
+				throw("g already scanned")
+			}
+			scanstack(gp, gcw)
+			gp.gcscandone = true
+			resumeG(stopped)
 
 			if selfScan {
 				casgstatus(userG, _Gwaiting, _Grunning)
@@ -237,14 +263,18 @@
 		throw("rootBlockBytes must be a multiple of 8*ptrSize")
 	}
 
-	b := b0 + uintptr(shard)*rootBlockBytes
-	if b >= b0+n0 {
+	// Note that if b0 is toward the end of the address space,
+	// then b0 + rootBlockBytes might wrap around.
+	// These tests are written to avoid any possible overflow.
+	off := uintptr(shard) * rootBlockBytes
+	if off >= n0 {
 		return
 	}
+	b := b0 + off
 	ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*sys.PtrSize))))
 	n := uintptr(rootBlockBytes)
-	if b+n > b0+n0 {
-		n = b0 + n0 - b
+	if off+n > n0 {
+		n = n0 - off
 	}
 
 	// Scan this shard.
@@ -282,10 +312,96 @@
 	unlock(&sched.gFree.lock)
 }
 
-// markrootSpans marks roots for one shard of work.spans.
+// markrootSpans marks roots for one shard of markArenas.
 //
 //go:nowritebarrier
 func markrootSpans(gcw *gcWork, shard int) {
+	if !go115NewMarkrootSpans {
+		oldMarkrootSpans(gcw, shard)
+		return
+	}
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	sg := mheap_.sweepgen
+
+	// Find the arena and page index into that arena for this shard.
+	ai := mheap_.markArenas[shard/(pagesPerArena/pagesPerSpanRoot)]
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	arenaPage := uint(uintptr(shard) * pagesPerSpanRoot % pagesPerArena)
+
+	// Construct slice of bitmap which we'll iterate over.
+	specialsbits := ha.pageSpecials[arenaPage/8:]
+	specialsbits = specialsbits[:pagesPerSpanRoot/8]
+	for i := range specialsbits {
+		// Find set bits, which correspond to spans with specials.
+		specials := atomic.Load8(&specialsbits[i])
+		if specials == 0 {
+			continue
+		}
+		for j := uint(0); j < 8; j++ {
+			if specials&(1<<j) == 0 {
+				continue
+			}
+			// Find the span for this bit.
+			//
+			// This value is guaranteed to be non-nil because having
+			// specials implies that the span is in-use, and since we're
+			// currently marking we can be sure that we don't have to worry
+			// about the span being freed and re-used.
+			s := ha.spans[arenaPage+uint(i)*8+j]
+
+			// The state must be mSpanInUse if the specials bit is set, so
+			// sanity check that.
+			if state := s.state.get(); state != mSpanInUse {
+				print("s.state = ", state, "\n")
+				throw("non in-use span found with specials bit set")
+			}
+			// Check that this span was swept (it may be cached or uncached).
+			if !useCheckmark && !(s.sweepgen == sg || s.sweepgen == sg+3) {
+				// sweepgen was updated (+2) during non-checkmark GC pass
+				print("sweep ", s.sweepgen, " ", sg, "\n")
+				throw("gc: unswept span")
+			}
+
+			// Lock the specials to prevent a special from being
+			// removed from the list while we're traversing it.
+			lock(&s.speciallock)
+			for sp := s.specials; sp != nil; sp = sp.next {
+				if sp.kind != _KindSpecialFinalizer {
+					continue
+				}
+				// don't mark finalized object, but scan it so we
+				// retain everything it points to.
+				spf := (*specialfinalizer)(unsafe.Pointer(sp))
+				// A finalizer can be set for an inner byte of an object, find object beginning.
+				p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+				// Mark everything that can be reached from
+				// the object (but *not* the object itself or
+				// we'll never collect it).
+				scanobject(p, gcw)
+
+				// The special itself is a root.
+				scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw, nil)
+			}
+			unlock(&s.speciallock)
+		}
+	}
+}
+
+// oldMarkrootSpans marks roots for one shard of work.spans.
+//
+// For go115NewMarkrootSpans = false.
+//
+//go:nowritebarrier
+func oldMarkrootSpans(gcw *gcWork, shard int) {
 	// Objects with finalizers have two GC-related invariants:
 	//
 	// 1) Everything reachable from the object must be marked.
@@ -302,13 +418,21 @@
 	sg := mheap_.sweepgen
 	spans := mheap_.sweepSpans[mheap_.sweepgen/2%2].block(shard)
 	// Note that work.spans may not include spans that were
-	// allocated between entering the scan phase and now. This is
-	// okay because any objects with finalizers in those spans
-	// must have been allocated and given finalizers after we
-	// entered the scan phase, so addfinalizer will have ensured
-	// the above invariants for them.
-	for _, s := range spans {
-		if s.state != mSpanInUse {
+	// allocated between entering the scan phase and now. We may
+	// also race with spans being added into sweepSpans when they're
+	// just created, and as a result we may see nil pointers in the
+	// spans slice. This is okay because any objects with finalizers
+	// in those spans must have been allocated and given finalizers
+	// after we entered the scan phase, so addfinalizer will have
+	// ensured the above invariants for them.
+	for i := 0; i < len(spans); i++ {
+		// sweepBuf.block requires that we read pointers from the block atomically.
+		// It also requires that we ignore nil pointers.
+		s := (*mspan)(atomic.Loadp(unsafe.Pointer(&spans[i])))
+
+		// This is racing with spans being initialized, so
+		// check the state carefully.
+		if s == nil || s.state.get() != mSpanInUse {
 			continue
 		}
 		// Check that this span was swept (it may be cached or uncached).
@@ -654,16 +778,16 @@
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
+// scanstack will also shrink the stack if it is safe to do so. If it
+// is not, it schedules a stack shrink for the next synchronous safe
+// point.
+//
 // scanstack is marked go:systemstack because it must not be preempted
 // while using a workbuf.
 //
 //go:nowritebarrier
 //go:systemstack
 func scanstack(gp *g, gcw *gcWork) {
-	if gp.gcscanvalid {
-		return
-	}
-
 	if readgstatus(gp)&_Gscan == 0 {
 		print("runtime:scanstack: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", hex(readgstatus(gp)), "\n")
 		throw("scanstack - bad status")
@@ -686,8 +810,13 @@
 		throw("can't scan our own stack")
 	}
 
-	// Shrink the stack if not much of it is being used.
-	shrinkstack(gp)
+	if isShrinkStackSafe(gp) {
+		// Shrink the stack if not much of it is being used.
+		shrinkstack(gp)
+	} else {
+		// Otherwise, shrink the stack at the next sync safe point.
+		gp.preemptShrink = true
+	}
 
 	var state stackScanState
 	state.stack = gp.stack
@@ -696,6 +825,10 @@
 		println("stack trace goroutine", gp.goid)
 	}
 
+	if debugScanConservative && gp.asyncSafePoint {
+		print("scanning async preempted goroutine ", gp.goid, " stack [", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
+	}
+
 	// Scan the saved context register. This is effectively a live
 	// register that gets moved back and forth between the
 	// register and sched.ctxt without a write barrier.
@@ -737,13 +870,17 @@
 	}
 	if gp._panic != nil {
 		// Panics are always stack allocated.
-		state.putPtr(uintptr(unsafe.Pointer(gp._panic)))
+		state.putPtr(uintptr(unsafe.Pointer(gp._panic)), false)
 	}
 
 	// Find and scan all reachable stack objects.
+	//
+	// The state's pointer queue prioritizes precise pointers over
+	// conservative pointers so that we'll prefer scanning stack
+	// objects precisely.
 	state.buildIndex()
 	for {
-		p := state.getPtr()
+		p, conservative := state.getPtr()
 		if p == 0 {
 			break
 		}
@@ -758,7 +895,13 @@
 		}
 		obj.setType(nil) // Don't scan it again.
 		if stackTraceDebug {
-			println("  live stkobj at", hex(state.stack.lo+uintptr(obj.off)), "of type", t.string())
+			printlock()
+			print("  live stkobj at", hex(state.stack.lo+uintptr(obj.off)), "of type", t.string())
+			if conservative {
+				print(" (conservative)")
+			}
+			println()
+			printunlock()
 		}
 		gcdata := t.gcdata
 		var s *mspan
@@ -776,7 +919,12 @@
 			gcdata = (*byte)(unsafe.Pointer(s.startAddr))
 		}
 
-		scanblock(state.stack.lo+uintptr(obj.off), t.ptrdata, gcdata, gcw, &state)
+		b := state.stack.lo + uintptr(obj.off)
+		if conservative {
+			scanConservative(b, t.ptrdata, gcdata, gcw, &state)
+		} else {
+			scanblock(b, t.ptrdata, gcdata, gcw, &state)
+		}
 
 		if s != nil {
 			dematerializeGCProg(s)
@@ -800,11 +948,9 @@
 		x.nobj = 0
 		putempty((*workbuf)(unsafe.Pointer(x)))
 	}
-	if state.buf != nil || state.freeBuf != nil {
+	if state.buf != nil || state.cbuf != nil || state.freeBuf != nil {
 		throw("remaining pointer buffers")
 	}
-
-	gp.gcscanvalid = true
 }
 
 // Scan a stack frame: local variables and function arguments/results.
@@ -814,6 +960,50 @@
 		print("scanframe ", funcname(frame.fn), "\n")
 	}
 
+	isAsyncPreempt := frame.fn.valid() && frame.fn.funcID == funcID_asyncPreempt
+	isDebugCall := frame.fn.valid() && frame.fn.funcID == funcID_debugCallV1
+	if state.conservative || isAsyncPreempt || isDebugCall {
+		if debugScanConservative {
+			println("conservatively scanning function", funcname(frame.fn), "at PC", hex(frame.continpc))
+		}
+
+		// Conservatively scan the frame. Unlike the precise
+		// case, this includes the outgoing argument space
+		// since we may have stopped while this function was
+		// setting up a call.
+		//
+		// TODO: We could narrow this down if the compiler
+		// produced a single map per function of stack slots
+		// and registers that ever contain a pointer.
+		if frame.varp != 0 {
+			size := frame.varp - frame.sp
+			if size > 0 {
+				scanConservative(frame.sp, size, nil, gcw, state)
+			}
+		}
+
+		// Scan arguments to this frame.
+		if frame.arglen != 0 {
+			// TODO: We could pass the entry argument map
+			// to narrow this down further.
+			scanConservative(frame.argp, frame.arglen, nil, gcw, state)
+		}
+
+		if isAsyncPreempt || isDebugCall {
+			// This function's frame contained the
+			// registers for the asynchronously stopped
+			// parent frame. Scan the parent
+			// conservatively.
+			state.conservative = true
+		} else {
+			// We only wanted to scan those two frames
+			// conservatively. Clear the flag for future
+			// frames.
+			state.conservative = false
+		}
+		return
+	}
+
 	locals, args, objs := getStackMap(frame, &state.cache, false)
 
 	// Scan local variables if stack frame has been allocated.
@@ -879,6 +1069,8 @@
 // credit to gcController.bgScanCredit every gcCreditSlack units of
 // scan work.
 //
+// gcDrain will always return if there is a pending STW.
+//
 //go:nowritebarrier
 func gcDrain(gcw *gcWork, flags gcDrainFlags) {
 	if !writeBarrier.needed {
@@ -907,7 +1099,8 @@
 
 	// Drain root marking jobs.
 	if work.markrootNext < work.markrootJobs {
-		for !(preemptible && gp.preempt) {
+		// Stop if we're preemptible or if someone wants to STW.
+		for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
 			job := atomic.Xadd(&work.markrootNext, +1) - 1
 			if job >= work.markrootJobs {
 				break
@@ -920,7 +1113,8 @@
 	}
 
 	// Drain heap marking jobs.
-	for !(preemptible && gp.preempt) {
+	// Stop if we're preemptible or if someone wants to STW.
+	for !(gp.preempt && (preemptible || atomic.Load(&sched.gcwaiting) != 0)) {
 		// Try to keep work available on the global queue. We used to
 		// check if there were waiting workers, but it's better to
 		// just keep work available than to make workers wait. In the
@@ -1086,7 +1280,7 @@
 					if obj, span, objIndex := findObject(p, b, i); obj != 0 {
 						greyobject(obj, b, i, span, gcw, objIndex)
 					} else if stk != nil && p >= stk.stack.lo && p < stk.stack.hi {
-						stk.putPtr(p)
+						stk.putPtr(p, false)
 					}
 				}
 			}
@@ -1196,6 +1390,101 @@
 	gcw.scanWork += int64(i)
 }
 
+// scanConservative scans block [b, b+n) conservatively, treating any
+// pointer-like value in the block as a pointer.
+//
+// If ptrmask != nil, only words that are marked in ptrmask are
+// considered as potential pointers.
+//
+// If state != nil, it's assumed that [b, b+n) is a block in the stack
+// and may contain pointers to stack objects.
+func scanConservative(b, n uintptr, ptrmask *uint8, gcw *gcWork, state *stackScanState) {
+	if debugScanConservative {
+		printlock()
+		print("conservatively scanning [", hex(b), ",", hex(b+n), ")\n")
+		hexdumpWords(b, b+n, func(p uintptr) byte {
+			if ptrmask != nil {
+				word := (p - b) / sys.PtrSize
+				bits := *addb(ptrmask, word/8)
+				if (bits>>(word%8))&1 == 0 {
+					return '$'
+				}
+			}
+
+			val := *(*uintptr)(unsafe.Pointer(p))
+			if state != nil && state.stack.lo <= val && val < state.stack.hi {
+				return '@'
+			}
+
+			span := spanOfHeap(val)
+			if span == nil {
+				return ' '
+			}
+			idx := span.objIndex(val)
+			if span.isFree(idx) {
+				return ' '
+			}
+			return '*'
+		})
+		printunlock()
+	}
+
+	for i := uintptr(0); i < n; i += sys.PtrSize {
+		if ptrmask != nil {
+			word := i / sys.PtrSize
+			bits := *addb(ptrmask, word/8)
+			if bits == 0 {
+				// Skip 8 words (the loop increment will do the 8th)
+				//
+				// This must be the first time we've
+				// seen this word of ptrmask, so i
+				// must be 8-word-aligned, but check
+				// our reasoning just in case.
+				if i%(sys.PtrSize*8) != 0 {
+					throw("misaligned mask")
+				}
+				i += sys.PtrSize*8 - sys.PtrSize
+				continue
+			}
+			if (bits>>(word%8))&1 == 0 {
+				continue
+			}
+		}
+
+		val := *(*uintptr)(unsafe.Pointer(b + i))
+
+		// Check if val points into the stack.
+		if state != nil && state.stack.lo <= val && val < state.stack.hi {
+			// val may point to a stack object. This
+			// object may be dead from last cycle and
+			// hence may contain pointers to unallocated
+			// objects, but unlike heap objects we can't
+			// tell if it's already dead. Hence, if all
+			// pointers to this object are from
+			// conservative scanning, we have to scan it
+			// defensively, too.
+			state.putPtr(val, true)
+			continue
+		}
+
+		// Check if val points to a heap span.
+		span := spanOfHeap(val)
+		if span == nil {
+			continue
+		}
+
+		// Check if val points to an allocated object.
+		idx := span.objIndex(val)
+		if span.isFree(idx) {
+			continue
+		}
+
+		// val points to an allocated object. Mark it.
+		obj := span.base() + idx*span.elemsize
+		greyobject(obj, b, i, span, gcw, idx)
+	}
+}
+
 // Shade the object if it isn't already.
 // The object is not nil and known to be in the heap.
 // Preemption must be disabled.
@@ -1294,15 +1583,15 @@
 		return
 	}
 	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, " s.state=")
-	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
-		print(mSpanStateNames[s.state], "\n")
+	if state := s.state.get(); 0 <= state && int(state) < len(mSpanStateNames) {
+		print(mSpanStateNames[state], "\n")
 	} else {
-		print("unknown(", s.state, ")\n")
+		print("unknown(", state, ")\n")
 	}
 
 	skipped := false
 	size := s.elemsize
-	if s.state == mSpanManual && size == 0 {
+	if s.state.get() == mSpanManual && size == 0 {
 		// We're printing something from a stack frame. We
 		// don't know how big it is, so just show up to an
 		// including off.
@@ -1338,11 +1627,21 @@
 //
 //go:nowritebarrier
 //go:nosplit
-func gcmarknewobject(obj, size, scanSize uintptr) {
+func gcmarknewobject(span *mspan, obj, size, scanSize uintptr) {
 	if useCheckmark { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
-	markBitsForAddr(obj).setMarked()
+
+	// Mark object.
+	objIndex := span.objIndex(obj)
+	span.markBitsForIndex(objIndex).setMarked()
+
+	// Mark span.
+	arena, pageIdx, pageMask := pageIndexOf(span.base())
+	if arena.pageMarks[pageIdx]&pageMask == 0 {
+		atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+	}
+
 	gcw := &getg().m.p.ptr().gcw
 	gcw.bytesMarked += uint64(size)
 	gcw.scanWork += int64(scanSize)
@@ -1390,7 +1689,7 @@
 func initCheckmarks() {
 	useCheckmark = true
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			heapBitsForAddr(s.base()).initCheckmarkSpan(s.layout())
 		}
 	}
@@ -1399,7 +1698,7 @@
 func clearCheckmarks() {
 	useCheckmark = false
 	for _, s := range mheap_.allspans {
-		if s.state == mSpanInUse {
+		if s.state.get() == mSpanInUse {
 			heapBitsForAddr(s.base()).clearCheckmarkSpan(s.layout())
 		}
 	}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 45a9eb2..b74da10 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -17,7 +17,29 @@
 // scavenger's primary goal is to bring the estimated heap RSS of the
 // application down to a goal.
 //
-// That goal is defined as (retainExtraPercent+100) / 100 * next_gc.
+// That goal is defined as:
+//   (retainExtraPercent+100) / 100 * (next_gc / last_next_gc) * last_heap_inuse
+//
+// Essentially, we wish to have the application's RSS track the heap goal, but
+// the heap goal is defined in terms of bytes of objects, rather than pages like
+// RSS. As a result, we need to take into account for fragmentation internal to
+// spans. next_gc / last_next_gc defines the ratio between the current heap goal
+// and the last heap goal, which tells us by how much the heap is growing and
+// shrinking. We estimate what the heap will grow to in terms of pages by taking
+// this ratio and multiplying it by heap_inuse at the end of the last GC, which
+// allows us to account for this additional fragmentation. Note that this
+// procedure makes the assumption that the degree of fragmentation won't change
+// dramatically over the next GC cycle. Overestimating the amount of
+// fragmentation simply results in higher memory use, which will be accounted
+// for by the next pacing up date. Underestimating the fragmentation however
+// could lead to performance degradation. Handling this case is not within the
+// scope of the scavenger. Situations where the amount of fragmentation balloons
+// over the course of a single GC cycle should be considered pathologies,
+// flagged as bugs, and fixed appropriately.
+//
+// An additional factor of retainExtraPercent is added as a buffer to help ensure
+// that there's more unscavenged memory to allocate out of, since each allocation
+// out of scavenged memory incurs a potentially expensive page fault.
 //
 // The goal is updated after each GC and the scavenger's pacing parameters
 // (which live in mheap_) are updated to match. The pacing parameters work much
@@ -33,25 +55,18 @@
 
 package runtime
 
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
 const (
 	// The background scavenger is paced according to these parameters.
 	//
 	// scavengePercent represents the portion of mutator time we're willing
 	// to spend on scavenging in percent.
-	//
-	// scavengePageLatency is a worst-case estimate (order-of-magnitude) of
-	// the time it takes to scavenge one (regular-sized) page of memory.
-	// scavengeHugePageLatency is the same but for huge pages.
-	//
-	// scavengePagePeriod is derived from scavengePercent and scavengePageLatency,
-	// and represents the average time between scavenging one page that we're
-	// aiming for. scavengeHugePagePeriod is the same but for huge pages.
-	// These constants are core to the scavenge pacing algorithm.
-	scavengePercent         = 1    // 1%
-	scavengePageLatency     = 10e3 // 10µs
-	scavengeHugePageLatency = 10e3 // 10µs
-	scavengePagePeriod      = scavengePageLatency / (scavengePercent / 100.0)
-	scavengeHugePagePeriod  = scavengePageLatency / (scavengePercent / 100.0)
+	scavengePercent = 1 // 1%
 
 	// retainExtraPercent represents the amount of memory over the heap goal
 	// that the scavenger should keep as a buffer space for the allocator.
@@ -61,34 +76,62 @@
 	// incurs an additional cost), to account for heap fragmentation and
 	// the ever-changing layout of the heap.
 	retainExtraPercent = 10
+
+	// maxPagesPerPhysPage is the maximum number of supported runtime pages per
+	// physical page, based on maxPhysPageSize.
+	maxPagesPerPhysPage = maxPhysPageSize / pageSize
+
+	// scavengeCostRatio is the approximate ratio between the costs of using previously
+	// scavenged memory and scavenging memory.
+	//
+	// For most systems the cost of scavenging greatly outweighs the costs
+	// associated with using scavenged memory, making this constant 0. On other systems
+	// (especially ones where "sysUsed" is not just a no-op) this cost is non-trivial.
+	//
+	// This ratio is used as part of multiplicative factor to help the scavenger account
+	// for the additional costs of using scavenged memory in its pacing.
+	scavengeCostRatio = 0.7 * sys.GoosDarwin
+
+	// scavengeReservationShards determines the amount of memory the scavenger
+	// should reserve for scavenging at a time. Specifically, the amount of
+	// memory reserved is (heap size in bytes) / scavengeReservationShards.
+	scavengeReservationShards = 64
 )
 
 // heapRetained returns an estimate of the current heap RSS.
-//
-// mheap_.lock must be held or the world must be stopped.
 func heapRetained() uint64 {
-	return memstats.heap_sys - memstats.heap_released
+	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
 // its rate and RSS goal.
 //
 // The RSS goal is based on the current heap goal with a small overhead
-// to accomodate non-determinism in the allocator.
+// to accommodate non-determinism in the allocator.
 //
 // The pacing is based on scavengePageRate, which applies to both regular and
 // huge pages. See that constant for more information.
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcPaceScavenger() {
-	// Compute our scavenging goal and align it to a physical page boundary
-	// to make the following calculations more exact.
-	retainedGoal := memstats.next_gc
+	// If we're called before the first GC completed, disable scavenging.
+	// We never scavenge before the 2nd GC cycle anyway (we don't have enough
+	// information about the heap yet) so this is fine, and avoids a fault
+	// or garbage data later.
+	if memstats.last_next_gc == 0 {
+		mheap_.scavengeGoal = ^uint64(0)
+		return
+	}
+	// Compute our scavenging goal.
+	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
 	// (e.g. if retainExtraPercent = 12.5, then we get a divisor of 8)
 	// that also avoids the overflow from a multiplication.
 	retainedGoal += retainedGoal / (1.0 / (retainExtraPercent / 100.0))
+	// Align it to a physical page boundary to make the following calculations
+	// a bit more exact.
 	retainedGoal = (retainedGoal + uint64(physPageSize) - 1) &^ (uint64(physPageSize) - 1)
 
 	// Represents where we are now in the heap's contribution to RSS in bytes.
@@ -104,87 +147,47 @@
 	// physical page.
 	retainedNow := heapRetained()
 
-	// If we're already below our goal, publish the goal in case it changed
-	// then disable the background scavenger.
-	if retainedNow <= retainedGoal {
-		mheap_.scavengeRetainedGoal = retainedGoal
-		mheap_.scavengeBytesPerNS = 0
+	// If we're already below our goal, or within one page of our goal, then disable
+	// the background scavenger. We disable the background scavenger if there's
+	// less than one physical page of work to do because it's not worth it.
+	if retainedNow <= retainedGoal || retainedNow-retainedGoal < uint64(physPageSize) {
+		mheap_.scavengeGoal = ^uint64(0)
 		return
 	}
-
-	// Now we start to compute the total amount of work necessary and the total
-	// amount of time we're willing to give the scavenger to complete this work.
-	// This will involve calculating how much of the work consists of huge pages
-	// and how much consists of regular pages since the former can let us scavenge
-	// more memory in the same time.
-	totalWork := retainedNow - retainedGoal
-
-	// On systems without huge page support, all work is regular work.
-	regularWork := totalWork
-	hugeTime := uint64(0)
-
-	// On systems where we have huge pages, we want to do as much of the
-	// scavenging work as possible on huge pages, because the costs are the
-	// same per page, but we can give back more more memory in a shorter
-	// period of time.
-	if physHugePageSize != 0 {
-		// Start by computing the amount of free memory we have in huge pages
-		// in total. Trivially, this is all the huge page work we need to do.
-		hugeWork := uint64(mheap_.free.unscavHugePages) << physHugePageShift
-
-		// ...but it could turn out that there's more huge work to do than
-		// total work, so cap it at total work. This might happen for very large
-		// heaps where the additional factor of retainExtraPercent can make it so
-		// that there are free chunks of memory larger than a huge page that we don't want
-		// to scavenge.
-		if hugeWork >= totalWork {
-			hugePages := totalWork >> physHugePageShift
-			hugeWork = hugePages << physHugePageShift
-		}
-		// Everything that's not huge work is regular work. At this point we
-		// know huge work so we can calculate how much time that will take
-		// based on scavengePageRate (which applies to pages of any size).
-		regularWork = totalWork - hugeWork
-		hugeTime = (hugeWork >> physHugePageShift) * scavengeHugePagePeriod
-	}
-	// Finally, we can compute how much time it'll take to do the regular work
-	// and the total time to do all the work.
-	regularTime := regularWork / uint64(physPageSize) * scavengePagePeriod
-	totalTime := hugeTime + regularTime
-
-	now := nanotime()
-
-	lock(&scavenge.lock)
-
-	// Update all the pacing parameters in mheap with scavenge.lock held,
-	// so that scavenge.gen is kept in sync with the updated values.
-	mheap_.scavengeRetainedGoal = retainedGoal
-	mheap_.scavengeRetainedBasis = retainedNow
-	mheap_.scavengeTimeBasis = now
-	mheap_.scavengeBytesPerNS = float64(totalWork) / float64(totalTime)
-	scavenge.gen++ // increase scavenge generation
-
-	// Wake up background scavenger if needed, since the pacing was just updated.
-	wakeScavengerLocked()
-
-	unlock(&scavenge.lock)
+	mheap_.scavengeGoal = retainedGoal
 }
 
-// State of the background scavenger.
+// Sleep/wait state of the background scavenger.
 var scavenge struct {
-	lock   mutex
-	g      *g
-	parked bool
-	timer  *timer
-	gen    uint32 // read with either lock or mheap_.lock, write with both
+	lock       mutex
+	g          *g
+	parked     bool
+	timer      *timer
+	sysmonWake uint32 // Set atomically.
 }
 
-// wakeScavengerLocked unparks the scavenger if necessary. It must be called
-// after any pacing update.
+// readyForScavenger signals sysmon to wake the scavenger because
+// there may be new work to do.
 //
-// scavenge.lock must be held.
-func wakeScavengerLocked() {
+// There may be a significant delay between when this function runs
+// and when the scavenger is kicked awake, but it may be safely invoked
+// in contexts where wakeScavenger is unsafe to call directly.
+func readyForScavenger() {
+	atomic.Store(&scavenge.sysmonWake, 1)
+}
+
+// wakeScavenger immediately unparks the scavenger if necessary.
+//
+// May run without a P, but it may allocate, so it must not be called
+// on any allocation path.
+//
+// mheap_.lock, scavenge.lock, and sched.lock must not be held.
+func wakeScavenger() {
+	lock(&scavenge.lock)
 	if scavenge.parked {
+		// Notify sysmon that it shouldn't bother waking up the scavenger.
+		atomic.Store(&scavenge.sysmonWake, 0)
+
 		// Try to stop the timer but we don't really care if we succeed.
 		// It's possible that either a timer was never started, or that
 		// we're racing with it.
@@ -194,45 +197,51 @@
 		stopTimer(scavenge.timer)
 
 		// Unpark the goroutine and tell it that there may have been a pacing
-		// change.
+		// change. Note that we skip the scheduler's runnext slot because we
+		// want to avoid having the scavenger interfere with the fair
+		// scheduling of user goroutines. In effect, this schedules the
+		// scavenger at a "lower priority" but that's OK because it'll
+		// catch up on the work it missed when it does get scheduled.
 		scavenge.parked = false
-		ready(scavenge.g, 0, true)
+
+		// Ready the goroutine by injecting it. We use injectglist instead
+		// of ready or goready in order to allow us to run this function
+		// without a P. injectglist also avoids placing the goroutine in
+		// the current P's runnext slot, which is desireable to prevent
+		// the scavenger from interfering with user goroutine scheduling
+		// too much.
+		var list gList
+		list.push(scavenge.g)
+		injectglist(&list)
 	}
+	unlock(&scavenge.lock)
 }
 
 // scavengeSleep attempts to put the scavenger to sleep for ns.
-// It also checks to see if gen != scavenge.gen before going to sleep,
-// and aborts if true (meaning an update had occurred).
 //
 // Note that this function should only be called by the scavenger.
 //
 // The scavenger may be woken up earlier by a pacing change, and it may not go
 // to sleep at all if there's a pending pacing change.
 //
-// Returns false if awoken early (i.e. true means a complete sleep).
-func scavengeSleep(gen uint32, ns int64) bool {
+// Returns the amount of time actually slept.
+func scavengeSleep(ns int64) int64 {
 	lock(&scavenge.lock)
 
-	// If there was an update, just abort the sleep.
-	if scavenge.gen != gen {
-		unlock(&scavenge.lock)
-		return false
-	}
-
 	// Set the timer.
-	now := nanotime()
-	scavenge.timer.when = now + ns
-	startTimer(scavenge.timer)
+	//
+	// This must happen here instead of inside gopark
+	// because we can't close over any variables without
+	// failing escape analysis.
+	start := nanotime()
+	resetTimer(scavenge.timer, start+ns)
 
-	// Park the goroutine. It's fine that we don't publish the
-	// fact that the timer was set; even if the timer wakes up
-	// and fire scavengeReady before we park, it'll block on
-	// scavenge.lock.
+	// Mark ourself as asleep and go to sleep.
 	scavenge.parked = true
 	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
 
-	// Return true if we completed the full sleep.
-	return (nanotime() - now) >= ns
+	// Return how long we actually slept for.
+	return nanotime() - start
 }
 
 // Background scavenger.
@@ -243,123 +252,676 @@
 func bgscavenge(c chan int) {
 	scavenge.g = getg()
 
+	lockInit(&scavenge.lock, lockRankScavenge)
 	lock(&scavenge.lock)
 	scavenge.parked = true
 
 	scavenge.timer = new(timer)
 	scavenge.timer.f = func(_ interface{}, _ uintptr) {
-		lock(&scavenge.lock)
-		wakeScavengerLocked()
-		unlock(&scavenge.lock)
+		wakeScavenger()
 	}
 
 	c <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
-	// Parameters for sleeping.
+	// Exponentially-weighted moving average of the fraction of time this
+	// goroutine spends scavenging (that is, percent of a single CPU).
+	// It represents a measure of scheduling overheads which might extend
+	// the sleep or the critical time beyond what's expected. Assume no
+	// overhead to begin with.
 	//
-	// If we end up doing more work than we need, we should avoid spinning
-	// until we have more work to do: instead, we know exactly how much time
-	// until more work will need to be done, so we sleep.
-	//
-	// We should avoid sleeping for less than minSleepNS because Gosched()
-	// overheads among other things will work out better in that case.
-	//
-	// There's no reason to set a maximum on sleep time because we'll always
-	// get woken up earlier if there's any kind of update that could change
-	// the scavenger's pacing.
-	//
-	// retryDelayNS tracks how much to sleep next time we fail to do any
-	// useful work.
-	const minSleepNS = int64(100 * 1000) // 100 µs
-
-	retryDelayNS := minSleepNS
+	// TODO(mknyszek): Consider making this based on total CPU time of the
+	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
+	// feasible now because the scavenger acquires the heap lock over the
+	// scavenging operation, which means scavenging effectively blocks
+	// allocators and isn't scalable. However, given a scalable allocator,
+	// it makes sense to also make the scavenger scale with it; if you're
+	// allocating more frequently, then presumably you're also generating
+	// more work for the scavenger.
+	const idealFraction = scavengePercent / 100.0
+	scavengeEWMA := float64(idealFraction)
 
 	for {
 		released := uintptr(0)
-		park := false
-		ttnext := int64(0)
-		gen := uint32(0)
+
+		// Time in scavenging critical section.
+		crit := float64(0)
 
 		// Run on the system stack since we grab the heap lock,
 		// and a stack growth with the heap lock means a deadlock.
 		systemstack(func() {
 			lock(&mheap_.lock)
 
-			gen = scavenge.gen
-
 			// If background scavenging is disabled or if there's no work to do just park.
-			retained := heapRetained()
-			if mheap_.scavengeBytesPerNS == 0 || retained <= mheap_.scavengeRetainedGoal {
+			retained, goal := heapRetained(), mheap_.scavengeGoal
+			if retained <= goal {
 				unlock(&mheap_.lock)
-				park = true
 				return
 			}
 
-			// Calculate how big we want the retained heap to be
-			// at this point in time.
-			//
-			// The formula is for that of a line, y = b - mx
-			// We want y (want),
-			//   m = scavengeBytesPerNS (> 0)
-			//   x = time between scavengeTimeBasis and now
-			//   b = scavengeRetainedBasis
-			rate := mheap_.scavengeBytesPerNS
-			tdist := nanotime() - mheap_.scavengeTimeBasis
-			rdist := uint64(rate * float64(tdist))
-			want := mheap_.scavengeRetainedBasis - rdist
+			// Scavenge one page, and measure the amount of time spent scavenging.
+			start := nanotime()
+			released = mheap_.pages.scavenge(physPageSize, true)
+			mheap_.pages.scav.released += released
+			crit = float64(nanotime() - start)
 
-			// If we're above the line, scavenge to get below the
-			// line.
-			if retained > want {
-				released = mheap_.scavengeLocked(uintptr(retained - want))
-			}
 			unlock(&mheap_.lock)
-
-			// If we over-scavenged a bit, calculate how much time it'll
-			// take at the current rate for us to make that up. We definitely
-			// won't have any work to do until at least that amount of time
-			// passes.
-			if released > uintptr(retained-want) {
-				extra := released - uintptr(retained-want)
-				ttnext = int64(float64(extra) / rate)
-			}
 		})
 
-		if park {
+		if released == 0 {
 			lock(&scavenge.lock)
 			scavenge.parked = true
 			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 			continue
 		}
 
-		if debug.gctrace > 0 {
-			if released > 0 {
-				print("scvg: ", released>>20, " MB released\n")
-			}
-			print("scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+		if released < physPageSize {
+			// If this happens, it means that we may have attempted to release part
+			// of a physical page, but the likely effect of that is that it released
+			// the whole physical page, some of which may have still been in-use.
+			// This could lead to memory corruption. Throw.
+			throw("released less than one physical page of memory")
 		}
 
-		if released == 0 {
-			// If we were unable to release anything this may be because there's
-			// no free memory available to scavenge. Go to sleep and try again.
-			if scavengeSleep(gen, retryDelayNS) {
-				// If we successfully slept through the delay, back off exponentially.
-				retryDelayNS *= 2
-			}
-			continue
-		}
-		retryDelayNS = minSleepNS
-
-		if ttnext > 0 && ttnext > minSleepNS {
-			// If there's an appreciable amount of time until the next scavenging
-			// goal, just sleep. We'll get woken up if anything changes and this
-			// way we avoid spinning.
-			scavengeSleep(gen, ttnext)
-			continue
+		// On some platforms we may see crit as zero if the time it takes to scavenge
+		// memory is less than the minimum granularity of its clock (e.g. Windows).
+		// In this case, just assume scavenging takes 10 µs per regular physical page
+		// (determined empirically), and conservatively ignore the impact of huge pages
+		// on timing.
+		//
+		// We shouldn't ever see a crit value less than zero unless there's a bug of
+		// some kind, either on our side or in the platform we're running on, but be
+		// defensive in that case as well.
+		const approxCritNSPerPhysicalPage = 10e3
+		if crit <= 0 {
+			crit = approxCritNSPerPhysicalPage * float64(released/physPageSize)
 		}
 
-		// Give something else a chance to run, no locks are held.
-		Gosched()
+		// Multiply the critical time by 1 + the ratio of the costs of using
+		// scavenged memory vs. scavenging memory. This forces us to pay down
+		// the cost of reusing this memory eagerly by sleeping for a longer period
+		// of time and scavenging less frequently. More concretely, we avoid situations
+		// where we end up scavenging so often that we hurt allocation performance
+		// because of the additional overheads of using scavenged memory.
+		crit *= 1 + scavengeCostRatio
+
+		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
+		// put their machine to sleep) in the critical section, bound the time we use to
+		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
+		const maxCrit = 10e6
+		if crit > maxCrit {
+			crit = maxCrit
+		}
+
+		// Compute the amount of time to sleep, assuming we want to use at most
+		// scavengePercent of CPU time. Take into account scheduling overheads
+		// that may extend the length of our sleep by multiplying by how far
+		// off we are from the ideal ratio. For example, if we're sleeping too
+		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
+		// down.
+		adjust := scavengeEWMA / idealFraction
+		sleepTime := int64(adjust * crit / (scavengePercent / 100.0))
+
+		// Go to sleep.
+		slept := scavengeSleep(sleepTime)
+
+		// Compute the new ratio.
+		fraction := crit / (crit + float64(slept))
+
+		// Set a lower bound on the fraction.
+		// Due to OS-related anomalies we may "sleep" for an inordinate amount
+		// of time. Let's avoid letting the ratio get out of hand by bounding
+		// the sleep time we use in our EWMA.
+		const minFraction = 1 / 1000
+		if fraction < minFraction {
+			fraction = minFraction
+		}
+
+		// Update scavengeEWMA by merging in the new crit/slept ratio.
+		const alpha = 0.5
+		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
 	}
 }
+
+// scavenge scavenges nbytes worth of free pages, starting with the
+// highest address first. Successive calls continue from where it left
+// off until the heap is exhausted. Call scavengeStartGen to bring it
+// back to the top of the heap.
+//
+// Returns the amount of memory scavenged in bytes.
+//
+// s.mheapLock must be held, but may be temporarily released if
+// mayUnlock == true.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+	var (
+		addrs addrRange
+		gen   uint32
+	)
+	released := uintptr(0)
+	for released < nbytes {
+		if addrs.size() == 0 {
+			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+				break
+			}
+		}
+		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
+		released += r
+		addrs = a
+	}
+	// Only unreserve the space which hasn't been scavenged or searched
+	// to ensure we always make progress.
+	s.scavengeUnreserve(addrs, gen)
+	return released
+}
+
+// printScavTrace prints a scavenge trace line to standard error.
+//
+// released should be the amount of memory released since the last time this
+// was called, and forced indicates whether the scavenge was forced by the
+// application.
+func printScavTrace(gen uint32, released uintptr, forced bool) {
+	printlock()
+	print("scav ", gen, " ",
+		released>>10, " KiB work, ",
+		atomic.Load64(&memstats.heap_released)>>10, " KiB total, ",
+		(atomic.Load64(&memstats.heap_inuse)*100)/heapRetained(), "% util",
+	)
+	if forced {
+		print(" (forced)")
+	}
+	println()
+	printunlock()
+}
+
+// scavengeStartGen starts a new scavenge generation, resetting
+// the scavenger's search space to the full in-use address space.
+//
+// s.mheapLock must be held.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeStartGen() {
+	if debug.scavtrace > 0 {
+		printScavTrace(s.scav.gen, s.scav.released, false)
+	}
+	s.inUse.cloneInto(&s.scav.inUse)
+
+	// Pick the new starting address for the scavenger cycle.
+	var startAddr offAddr
+	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+		// The "free" high watermark exceeds the "scavenged" low watermark,
+		// so there are free scavengable pages in parts of the address space
+		// that the scavenger already searched, the high watermark being the
+		// highest one. Pick that as our new starting point to ensure we
+		// see those pages.
+		startAddr = s.scav.freeHWM
+	} else {
+		// The "free" high watermark does not exceed the "scavenged" low
+		// watermark. This means the allocator didn't free any memory in
+		// the range we scavenged last cycle, so we might as well continue
+		// scavenging from where we were.
+		startAddr = s.scav.scavLWM
+	}
+	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+
+	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// scavengeReservationShards is large. This case is fine as the scavenger
+	// will simply be turned off, but it does mean that scavengeReservationShards,
+	// in concert with pallocChunkBytes, dictates the minimum heap size at which
+	// the scavenger triggers. In practice this minimum is generally less than an
+	// arena in size, so virtually every heap has the scavenger on.
+	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	s.scav.gen++
+	s.scav.released = 0
+	s.scav.freeHWM = minOffAddr
+	s.scav.scavLWM = maxOffAddr
+}
+
+// scavengeReserve reserves a contiguous range of the address space
+// for scavenging. The maximum amount of space it reserves is proportional
+// to the size of the heap. The ranges are reserved from the high addresses
+// first.
+//
+// Returns the reserved range and the scavenge generation number for it.
+//
+// s.mheapLock must be held.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+	// Start by reserving the minimum.
+	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+
+	// Return early if the size is zero; we don't want to use
+	// the bogus address below.
+	if r.size() == 0 {
+		return r, s.scav.gen
+	}
+
+	// The scavenger requires that base be aligned to a
+	// palloc chunk because that's the unit of operation for
+	// the scavenger, so align down, potentially extending
+	// the range.
+	newBase := alignDown(r.base.addr(), pallocChunkBytes)
+
+	// Remove from inUse however much extra we just pulled out.
+	s.scav.inUse.removeGreaterEqual(newBase)
+	r.base = offAddr{newBase}
+	return r, s.scav.gen
+}
+
+// scavengeUnreserve returns an unscavenged portion of a range that was
+// previously reserved with scavengeReserve.
+//
+// s.mheapLock must be held.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	if r.size() == 0 || gen != s.scav.gen {
+		return
+	}
+	if r.base.addr()%pallocChunkBytes != 0 {
+		throw("unreserving unaligned region")
+	}
+	s.scav.inUse.add(r)
+}
+
+// scavengeOne walks over address range work until it finds
+// a contiguous run of pages to scavenge. It will try to scavenge
+// at most max bytes at once, but may scavenge more to avoid
+// breaking huge pages. Once it scavenges some memory it returns
+// how much it scavenged in bytes.
+//
+// Returns the number of bytes scavenged and the part of work
+// which was not yet searched.
+//
+// work's base address must be aligned to pallocChunkBytes.
+//
+// s.mheapLock must be held, but may be temporarily released if
+// mayUnlock == true.
+//
+// Must run on the system stack because s.mheapLock must be held.
+//
+//go:systemstack
+func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+	// Defensively check if we've recieved an empty address range.
+	// If so, just return.
+	if work.size() == 0 {
+		// Nothing to do.
+		return 0, work
+	}
+	// Check the prerequisites of work.
+	if work.base.addr()%pallocChunkBytes != 0 {
+		throw("scavengeOne called with unaligned work region")
+	}
+	// Calculate the maximum number of pages to scavenge.
+	//
+	// This should be alignUp(max, pageSize) / pageSize but max can and will
+	// be ^uintptr(0), so we need to be very careful not to overflow here.
+	// Rather than use alignUp, calculate the number of pages rounded down
+	// first, then add back one if necessary.
+	maxPages := max / pageSize
+	if max%pageSize != 0 {
+		maxPages++
+	}
+
+	// Calculate the minimum number of pages we can scavenge.
+	//
+	// Because we can only scavenge whole physical pages, we must
+	// ensure that we scavenge at least minPages each time, aligned
+	// to minPages*pageSize.
+	minPages := physPageSize / pageSize
+	if minPages < 1 {
+		minPages = 1
+	}
+
+	// Helpers for locking and unlocking only if mayUnlock == true.
+	lockHeap := func() {
+		if mayUnlock {
+			lock(s.mheapLock)
+		}
+	}
+	unlockHeap := func() {
+		if mayUnlock {
+			unlock(s.mheapLock)
+		}
+	}
+
+	// Fast path: check the chunk containing the top-most address in work,
+	// starting at that address's page index in the chunk.
+	//
+	// Note that work.end() is exclusive, so get the chunk we care about
+	// by subtracting 1.
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all.
+		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			return uintptr(npages) * pageSize, work
+		}
+	}
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
+
+	// findCandidate finds the next scavenge candidate in work optimistically.
+	//
+	// Returns the candidate chunk index and true on success, and false on failure.
+	//
+	// The heap need not be locked.
+	findCandidate := func(work addrRange) (chunkIdx, bool) {
+		// Iterate over this work's chunks.
+		for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- {
+			// If this chunk is totally in-use or has no unscavenged pages, don't bother
+			// doing a more sophisticated check.
+			//
+			// Note we're accessing the summary and the chunks without a lock, but
+			// that's fine. We're being optimistic anyway.
+
+			// Check quickly if there are enough free pages at all.
+			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+				continue
+			}
+
+			// Run over the chunk looking harder for a candidate. Again, we could
+			// race with a lot of different pieces of code, but we're just being
+			// optimistic. Make sure we load the l2 pointer atomically though, to
+			// avoid races with heap growth. It may or may not be possible to also
+			// see a nil pointer in this case if we do race with heap growth, but
+			// just defensively ignore the nils. This operation is optimistic anyway.
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
+				return i, true
+			}
+		}
+		return 0, false
+	}
+
+	// Slow path: iterate optimistically over the in-use address space
+	// looking for any free and unscavenged page. If we think we see something,
+	// lock and verify it!
+	for work.size() != 0 {
+		unlockHeap()
+
+		// Search for the candidate.
+		candidateChunkIdx, ok := findCandidate(work)
+
+		// Lock the heap. We need to do this now if we found a candidate or not.
+		// If we did, we'll verify it. If not, we need to lock before returning
+		// anyway.
+		lockHeap()
+
+		if !ok {
+			// We didn't find a candidate, so we're done.
+			work.limit = work.base
+			break
+		}
+
+		// Find, verify, and scavenge if we can.
+		chunk := s.chunkOf(candidateChunkIdx)
+		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
+		if npages > 0 {
+			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			return uintptr(npages) * pageSize, work
+		}
+
+		// We were fooled, so let's continue from where we left off.
+		work.limit = offAddr{chunkBase(candidateChunkIdx)}
+	}
+	return 0, work
+}
+
+// scavengeRangeLocked scavenges the given region of memory.
+// The region of memory is described by its chunk index (ci),
+// the starting page index of the region relative to that
+// chunk (base), and the length of the region in pages (npages).
+//
+// Returns the base address of the scavenged region.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	s.chunkOf(ci).scavenged.setRange(base, npages)
+
+	// Compute the full address for the start of the range.
+	addr := chunkBase(ci) + uintptr(base)*pageSize
+
+	// Update the scavenge low watermark.
+	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
+		s.scav.scavLWM = oAddr
+	}
+
+	// Only perform the actual scavenging if we're not in a test.
+	// It's dangerous to do so otherwise.
+	if s.test {
+		return addr
+	}
+	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+
+	// Update global accounting only when not in test, otherwise
+	// the runtime's accounting will be wrong.
+	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	return addr
+}
+
+// fillAligned returns x but with all zeroes in m-aligned
+// groups of m bits set to 1 if any bit in the group is non-zero.
+//
+// For example, fillAligned(0x0100a3, 8) == 0xff00ff.
+//
+// Note that if m == 1, this is a no-op.
+//
+// m must be a power of 2 <= maxPagesPerPhysPage.
+func fillAligned(x uint64, m uint) uint64 {
+	apply := func(x uint64, c uint64) uint64 {
+		// The technique used it here is derived from
+		// https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
+		// and extended for more than just bytes (like nibbles
+		// and uint16s) by using an appropriate constant.
+		//
+		// To summarize the technique, quoting from that page:
+		// "[It] works by first zeroing the high bits of the [8]
+		// bytes in the word. Subsequently, it adds a number that
+		// will result in an overflow to the high bit of a byte if
+		// any of the low bits were initially set. Next the high
+		// bits of the original word are ORed with these values;
+		// thus, the high bit of a byte is set iff any bit in the
+		// byte was set. Finally, we determine if any of these high
+		// bits are zero by ORing with ones everywhere except the
+		// high bits and inverting the result."
+		return ^((((x & c) + c) | x) | c)
+	}
+	// Transform x to contain a 1 bit at the top of each m-aligned
+	// group of m zero bits.
+	switch m {
+	case 1:
+		return x
+	case 2:
+		x = apply(x, 0x5555555555555555)
+	case 4:
+		x = apply(x, 0x7777777777777777)
+	case 8:
+		x = apply(x, 0x7f7f7f7f7f7f7f7f)
+	case 16:
+		x = apply(x, 0x7fff7fff7fff7fff)
+	case 32:
+		x = apply(x, 0x7fffffff7fffffff)
+	case 64: // == maxPagesPerPhysPage
+		x = apply(x, 0x7fffffffffffffff)
+	default:
+		throw("bad m value")
+	}
+	// Now, the top bit of each m-aligned group in x is set
+	// that group was all zero in the original x.
+
+	// From each group of m bits subtract 1.
+	// Because we know only the top bits of each
+	// m-aligned group are set, we know this will
+	// set each group to have all the bits set except
+	// the top bit, so just OR with the original
+	// result to set all the bits.
+	return ^((x - (x >> (m - 1))) | x)
+}
+
+// hasScavengeCandidate returns true if there's any min-page-aligned groups of
+// min pages of free-and-unscavenged memory in the region represented by this
+// pallocData.
+//
+// min must be a non-zero power of 2 <= maxPagesPerPhysPage.
+func (m *pallocData) hasScavengeCandidate(min uintptr) bool {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+
+	// The goal of this search is to see if the chunk contains any free and unscavenged memory.
+	for i := len(m.scavenged) - 1; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		//
+		// TODO(mknyszek): Consider splitting up fillAligned into two
+		// functions, since here we technically could get by with just
+		// the first half of its computation. It'll save a few instructions
+		// but adds some additional code complexity.
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+
+		// Quickly skip over chunks of non-free or scavenged pages.
+		if x != ^uint64(0) {
+			return true
+		}
+	}
+	return false
+}
+
+// findScavengeCandidate returns a start index and a size for this pallocData
+// segment which represents a contiguous region of free and unscavenged memory.
+//
+// searchIdx indicates the page index within this chunk to start the search, but
+// note that findScavengeCandidate searches backwards through the pallocData. As a
+// a result, it will return the highest scavenge candidate in address order.
+//
+// min indicates a hard minimum size and alignment for runs of pages. That is,
+// findScavengeCandidate will not return a region smaller than min pages in size,
+// or that is min pages or greater in size but not aligned to min. min must be
+// a non-zero power of 2 <= maxPagesPerPhysPage.
+//
+// max is a hint for how big of a region is desired. If max >= pallocChunkPages, then
+// findScavengeCandidate effectively returns entire free and unscavenged regions.
+// If max < pallocChunkPages, it may truncate the returned region such that size is
+// max. However, findScavengeCandidate may still return a larger region if, for
+// example, it chooses to preserve huge pages, or if max is not aligned to min (it
+// will round up). That is, even if max is small, the returned size is not guaranteed
+// to be equal to max. max is allowed to be less than min, in which case it is as if
+// max == min.
+func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (uint, uint) {
+	if min&(min-1) != 0 || min == 0 {
+		print("runtime: min = ", min, "\n")
+		throw("min must be a non-zero power of 2")
+	} else if min > maxPagesPerPhysPage {
+		print("runtime: min = ", min, "\n")
+		throw("min too large")
+	}
+	// max may not be min-aligned, so we might accidentally truncate to
+	// a max value which causes us to return a non-min-aligned value.
+	// To prevent this, align max up to a multiple of min (which is always
+	// a power of 2). This also prevents max from ever being less than
+	// min, unless it's zero, so handle that explicitly.
+	if max == 0 {
+		max = min
+	} else {
+		max = alignUp(max, min)
+	}
+
+	i := int(searchIdx / 64)
+	// Start by quickly skipping over blocks of non-free or scavenged pages.
+	for ; i >= 0; i-- {
+		// 1s are scavenged OR non-free => 0s are unscavenged AND free
+		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+		if x != ^uint64(0) {
+			break
+		}
+	}
+	if i < 0 {
+		// Failed to find any free/unscavenged pages.
+		return 0, 0
+	}
+	// We have something in the 64-bit chunk at i, but it could
+	// extend further. Loop until we find the extent of it.
+
+	// 1s are scavenged OR non-free => 0s are unscavenged AND free
+	x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
+	z1 := uint(sys.LeadingZeros64(^x))
+	run, end := uint(0), uint(i)*64+(64-z1)
+	if x<<z1 != 0 {
+		// After shifting out z1 bits, we still have 1s,
+		// so the run ends inside this word.
+		run = uint(sys.LeadingZeros64(x << z1))
+	} else {
+		// After shifting out z1 bits, we have no more 1s.
+		// This means the run extends to the bottom of the
+		// word so it may extend into further words.
+		run = 64 - z1
+		for j := i - 1; j >= 0; j-- {
+			x := fillAligned(m.scavenged[j]|m.pallocBits[j], uint(min))
+			run += uint(sys.LeadingZeros64(x))
+			if x != 0 {
+				// The run stopped in this word.
+				break
+			}
+		}
+	}
+
+	// Split the run we found if it's larger than max but hold on to
+	// our original length, since we may need it later.
+	size := run
+	if size > uint(max) {
+		size = uint(max)
+	}
+	start := end - size
+
+	// Each huge page is guaranteed to fit in a single palloc chunk.
+	//
+	// TODO(mknyszek): Support larger huge page sizes.
+	// TODO(mknyszek): Consider taking pages-per-huge-page as a parameter
+	// so we can write tests for this.
+	if physHugePageSize > pageSize && physHugePageSize > physPageSize {
+		// We have huge pages, so let's ensure we don't break one by scavenging
+		// over a huge page boundary. If the range [start, start+size) overlaps with
+		// a free-and-unscavenged huge page, we want to grow the region we scavenge
+		// to include that huge page.
+
+		// Compute the huge page boundary above our candidate.
+		pagesPerHugePage := uintptr(physHugePageSize / pageSize)
+		hugePageAbove := uint(alignUp(uintptr(start), pagesPerHugePage))
+
+		// If that boundary is within our current candidate, then we may be breaking
+		// a huge page.
+		if hugePageAbove <= end {
+			// Compute the huge page boundary below our candidate.
+			hugePageBelow := uint(alignDown(uintptr(start), pagesPerHugePage))
+
+			if hugePageBelow >= end-run {
+				// We're in danger of breaking apart a huge page since start+size crosses
+				// a huge page boundary and rounding down start to the nearest huge
+				// page boundary is included in the full run we found. Include the entire
+				// huge page in the bound by rounding down to the huge page size.
+				size = size + (start - hugePageBelow)
+				start = hugePageBelow
+			}
+		}
+	}
+	return start, size
+}
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
new file mode 100644
index 0000000..7f619b1
--- /dev/null
+++ b/src/runtime/mgcscavenge_test.go
@@ -0,0 +1,443 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+// makePallocData produces an initialized PallocData by setting
+// the ranges of described in alloc and scavenge.
+func makePallocData(alloc, scavenged []BitRange) *PallocData {
+	b := new(PallocData)
+	for _, v := range alloc {
+		if v.N == 0 {
+			// Skip N==0. It's harmless and allocRange doesn't
+			// handle this case.
+			continue
+		}
+		b.AllocRange(v.I, v.N)
+	}
+	for _, v := range scavenged {
+		if v.N == 0 {
+			// See the previous loop.
+			continue
+		}
+		b.ScavengedSetRange(v.I, v.N)
+	}
+	return b
+}
+
+func TestFillAligned(t *testing.T) {
+	fillAlignedSlow := func(x uint64, m uint) uint64 {
+		if m == 1 {
+			return x
+		}
+		out := uint64(0)
+		for i := uint(0); i < 64; i += m {
+			for j := uint(0); j < m; j++ {
+				if x&(uint64(1)<<(i+j)) != 0 {
+					out |= ((uint64(1) << m) - 1) << i
+					break
+				}
+			}
+		}
+		return out
+	}
+	check := func(x uint64, m uint) {
+		want := fillAlignedSlow(x, m)
+		if got := FillAligned(x, m); got != want {
+			t.Logf("got:  %064b", got)
+			t.Logf("want: %064b", want)
+			t.Errorf("bad fillAligned(%016x, %d)", x, m)
+		}
+	}
+	for m := uint(1); m <= 64; m *= 2 {
+		tests := []uint64{
+			0x0000000000000000,
+			0x00000000ffffffff,
+			0xffffffff00000000,
+			0x8000000000000001,
+			0xf00000000000000f,
+			0xf00000010050000f,
+			0xffffffffffffffff,
+			0x0000000000000001,
+			0x0000000000000002,
+			0x0000000000000008,
+			uint64(1) << (m - 1),
+			uint64(1) << m,
+			// Try a few fixed arbitrary examples.
+			0xb02b9effcf137016,
+			0x3975a076a9fbff18,
+			0x0f8c88ec3b81506e,
+			0x60f14d80ef2fa0e6,
+		}
+		for _, test := range tests {
+			check(test, m)
+		}
+		for i := 0; i < 1000; i++ {
+			// Try a pseudo-random numbers.
+			check(rand.Uint64(), m)
+
+			if m > 1 {
+				// For m != 1, let's construct a slightly more interesting
+				// random test. Generate a bitmap which is either 0 or
+				// randomly set bits for each m-aligned group of m bits.
+				val := uint64(0)
+				for n := uint(0); n < 64; n += m {
+					// For each group of m bits, flip a coin:
+					// * Leave them as zero.
+					// * Set them randomly.
+					if rand.Uint64()%2 == 0 {
+						val |= (rand.Uint64() & ((1 << m) - 1)) << n
+					}
+				}
+				check(val, m)
+			}
+		}
+	}
+}
+
+func TestPallocDataFindScavengeCandidate(t *testing.T) {
+	type test struct {
+		alloc, scavenged []BitRange
+		min, max         uintptr
+		want             BitRange
+	}
+	tests := map[string]test{
+		"MixedMin1": {
+			alloc:     []BitRange{{0, 40}, {42, PallocChunkPages - 42}},
+			scavenged: []BitRange{{0, 41}, {42, PallocChunkPages - 42}},
+			min:       1,
+			max:       PallocChunkPages,
+			want:      BitRange{41, 1},
+		},
+		"MultiMin1": {
+			alloc:     []BitRange{{0, 63}, {65, 20}, {87, PallocChunkPages - 87}},
+			scavenged: []BitRange{{86, 1}},
+			min:       1,
+			max:       PallocChunkPages,
+			want:      BitRange{85, 1},
+		},
+	}
+	// Try out different page minimums.
+	for m := uintptr(1); m <= 64; m *= 2 {
+		suffix := fmt.Sprintf("Min%d", m)
+		tests["AllFree"+suffix] = test{
+			min:  m,
+			max:  PallocChunkPages,
+			want: BitRange{0, PallocChunkPages},
+		}
+		tests["AllScavenged"+suffix] = test{
+			scavenged: []BitRange{{0, PallocChunkPages}},
+			min:       m,
+			max:       PallocChunkPages,
+			want:      BitRange{0, 0},
+		}
+		tests["NoneFree"+suffix] = test{
+			alloc:     []BitRange{{0, PallocChunkPages}},
+			scavenged: []BitRange{{PallocChunkPages / 2, PallocChunkPages / 2}},
+			min:       m,
+			max:       PallocChunkPages,
+			want:      BitRange{0, 0},
+		}
+		tests["StartFree"+suffix] = test{
+			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{0, uint(m)},
+		}
+		tests["StartFree"+suffix] = test{
+			alloc: []BitRange{{uint(m), PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{0, uint(m)},
+		}
+		tests["EndFree"+suffix] = test{
+			alloc: []BitRange{{0, PallocChunkPages - uint(m)}},
+			min:   m,
+			max:   PallocChunkPages,
+			want:  BitRange{PallocChunkPages - uint(m), uint(m)},
+		}
+		tests["Straddle64"+suffix] = test{
+			alloc: []BitRange{{0, 64 - uint(m)}, {64 + uint(m), PallocChunkPages - (64 + uint(m))}},
+			min:   m,
+			max:   2 * m,
+			want:  BitRange{64 - uint(m), 2 * uint(m)},
+		}
+		tests["BottomEdge64WithFull"+suffix] = test{
+			alloc:     []BitRange{{64, 64}, {128 + 3*uint(m), PallocChunkPages - (128 + 3*uint(m))}},
+			scavenged: []BitRange{{1, 10}},
+			min:       m,
+			max:       3 * m,
+			want:      BitRange{128, 3 * uint(m)},
+		}
+		tests["BottomEdge64WithPocket"+suffix] = test{
+			alloc:     []BitRange{{64, 62}, {127, 1}, {128 + 3*uint(m), PallocChunkPages - (128 + 3*uint(m))}},
+			scavenged: []BitRange{{1, 10}},
+			min:       m,
+			max:       3 * m,
+			want:      BitRange{128, 3 * uint(m)},
+		}
+		tests["Max0"+suffix] = test{
+			scavenged: []BitRange{{0, PallocChunkPages - uint(m)}},
+			min:       m,
+			max:       0,
+			want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+		}
+		if m <= 8 {
+			tests["OneFree"] = test{
+				alloc: []BitRange{{0, 40}, {40 + uint(m), PallocChunkPages - (40 + uint(m))}},
+				min:   m,
+				max:   PallocChunkPages,
+				want:  BitRange{40, uint(m)},
+			}
+			tests["OneScavenged"] = test{
+				alloc:     []BitRange{{0, 40}, {40 + uint(m), PallocChunkPages - (40 + uint(m))}},
+				scavenged: []BitRange{{40, 1}},
+				min:       m,
+				max:       PallocChunkPages,
+				want:      BitRange{0, 0},
+			}
+		}
+		if m > 1 {
+			tests["MaxUnaligned"+suffix] = test{
+				scavenged: []BitRange{{0, PallocChunkPages - uint(m*2-1)}},
+				min:       m,
+				max:       m - 2,
+				want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+			}
+			tests["SkipSmall"+suffix] = test{
+				alloc: []BitRange{{0, 64 - uint(m)}, {64, 5}, {70, 11}, {82, PallocChunkPages - 82}},
+				min:   m,
+				max:   m,
+				want:  BitRange{64 - uint(m), uint(m)},
+			}
+			tests["SkipMisaligned"+suffix] = test{
+				alloc: []BitRange{{0, 64 - uint(m)}, {64, 63}, {127 + uint(m), PallocChunkPages - (127 + uint(m))}},
+				min:   m,
+				max:   m,
+				want:  BitRange{64 - uint(m), uint(m)},
+			}
+			tests["MaxLessThan"+suffix] = test{
+				scavenged: []BitRange{{0, PallocChunkPages - uint(m)}},
+				min:       m,
+				max:       1,
+				want:      BitRange{PallocChunkPages - uint(m), uint(m)},
+			}
+		}
+	}
+	if PhysHugePageSize > uintptr(PageSize) {
+		// Check hugepage preserving behavior.
+		bits := uint(PhysHugePageSize / uintptr(PageSize))
+		tests["PreserveHugePageBottom"] = test{
+			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
+			min:   1,
+			max:   3, // Make it so that max would have us try to break the huge page.
+			want:  BitRange{0, bits + 2},
+		}
+		if 3*bits < PallocChunkPages {
+			// We need at least 3 huge pages in a chunk for this test to make sense.
+			tests["PreserveHugePageMiddle"] = test{
+				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+				min:   1,
+				max:   12, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{bits, bits + 10},
+			}
+		}
+		tests["PreserveHugePageTop"] = test{
+			alloc: []BitRange{{0, PallocChunkPages - bits}},
+			min:   1,
+			max:   1, // Even one page would break a huge page in this case.
+			want:  BitRange{PallocChunkPages - bits, bits},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocData(v.alloc, v.scavenged)
+			start, size := b.FindScavengeCandidate(PallocChunkPages-1, v.min, v.max)
+			got := BitRange{start, size}
+			if !(got.N == 0 && v.want.N == 0) && got != v.want {
+				t.Fatalf("candidate mismatch: got %v, want %v", got, v.want)
+			}
+		})
+	}
+}
+
+// Tests end-to-end scavenging on a pageAlloc.
+func TestPageAllocScavenge(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	type test struct {
+		request, expect uintptr
+	}
+	minPages := PhysPageSize / PageSize
+	if minPages < 1 {
+		minPages = 1
+	}
+	type setup struct {
+		beforeAlloc map[ChunkIdx][]BitRange
+		beforeScav  map[ChunkIdx][]BitRange
+		expect      []test
+		afterScav   map[ChunkIdx][]BitRange
+	}
+	tests := map[string]setup{
+		"AllFreeUnscavExhaust": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+			},
+			expect: []test{
+				{^uintptr(0), 3 * PallocChunkPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NoneFreeUnscavExhaust": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+			expect: []test{
+				{^uintptr(0), 0},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+		},
+		"ScavHighestPageFirst": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{1, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(minPages)}},
+			},
+		},
+		"ScavMultiple": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{minPages * PageSize, minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"ScavMultiple2": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{2 * minPages * PageSize, 2 * minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+				{minPages * PageSize, minPages * PageSize},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"ScavDiscontiguous": {
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {},
+				BaseChunkIdx + 0xe: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {{uint(minPages), PallocChunkPages - uint(2*minPages)}},
+				BaseChunkIdx + 0xe: {{uint(2 * minPages), PallocChunkPages - uint(2*minPages)}},
+			},
+			expect: []test{
+				{2 * minPages * PageSize, 2 * minPages * PageSize},
+				{^uintptr(0), 2 * minPages * PageSize},
+				{^uintptr(0), 0},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:       {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xe: {{0, PallocChunkPages}},
+			},
+		},
+	}
+	if PageAlloc64Bit != 0 {
+		tests["ScavAllVeryDiscontiguous"] = setup{
+			beforeAlloc: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:          {},
+				BaseChunkIdx + 0x1000: {},
+			},
+			beforeScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:          {},
+				BaseChunkIdx + 0x1000: {},
+			},
+			expect: []test{
+				{^uintptr(0), 2 * PallocChunkPages * PageSize},
+				{^uintptr(0), 0},
+			},
+			afterScav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:          {{0, PallocChunkPages}},
+				BaseChunkIdx + 0x1000: {{0, PallocChunkPages}},
+			},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		runTest := func(t *testing.T, mayUnlock bool) {
+			b := NewPageAlloc(v.beforeAlloc, v.beforeScav)
+			defer FreePageAlloc(b)
+
+			for iter, h := range v.expect {
+				if got := b.Scavenge(h.request, mayUnlock); got != h.expect {
+					t.Fatalf("bad scavenge #%d: want %d, got %d", iter+1, h.expect, got)
+				}
+			}
+			want := NewPageAlloc(v.beforeAlloc, v.afterScav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		}
+		t.Run(name, func(t *testing.T) {
+			runTest(t, false)
+		})
+		t.Run(name+"MayUnlock", func(t *testing.T) {
+			runTest(t, true)
+		})
+	}
+}
diff --git a/src/runtime/mgcstack.go b/src/runtime/mgcstack.go
index baeaa4f..211d882 100644
--- a/src/runtime/mgcstack.go
+++ b/src/runtime/mgcstack.go
@@ -175,12 +175,23 @@
 	// stack limits
 	stack stack
 
+	// conservative indicates that the next frame must be scanned conservatively.
+	// This applies only to the innermost frame at an async safe-point.
+	conservative bool
+
 	// buf contains the set of possible pointers to stack objects.
 	// Organized as a LIFO linked list of buffers.
 	// All buffers except possibly the head buffer are full.
 	buf     *stackWorkBuf
 	freeBuf *stackWorkBuf // keep around one free buffer for allocation hysteresis
 
+	// cbuf contains conservative pointers to stack objects. If
+	// all pointers to a stack object are obtained via
+	// conservative scanning, then the stack object may be dead
+	// and may contain dead pointers, so it must be scanned
+	// defensively.
+	cbuf *stackWorkBuf
+
 	// list of stack objects
 	// Objects are in increasing address order.
 	head  *stackObjectBuf
@@ -194,17 +205,21 @@
 
 // Add p as a potential pointer to a stack object.
 // p must be a stack address.
-func (s *stackScanState) putPtr(p uintptr) {
+func (s *stackScanState) putPtr(p uintptr, conservative bool) {
 	if p < s.stack.lo || p >= s.stack.hi {
 		throw("address not a stack address")
 	}
-	buf := s.buf
+	head := &s.buf
+	if conservative {
+		head = &s.cbuf
+	}
+	buf := *head
 	if buf == nil {
 		// Initial setup.
 		buf = (*stackWorkBuf)(unsafe.Pointer(getempty()))
 		buf.nobj = 0
 		buf.next = nil
-		s.buf = buf
+		*head = buf
 	} else if buf.nobj == len(buf.obj) {
 		if s.freeBuf != nil {
 			buf = s.freeBuf
@@ -213,8 +228,8 @@
 			buf = (*stackWorkBuf)(unsafe.Pointer(getempty()))
 		}
 		buf.nobj = 0
-		buf.next = s.buf
-		s.buf = buf
+		buf.next = *head
+		*head = buf
 	}
 	buf.obj[buf.nobj] = p
 	buf.nobj++
@@ -222,30 +237,39 @@
 
 // Remove and return a potential pointer to a stack object.
 // Returns 0 if there are no more pointers available.
-func (s *stackScanState) getPtr() uintptr {
-	buf := s.buf
-	if buf == nil {
-		// Never had any data.
-		return 0
-	}
-	if buf.nobj == 0 {
-		if s.freeBuf != nil {
-			// Free old freeBuf.
-			putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
-		}
-		// Move buf to the freeBuf.
-		s.freeBuf = buf
-		buf = buf.next
-		s.buf = buf
+//
+// This prefers non-conservative pointers so we scan stack objects
+// precisely if there are any non-conservative pointers to them.
+func (s *stackScanState) getPtr() (p uintptr, conservative bool) {
+	for _, head := range []**stackWorkBuf{&s.buf, &s.cbuf} {
+		buf := *head
 		if buf == nil {
-			// No more data.
-			putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
-			s.freeBuf = nil
-			return 0
+			// Never had any data.
+			continue
 		}
+		if buf.nobj == 0 {
+			if s.freeBuf != nil {
+				// Free old freeBuf.
+				putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
+			}
+			// Move buf to the freeBuf.
+			s.freeBuf = buf
+			buf = buf.next
+			*head = buf
+			if buf == nil {
+				// No more data in this list.
+				continue
+			}
+		}
+		buf.nobj--
+		return buf.obj[buf.nobj], head == &s.cbuf
 	}
-	buf.nobj--
-	return buf.obj[buf.nobj]
+	// No more data in either list.
+	if s.freeBuf != nil {
+		putempty((*workbuf)(unsafe.Pointer(s.freeBuf)))
+		s.freeBuf = nil
+	}
+	return 0, false
 }
 
 // addObject adds a stack object at addr of type typ to the set of stack objects.
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 5f1c90b..3aa3afc 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -10,7 +10,7 @@
 //   can free a whole span if none of the objects are marked, but that
 //   isn't its goal. This can be driven either synchronously by
 //   mcentral.cacheSpan for mcentral spans, or asynchronously by
-//   sweepone from the list of all in-use spans in mheap_.sweepSpans.
+//   sweepone, which looks at all the mcentral lists.
 //
 // * The span reclaimer looks for spans that contain no marked objects
 //   and frees whole spans. This is a separate algorithm because
@@ -40,6 +40,80 @@
 
 	nbgsweep    uint32
 	npausesweep uint32
+
+	// centralIndex is the current unswept span class.
+	// It represents an index into the mcentral span
+	// sets. Accessed and updated via its load and
+	// update methods. Not protected by a lock.
+	//
+	// Reset at mark termination.
+	// Used by mheap.nextSpanForSweep.
+	centralIndex sweepClass
+}
+
+// sweepClass is a spanClass and one bit to represent whether we're currently
+// sweeping partial or full spans.
+type sweepClass uint32
+
+const (
+	numSweepClasses            = numSpanClasses * 2
+	sweepClassDone  sweepClass = sweepClass(^uint32(0))
+)
+
+func (s *sweepClass) load() sweepClass {
+	return sweepClass(atomic.Load((*uint32)(s)))
+}
+
+func (s *sweepClass) update(sNew sweepClass) {
+	// Only update *s if its current value is less than sNew,
+	// since *s increases monotonically.
+	sOld := s.load()
+	for sOld < sNew && !atomic.Cas((*uint32)(s), uint32(sOld), uint32(sNew)) {
+		sOld = s.load()
+	}
+	// TODO(mknyszek): This isn't the only place we have
+	// an atomic monotonically increasing counter. It would
+	// be nice to have an "atomic max" which is just implemented
+	// as the above on most architectures. Some architectures
+	// like RISC-V however have native support for an atomic max.
+}
+
+func (s *sweepClass) clear() {
+	atomic.Store((*uint32)(s), 0)
+}
+
+// split returns the underlying span class as well as
+// whether we're interested in the full or partial
+// unswept lists for that class, indicated as a boolean
+// (true means "full").
+func (s sweepClass) split() (spc spanClass, full bool) {
+	return spanClass(s >> 1), s&1 == 0
+}
+
+// nextSpanForSweep finds and pops the next span for sweeping from the
+// central sweep buffers. It returns ownership of the span to the caller.
+// Returns nil if no such span exists.
+func (h *mheap) nextSpanForSweep() *mspan {
+	sg := h.sweepgen
+	for sc := sweep.centralIndex.load(); sc < numSweepClasses; sc++ {
+		spc, full := sc.split()
+		c := &h.central[spc].mcentral
+		var s *mspan
+		if full {
+			s = c.fullUnswept(sg).pop()
+		} else {
+			s = c.partialUnswept(sg).pop()
+		}
+		if s != nil {
+			// Write down that we found something so future sweepers
+			// can start from here.
+			sweep.centralIndex.update(sc)
+			return s
+		}
+	}
+	// Write down that we found nothing.
+	sweep.centralIndex.update(sweepClassDone)
+	return nil
 }
 
 // finishsweep_m ensures that all spans are swept.
@@ -58,12 +132,31 @@
 		sweep.npausesweep++
 	}
 
+	if go115NewMCentralImpl {
+		// Reset all the unswept buffers, which should be empty.
+		// Do this in sweep termination as opposed to mark termination
+		// so that we can catch unswept spans and reclaim blocks as
+		// soon as possible.
+		sg := mheap_.sweepgen
+		for i := range mheap_.central {
+			c := &mheap_.central[i].mcentral
+			c.partialUnswept(sg).reset()
+			c.fullUnswept(sg).reset()
+		}
+	}
+
+	// Sweeping is done, so if the scavenger isn't already awake,
+	// wake it up. There's definitely work for it to do at this
+	// point.
+	wakeScavenger()
+
 	nextMarkBitArenaEpoch()
 }
 
 func bgsweep(c chan int) {
 	sweep.g = getg()
 
+	lockInit(&sweep.lock, lockRankSweep)
 	lock(&sweep.lock)
 	sweep.parked = true
 	c <- 1
@@ -109,17 +202,21 @@
 	var s *mspan
 	sg := mheap_.sweepgen
 	for {
-		s = mheap_.sweepSpans[1-sg/2%2].pop()
+		if go115NewMCentralImpl {
+			s = mheap_.nextSpanForSweep()
+		} else {
+			s = mheap_.sweepSpans[1-sg/2%2].pop()
+		}
 		if s == nil {
 			atomic.Store(&mheap_.sweepdone, 1)
 			break
 		}
-		if s.state != mSpanInUse {
+		if state := s.state.get(); state != mSpanInUse {
 			// This can happen if direct sweeping already
 			// swept this span, but in that case the sweep
 			// generation should always be up-to-date.
 			if !(s.sweepgen == sg || s.sweepgen == sg+3) {
-				print("runtime: bad span s.state=", s.state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
+				print("runtime: bad span s.state=", state, " s.sweepgen=", s.sweepgen, " sweepgen=", sg, "\n")
 				throw("non in-use span in unswept list")
 			}
 			continue
@@ -149,6 +246,27 @@
 	// Decrement the number of active sweepers and if this is the
 	// last one print trace information.
 	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
+		// Since the sweeper is done, move the scavenge gen forward (signalling
+		// that there's new work to do) and wake the scavenger.
+		//
+		// The scavenger is signaled by the last sweeper because once
+		// sweeping is done, we will definitely have useful work for
+		// the scavenger to do, since the scavenger only runs over the
+		// heap once per GC cyle. This update is not done during sweep
+		// termination because in some cases there may be a long delay
+		// between sweep done and sweep termination (e.g. not enough
+		// allocations to trigger a GC) which would be nice to fill in
+		// with scavenging work.
+		systemstack(func() {
+			lock(&mheap_.lock)
+			mheap_.pages.scavengeStartGen()
+			unlock(&mheap_.lock)
+		})
+		// Since we might sweep in an allocation path, it's not possible
+		// for us to wake the scavenger directly via wakeScavenger, since
+		// it could allocate. Ask sysmon to do it for us instead.
+		readyForScavenger()
+
 		if debug.gcpacertrace > 0 {
 			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
 		}
@@ -204,6 +322,9 @@
 // If preserve=true, don't return it to heap nor relink in mcentral lists;
 // caller takes care of it.
 func (s *mspan) sweep(preserve bool) bool {
+	if !go115NewMCentralImpl {
+		return s.oldSweep(preserve)
+	}
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -211,8 +332,8 @@
 		throw("mspan.sweep: m is not locked")
 	}
 	sweepgen := mheap_.sweepgen
-	if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-		print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 		throw("mspan.sweep: bad span state")
 	}
 
@@ -224,10 +345,8 @@
 
 	spc := s.spanclass
 	size := s.elemsize
-	res := false
 
-	c := _g_.m.mcache
-	freeToHeap := false
+	c := _g_.m.p.ptr().mcache
 
 	// The allocBits indicate which unmarked objects don't need to be
 	// processed since they were free at the end of the last GC cycle
@@ -245,6 +364,7 @@
 	// 2. A tiny object can have several finalizers setup for different offsets.
 	//    If such object is not marked, we need to queue all finalizers at once.
 	// Both 1 and 2 are possible at the same time.
+	hadSpecials := s.specials != nil
 	specialp := &s.specials
 	special := *specialp
 	for special != nil {
@@ -289,6 +409,262 @@
 			special = *specialp
 		}
 	}
+	if hadSpecials && s.specials == nil {
+		spanHasNoSpecials(s)
+	}
+
+	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if debug.clobberfree != 0 {
+					clobberfree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
+		}
+	}
+
+	// Check for zombie objects.
+	if s.freeindex < s.nelems {
+		// Everything < freeindex is allocated and hence
+		// cannot be zombies.
+		//
+		// Check the first bitmap byte, where we have to be
+		// careful with freeindex.
+		obj := s.freeindex
+		if (*s.gcmarkBits.bytep(obj / 8)&^*s.allocBits.bytep(obj / 8))>>(obj%8) != 0 {
+			s.reportZombies()
+		}
+		// Check remaining bytes.
+		for i := obj/8 + 1; i < divRoundUp(s.nelems, 8); i++ {
+			if *s.gcmarkBits.bytep(i)&^*s.allocBits.bytep(i) != 0 {
+				s.reportZombies()
+			}
+		}
+	}
+
+	// Count the number of free objects in this span.
+	nalloc := uint16(s.countAlloc())
+	nfreed := s.allocCount - nalloc
+	if nalloc > s.allocCount {
+		// The zombie check above should have caught this in
+		// more detail.
+		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
+
+	s.allocCount = nalloc
+	s.freeindex = 0 // reset allocation index to start of span.
+	if trace.enabled {
+		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
+	}
+
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
+
+	// The span must be in our exclusive ownership until we update sweepgen,
+	// check for potential races.
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state after sweep")
+	}
+	if s.sweepgen == sweepgen+1 || s.sweepgen == sweepgen+3 {
+		throw("swept cached span")
+	}
+
+	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
+	// because of the potential for a concurrent free/SetFinalizer.
+	//
+	// But we need to set it before we make the span available for allocation
+	// (return it to heap or mcentral), because allocation code assumes that a
+	// span is already swept if available for allocation.
+	//
+	// Serialization point.
+	// At this point the mark bits are cleared and allocation ready
+	// to go so release the span.
+	atomic.Store(&s.sweepgen, sweepgen)
+
+	if spc.sizeclass() != 0 {
+		// Handle spans for small objects.
+		if nfreed > 0 {
+			// Only mark the span as needing zeroing if we've freed any
+			// objects, because a fresh span that had been allocated into,
+			// wasn't totally filled, but then swept, still has all of its
+			// free slots zeroed.
+			s.needzero = 1
+			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+		}
+		if !preserve {
+			// The caller may not have removed this span from whatever
+			// unswept set its on but taken ownership of the span for
+			// sweeping by updating sweepgen. If this span still is in
+			// an unswept set, then the mcentral will pop it off the
+			// set, check its sweepgen, and ignore it.
+			if nalloc == 0 {
+				// Free totally free span directly back to the heap.
+				mheap_.freeSpan(s)
+				return true
+			}
+			// Return span back to the right mcentral list.
+			if uintptr(nalloc) == s.nelems {
+				mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+			} else {
+				mheap_.central[spc].mcentral.partialSwept(sweepgen).push(s)
+			}
+		}
+	} else if !preserve {
+		// Handle spans for large objects.
+		if nfreed != 0 {
+			// Free large object span to heap.
+
+			// NOTE(rsc,dvyukov): The original implementation of efence
+			// in CL 22060046 used sysFree instead of sysFault, so that
+			// the operating system would eventually give the memory
+			// back to us again, so that an efence program could run
+			// longer without running out of memory. Unfortunately,
+			// calling sysFree here without any kind of adjustment of the
+			// heap data structures means that when the memory does
+			// come back to us, we have the wrong metadata for it, either in
+			// the mspan structures or in the garbage collection bitmap.
+			// Using sysFault here means that the program will run out of
+			// memory fairly quickly in efence mode, but at least it won't
+			// have mysterious crashes due to confused memory reuse.
+			// It should be possible to switch back to sysFree if we also
+			// implement and then call some kind of mheap.deleteSpan.
+			if debug.efence > 0 {
+				s.limit = 0 // prevent mlookup from finding this span
+				sysFault(unsafe.Pointer(s.base()), size)
+			} else {
+				mheap_.freeSpan(s)
+			}
+			c.local_nlargefree++
+			c.local_largefree += size
+			return true
+		}
+
+		// Add a large span directly onto the full+swept list.
+		mheap_.central[spc].mcentral.fullSwept(sweepgen).push(s)
+	}
+	return false
+}
+
+// Sweep frees or collects finalizers for blocks not marked in the mark phase.
+// It clears the mark bits in preparation for the next GC round.
+// Returns true if the span was returned to heap.
+// If preserve=true, don't return it to heap nor relink in mcentral lists;
+// caller takes care of it.
+//
+// For !go115NewMCentralImpl.
+func (s *mspan) oldSweep(preserve bool) bool {
+	// It's critical that we enter this function with preemption disabled,
+	// GC must not start while we are in the middle of this function.
+	_g_ := getg()
+	if _g_.m.locks == 0 && _g_.m.mallocing == 0 && _g_ != _g_.m.g0 {
+		throw("mspan.sweep: m is not locked")
+	}
+	sweepgen := mheap_.sweepgen
+	if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+		print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		throw("mspan.sweep: bad span state")
+	}
+
+	if trace.enabled {
+		traceGCSweepSpan(s.npages * _PageSize)
+	}
+
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
+
+	spc := s.spanclass
+	size := s.elemsize
+	res := false
+
+	c := _g_.m.p.ptr().mcache
+	freeToHeap := false
+
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
+
+	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
+	hadSpecials := s.specials != nil
+	specialp := &s.specials
+	special := *specialp
+	for special != nil {
+		// A finalizer can be set for an inner byte of an object, find object beginning.
+		objIndex := uintptr(special.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
+			}
+		} else {
+			// object is still live: keep special record
+			specialp = &special.next
+			special = *specialp
+		}
+	}
+	if go115NewMarkrootSpans && hadSpecials && s.specials == nil {
+		spanHasNoSpecials(s)
+	}
 
 	if debug.allocfreetrace != 0 || debug.clobberfree != 0 || raceenabled || msanenabled {
 		// Find all newly freed objects. This doesn't have to
@@ -351,8 +727,8 @@
 	if freeToHeap || nfreed == 0 {
 		// The span must be in our exclusive ownership until we update sweepgen,
 		// check for potential races.
-		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
-			print("mspan.sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
+		if state := s.state.get(); state != mSpanInUse || s.sweepgen != sweepgen-1 {
+			print("mspan.sweep: state=", state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 			throw("mspan.sweep: bad span state after sweep")
 		}
 		// Serialization point.
@@ -386,7 +762,7 @@
 			s.limit = 0 // prevent mlookup from finding this span
 			sysFault(unsafe.Pointer(s.base()), size)
 		} else {
-			mheap_.freeSpan(s, true)
+			mheap_.freeSpan(s)
 		}
 		c.local_nlargefree++
 		c.local_largefree += size
@@ -400,6 +776,57 @@
 	return res
 }
 
+// reportZombies reports any marked but free objects in s and throws.
+//
+// This generally means one of the following:
+//
+// 1. User code converted a pointer to a uintptr and then back
+// unsafely, and a GC ran while the uintptr was the only reference to
+// an object.
+//
+// 2. User code (or a compiler bug) constructed a bad pointer that
+// points to a free slot, often a past-the-end pointer.
+//
+// 3. The GC two cycles ago missed a pointer and freed a live object,
+// but it was still live in the last cycle, so this GC cycle found a
+// pointer to that object and marked it.
+func (s *mspan) reportZombies() {
+	printlock()
+	print("runtime: marked free object in span ", s, ", elemsize=", s.elemsize, " freeindex=", s.freeindex, " (bad use of unsafe.Pointer? try -d=checkptr)\n")
+	mbits := s.markBitsForBase()
+	abits := s.allocBitsForIndex(0)
+	for i := uintptr(0); i < s.nelems; i++ {
+		addr := s.base() + i*s.elemsize
+		print(hex(addr))
+		alloc := i < s.freeindex || abits.isMarked()
+		if alloc {
+			print(" alloc")
+		} else {
+			print(" free ")
+		}
+		if mbits.isMarked() {
+			print(" marked  ")
+		} else {
+			print(" unmarked")
+		}
+		zombie := mbits.isMarked() && !alloc
+		if zombie {
+			print(" zombie")
+		}
+		print("\n")
+		if zombie {
+			length := s.elemsize
+			if length > 1024 {
+				length = 1024
+			}
+			hexdumpWords(addr, addr+length, nil)
+		}
+		mbits.advance()
+		abits.advance()
+	}
+	throw("found pointer to free object")
+}
+
 // deductSweepCredit deducts sweep credit for allocating a span of
 // size spanBytes. This must be performed *before* the span is
 // allocated to ensure the system has enough credit. If necessary, it
diff --git a/src/runtime/mgcsweepbuf.go b/src/runtime/mgcsweepbuf.go
index 0491f7c..1f722c3 100644
--- a/src/runtime/mgcsweepbuf.go
+++ b/src/runtime/mgcsweepbuf.go
@@ -111,8 +111,9 @@
 		unlock(&b.spineLock)
 	}
 
-	// We have a block. Insert the span.
-	block.spans[bottom] = s
+	// We have a block. Insert the span atomically, since there may be
+	// concurrent readers via the block API.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
 }
 
 // pop removes and returns a span from buffer b, or nil if b is empty.
@@ -143,11 +144,13 @@
 // intervening pops. Spans that are pushed after the call may also
 // appear in these blocks.
 func (b *gcSweepBuf) numBlocks() int {
-	return int((atomic.Load(&b.index) + gcSweepBlockEntries - 1) / gcSweepBlockEntries)
+	return int(divRoundUp(uintptr(atomic.Load(&b.index)), gcSweepBlockEntries))
 }
 
 // block returns the spans in the i'th block of buffer b. block is
-// safe to call concurrently with push.
+// safe to call concurrently with push. The block may contain nil
+// pointers that must be ignored, and each entry in the block must be
+// loaded atomically.
 func (b *gcSweepBuf) block(i int) []*mspan {
 	// Perform bounds check before loading spine address since
 	// push ensures the allocated length is at least spineLen.
@@ -169,11 +172,5 @@
 	} else {
 		spans = block.spans[:bottom]
 	}
-
-	// push may have reserved a slot but not filled it yet, so
-	// trim away unused entries.
-	for len(spans) > 0 && spans[len(spans)-1] == nil {
-		spans = spans[:len(spans)-1]
-	}
 	return spans
 }
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index f2c16d7..4610165 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -126,12 +126,12 @@
 	if debugCachedWork {
 		alreadyFailed := w.putGen == w.pauseGen
 		w.putGen = w.pauseGen
-		if m := getg().m; m.locks > 0 || m.mallocing != 0 || m.preemptoff != "" || m.p.ptr().status != _Prunning {
+		if !canPreemptM(getg().m) {
 			// If we were to spin, the runtime may
-			// deadlock: the condition above prevents
-			// preemption (see newstack), which could
-			// prevent gcMarkDone from finishing the
-			// ragged barrier and releasing the spin.
+			// deadlock. Since we can't be preempted, the
+			// spin could prevent gcMarkDone from
+			// finishing the ragged barrier, which is what
+			// releases us from the spin.
 			return
 		}
 		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
@@ -178,6 +178,10 @@
 
 	flushed := false
 	wbuf := w.wbuf1
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if wbuf == nil {
 		w.init()
 		wbuf = w.wbuf1
@@ -423,6 +427,10 @@
 			b.checkempty()
 		}
 	}
+	// Record that this may acquire the wbufSpans or heap lock to
+	// allocate a workbuf.
+	lockWithRankMayAcquire(&work.wbufSpans.lock, lockRankWbufSpans)
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if b == nil {
 		// Allocate more workbufs.
 		var s *mspan
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 706603a..2c7bfd8 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -15,10 +15,45 @@
 	"unsafe"
 )
 
-// minPhysPageSize is a lower-bound on the physical page size. The
-// true physical page size may be larger than this. In contrast,
-// sys.PhysPageSize is an upper-bound on the physical page size.
-const minPhysPageSize = 4096
+const (
+	// minPhysPageSize is a lower-bound on the physical page size. The
+	// true physical page size may be larger than this. In contrast,
+	// sys.PhysPageSize is an upper-bound on the physical page size.
+	minPhysPageSize = 4096
+
+	// maxPhysPageSize is the maximum page size the runtime supports.
+	maxPhysPageSize = 512 << 10
+
+	// maxPhysHugePageSize sets an upper-bound on the maximum huge page size
+	// that the runtime supports.
+	maxPhysHugePageSize = pallocChunkBytes
+
+	// pagesPerReclaimerChunk indicates how many pages to scan from the
+	// pageInUse bitmap at a time. Used by the page reclaimer.
+	//
+	// Higher values reduce contention on scanning indexes (such as
+	// h.reclaimIndex), but increase the minimum latency of the
+	// operation.
+	//
+	// The time required to scan this many pages can vary a lot depending
+	// on how many spans are actually freed. Experimentally, it can
+	// scan for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
+	// free spans at ~32 MB/ms. Using 512 pages bounds this at
+	// roughly 100µs.
+	//
+	// Must be a multiple of the pageInUse bitmap element size and
+	// must also evenly divid pagesPerArena.
+	pagesPerReclaimerChunk = 512
+
+	// go115NewMCentralImpl is a feature flag for the new mcentral implementation.
+	//
+	// This flag depends on go115NewMarkrootSpans because the new mcentral
+	// implementation requires that markroot spans no longer rely on mgcsweepbufs.
+	// The definition of this flag helps ensure that if there's a problem with
+	// the new markroot spans implementation and it gets turned off, that the new
+	// mcentral implementation also gets turned off so the runtime isn't broken.
+	go115NewMCentralImpl = true && go115NewMarkrootSpans
+)
 
 // Main malloc heap.
 // The heap itself is the "free" and "scav" treaps,
@@ -32,10 +67,10 @@
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
 	lock      mutex
-	free      mTreap // free spans
-	sweepgen  uint32 // sweep generation, see comment in mspan
-	sweepdone uint32 // all spans are swept
-	sweepers  uint32 // number of active sweepone calls
+	pages     pageAlloc // page allocation data structure
+	sweepgen  uint32    // sweep generation, see comment in mspan; written during STW
+	sweepdone uint32    // all spans are swept
+	sweepers  uint32    // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -59,6 +94,8 @@
 	// unswept stack and pushes spans that are still in-use on the
 	// swept stack. Likewise, allocating an in-use span pushes it
 	// on the swept stack.
+	//
+	// For !go115NewMCentralImpl.
 	sweepSpans [2]gcSweepBuf
 
 	_ uint32 // align uint64 fields on 32-bit for atomics
@@ -81,7 +118,7 @@
 	// accounting for current progress. If we could only adjust
 	// the slope, it would create a discontinuity in debt if any
 	// progress has already been made.
-	pagesInUse         uint64  // pages of spans in stats mSpanInUse; R/W with mheap.lock
+	pagesInUse         uint64  // pages of spans in stats mSpanInUse; updated atomically
 	pagesSwept         uint64  // pages swept this cycle; updated atomically
 	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
 	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
@@ -89,24 +126,10 @@
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
 
-	// Scavenger pacing parameters
-	//
-	// The two basis parameters and the scavenge ratio parallel the proportional
-	// sweeping implementation, the primary differences being that:
-	//  * Scavenging concerns itself with RSS, estimated as heapRetained()
-	//  * Rather than pacing the scavenger to the GC, it is paced to a
-	//    time-based rate computed in gcPaceScavenger.
-	//
-	// scavengeRetainedGoal represents our goal RSS.
-	//
-	// All fields must be accessed with lock.
-	//
-	// TODO(mknyszek): Consider abstracting the basis fields and the scavenge ratio
-	// into its own type so that this logic may be shared with proportional sweeping.
-	scavengeTimeBasis     int64
-	scavengeRetainedBasis uint64
-	scavengeBytesPerNS    float64
-	scavengeRetainedGoal  uint64
+	// scavengeGoal is the amount of total retained heap memory (measured by
+	// heapRetained) that the runtime will try to maintain by returning memory
+	// to the OS.
+	scavengeGoal uint64
 
 	// Page reclaimer state
 
@@ -185,7 +208,19 @@
 	// simply blocking GC (by disabling preemption).
 	sweepArenas []arenaIdx
 
-	_ uint32 // ensure 64-bit alignment of central
+	// markArenas is a snapshot of allArenas taken at the beginning
+	// of the mark cycle. Because allArenas is append-only, neither
+	// this slice nor its contents will change during the mark, so
+	// it can be read safely.
+	markArenas []arenaIdx
+
+	// curArena is the arena that the heap is currently growing
+	// into. This should always be physPageSize-aligned.
+	curArena struct {
+		base, end uintptr
+	}
+
+	// _ uint32 // ensure 64-bit alignment of central
 
 	// central free lists for small size classes.
 	// the padding makes sure that the mcentrals are
@@ -199,7 +234,6 @@
 
 	spanalloc             fixalloc // allocator for span*
 	cachealloc            fixalloc // allocator for mcache*
-	treapalloc            fixalloc // allocator for treapNodes*
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
@@ -213,10 +247,6 @@
 // A heapArena stores metadata for a heap arena. heapArenas are stored
 // outside of the Go heap and accessed via the mheap_.arenas index.
 //
-// This gets allocated directly from the OS, so ideally it should be a
-// multiple of the system page size. For example, avoid adding small
-// fields.
-//
 //go:notinheap
 type heapArena struct {
 	// bitmap stores the pointer/scalar bitmap for the words in
@@ -242,7 +272,7 @@
 	// but only the bit corresponding to the first page in each
 	// span is used.
 	//
-	// Writes are protected by mheap_.lock.
+	// Reads and writes are atomic.
 	pageInUse [pagesPerArena / 8]uint8
 
 	// pageMarks is a bitmap that indicates which spans have any
@@ -259,6 +289,28 @@
 	// faster scanning, but we don't have 64-bit atomic bit
 	// operations.
 	pageMarks [pagesPerArena / 8]uint8
+
+	// pageSpecials is a bitmap that indicates which spans have
+	// specials (finalizers or other). Like pageInUse, only the bit
+	// corresponding to the first page in each span is used.
+	//
+	// Writes are done atomically whenever a special is added to
+	// a span and whenever the last special is removed from a span.
+	// Reads are done atomically to find spans containing specials
+	// during marking.
+	pageSpecials [pagesPerArena / 8]uint8
+
+	// zeroedBase marks the first byte of the first page in this
+	// arena which hasn't been used yet and is therefore already
+	// zero. zeroedBase is relative to the arena base.
+	// Increases monotonically until it hits heapArenaBytes.
+	//
+	// This field is sufficient to determine if an allocation
+	// needs to be zeroed because the page allocator follows an
+	// address-ordered first-fit policy.
+	//
+	// Read atomically and written with an atomic CAS.
+	zeroedBase uintptr
 }
 
 // arenaHint is a hint for where to grow the heap arenas. See
@@ -298,13 +350,20 @@
 // * During GC (gcphase != _GCoff), a span *must not* transition from
 //   manual or in-use to free. Because concurrent GC may read a pointer
 //   and then look up its span, the span state must be monotonic.
+//
+// Setting mspan.state to mSpanInUse or mSpanManual must be done
+// atomically and only after all other span fields are valid.
+// Likewise, if inspecting a span is contingent on it being
+// mSpanInUse, the state should be loaded atomically and checked
+// before depending on other fields. This allows the garbage collector
+// to safely deal with potentially invalid pointers, since resolving
+// such pointers may race with a span being allocated.
 type mSpanState uint8
 
 const (
 	mSpanDead   mSpanState = iota
 	mSpanInUse             // allocated for garbage collected heap
 	mSpanManual            // allocated for manual management (e.g., stack allocator)
-	mSpanFree
 )
 
 // mSpanStateNames are the names of the span states, indexed by
@@ -316,6 +375,21 @@
 	"mSpanFree",
 }
 
+// mSpanStateBox holds an mSpanState and provides atomic operations on
+// it. This is a separate type to disallow accidental comparison or
+// assignment with mSpanState.
+type mSpanStateBox struct {
+	s mSpanState
+}
+
+func (b *mSpanStateBox) set(s mSpanState) {
+	atomic.Store8((*uint8)(&b.s), uint8(s))
+}
+
+func (b *mSpanStateBox) get() mSpanState {
+	return mSpanState(atomic.Load8((*uint8)(&b.s)))
+}
+
 // mSpanList heads a linked list of spans.
 //
 //go:notinheap
@@ -397,19 +471,18 @@
 	// h->sweepgen is incremented by 2 after every GC
 
 	sweepgen    uint32
-	divMul      uint16     // for divide by elemsize - divMagic.mul
-	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
-	allocCount  uint16     // number of allocated objects
-	spanclass   spanClass  // size class and noscan (uint8)
-	state       mSpanState // mspaninuse etc
-	needzero    uint8      // needs to be zeroed before allocation
-	divShift    uint8      // for divide by elemsize - divMagic.shift
-	divShift2   uint8      // for divide by elemsize - divMagic.shift2
-	scavenged   bool       // whether this span has had its pages released to the OS
-	elemsize    uintptr    // computed from sizeclass or from npages
-	limit       uintptr    // end of data in span
-	speciallock mutex      // guards specials list
-	specials    *special   // linked list of special records sorted by offset.
+	divMul      uint16        // for divide by elemsize - divMagic.mul
+	baseMask    uint16        // if non-0, elemsize is a power of 2, & this will get object allocation base
+	allocCount  uint16        // number of allocated objects
+	spanclass   spanClass     // size class and noscan (uint8)
+	state       mSpanStateBox // mSpanInUse etc; accessed atomically (get/set methods)
+	needzero    uint8         // needs to be zeroed before allocation
+	divShift    uint8         // for divide by elemsize - divMagic.shift
+	divShift2   uint8         // for divide by elemsize - divMagic.shift2
+	elemsize    uintptr       // computed from sizeclass or from npages
+	limit       uintptr       // end of data in span
+	speciallock mutex         // guards specials list
+	specials    *special      // linked list of special records sorted by offset.
 }
 
 func (s *mspan) base() uintptr {
@@ -425,181 +498,6 @@
 	return
 }
 
-// physPageBounds returns the start and end of the span
-// rounded in to the physical page size.
-func (s *mspan) physPageBounds() (uintptr, uintptr) {
-	start := s.base()
-	end := start + s.npages<<_PageShift
-	if physPageSize > _PageSize {
-		// Round start and end in.
-		start = (start + physPageSize - 1) &^ (physPageSize - 1)
-		end &^= physPageSize - 1
-	}
-	return start, end
-}
-
-func (h *mheap) coalesce(s *mspan) {
-	// merge is a helper which merges other into s, deletes references to other
-	// in heap metadata, and then discards it. other must be adjacent to s.
-	merge := func(a, b, other *mspan) {
-		// Caller must ensure a.startAddr < b.startAddr and that either a or
-		// b is s. a and b must be adjacent. other is whichever of the two is
-		// not s.
-
-		if pageSize < physPageSize && a.scavenged && b.scavenged {
-			// If we're merging two scavenged spans on systems where
-			// pageSize < physPageSize, then their boundary should always be on
-			// a physical page boundary, due to the realignment that happens
-			// during coalescing. Throw if this case is no longer true, which
-			// means the implementation should probably be changed to scavenge
-			// along the boundary.
-			_, start := a.physPageBounds()
-			end, _ := b.physPageBounds()
-			if start != end {
-				println("runtime: a.base=", hex(a.base()), "a.npages=", a.npages)
-				println("runtime: b.base=", hex(b.base()), "b.npages=", b.npages)
-				println("runtime: physPageSize=", physPageSize, "pageSize=", pageSize)
-				throw("neighboring scavenged spans boundary is not a physical page boundary")
-			}
-		}
-
-		// Adjust s via base and npages and also in heap metadata.
-		s.npages += other.npages
-		s.needzero |= other.needzero
-		if a == s {
-			h.setSpan(s.base()+s.npages*pageSize-1, s)
-		} else {
-			s.startAddr = other.startAddr
-			h.setSpan(s.base(), s)
-		}
-
-		// The size is potentially changing so the treap needs to delete adjacent nodes and
-		// insert back as a combined node.
-		h.free.removeSpan(other)
-		other.state = mSpanDead
-		h.spanalloc.free(unsafe.Pointer(other))
-	}
-
-	// realign is a helper which shrinks other and grows s such that their
-	// boundary is on a physical page boundary.
-	realign := func(a, b, other *mspan) {
-		// Caller must ensure a.startAddr < b.startAddr and that either a or
-		// b is s. a and b must be adjacent. other is whichever of the two is
-		// not s.
-
-		// If pageSize >= physPageSize then spans are always aligned
-		// to physical page boundaries, so just exit.
-		if pageSize >= physPageSize {
-			return
-		}
-		// Since we're resizing other, we must remove it from the treap.
-		h.free.removeSpan(other)
-
-		// Round boundary to the nearest physical page size, toward the
-		// scavenged span.
-		boundary := b.startAddr
-		if a.scavenged {
-			boundary &^= (physPageSize - 1)
-		} else {
-			boundary = (boundary + physPageSize - 1) &^ (physPageSize - 1)
-		}
-		a.npages = (boundary - a.startAddr) / pageSize
-		b.npages = (b.startAddr + b.npages*pageSize - boundary) / pageSize
-		b.startAddr = boundary
-
-		h.setSpan(boundary-1, a)
-		h.setSpan(boundary, b)
-
-		// Re-insert other now that it has a new size.
-		h.free.insert(other)
-	}
-
-	hpMiddle := s.hugePages()
-
-	// Coalesce with earlier, later spans.
-	var hpBefore uintptr
-	if before := spanOf(s.base() - 1); before != nil && before.state == mSpanFree {
-		if s.scavenged == before.scavenged {
-			hpBefore = before.hugePages()
-			merge(before, s, before)
-		} else {
-			realign(before, s, before)
-		}
-	}
-
-	// Now check to see if next (greater addresses) span is free and can be coalesced.
-	var hpAfter uintptr
-	if after := spanOf(s.base() + s.npages*pageSize); after != nil && after.state == mSpanFree {
-		if s.scavenged == after.scavenged {
-			hpAfter = after.hugePages()
-			merge(s, after, after)
-		} else {
-			realign(s, after, after)
-		}
-	}
-	if !s.scavenged && s.hugePages() > hpBefore+hpMiddle+hpAfter {
-		// If s has grown such that it now may contain more huge pages than it
-		// and its now-coalesced neighbors did before, then mark the whole region
-		// as huge-page-backable.
-		//
-		// Otherwise, on systems where we break up huge pages (like Linux)
-		// s may not be backed by huge pages because it could be made up of
-		// pieces which are broken up in the underlying VMA. The primary issue
-		// with this is that it can lead to a poor estimate of the amount of
-		// free memory backed by huge pages for determining the scavenging rate.
-		//
-		// TODO(mknyszek): Measure the performance characteristics of sysHugePage
-		// and determine whether it makes sense to only sysHugePage on the pages
-		// that matter, or if it's better to just mark the whole region.
-		sysHugePage(unsafe.Pointer(s.base()), s.npages*pageSize)
-	}
-}
-
-// hugePages returns the number of aligned physical huge pages in the memory
-// regioned owned by this mspan.
-func (s *mspan) hugePages() uintptr {
-	if physHugePageSize == 0 || s.npages < physHugePageSize/pageSize {
-		return 0
-	}
-	start := s.base()
-	end := start + s.npages*pageSize
-	if physHugePageSize > pageSize {
-		// Round start and end in.
-		start = (start + physHugePageSize - 1) &^ (physHugePageSize - 1)
-		end &^= physHugePageSize - 1
-	}
-	if start < end {
-		return (end - start) >> physHugePageShift
-	}
-	return 0
-}
-
-func (s *mspan) scavenge() uintptr {
-	// start and end must be rounded in, otherwise madvise
-	// will round them *out* and release more memory
-	// than we want.
-	start, end := s.physPageBounds()
-	if end <= start {
-		// start and end don't span a whole physical page.
-		return 0
-	}
-	released := end - start
-	memstats.heap_released += uint64(released)
-	s.scavenged = true
-	sysUnused(unsafe.Pointer(start), released)
-	return released
-}
-
-// released returns the number of bytes in this span
-// which were returned back to the OS.
-func (s *mspan) released() uintptr {
-	if !s.scavenged {
-		return 0
-	}
-	start, end := s.physPageBounds()
-	return end - start
-}
-
 // recordspan adds a newly allocated span to h.allspans.
 //
 // This only happens the first time a span is allocated from
@@ -678,13 +576,13 @@
 //
 //go:nosplit
 func arenaIndex(p uintptr) arenaIdx {
-	return arenaIdx((p + arenaBaseOffset) / heapArenaBytes)
+	return arenaIdx((p - arenaBaseOffset) / heapArenaBytes)
 }
 
 // arenaBase returns the low address of the region covered by heap
 // arena i.
 func arenaBase(i arenaIdx) uintptr {
-	return uintptr(i)*heapArenaBytes - arenaBaseOffset
+	return uintptr(i)*heapArenaBytes + arenaBaseOffset
 }
 
 type arenaIdx uint
@@ -726,7 +624,7 @@
 	if s == nil || b < s.base() {
 		return false
 	}
-	switch s.state {
+	switch s.state.get() {
 	case mSpanInUse, mSpanManual:
 		return b < s.limit
 	default:
@@ -793,9 +691,12 @@
 //go:nosplit
 func spanOfHeap(p uintptr) *mspan {
 	s := spanOf(p)
-	// If p is not allocated, it may point to a stale span, so we
-	// have to check the span's bounds and state.
-	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
+	// s is nil if it's never been allocated. Otherwise, we check
+	// its state first because we don't trust this pointer, so we
+	// have to synchronize with span initialization. Then, it's
+	// still possible we picked up a stale span pointer, so we
+	// have to check the span's bounds.
+	if s == nil || s.state.get() != mSpanInUse || p < s.base() || p >= s.limit {
 		return nil
 	}
 	return s
@@ -813,7 +714,11 @@
 
 // Initialize the heap.
 func (h *mheap) init() {
-	h.treapalloc.init(unsafe.Sizeof(treapNode{}), nil, nil, &memstats.other_sys)
+	lockInit(&h.lock, lockRankMheap)
+	lockInit(&h.sweepSpans[0].spineLock, lockRankSpine)
+	lockInit(&h.sweepSpans[1].spineLock, lockRankSpine)
+	lockInit(&h.speciallock, lockRankMheapSpecial)
+
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -834,6 +739,8 @@
 	for i := range h.central {
 		h.central[i].mcentral.init(spanClass(i))
 	}
+
+	h.pages.init(&h.lock, &memstats.gc_sys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -843,23 +750,10 @@
 //
 // h must NOT be locked.
 func (h *mheap) reclaim(npage uintptr) {
-	// This scans pagesPerChunk at a time. Higher values reduce
-	// contention on h.reclaimPos, but increase the minimum
-	// latency of performing a reclaim.
-	//
-	// Must be a multiple of the pageInUse bitmap element size.
-	//
-	// The time required by this can vary a lot depending on how
-	// many spans are actually freed. Experimentally, it can scan
-	// for pages at ~300 GB/ms on a 2.6GHz Core i7, but can only
-	// free spans at ~32 MB/ms. Using 512 pages bounds this at
-	// roughly 100µs.
-	//
 	// TODO(austin): Half of the time spent freeing spans is in
 	// locking/unlocking the heap (even with low contention). We
 	// could make the slow path here several times faster by
 	// batching heap frees.
-	const pagesPerChunk = 512
 
 	// Bail early if there's no more reclaim work.
 	if atomic.Load64(&h.reclaimIndex) >= 1<<63 {
@@ -892,7 +786,7 @@
 		}
 
 		// Claim a chunk of work.
-		idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerChunk) - pagesPerChunk)
+		idx := uintptr(atomic.Xadd64(&h.reclaimIndex, pagesPerReclaimerChunk) - pagesPerReclaimerChunk)
 		if idx/pagesPerArena >= uintptr(len(arenas)) {
 			// Page reclaiming is done.
 			atomic.Store64(&h.reclaimIndex, 1<<63)
@@ -906,7 +800,7 @@
 		}
 
 		// Scan this chunk.
-		nfound := h.reclaimChunk(arenas, idx, pagesPerChunk)
+		nfound := h.reclaimChunk(arenas, idx, pagesPerReclaimerChunk)
 		if nfound <= npage {
 			npage -= nfound
 		} else {
@@ -928,7 +822,9 @@
 // reclaimChunk sweeps unmarked spans that start at page indexes [pageIdx, pageIdx+n).
 // It returns the number of pages returned to the heap.
 //
-// h.lock must be held and the caller must be non-preemptible.
+// h.lock must be held and the caller must be non-preemptible. Note: h.lock may be
+// temporarily unlocked and re-locked in order to do sweeping or if tracing is
+// enabled.
 func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	// The heap lock must be held because this accesses the
 	// heapArena.spans arrays using potentially non-live pointers.
@@ -954,7 +850,7 @@
 		// Scan this bitmap chunk for spans that are in-use
 		// but have no marked objects on them.
 		for i := range inUse {
-			inUseUnmarked := inUse[i] &^ marked[i]
+			inUseUnmarked := atomic.Load8(&inUse[i]) &^ marked[i]
 			if inUseUnmarked == 0 {
 				continue
 			}
@@ -973,7 +869,7 @@
 						// spans were freed when we dropped the
 						// lock and we don't want to get stale
 						// pointers from the spans array.
-						inUseUnmarked = inUse[i] &^ marked[i]
+						inUseUnmarked = atomic.Load8(&inUse[i]) &^ marked[i]
 					}
 				}
 			}
@@ -984,106 +880,31 @@
 		n -= uintptr(len(inUse) * 8)
 	}
 	if trace.enabled {
+		unlock(&h.lock)
 		// Account for pages scanned but not reclaimed.
 		traceGCSweepSpan((n0 - nFreed) * pageSize)
+		lock(&h.lock)
 	}
 	return nFreed
 }
 
-// alloc_m is the internal implementation of mheap.alloc.
-//
-// alloc_m must run on the system stack because it locks the heap, so
-// any stack growth during alloc_m would self-deadlock.
-//
-//go:systemstack
-func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
-	_g_ := getg()
-
-	// To prevent excessive heap growth, before allocating n pages
-	// we need to sweep and reclaim at least n pages.
-	if h.sweepdone == 0 {
-		h.reclaim(npage)
-	}
-
-	lock(&h.lock)
-	// transfer stats from cache to global
-	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
-	_g_.m.mcache.local_scan = 0
-	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
-	_g_.m.mcache.local_tinyallocs = 0
-
-	s := h.allocSpanLocked(npage, &memstats.heap_inuse)
-	if s != nil {
-		// Record span info, because gc needs to be
-		// able to map interior pointer to containing span.
-		atomic.Store(&s.sweepgen, h.sweepgen)
-		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
-		s.state = mSpanInUse
-		s.allocCount = 0
-		s.spanclass = spanclass
-		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
-			s.elemsize = s.npages << _PageShift
-			s.divShift = 0
-			s.divMul = 0
-			s.divShift2 = 0
-			s.baseMask = 0
-		} else {
-			s.elemsize = uintptr(class_to_size[sizeclass])
-			m := &class_to_divmagic[sizeclass]
-			s.divShift = m.shift
-			s.divMul = m.mul
-			s.divShift2 = m.shift2
-			s.baseMask = m.baseMask
-		}
-
-		// Mark in-use span in arena page bitmap.
-		arena, pageIdx, pageMask := pageIndexOf(s.base())
-		arena.pageInUse[pageIdx] |= pageMask
-
-		// update stats, sweep lists
-		h.pagesInUse += uint64(npage)
-		if large {
-			memstats.heap_objects++
-			mheap_.largealloc += uint64(s.elemsize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
-		}
-	}
-	// heap_scan and heap_live were updated.
-	if gcBlackenEnabled != 0 {
-		gcController.revise()
-	}
-
-	if trace.enabled {
-		traceHeapAlloc()
-	}
-
-	// h.spans is accessed concurrently without synchronization
-	// from other threads. Hence, there must be a store/store
-	// barrier here to ensure the writes to h.spans above happen
-	// before the caller can publish a pointer p to an object
-	// allocated from s. As soon as this happens, the garbage
-	// collector running on another processor could read p and
-	// look up s in h.spans. The unlock acts as the barrier to
-	// order these writes. On the read side, the data dependency
-	// between p and the index in h.spans orders the reads.
-	unlock(&h.lock)
-	return s
-}
-
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
-// Either large must be true or spanclass must indicates the span's
-// size class and scannability.
+// spanclass indicates the span's size class and scannability.
 //
 // If needzero is true, the memory for the returned span will be zeroed.
-func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan {
+func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
 	var s *mspan
 	systemstack(func() {
-		s = h.alloc_m(npage, spanclass, large)
+		// To prevent excessive heap growth, before allocating n pages
+		// we need to sweep and reclaim at least n pages.
+		if h.sweepdone == 0 {
+			h.reclaim(npages)
+		}
+		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
 	})
 
 	if s != nil {
@@ -1105,35 +926,12 @@
 // The memory backing the returned span may not be zeroed if
 // span.needzero is set.
 //
-// allocManual must be called on the system stack because it acquires
-// the heap lock. See mheap for details.
+// allocManual must be called on the system stack because it may
+// acquire the heap lock via allocSpan. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
-	lock(&h.lock)
-	s := h.allocSpanLocked(npage, stat)
-	if s != nil {
-		s.state = mSpanManual
-		s.manualFreeList = 0
-		s.allocCount = 0
-		s.spanclass = 0
-		s.nelems = 0
-		s.elemsize = 0
-		s.limit = s.base() + s.npages<<_PageShift
-		// Manually managed memory doesn't count toward heap_sys.
-		memstats.heap_sys -= uint64(s.npages << _PageShift)
-	}
-
-	// This unlock acts as a release barrier. See mheap.alloc_m.
-	unlock(&h.lock)
-
-	return s
-}
-
-// setSpan modifies the span map so spanOf(base) is s.
-func (h *mheap) setSpan(base uintptr, s *mspan) {
-	ai := arenaIndex(base)
-	h.arenas[ai.l1()][ai.l2()].spans[(base/pageSize)%pagesPerArena] = s
+func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
+	return h.allocSpan(npages, true, 0, stat)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -1152,94 +950,377 @@
 	}
 }
 
-// Allocates a span of the given size.  h must be locked.
-// The returned span has been removed from the
-// free structures, but its state is still mSpanFree.
-func (h *mheap) allocSpanLocked(npage uintptr, stat *uint64) *mspan {
-	t := h.free.find(npage)
-	if t.valid() {
-		goto HaveSpan
+// allocNeedsZero checks if the region of address space [base, base+npage*pageSize),
+// assumed to be allocated, needs to be zeroed, updating heap arena metadata for
+// future allocations.
+//
+// This must be called each time pages are allocated from the heap, even if the page
+// allocator can otherwise prove the memory it's allocating is already zero because
+// they're fresh from the operating system. It updates heapArena metadata that is
+// critical for future page allocations.
+//
+// There are no locking constraints on this method.
+func (h *mheap) allocNeedsZero(base, npage uintptr) (needZero bool) {
+	for npage > 0 {
+		ai := arenaIndex(base)
+		ha := h.arenas[ai.l1()][ai.l2()]
+
+		zeroedBase := atomic.Loaduintptr(&ha.zeroedBase)
+		arenaBase := base % heapArenaBytes
+		if arenaBase < zeroedBase {
+			// We extended into the non-zeroed part of the
+			// arena, so this region needs to be zeroed before use.
+			//
+			// zeroedBase is monotonically increasing, so if we see this now then
+			// we can be sure we need to zero this memory region.
+			//
+			// We still need to update zeroedBase for this arena, and
+			// potentially more arenas.
+			needZero = true
+		}
+		// We may observe arenaBase > zeroedBase if we're racing with one or more
+		// allocations which are acquiring memory directly before us in the address
+		// space. But, because we know no one else is acquiring *this* memory, it's
+		// still safe to not zero.
+
+		// Compute how far into the arena we extend into, capped
+		// at heapArenaBytes.
+		arenaLimit := arenaBase + npage*pageSize
+		if arenaLimit > heapArenaBytes {
+			arenaLimit = heapArenaBytes
+		}
+		// Increase ha.zeroedBase so it's >= arenaLimit.
+		// We may be racing with other updates.
+		for arenaLimit > zeroedBase {
+			if atomic.Casuintptr(&ha.zeroedBase, zeroedBase, arenaLimit) {
+				break
+			}
+			zeroedBase = atomic.Loaduintptr(&ha.zeroedBase)
+			// Sanity check zeroedBase.
+			if zeroedBase <= arenaLimit && zeroedBase > arenaBase {
+				// The zeroedBase moved into the space we were trying to
+				// claim. That's very bad, and indicates someone allocated
+				// the same region we did.
+				throw("potentially overlapping in-use allocations detected")
+			}
+		}
+
+		// Move base forward and subtract from npage to move into
+		// the next arena, or finish.
+		base += arenaLimit - arenaBase
+		npage -= (arenaLimit - arenaBase) / pageSize
 	}
-	if !h.grow(npage) {
+	return
+}
+
+// tryAllocMSpan attempts to allocate an mspan object from
+// the P-local cache, but may fail.
+//
+// h need not be locked.
+//
+// This caller must ensure that its P won't change underneath
+// it during this function. Currently to ensure that we enforce
+// that the function is run on the system stack, because that's
+// the only place it is used now. In the future, this requirement
+// may be relaxed if its use is necessary elsewhere.
+//
+//go:systemstack
+func (h *mheap) tryAllocMSpan() *mspan {
+	pp := getg().m.p.ptr()
+	// If we don't have a p or the cache is empty, we can't do
+	// anything here.
+	if pp == nil || pp.mspancache.len == 0 {
 		return nil
 	}
-	t = h.free.find(npage)
-	if t.valid() {
-		goto HaveSpan
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
+
+// allocMSpanLocked allocates an mspan object.
+//
+// h must be locked.
+//
+// allocMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) allocMSpanLocked() *mspan {
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		// We don't have a p so just do the normal thing.
+		return (*mspan)(h.spanalloc.alloc())
 	}
-	throw("grew heap, but no adequate free span found")
+	// Refill the cache if necessary.
+	if pp.mspancache.len == 0 {
+		const refillCount = len(pp.mspancache.buf) / 2
+		for i := 0; i < refillCount; i++ {
+			pp.mspancache.buf[i] = (*mspan)(h.spanalloc.alloc())
+		}
+		pp.mspancache.len = refillCount
+	}
+	// Pull off the last entry in the cache.
+	s := pp.mspancache.buf[pp.mspancache.len-1]
+	pp.mspancache.len--
+	return s
+}
+
+// freeMSpanLocked free an mspan object.
+//
+// h must be locked.
+//
+// freeMSpanLocked must be called on the system stack because
+// its caller holds the heap lock. See mheap for details.
+// Running on the system stack also ensures that we won't
+// switch Ps during this function. See tryAllocMSpan for details.
+//
+//go:systemstack
+func (h *mheap) freeMSpanLocked(s *mspan) {
+	pp := getg().m.p.ptr()
+	// First try to free the mspan directly to the cache.
+	if pp != nil && pp.mspancache.len < len(pp.mspancache.buf) {
+		pp.mspancache.buf[pp.mspancache.len] = s
+		pp.mspancache.len++
+		return
+	}
+	// Failing that (or if we don't have a p), just free it to
+	// the heap.
+	h.spanalloc.free(unsafe.Pointer(s))
+}
+
+// allocSpan allocates an mspan which owns npages worth of memory.
+//
+// If manual == false, allocSpan allocates a heap span of class spanclass
+// and updates heap accounting. If manual == true, allocSpan allocates a
+// manually-managed span (spanclass is ignored), and the caller is
+// responsible for any accounting related to its use of the span. Either
+// way, allocSpan will atomically add the bytes in the newly allocated
+// span to *sysStat.
+//
+// The returned span is fully initialized.
+//
+// h must not be locked.
+//
+// allocSpan must be called on the system stack both because it acquires
+// the heap lock and because it must block GC transitions.
+//
+//go:systemstack
+func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+	// Function-global state.
+	gp := getg()
+	base, scav := uintptr(0), uintptr(0)
+
+	// If the allocation is small enough, try the page cache!
+	pp := gp.m.p.ptr()
+	if pp != nil && npages < pageCachePages/4 {
+		c := &pp.pcache
+
+		// If the cache is empty, refill it.
+		if c.empty() {
+			lock(&h.lock)
+			*c = h.pages.allocToCache()
+			unlock(&h.lock)
+		}
+
+		// Try to allocate from the cache.
+		base, scav = c.alloc(npages)
+		if base != 0 {
+			s = h.tryAllocMSpan()
+
+			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+				goto HaveSpan
+			}
+			// We're either running duing GC, failed to acquire a mspan,
+			// or the allocation is for a large object. This means we
+			// have to lock the heap and do a bunch of extra work,
+			// so go down the HaveBaseLocked path.
+			//
+			// We must do this during GC to avoid skew with heap_scan
+			// since we flush mcache stats whenever we lock.
+			//
+			// TODO(mknyszek): It would be nice to not have to
+			// lock the heap if it's a large allocation, but
+			// it's fine for now. The critical section here is
+			// short and large object allocations are relatively
+			// infrequent.
+		}
+	}
+
+	// For one reason or another, we couldn't get the
+	// whole job done without the heap lock.
+	lock(&h.lock)
+
+	if base == 0 {
+		// Try to acquire a base address.
+		base, scav = h.pages.alloc(npages)
+		if base == 0 {
+			if !h.grow(npages) {
+				unlock(&h.lock)
+				return nil
+			}
+			base, scav = h.pages.alloc(npages)
+			if base == 0 {
+				throw("grew heap, but no adequate free space found")
+			}
+		}
+	}
+	if s == nil {
+		// We failed to get an mspan earlier, so grab
+		// one now that we have the heap lock.
+		s = h.allocMSpanLocked()
+	}
+	if !manual {
+		// This is a heap span, so we should do some additional accounting
+		// which may only be done with the heap locked.
+
+		// Transfer stats from mcache to global.
+		var c *mcache
+		if gp.m.p != 0 {
+			c = gp.m.p.ptr().mcache
+		} else {
+			// This case occurs while bootstrapping.
+			// See the similar code in mallocgc.
+			c = mcache0
+			if c == nil {
+				throw("mheap.allocSpan called with no P")
+			}
+		}
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		c.local_tinyallocs = 0
+
+		// Do some additional accounting if it's a large allocation.
+		if spanclass.sizeclass() == 0 {
+			mheap_.largealloc += uint64(npages * pageSize)
+			mheap_.nlargealloc++
+			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+		}
+
+		// Either heap_live or heap_scan could have been updated.
+		if gcBlackenEnabled != 0 {
+			gcController.revise()
+		}
+	}
+	unlock(&h.lock)
 
 HaveSpan:
-	s := t.span()
-	if s.state != mSpanFree {
-		throw("candidate mspan for allocation is not free")
+	// At this point, both s != nil and base != 0, and the heap
+	// lock is no longer held. Initialize the span.
+	s.init(base, npages)
+	if h.allocNeedsZero(base, npages) {
+		s.needzero = 1
 	}
-
-	// First, subtract any memory that was released back to
-	// the OS from s. We will add back what's left if necessary.
-	memstats.heap_released -= uint64(s.released())
-
-	if s.npages == npage {
-		h.free.erase(t)
-	} else if s.npages > npage {
-		// Trim off the lower bits and make that our new span.
-		// Do this in-place since this operation does not
-		// affect the original span's location in the treap.
-		n := (*mspan)(h.spanalloc.alloc())
-		h.free.mutate(t, func(s *mspan) {
-			n.init(s.base(), npage)
-			s.npages -= npage
-			s.startAddr = s.base() + npage*pageSize
-			h.setSpan(s.base()-1, n)
-			h.setSpan(s.base(), s)
-			h.setSpan(n.base(), n)
-			n.needzero = s.needzero
-			// n may not be big enough to actually be scavenged, but that's fine.
-			// We still want it to appear to be scavenged so that we can do the
-			// right bookkeeping later on in this function (i.e. sysUsed).
-			n.scavenged = s.scavenged
-			// Check if s is still scavenged.
-			if s.scavenged {
-				start, end := s.physPageBounds()
-				if start < end {
-					memstats.heap_released += uint64(end - start)
-				} else {
-					s.scavenged = false
-				}
-			}
-		})
-		s = n
+	nbytes := npages * pageSize
+	if manual {
+		s.manualFreeList = 0
+		s.nelems = 0
+		s.limit = s.base() + s.npages*pageSize
+		// Manually managed memory doesn't count toward heap_sys.
+		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
+		s.state.set(mSpanManual)
 	} else {
-		throw("candidate mspan for allocation is too small")
+		// We must set span properties before the span is published anywhere
+		// since we're not holding the heap lock.
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
+			s.elemsize = nbytes
+			s.nelems = 1
+
+			s.divShift = 0
+			s.divMul = 0
+			s.divShift2 = 0
+			s.baseMask = 0
+		} else {
+			s.elemsize = uintptr(class_to_size[sizeclass])
+			s.nelems = nbytes / s.elemsize
+
+			m := &class_to_divmagic[sizeclass]
+			s.divShift = m.shift
+			s.divMul = m.mul
+			s.divShift2 = m.shift2
+			s.baseMask = m.baseMask
+		}
+
+		// Initialize mark and allocation structures.
+		s.freeindex = 0
+		s.allocCache = ^uint64(0) // all 1s indicating all free.
+		s.gcmarkBits = newMarkBits(s.nelems)
+		s.allocBits = newAllocBits(s.nelems)
+
+		// It's safe to access h.sweepgen without the heap lock because it's
+		// only ever updated with the world stopped and we run on the
+		// systemstack which blocks a STW transition.
+		atomic.Store(&s.sweepgen, h.sweepgen)
+
+		// Now that the span is filled in, set its state. This
+		// is a publication barrier for the other fields in
+		// the span. While valid pointers into this span
+		// should never be visible until the span is returned,
+		// if the garbage collector finds an invalid pointer,
+		// access to the span may race with initialization of
+		// the span. We resolve this race by atomically
+		// setting the state after the span is fully
+		// initialized, and atomically checking the state in
+		// any situation where a pointer is suspect.
+		s.state.set(mSpanInUse)
 	}
-	// "Unscavenge" s only AFTER splitting so that
-	// we only sysUsed whatever we actually need.
-	if s.scavenged {
+
+	// Commit and account for any scavenged memory that the span now owns.
+	if scav != 0 {
 		// sysUsed all the pages that are actually available
-		// in the span. Note that we don't need to decrement
-		// heap_released since we already did so earlier.
-		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
-		s.scavenged = false
+		// in the span since some of them might be scavenged.
+		sysUsed(unsafe.Pointer(base), nbytes)
+		mSysStatDec(&memstats.heap_released, scav)
+	}
+	// Update stats.
+	mSysStatInc(sysStat, nbytes)
+	mSysStatDec(&memstats.heap_idle, nbytes)
 
-		// Since we allocated out of a scavenged span, we just
-		// grew the RSS. Mitigate this by scavenging enough free
-		// space to make up for it but only if we need to.
+	// Publish the span in various locations.
+
+	// This is safe to call without the lock held because the slots
+	// related to this span will only ever be read or modified by
+	// this thread until pointers into the span are published (and
+	// we execute a publication barrier at the end of this function
+	// before that happens) or pageInUse is updated.
+	h.setSpans(s.base(), npages, s)
+
+	if !manual {
+		if !go115NewMCentralImpl {
+			// Add to swept in-use list.
+			//
+			// This publishes the span to root marking.
+			//
+			// h.sweepgen is guaranteed to only change during STW,
+			// and preemption is disabled in the page allocator.
+			h.sweepSpans[h.sweepgen/2%2].push(s)
+		}
+
+		// Mark in-use span in arena page bitmap.
 		//
-		// scavengeLocked may cause coalescing, so prevent
-		// coalescing with s by temporarily changing its state.
-		s.state = mSpanManual
-		h.scavengeIfNeededLocked(s.npages * pageSize)
-		s.state = mSpanFree
+		// This publishes the span to the page sweeper, so
+		// it's imperative that the span be completely initialized
+		// prior to this line.
+		arena, pageIdx, pageMask := pageIndexOf(s.base())
+		atomic.Or8(&arena.pageInUse[pageIdx], pageMask)
+
+		// Update related page sweeper stats.
+		atomic.Xadd64(&h.pagesInUse, int64(npages))
+
+		if trace.enabled {
+			// Trace that a heap alloc occurred.
+			traceHeapAlloc()
+		}
 	}
 
-	h.setSpans(s.base(), npage, s)
+	// Make sure the newly allocated span will be observed
+	// by the GC before pointers into the span are published.
+	publicationBarrier()
 
-	*stat += uint64(npage << _PageShift)
-	memstats.heap_idle -= uint64(npage << _PageShift)
-
-	if s.inList() {
-		throw("still in list")
-	}
 	return s
 }
 
@@ -1248,54 +1329,92 @@
 //
 // h must be locked.
 func (h *mheap) grow(npage uintptr) bool {
-	ask := npage << _PageShift
-	v, size := h.sysAlloc(ask)
-	if v == nil {
-		print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
-		return false
+	// We must grow the heap in whole palloc chunks.
+	ask := alignUp(npage, pallocChunkPages) * pageSize
+
+	totalGrowth := uintptr(0)
+	// This may overflow because ask could be very large
+	// and is otherwise unrelated to h.curArena.base.
+	end := h.curArena.base + ask
+	nBase := alignUp(end, physPageSize)
+	if nBase > h.curArena.end || /* overflow */ end < h.curArena.base {
+		// Not enough room in the current arena. Allocate more
+		// arena space. This may not be contiguous with the
+		// current arena, so we have to request the full ask.
+		av, asize := h.sysAlloc(ask)
+		if av == nil {
+			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
+			return false
+		}
+
+		if uintptr(av) == h.curArena.end {
+			// The new space is contiguous with the old
+			// space, so just extend the current space.
+			h.curArena.end = uintptr(av) + asize
+		} else {
+			// The new space is discontiguous. Track what
+			// remains of the current space and switch to
+			// the new space. This should be rare.
+			if size := h.curArena.end - h.curArena.base; size != 0 {
+				h.pages.grow(h.curArena.base, size)
+				totalGrowth += size
+			}
+			// Switch to the new space.
+			h.curArena.base = uintptr(av)
+			h.curArena.end = uintptr(av) + asize
+		}
+
+		// The memory just allocated counts as both released
+		// and idle, even though it's not yet backed by spans.
+		//
+		// The allocation is always aligned to the heap arena
+		// size which is always > physPageSize, so its safe to
+		// just add directly to heap_released.
+		mSysStatInc(&memstats.heap_released, asize)
+		mSysStatInc(&memstats.heap_idle, asize)
+
+		// Recalculate nBase.
+		// We know this won't overflow, because sysAlloc returned
+		// a valid region starting at h.curArena.base which is at
+		// least ask bytes in size.
+		nBase = alignUp(h.curArena.base+ask, physPageSize)
 	}
 
-	// Create a fake "in use" span and free it, so that the
-	// right accounting and coalescing happens.
-	s := (*mspan)(h.spanalloc.alloc())
-	s.init(uintptr(v), size/pageSize)
-	h.setSpans(s.base(), s.npages, s)
-	s.state = mSpanFree
-	memstats.heap_idle += uint64(size)
-	// (*mheap).sysAlloc returns untouched/uncommitted memory.
-	s.scavenged = true
-	// s is always aligned to the heap arena size which is always > physPageSize,
-	// so its totally safe to just add directly to heap_released. Coalescing,
-	// if possible, will also always be correct in terms of accounting, because
-	// s.base() must be a physical page boundary.
-	memstats.heap_released += uint64(size)
-	h.coalesce(s)
-	h.free.insert(s)
+	// Grow into the current arena.
+	v := h.curArena.base
+	h.curArena.base = nBase
+	h.pages.grow(v, nBase-v)
+	totalGrowth += nBase - v
+
+	// We just caused a heap growth, so scavenge down what will soon be used.
+	// By scavenging inline we deal with the failure to allocate out of
+	// memory fragments by scavenging the memory fragments that are least
+	// likely to be re-used.
+	if retained := heapRetained(); retained+uint64(totalGrowth) > h.scavengeGoal {
+		todo := totalGrowth
+		if overage := uintptr(retained + uint64(totalGrowth) - h.scavengeGoal); todo > overage {
+			todo = overage
+		}
+		h.pages.scavenge(todo, false)
+	}
 	return true
 }
 
 // Free the span back into the heap.
-//
-// large must match the value of large passed to mheap.alloc. This is
-// used for accounting.
-func (h *mheap) freeSpan(s *mspan, large bool) {
+func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		mp := getg().m
+		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(mp.mcache.local_scan)
-		mp.mcache.local_scan = 0
-		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
-		mp.mcache.local_tinyallocs = 0
+		memstats.heap_scan += uint64(c.local_scan)
+		c.local_scan = 0
+		memstats.tinyallocs += uint64(c.local_tinyallocs)
+		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if large {
-			// Match accounting done in mheap.alloc.
-			memstats.heap_objects--
-		}
 		if gcBlackenEnabled != 0 {
 			// heap_scan changed.
 			gcController.revise()
@@ -1319,14 +1438,14 @@
 func (h *mheap) freeManual(s *mspan, stat *uint64) {
 	s.needzero = 1
 	lock(&h.lock)
-	*stat -= uint64(s.npages << _PageShift)
-	memstats.heap_sys += uint64(s.npages << _PageShift)
+	mSysStatDec(stat, s.npages*pageSize)
+	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
 	h.freeSpanLocked(s, false, true)
 	unlock(&h.lock)
 }
 
 func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
-	switch s.state {
+	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
 			throw("mheap.freeSpanLocked - invalid stack free")
@@ -1336,161 +1455,50 @@
 			print("mheap.freeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
 			throw("mheap.freeSpanLocked - invalid free")
 		}
-		h.pagesInUse -= uint64(s.npages)
+		atomic.Xadd64(&h.pagesInUse, -int64(s.npages))
 
 		// Clear in-use bit in arena page bitmap.
 		arena, pageIdx, pageMask := pageIndexOf(s.base())
-		arena.pageInUse[pageIdx] &^= pageMask
+		atomic.And8(&arena.pageInUse[pageIdx], ^pageMask)
 	default:
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
 	if acctinuse {
-		memstats.heap_inuse -= uint64(s.npages << _PageShift)
+		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
 	}
 	if acctidle {
-		memstats.heap_idle += uint64(s.npages << _PageShift)
+		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
 	}
-	s.state = mSpanFree
 
-	// Coalesce span with neighbors.
-	h.coalesce(s)
+	// Mark the space as free.
+	h.pages.free(s.base(), s.npages)
 
-	// Insert s into the treap.
-	h.free.insert(s)
+	// Free the span structure. We no longer have a use for it.
+	s.state.set(mSpanDead)
+	h.freeMSpanLocked(s)
 }
 
-// scavengeSplit takes t.span() and attempts to split off a span containing size
-// (in bytes) worth of physical pages from the back.
-//
-// The split point is only approximately defined by size since the split point
-// is aligned to physPageSize and pageSize every time. If physHugePageSize is
-// non-zero and the split point would break apart a huge page in the span, then
-// the split point is also aligned to physHugePageSize.
-//
-// If the desired split point ends up at the base of s, or if size is obviously
-// much larger than s, then a split is not possible and this method returns nil.
-// Otherwise if a split occurred it returns the newly-created span.
-func (h *mheap) scavengeSplit(t treapIter, size uintptr) *mspan {
-	s := t.span()
-	start, end := s.physPageBounds()
-	if end <= start || end-start <= size {
-		// Size covers the whole span.
-		return nil
-	}
-	// The span is bigger than what we need, so compute the base for the new
-	// span if we decide to split.
-	base := end - size
-	// Round down to the next physical or logical page, whichever is bigger.
-	base &^= (physPageSize - 1) | (pageSize - 1)
-	if base <= start {
-		return nil
-	}
-	if physHugePageSize > pageSize && base&^(physHugePageSize-1) >= start {
-		// We're in danger of breaking apart a huge page, so include the entire
-		// huge page in the bound by rounding down to the huge page size.
-		// base should still be aligned to pageSize.
-		base &^= physHugePageSize - 1
-	}
-	if base == start {
-		// After all that we rounded base down to s.base(), so no need to split.
-		return nil
-	}
-	if base < start {
-		print("runtime: base=", base, ", s.npages=", s.npages, ", s.base()=", s.base(), ", size=", size, "\n")
-		print("runtime: physPageSize=", physPageSize, ", physHugePageSize=", physHugePageSize, "\n")
-		throw("bad span split base")
-	}
-
-	// Split s in-place, removing from the back.
-	n := (*mspan)(h.spanalloc.alloc())
-	nbytes := s.base() + s.npages*pageSize - base
-	h.free.mutate(t, func(s *mspan) {
-		n.init(base, nbytes/pageSize)
-		s.npages -= nbytes / pageSize
-		h.setSpan(n.base()-1, s)
-		h.setSpan(n.base(), n)
-		h.setSpan(n.base()+nbytes-1, n)
-		n.needzero = s.needzero
-		n.state = s.state
-	})
-	return n
-}
-
-// scavengeLocked scavenges nbytes worth of spans in the free treap by
-// starting from the span with the highest base address and working down.
-// It then takes those spans and places them in scav.
-//
-// Returns the amount of memory scavenged in bytes. h must be locked.
-func (h *mheap) scavengeLocked(nbytes uintptr) uintptr {
-	released := uintptr(0)
-	// Iterate over spans with huge pages first, then spans without.
-	const mask = treapIterScav | treapIterHuge
-	for _, match := range []treapIterType{treapIterHuge, 0} {
-		// Iterate over the treap backwards (from highest address to lowest address)
-		// scavenging spans until we've reached our quota of nbytes.
-		for t := h.free.end(mask, match); released < nbytes && t.valid(); {
-			s := t.span()
-			start, end := s.physPageBounds()
-			if start >= end {
-				// This span doesn't cover at least one physical page, so skip it.
-				t = t.prev()
-				continue
-			}
-			n := t.prev()
-			if span := h.scavengeSplit(t, nbytes-released); span != nil {
-				s = span
-			} else {
-				h.free.erase(t)
-			}
-			released += s.scavenge()
-			// Now that s is scavenged, we must eagerly coalesce it
-			// with its neighbors to prevent having two spans with
-			// the same scavenged state adjacent to each other.
-			h.coalesce(s)
-			t = n
-			h.free.insert(s)
-		}
-	}
-	return released
-}
-
-// scavengeIfNeededLocked calls scavengeLocked if we're currently above the
-// scavenge goal in order to prevent the mutator from out-running the
-// the scavenger.
-//
-// h must be locked.
-func (h *mheap) scavengeIfNeededLocked(size uintptr) {
-	if r := heapRetained(); r+uint64(size) > h.scavengeRetainedGoal {
-		todo := uint64(size)
-		// If we're only going to go a little bit over, just request what
-		// we actually need done.
-		if overage := r + uint64(size) - h.scavengeRetainedGoal; overage < todo {
-			todo = overage
-		}
-		h.scavengeLocked(uintptr(todo))
-	}
-}
-
-// scavengeAll visits each node in the free treap and scavenges the
-// treapNode's span. It then removes the scavenged span from
-// unscav and adds it into scav before continuing.
+// scavengeAll acquires the heap lock (blocking any additional
+// manipulation of the page allocator) and iterates over the whole
+// heap, scavenging every free page available.
 func (h *mheap) scavengeAll() {
 	// Disallow malloc or panic while holding the heap lock. We do
-	// this here because this is an non-mallocgc entry-point to
+	// this here because this is a non-mallocgc entry-point to
 	// the mheap API.
 	gp := getg()
 	gp.m.mallocing++
 	lock(&h.lock)
-	released := h.scavengeLocked(^uintptr(0))
+	// Start a new scavenge generation so we have a chance to walk
+	// over the whole heap.
+	h.pages.scavengeStartGen()
+	released := h.pages.scavenge(^uintptr(0), false)
+	gen := h.pages.scav.gen
 	unlock(&h.lock)
 	gp.m.mallocing--
 
-	if debug.gctrace > 0 {
-		if released > 0 {
-			print("forced scvg: ", released>>20, " MB released\n")
-		}
-		print("forced scvg: inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
+	if debug.scavtrace > 0 {
+		printScavTrace(gen, released, true)
 	}
 }
 
@@ -1511,14 +1519,14 @@
 	span.allocCount = 0
 	span.spanclass = 0
 	span.elemsize = 0
-	span.state = mSpanDead
-	span.scavenged = false
 	span.speciallock.key = 0
 	span.specials = nil
 	span.needzero = 0
 	span.freeindex = 0
 	span.allocBits = nil
 	span.gcmarkBits = nil
+	span.state.set(mSpanDead)
+	lockInit(&span.speciallock, lockRankMspanSpecial)
 }
 
 func (span *mspan) inList() bool {
@@ -1632,6 +1640,22 @@
 	kind   byte     // kind of special
 }
 
+// spanHasSpecials marks a span as having specials in the arena bitmap.
+func spanHasSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.Or8(&ha.pageSpecials[arenaPage/8], uint8(1)<<(arenaPage%8))
+}
+
+// spanHasNoSpecials marks a span as having no specials in the arena bitmap.
+func spanHasNoSpecials(s *mspan) {
+	arenaPage := (s.base() / pageSize) % pagesPerArena
+	ai := arenaIndex(s.base())
+	ha := mheap_.arenas[ai.l1()][ai.l2()]
+	atomic.And8(&ha.pageSpecials[arenaPage/8], ^(uint8(1) << (arenaPage % 8)))
+}
+
 // Adds the special record s to the list of special records for
 // the object p. All fields of s should be filled in except for
 // offset & next, which this routine will fill in.
@@ -1677,6 +1701,9 @@
 	s.offset = uint16(offset)
 	s.next = *t
 	*t = s
+	if go115NewMarkrootSpans {
+		spanHasSpecials(span)
+	}
 	unlock(&span.speciallock)
 	releasem(mp)
 
@@ -1700,6 +1727,7 @@
 
 	offset := uintptr(p) - span.base()
 
+	var result *special
 	lock(&span.speciallock)
 	t := &span.specials
 	for {
@@ -1711,15 +1739,17 @@
 		// "interior" specials (p must be exactly equal to s->offset).
 		if offset == uintptr(s.offset) && kind == s.kind {
 			*t = s.next
-			unlock(&span.speciallock)
-			releasem(mp)
-			return s
+			result = s
+			break
 		}
 		t = &s.next
 	}
+	if go115NewMarkrootSpans && span.specials == nil {
+		spanHasNoSpecials(span)
+	}
 	unlock(&span.speciallock)
 	releasem(mp)
-	return nil
+	return result
 }
 
 // The described object has a finalizer set for it.
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index b6fe701..6c7a4cf 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -194,7 +194,9 @@
 }
 
 func copyPPC64x(w io.Writer) {
-	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
+	// duffcopy is not used on PPC64.
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	fmt.Fprintln(w, "\tUNDEF")
 }
 
 func tagsMIPS64x(w io.Writer) {
@@ -216,5 +218,13 @@
 }
 
 func copyMIPS64x(w io.Writer) {
-	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOVV\t(R1), R23")
+		fmt.Fprintln(w, "\tADDV\t$8, R1")
+		fmt.Fprintln(w, "\tMOVV\tR23, (R2)")
+		fmt.Fprintln(w, "\tADDV\t$8, R2")
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintln(w, "\tRET")
 }
diff --git a/src/runtime/mknacl.sh b/src/runtime/mknacl.sh
deleted file mode 100644
index 306ae3d..0000000
--- a/src/runtime/mknacl.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2013 The Go Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style
-# license that can be found in the LICENSE file.
-
-cat /Users/rsc/pub/native_client/src/trusted/service_runtime/include/bits/nacl_syscalls.h |
-	awk '
-	BEGIN {
-		printf("// Code generated by mknacl.sh; DO NOT EDIT.\n")
-	}
-	NF==3 && $1=="#define" && $2~/^NACL_sys_/ {
-		name=$2
-		sub(/^NACL_sys_/, "SYS_", name)
-		printf("#define %s %s\n", name, $3)
-	}' >syscall_nacl.h
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
new file mode 100644
index 0000000..1fe7766
--- /dev/null
+++ b/src/runtime/mkpreempt.go
@@ -0,0 +1,575 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// mkpreempt generates the asyncPreempt functions for each
+// architecture.
+package main
+
+import (
+	"flag"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"strings"
+)
+
+// Copied from cmd/compile/internal/ssa/gen/*Ops.go
+
+var regNames386 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+}
+
+var regNamesAMD64 = []string{
+	"AX",
+	"CX",
+	"DX",
+	"BX",
+	"SP",
+	"BP",
+	"SI",
+	"DI",
+	"R8",
+	"R9",
+	"R10",
+	"R11",
+	"R12",
+	"R13",
+	"R14",
+	"R15",
+	"X0",
+	"X1",
+	"X2",
+	"X3",
+	"X4",
+	"X5",
+	"X6",
+	"X7",
+	"X8",
+	"X9",
+	"X10",
+	"X11",
+	"X12",
+	"X13",
+	"X14",
+	"X15",
+}
+
+var out io.Writer
+
+var arches = map[string]func(){
+	"386":     gen386,
+	"amd64":   genAMD64,
+	"arm":     genARM,
+	"arm64":   genARM64,
+	"mips64x": func() { genMIPS(true) },
+	"mipsx":   func() { genMIPS(false) },
+	"ppc64x":  genPPC64,
+	"riscv64": genRISCV64,
+	"s390x":   genS390X,
+	"wasm":    genWasm,
+}
+var beLe = map[string]bool{"mips64x": true, "mipsx": true, "ppc64x": true}
+
+func main() {
+	flag.Parse()
+	if flag.NArg() > 0 {
+		out = os.Stdout
+		for _, arch := range flag.Args() {
+			gen, ok := arches[arch]
+			if !ok {
+				log.Fatalf("unknown arch %s", arch)
+			}
+			header(arch)
+			gen()
+		}
+		return
+	}
+
+	for arch, gen := range arches {
+		f, err := os.Create(fmt.Sprintf("preempt_%s.s", arch))
+		if err != nil {
+			log.Fatal(err)
+		}
+		out = f
+		header(arch)
+		gen()
+		if err := f.Close(); err != nil {
+			log.Fatal(err)
+		}
+	}
+}
+
+func header(arch string) {
+	fmt.Fprintf(out, "// Code generated by mkpreempt.go; DO NOT EDIT.\n\n")
+	if beLe[arch] {
+		base := arch[:len(arch)-1]
+		fmt.Fprintf(out, "// +build %s %sle\n\n", base, base)
+	}
+	fmt.Fprintf(out, "#include \"go_asm.h\"\n")
+	fmt.Fprintf(out, "#include \"textflag.h\"\n\n")
+	fmt.Fprintf(out, "TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0\n")
+}
+
+func p(f string, args ...interface{}) {
+	fmted := fmt.Sprintf(f, args...)
+	fmt.Fprintf(out, "\t%s\n", strings.Replace(fmted, "\n", "\n\t", -1))
+}
+
+func label(l string) {
+	fmt.Fprintf(out, "%s\n", l)
+}
+
+type layout struct {
+	stack int
+	regs  []regPos
+	sp    string // stack pointer register
+}
+
+type regPos struct {
+	pos int
+
+	op  string
+	reg string
+
+	// If this register requires special save and restore, these
+	// give those operations with a %d placeholder for the stack
+	// offset.
+	save, restore string
+}
+
+func (l *layout) add(op, reg string, size int) {
+	l.regs = append(l.regs, regPos{op: op, reg: reg, pos: l.stack})
+	l.stack += size
+}
+
+func (l *layout) addSpecial(save, restore string, size int) {
+	l.regs = append(l.regs, regPos{save: save, restore: restore, pos: l.stack})
+	l.stack += size
+}
+
+func (l *layout) save() {
+	for _, reg := range l.regs {
+		if reg.save != "" {
+			p(reg.save, reg.pos)
+		} else {
+			p("%s %s, %d(%s)", reg.op, reg.reg, reg.pos, l.sp)
+		}
+	}
+}
+
+func (l *layout) restore() {
+	for i := len(l.regs) - 1; i >= 0; i-- {
+		reg := l.regs[i]
+		if reg.restore != "" {
+			p(reg.restore, reg.pos)
+		} else {
+			p("%s %d(%s), %s", reg.op, reg.pos, l.sp, reg.reg)
+		}
+	}
+}
+
+func gen386() {
+	p("PUSHFL")
+
+	// Save general purpose registers.
+	var l = layout{sp: "SP"}
+	for _, reg := range regNames386 {
+		if reg == "SP" || strings.HasPrefix(reg, "X") {
+			continue
+		}
+		l.add("MOVL", reg, 4)
+	}
+
+	// Save the 387 state.
+	l.addSpecial(
+		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
+		"FRSTOR %d(SP)",
+		108)
+
+	// Save SSE state only if supported.
+	lSSE := layout{stack: l.stack, sp: "SP"}
+	for i := 0; i < 8; i++ {
+		lSSE.add("MOVUPS", fmt.Sprintf("X%d", i), 16)
+	}
+
+	p("ADJSP $%d", lSSE.stack)
+	p("NOP SP")
+	l.save()
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse")
+	lSSE.save()
+	label("nosse:")
+	p("CALL ·asyncPreempt2(SB)")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1\nJNE nosse2")
+	lSSE.restore()
+	label("nosse2:")
+	l.restore()
+	p("ADJSP $%d", -lSSE.stack)
+
+	p("POPFL")
+	p("RET")
+}
+
+func genAMD64() {
+	// Assign stack offsets.
+	var l = layout{sp: "SP"}
+	for _, reg := range regNamesAMD64 {
+		if reg == "SP" || reg == "BP" {
+			continue
+		}
+		if strings.HasPrefix(reg, "X") {
+			l.add("MOVUPS", reg, 16)
+		} else {
+			l.add("MOVQ", reg, 8)
+		}
+	}
+
+	// TODO: MXCSR register?
+
+	p("PUSHQ BP")
+	p("MOVQ SP, BP")
+	p("// Save flags before clobbering them")
+	p("PUSHFQ")
+	p("// obj doesn't understand ADD/SUB on SP, but does understand ADJSP")
+	p("ADJSP $%d", l.stack)
+	p("// But vet doesn't know ADJSP, so suppress vet stack checking")
+	p("NOP SP")
+
+	// Apparently, the signal handling code path in darwin kernel leaves
+	// the upper bits of Y registers in a dirty state, which causes
+	// many SSE operations (128-bit and narrower) become much slower.
+	// Clear the upper bits to get to a clean state. See issue #37174.
+	// It is safe here as Go code don't use the upper bits of Y registers.
+	p("#ifdef GOOS_darwin")
+	p("CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0")
+	p("JE 2(PC)")
+	p("VZEROUPPER")
+	p("#endif")
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+	p("ADJSP $%d", -l.stack)
+	p("POPFQ")
+	p("POPQ BP")
+	p("RET")
+}
+
+func genARM() {
+	// Add integer registers R0-R12.
+	// R13 (SP), R14 (LR), R15 (PC) are special and not saved here.
+	var l = layout{sp: "R13", stack: 4} // add LR slot
+	for i := 0; i <= 12; i++ {
+		reg := fmt.Sprintf("R%d", i)
+		if i == 10 {
+			continue // R10 is g register, no need to save/restore
+		}
+		l.add("MOVW", reg, 4)
+	}
+	// Add flag register.
+	l.addSpecial(
+		"MOVW CPSR, R0\nMOVW R0, %d(R13)",
+		"MOVW %d(R13), R0\nMOVW R0, CPSR",
+		4)
+
+	// Add floating point registers F0-F15 and flag register.
+	var lfp = layout{stack: l.stack, sp: "R13"}
+	lfp.addSpecial(
+		"MOVW FPCR, R0\nMOVW R0, %d(R13)",
+		"MOVW %d(R13), R0\nMOVW R0, FPCR",
+		4)
+	for i := 0; i <= 15; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		lfp.add("MOVD", reg, 8)
+	}
+
+	p("MOVW.W R14, -%d(R13)", lfp.stack) // allocate frame, save LR
+	l.save()
+	p("MOVB ·goarm(SB), R0\nCMP $6, R0\nBLT nofp") // test goarm, and skip FP registers if goarm=5.
+	lfp.save()
+	label("nofp:")
+	p("CALL ·asyncPreempt2(SB)")
+	p("MOVB ·goarm(SB), R0\nCMP $6, R0\nBLT nofp2") // test goarm, and skip FP registers if goarm=5.
+	lfp.restore()
+	label("nofp2:")
+	l.restore()
+
+	p("MOVW %d(R13), R14", lfp.stack)     // sigctxt.pushCall pushes LR on stack, restore it
+	p("MOVW.P %d(R13), R15", lfp.stack+4) // load PC, pop frame (including the space pushed by sigctxt.pushCall)
+	p("UNDEF")                            // shouldn't get here
+}
+
+func genARM64() {
+	// Add integer registers R0-R26
+	// R27 (REGTMP), R28 (g), R29 (FP), R30 (LR), R31 (SP) are special
+	// and not saved here.
+	var l = layout{sp: "RSP", stack: 8} // add slot to save PC of interrupted instruction
+	for i := 0; i <= 26; i++ {
+		if i == 18 {
+			continue // R18 is not used, skip
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add("MOVD", reg, 8)
+	}
+	// Add flag registers.
+	l.addSpecial(
+		"MOVD NZCV, R0\nMOVD R0, %d(RSP)",
+		"MOVD %d(RSP), R0\nMOVD R0, NZCV",
+		8)
+	l.addSpecial(
+		"MOVD FPSR, R0\nMOVD R0, %d(RSP)",
+		"MOVD %d(RSP), R0\nMOVD R0, FPSR",
+		8)
+	// TODO: FPCR? I don't think we'll change it, so no need to save.
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+	if l.stack%16 != 0 {
+		l.stack += 8 // SP needs 16-byte alignment
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR)
+	p("MOVD R30, %d(RSP)", -l.stack)
+	p("SUB $%d, RSP", l.stack)
+	p("#ifdef GOOS_linux")
+	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
+	p("SUB $8, RSP, R29")  // set up new frame pointer
+	p("#endif")
+	// On darwin, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// so any writes below SP may be clobbered.
+	p("#ifdef GOOS_darwin")
+	p("MOVD R30, (RSP)")
+	p("#endif")
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(RSP), R30", l.stack) // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p("#ifdef GOOS_linux")
+	p("MOVD -8(RSP), R29") // restore frame pointer
+	p("#endif")
+	p("MOVD (RSP), R27")          // load PC to REGTMP
+	p("ADD $%d, RSP", l.stack+16) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (R27)")
+}
+
+func genMIPS(_64bit bool) {
+	mov := "MOVW"
+	movf := "MOVF"
+	add := "ADD"
+	sub := "SUB"
+	r28 := "R28"
+	regsize := 4
+	softfloat := "GOMIPS_softfloat"
+	if _64bit {
+		mov = "MOVV"
+		movf = "MOVD"
+		add = "ADDV"
+		sub = "SUBV"
+		r28 = "RSB"
+		regsize = 8
+		softfloat = "GOMIPS64_softfloat"
+	}
+
+	// Add integer registers R1-R22, R24-R25, R28
+	// R0 (zero), R23 (REGTMP), R29 (SP), R30 (g), R31 (LR) are special,
+	// and not saved here. R26 and R27 are reserved by kernel and not used.
+	var l = layout{sp: "R29", stack: regsize} // add slot to save PC of interrupted instruction (in LR)
+	for i := 1; i <= 25; i++ {
+		if i == 23 {
+			continue // R23 is REGTMP
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add(mov, reg, regsize)
+	}
+	l.add(mov, r28, regsize)
+	l.addSpecial(
+		mov+" HI, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, HI",
+		regsize)
+	l.addSpecial(
+		mov+" LO, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, LO",
+		regsize)
+
+	// Add floating point control/status register FCR31 (FCR0-FCR30 are irrelevant)
+	var lfp = layout{sp: "R29", stack: l.stack}
+	lfp.addSpecial(
+		mov+" FCR31, R1\n"+mov+" R1, %d(R29)",
+		mov+" %d(R29), R1\n"+mov+" R1, FCR31",
+		regsize)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		lfp.add(movf, reg, regsize)
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR)
+	p(mov+" R31, -%d(R29)", lfp.stack)
+	p(sub+" $%d, R29", lfp.stack)
+
+	l.save()
+	p("#ifndef %s", softfloat)
+	lfp.save()
+	p("#endif")
+	p("CALL ·asyncPreempt2(SB)")
+	p("#ifndef %s", softfloat)
+	lfp.restore()
+	p("#endif")
+	l.restore()
+
+	p(mov+" %d(R29), R31", lfp.stack)     // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p(mov + " (R29), R23")                // load PC to REGTMP
+	p(add+" $%d, R29", lfp.stack+regsize) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (R23)")
+}
+
+func genPPC64() {
+	// Add integer registers R3-R29
+	// R0 (zero), R1 (SP), R30 (g) are special and not saved here.
+	// R2 (TOC pointer in PIC mode), R12 (function entry address in PIC mode) have been saved in sigctxt.pushCall.
+	// R31 (REGTMP) will be saved manually.
+	var l = layout{sp: "R1", stack: 32 + 8} // MinFrameSize on PPC64, plus one word for saving R31
+	for i := 3; i <= 29; i++ {
+		if i == 12 || i == 13 {
+			// R12 has been saved in sigctxt.pushCall.
+			// R13 is TLS pointer, not used by Go code. we must NOT
+			// restore it, otherwise if we parked and resumed on a
+			// different thread we'll mess up TLS addresses.
+			continue
+		}
+		reg := fmt.Sprintf("R%d", i)
+		l.add("MOVD", reg, 8)
+	}
+	l.addSpecial(
+		"MOVW CR, R31\nMOVW R31, %d(R1)",
+		"MOVW %d(R1), R31\nMOVFL R31, $0xff", // this is MOVW R31, CR
+		8)                                    // CR is 4-byte wide, but just keep the alignment
+	l.addSpecial(
+		"MOVD XER, R31\nMOVD R31, %d(R1)",
+		"MOVD %d(R1), R31\nMOVD R31, XER",
+		8)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+	// Add floating point control/status register FPSCR.
+	l.addSpecial(
+		"MOVFL FPSCR, F0\nFMOVD F0, %d(R1)",
+		"FMOVD %d(R1), F0\nMOVFL F0, FPSCR",
+		8)
+
+	p("MOVD R31, -%d(R1)", l.stack-32) // save R31 first, we'll use R31 for saving LR
+	p("MOVD LR, R31")
+	p("MOVDU R31, -%d(R1)", l.stack) // allocate frame, save PC of interrupted instruction (in LR)
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(R1), R31", l.stack) // sigctxt.pushCall has pushed LR, R2, R12 (at interrupt) on stack, restore them
+	p("MOVD R31, LR")
+	p("MOVD %d(R1), R2", l.stack+8)
+	p("MOVD %d(R1), R12", l.stack+16)
+	p("MOVD (R1), R31") // load PC to CTR
+	p("MOVD R31, CTR")
+	p("MOVD 32(R1), R31")        // restore R31
+	p("ADD $%d, R1", l.stack+32) // pop frame (including the space pushed by sigctxt.pushCall)
+	p("JMP (CTR)")
+}
+
+func genRISCV64() {
+	// X0 (zero), X1 (LR), X2 (SP), X4 (g), X31 (TMP) are special.
+	var l = layout{sp: "X2", stack: 8}
+
+	// Add integer registers (X3, X5-X30).
+	for i := 3; i < 31; i++ {
+		if i == 4 {
+			continue
+		}
+		reg := fmt.Sprintf("X%d", i)
+		l.add("MOV", reg, 8)
+	}
+
+	// Add floating point registers (F0-F31).
+	for i := 0; i <= 31; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("MOVD", reg, 8)
+	}
+
+	p("MOV X1, -%d(X2)", l.stack)
+	p("ADD $-%d, X2", l.stack)
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+	p("MOV %d(X2), X1", l.stack)
+	p("MOV (X2), X31")
+	p("ADD $%d, X2", l.stack+8)
+	p("JMP (X31)")
+}
+
+func genS390X() {
+	// Add integer registers R0-R12
+	// R13 (g), R14 (LR), R15 (SP) are special, and not saved here.
+	// Saving R10 (REGTMP) is not necessary, but it is saved anyway.
+	var l = layout{sp: "R15", stack: 16} // add slot to save PC of interrupted instruction and flags
+	l.addSpecial(
+		"STMG R0, R12, %d(R15)",
+		"LMG %d(R15), R0, R12",
+		13*8)
+	// Add floating point registers F0-F31.
+	for i := 0; i <= 15; i++ {
+		reg := fmt.Sprintf("F%d", i)
+		l.add("FMOVD", reg, 8)
+	}
+
+	// allocate frame, save PC of interrupted instruction (in LR) and flags (condition code)
+	p("IPM R10") // save flags upfront, as ADD will clobber flags
+	p("MOVD R14, -%d(R15)", l.stack)
+	p("ADD $-%d, R15", l.stack)
+	p("MOVW R10, 8(R15)") // save flags
+
+	l.save()
+	p("CALL ·asyncPreempt2(SB)")
+	l.restore()
+
+	p("MOVD %d(R15), R14", l.stack)    // sigctxt.pushCall has pushed LR (at interrupt) on stack, restore it
+	p("ADD $%d, R15", l.stack+8)       // pop frame (including the space pushed by sigctxt.pushCall)
+	p("MOVWZ -%d(R15), R10", l.stack)  // load flags to REGTMP
+	p("TMLH R10, $(3<<12)")            // restore flags
+	p("MOVD -%d(R15), R10", l.stack+8) // load PC to REGTMP
+	p("JMP (R10)")
+}
+
+func genWasm() {
+	p("// No async preemption on wasm")
+	p("UNDEF")
+}
+
+func notImplemented() {
+	p("// Not implemented yet")
+	p("JMP ·abort(SB)")
+}
diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 2868f3f..9fe31cb 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go
@@ -5,7 +5,6 @@
 // +build !plan9
 // +build !solaris
 // +build !windows
-// +build !nacl
 // +build !linux !amd64
 // +build !linux !arm64
 // +build !js
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
new file mode 100644
index 0000000..60f7f9f
--- /dev/null
+++ b/src/runtime/mpagealloc.go
@@ -0,0 +1,951 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page allocator.
+//
+// The page allocator manages mapped pages (defined by pageSize, NOT
+// physPageSize) for allocation and re-use. It is embedded into mheap.
+//
+// Pages are managed using a bitmap that is sharded into chunks.
+// In the bitmap, 1 means in-use, and 0 means free. The bitmap spans the
+// process's address space. Chunks are managed in a sparse-array-style structure
+// similar to mheap.arenas, since the bitmap may be large on some systems.
+//
+// The bitmap is efficiently searched by using a radix tree in combination
+// with fast bit-wise intrinsics. Allocation is performed using an address-ordered
+// first-fit approach.
+//
+// Each entry in the radix tree is a summary that describes three properties of
+// a particular region of the address space: the number of contiguous free pages
+// at the start and end of the region it represents, and the maximum number of
+// contiguous free pages found anywhere in that region.
+//
+// Each level of the radix tree is stored as one contiguous array, which represents
+// a different granularity of subdivision of the processes' address space. Thus, this
+// radix tree is actually implicit in these large arrays, as opposed to having explicit
+// dynamically-allocated pointer-based node structures. Naturally, these arrays may be
+// quite large for system with large address spaces, so in these cases they are mapped
+// into memory as needed. The leaf summaries of the tree correspond to a bitmap chunk.
+//
+// The root level (referred to as L0 and index 0 in pageAlloc.summary) has each
+// summary represent the largest section of address space (16 GiB on 64-bit systems),
+// with each subsequent level representing successively smaller subsections until we
+// reach the finest granularity at the leaves, a chunk.
+//
+// More specifically, each summary in each level (except for leaf summaries)
+// represents some number of entries in the following level. For example, each
+// summary in the root level may represent a 16 GiB region of address space,
+// and in the next level there could be 8 corresponding entries which represent 2
+// GiB subsections of that 16 GiB region, each of which could correspond to 8
+// entries in the next level which each represent 256 MiB regions, and so on.
+//
+// Thus, this design only scales to heaps so large, but can always be extended to
+// larger heaps by simply adding levels to the radix tree, which mostly costs
+// additional virtual address space. The choice of managing large arrays also means
+// that a large amount of virtual address space may be reserved by the runtime.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	// The size of a bitmap chunk, i.e. the amount of bits (that is, pages) to consider
+	// in the bitmap at once.
+	pallocChunkPages    = 1 << logPallocChunkPages
+	pallocChunkBytes    = pallocChunkPages * pageSize
+	logPallocChunkPages = 9
+	logPallocChunkBytes = logPallocChunkPages + pageShift
+
+	// The number of radix bits for each level.
+	//
+	// The value of 3 is chosen such that the block of summaries we need to scan at
+	// each level fits in 64 bytes (2^3 summaries * 8 bytes per summary), which is
+	// close to the L1 cache line width on many systems. Also, a value of 3 fits 4 tree
+	// levels perfectly into the 21-bit pallocBits summary field at the root level.
+	//
+	// The following equation explains how each of the constants relate:
+	// summaryL0Bits + (summaryLevels-1)*summaryLevelBits + logPallocChunkBytes = heapAddrBits
+	//
+	// summaryLevels is an architecture-dependent value defined in mpagealloc_*.go.
+	summaryLevelBits = 3
+	summaryL0Bits    = heapAddrBits - logPallocChunkBytes - (summaryLevels-1)*summaryLevelBits
+
+	// pallocChunksL2Bits is the number of bits of the chunk index number
+	// covered by the second level of the chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this change.
+	pallocChunksL2Bits  = heapAddrBits - logPallocChunkBytes - pallocChunksL1Bits
+	pallocChunksL1Shift = pallocChunksL2Bits
+)
+
+// Maximum searchAddr value, which indicates that the heap has no free space.
+//
+// We alias maxOffAddr just to make it clear that this is the maximum address
+// for the page allocator's search space. See maxOffAddr for details.
+var maxSearchAddr = maxOffAddr
+
+// Global chunk index.
+//
+// Represents an index into the leaf level of the radix tree.
+// Similar to arenaIndex, except instead of arenas, it divides the address
+// space into chunks.
+type chunkIdx uint
+
+// chunkIndex returns the global index of the palloc chunk containing the
+// pointer p.
+func chunkIndex(p uintptr) chunkIdx {
+	return chunkIdx((p - arenaBaseOffset) / pallocChunkBytes)
+}
+
+// chunkIndex returns the base address of the palloc chunk at index ci.
+func chunkBase(ci chunkIdx) uintptr {
+	return uintptr(ci)*pallocChunkBytes + arenaBaseOffset
+}
+
+// chunkPageIndex computes the index of the page that contains p,
+// relative to the chunk which contains p.
+func chunkPageIndex(p uintptr) uint {
+	return uint(p % pallocChunkBytes / pageSize)
+}
+
+// l1 returns the index into the first level of (*pageAlloc).chunks.
+func (i chunkIdx) l1() uint {
+	if pallocChunksL1Bits == 0 {
+		// Let the compiler optimize this away if there's no
+		// L1 map.
+		return 0
+	} else {
+		return uint(i) >> pallocChunksL1Shift
+	}
+}
+
+// l2 returns the index into the second level of (*pageAlloc).chunks.
+func (i chunkIdx) l2() uint {
+	if pallocChunksL1Bits == 0 {
+		return uint(i)
+	} else {
+		return uint(i) & (1<<pallocChunksL2Bits - 1)
+	}
+}
+
+// offAddrToLevelIndex converts an address in the offset address space
+// to the index into summary[level] containing addr.
+func offAddrToLevelIndex(level int, addr offAddr) int {
+	return int((addr.a - arenaBaseOffset) >> levelShift[level])
+}
+
+// levelIndexToOffAddr converts an index into summary[level] into
+// the corresponding address in the offset address space.
+func levelIndexToOffAddr(level, idx int) offAddr {
+	return offAddr{(uintptr(idx) << levelShift[level]) + arenaBaseOffset}
+}
+
+// addrsToSummaryRange converts base and limit pointers into a range
+// of entries for the given summary level.
+//
+// The returned range is inclusive on the lower bound and exclusive on
+// the upper bound.
+func addrsToSummaryRange(level int, base, limit uintptr) (lo int, hi int) {
+	// This is slightly more nuanced than just a shift for the exclusive
+	// upper-bound. Note that the exclusive upper bound may be within a
+	// summary at this level, meaning if we just do the obvious computation
+	// hi will end up being an inclusive upper bound. Unfortunately, just
+	// adding 1 to that is too broad since we might be on the very edge of
+	// of a summary's max page count boundary for this level
+	// (1 << levelLogPages[level]). So, make limit an inclusive upper bound
+	// then shift, then add 1, so we get an exclusive upper bound at the end.
+	lo = int((base - arenaBaseOffset) >> levelShift[level])
+	hi = int(((limit-1)-arenaBaseOffset)>>levelShift[level]) + 1
+	return
+}
+
+// blockAlignSummaryRange aligns indices into the given level to that
+// level's block width (1 << levelBits[level]). It assumes lo is inclusive
+// and hi is exclusive, and so aligns them down and up respectively.
+func blockAlignSummaryRange(level int, lo, hi int) (int, int) {
+	e := uintptr(1) << levelBits[level]
+	return int(alignDown(uintptr(lo), e)), int(alignUp(uintptr(hi), e))
+}
+
+type pageAlloc struct {
+	// Radix tree of summaries.
+	//
+	// Each slice's cap represents the whole memory reservation.
+	// Each slice's len reflects the allocator's maximum known
+	// mapped heap address for that level.
+	//
+	// The backing store of each summary level is reserved in init
+	// and may or may not be committed in grow (small address spaces
+	// may commit all the memory in init).
+	//
+	// The purpose of keeping len <= cap is to enforce bounds checks
+	// on the top end of the slice so that instead of an unknown
+	// runtime segmentation fault, we get a much friendlier out-of-bounds
+	// error.
+	//
+	// To iterate over a summary level, use inUse to determine which ranges
+	// are currently available. Otherwise one might try to access
+	// memory which is only Reserved which may result in a hard fault.
+	//
+	// We may still get segmentation faults < len since some of that
+	// memory may not be committed yet.
+	summary [summaryLevels][]pallocSum
+
+	// chunks is a slice of bitmap chunks.
+	//
+	// The total size of chunks is quite large on most 64-bit platforms
+	// (O(GiB) or more) if flattened, so rather than making one large mapping
+	// (which has problems on some platforms, even when PROT_NONE) we use a
+	// two-level sparse array approach similar to the arena index in mheap.
+	//
+	// To find the chunk containing a memory address `a`, do:
+	//   chunkOf(chunkIndex(a))
+	//
+	// Below is a table describing the configuration for chunks for various
+	// heapAddrBits supported by the runtime.
+	//
+	// heapAddrBits | L1 Bits | L2 Bits | L2 Entry Size
+	// ------------------------------------------------
+	// 32           | 0       | 10      | 128 KiB
+	// 33 (iOS)     | 0       | 11      | 256 KiB
+	// 48           | 13      | 13      | 1 MiB
+	//
+	// There's no reason to use the L1 part of chunks on 32-bit, the
+	// address space is small so the L2 is small. For platforms with a
+	// 48-bit address space, we pick the L1 such that the L2 is 1 MiB
+	// in size, which is a good balance between low granularity without
+	// making the impact on BSS too high (note the L1 is stored directly
+	// in pageAlloc).
+	//
+	// To iterate over the bitmap, use inUse to determine which ranges
+	// are currently available. Otherwise one might iterate over unused
+	// ranges.
+	//
+	// TODO(mknyszek): Consider changing the definition of the bitmap
+	// such that 1 means free and 0 means in-use so that summaries and
+	// the bitmaps align better on zero-values.
+	chunks [1 << pallocChunksL1Bits]*[1 << pallocChunksL2Bits]pallocData
+
+	// The address to start an allocation search with. It must never
+	// point to any memory that is not contained in inUse, i.e.
+	// inUse.contains(searchAddr) must always be true.
+	//
+	// When added with arenaBaseOffset, we guarantee that
+	// all valid heap addresses (when also added with
+	// arenaBaseOffset) below this value are allocated and
+	// not worth searching.
+	//
+	// Note that adding in arenaBaseOffset transforms addresses
+	// to a new address space with a linear view of the full address
+	// space on architectures with segmented address spaces.
+	searchAddr offAddr
+
+	// start and end represent the chunk indices
+	// which pageAlloc knows about. It assumes
+	// chunks in the range [start, end) are
+	// currently ready to use.
+	start, end chunkIdx
+
+	// inUse is a slice of ranges of address space which are
+	// known by the page allocator to be currently in-use (passed
+	// to grow).
+	//
+	// This field is currently unused on 32-bit architectures but
+	// is harmless to track. We care much more about having a
+	// contiguous heap in these cases and take additional measures
+	// to ensure that, so in nearly all cases this should have just
+	// 1 element.
+	//
+	// All access is protected by the mheapLock.
+	inUse addrRanges
+
+	// scav stores the scavenger state.
+	//
+	// All fields are protected by mheapLock.
+	scav struct {
+		// inUse is a slice of ranges of address space which have not
+		// yet been looked at by the scavenger.
+		inUse addrRanges
+
+		// gen is the scavenge generation number.
+		gen uint32
+
+		// reservationBytes is how large of a reservation should be made
+		// in bytes of address space for each scavenge iteration.
+		reservationBytes uintptr
+
+		// released is the amount of memory released this generation.
+		released uintptr
+
+		// scavLWM is the lowest (offset) address that the scavenger reached this
+		// scavenge generation.
+		scavLWM offAddr
+
+		// freeHWM is the highest (offset) address of a page that was freed to
+		// the page allocator this scavenge generation.
+		freeHWM offAddr
+	}
+
+	// mheap_.lock. This level of indirection makes it possible
+	// to test pageAlloc indepedently of the runtime allocator.
+	mheapLock *mutex
+
+	// sysStat is the runtime memstat to update when new system
+	// memory is committed by the pageAlloc for allocation metadata.
+	sysStat *uint64
+
+	// Whether or not this struct is being used in tests.
+	test bool
+}
+
+func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+	if levelLogPages[0] > logMaxPackedValue {
+		// We can't represent 1<<levelLogPages[0] pages, the maximum number
+		// of pages we need to represent at the root level, in a summary, which
+		// is a big problem. Throw.
+		print("runtime: root level max pages = ", 1<<levelLogPages[0], "\n")
+		print("runtime: summary max pages = ", maxPackedValue, "\n")
+		throw("root level max pages doesn't fit in summary")
+	}
+	s.sysStat = sysStat
+
+	// Initialize s.inUse.
+	s.inUse.init(sysStat)
+
+	// System-dependent initialization.
+	s.sysInit()
+
+	// Start with the searchAddr in a state indicating there's no free memory.
+	s.searchAddr = maxSearchAddr
+
+	// Set the mheapLock.
+	s.mheapLock = mheapLock
+
+	// Initialize scavenge tracking state.
+	s.scav.scavLWM = maxSearchAddr
+}
+
+// chunkOf returns the chunk at the given chunk index.
+func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &s.chunks[ci.l1()][ci.l2()]
+}
+
+// grow sets up the metadata for the address range [base, base+size).
+// It may allocate metadata, in which case *s.sysStat will be updated.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) grow(base, size uintptr) {
+	// Round up to chunks, since we can't deal with increments smaller
+	// than chunks. Also, sysGrow expects aligned values.
+	limit := alignUp(base+size, pallocChunkBytes)
+	base = alignDown(base, pallocChunkBytes)
+
+	// Grow the summary levels in a system-dependent manner.
+	// We just update a bunch of additional metadata here.
+	s.sysGrow(base, limit)
+
+	// Update s.start and s.end.
+	// If no growth happened yet, start == 0. This is generally
+	// safe since the zero page is unmapped.
+	firstGrowth := s.start == 0
+	start, end := chunkIndex(base), chunkIndex(limit)
+	if firstGrowth || start < s.start {
+		s.start = start
+	}
+	if end > s.end {
+		s.end = end
+	}
+	// Note that [base, limit) will never overlap with any existing
+	// range inUse because grow only ever adds never-used memory
+	// regions to the page allocator.
+	s.inUse.add(makeAddrRange(base, limit))
+
+	// A grow operation is a lot like a free operation, so if our
+	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// new address, just like in free.
+	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
+	}
+
+	// Add entries into chunks, which is sparse, if needed. Then,
+	// initialize the bitmap.
+	//
+	// Newly-grown memory is always considered scavenged.
+	// Set all the bits in the scavenged bitmaps high.
+	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
+		if s.chunks[c.l1()] == nil {
+			// Create the necessary l2 entry.
+			//
+			// Store it atomically to avoid races with readers which
+			// don't acquire the heap lock.
+			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+		}
+		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+	}
+
+	// Update summaries accordingly. The grow acts like a free, so
+	// we need to ensure this newly-free memory is visible in the
+	// summaries.
+	s.update(base, size/pageSize, true, false)
+}
+
+// update updates heap metadata. It must be called each time the bitmap
+// is updated.
+//
+// If contig is true, update does some optimizations assuming that there was
+// a contiguous allocation or free between addr and addr+npages. alloc indicates
+// whether the operation performed was an allocation or a free.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+	// base, limit, start, and end are inclusive.
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+
+	// Handle updating the lowest level first.
+	if sc == ec {
+		// Fast path: the allocation doesn't span more than one chunk,
+		// so update this one and if the summary didn't change, return.
+		x := s.summary[len(s.summary)-1][sc]
+		y := s.chunkOf(sc).summarize()
+		if x == y {
+			return
+		}
+		s.summary[len(s.summary)-1][sc] = y
+	} else if contig {
+		// Slow contiguous path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		summary := s.summary[len(s.summary)-1]
+
+		// Update the summary for chunk sc.
+		summary[sc] = s.chunkOf(sc).summarize()
+
+		// Update the summaries for chunks in between, which are
+		// either totally allocated or freed.
+		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		if alloc {
+			// Should optimize into a memclr.
+			for i := range whole {
+				whole[i] = 0
+			}
+		} else {
+			for i := range whole {
+				whole[i] = freeChunkSum
+			}
+		}
+
+		// Update the summary for chunk ec.
+		summary[ec] = s.chunkOf(ec).summarize()
+	} else {
+		// Slow general path: the allocation spans more than one chunk
+		// and at least one summary is guaranteed to change.
+		//
+		// We can't assume a contiguous allocation happened, so walk over
+		// every chunk in the range and manually recompute the summary.
+		summary := s.summary[len(s.summary)-1]
+		for c := sc; c <= ec; c++ {
+			summary[c] = s.chunkOf(c).summarize()
+		}
+	}
+
+	// Walk up the radix tree and update the summaries appropriately.
+	changed := true
+	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+		// Update summaries at level l from summaries at level l+1.
+		changed = false
+
+		// "Constants" for the previous level which we
+		// need to compute the summary from that level.
+		logEntriesPerBlock := levelBits[l+1]
+		logMaxPages := levelLogPages[l+1]
+
+		// lo and hi describe all the parts of the level we need to look at.
+		lo, hi := addrsToSummaryRange(l, base, limit+1)
+
+		// Iterate over each block, updating the corresponding summary in the less-granular level.
+		for i := lo; i < hi; i++ {
+			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			sum := mergeSummaries(children, logMaxPages)
+			old := s.summary[l][i]
+			if old != sum {
+				changed = true
+				s.summary[l][i] = sum
+			}
+		}
+	}
+}
+
+// allocRange marks the range of memory [base, base+npages*pageSize) as
+// allocated. It also updates the summaries to reflect the newly-updated
+// bitmap.
+//
+// Returns the amount of scavenged memory in bytes present in the
+// allocated range.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+	limit := base + npages*pageSize - 1
+	sc, ec := chunkIndex(base), chunkIndex(limit)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+	scav := uint(0)
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		chunk := s.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, ei+1-si)
+		chunk.allocRange(si, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		chunk := s.chunkOf(sc)
+		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
+		chunk.allocRange(si, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			chunk := s.chunkOf(c)
+			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
+			chunk.allocAll()
+		}
+		chunk = s.chunkOf(ec)
+		scav += chunk.scavenged.popcntRange(0, ei+1)
+		chunk.allocRange(0, ei+1)
+	}
+	s.update(base, npages, true, true)
+	return uintptr(scav) * pageSize
+}
+
+// find searches for the first (address-ordered) contiguous free region of
+// npages in size and returns a base address for that region.
+//
+// It uses s.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(s.searchAddr) contain any free memory at all.
+//
+// find also computes and returns a candidate s.searchAddr, which may or
+// may not prune more of the address space than s.searchAddr already does.
+//
+// find represents the slow path and the full radix tree search.
+//
+// Returns a base address of 0 on failure, in which case the candidate
+// searchAddr returned is invalid and must be ignored.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+	// Search algorithm.
+	//
+	// This algorithm walks each level l of the radix tree from the root level
+	// to the leaf level. It iterates over at most 1 << levelBits[l] of entries
+	// in a given level in the radix tree, and uses the summary information to
+	// find either:
+	//  1) That a given subtree contains a large enough contiguous region, at
+	//     which point it continues iterating on the next level, or
+	//  2) That there are enough contiguous boundary-crossing bits to satisfy
+	//     the allocation, at which point it knows exactly where to start
+	//     allocating from.
+	//
+	// i tracks the index into the current level l's structure for the
+	// contiguous 1 << levelBits[l] entries we're actually interested in.
+	//
+	// NOTE: Technically this search could allocate a region which crosses
+	// the arenaBaseOffset boundary, which when arenaBaseOffset != 0, is
+	// a discontinuity. However, the only way this could happen is if the
+	// page at the zero address is mapped, and this is impossible on
+	// every system we support where arenaBaseOffset != 0. So, the
+	// discontinuity is already encoded in the fact that the OS will never
+	// map the zero page for us, and this function doesn't try to handle
+	// this case in any way.
+
+	// i is the beginning of the block of entries we're searching at the
+	// current level.
+	i := 0
+
+	// firstFree is the region of address space that we are certain to
+	// find the first free page in the heap. base and bound are the inclusive
+	// bounds of this window, and both are addresses in the linearized, contiguous
+	// view of the address space (with arenaBaseOffset pre-added). At each level,
+	// this window is narrowed as we find the memory region containing the
+	// first free page of memory. To begin with, the range reflects the
+	// full process address space.
+	//
+	// firstFree is updated by calling foundFree each time free space in the
+	// heap is discovered.
+	//
+	// At the end of the search, base.addr() is the best new
+	// searchAddr we could deduce in this search.
+	firstFree := struct {
+		base, bound offAddr
+	}{
+		base:  minOffAddr,
+		bound: maxOffAddr,
+	}
+	// foundFree takes the given address range [addr, addr+size) and
+	// updates firstFree if it is a narrower range. The input range must
+	// either be fully contained within firstFree or not overlap with it
+	// at all.
+	//
+	// This way, we'll record the first summary we find with any free
+	// pages on the root level and narrow that down if we descend into
+	// that summary. But as soon as we need to iterate beyond that summary
+	// in a level to find a large enough range, we'll stop narrowing.
+	foundFree := func(addr offAddr, size uintptr) {
+		if firstFree.base.lessEqual(addr) && addr.add(size-1).lessEqual(firstFree.bound) {
+			// This range fits within the current firstFree window, so narrow
+			// down the firstFree window to the base and bound of this range.
+			firstFree.base = addr
+			firstFree.bound = addr.add(size - 1)
+		} else if !(addr.add(size-1).lessThan(firstFree.base) || firstFree.bound.lessThan(addr)) {
+			// This range only partially overlaps with the firstFree range,
+			// so throw.
+			print("runtime: addr = ", hex(addr.addr()), ", size = ", size, "\n")
+			print("runtime: base = ", hex(firstFree.base.addr()), ", bound = ", hex(firstFree.bound.addr()), "\n")
+			throw("range partially overlaps")
+		}
+	}
+
+	// lastSum is the summary which we saw on the previous level that made us
+	// move on to the next level. Used to print additional information in the
+	// case of a catastrophic failure.
+	// lastSumIdx is that summary's index in the previous level.
+	lastSum := packPallocSum(0, 0, 0)
+	lastSumIdx := -1
+
+nextLevel:
+	for l := 0; l < len(s.summary); l++ {
+		// For the root level, entriesPerBlock is the whole level.
+		entriesPerBlock := 1 << levelBits[l]
+		logMaxPages := levelLogPages[l]
+
+		// We've moved into a new level, so let's update i to our new
+		// starting index. This is a no-op for level 0.
+		i <<= levelBits[l]
+
+		// Slice out the block of entries we care about.
+		entries := s.summary[l][i : i+entriesPerBlock]
+
+		// Determine j0, the first index we should start iterating from.
+		// The searchAddr may help us eliminate iterations if we followed the
+		// searchAddr on the previous level or we're on the root leve, in which
+		// case the searchAddr should be the same as i after levelShift.
+		j0 := 0
+		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+			j0 = searchIdx & (entriesPerBlock - 1)
+		}
+
+		// Run over the level entries looking for
+		// a contiguous run of at least npages either
+		// within an entry or across entries.
+		//
+		// base contains the page index (relative to
+		// the first entry's first page) of the currently
+		// considered run of consecutive pages.
+		//
+		// size contains the size of the currently considered
+		// run of consecutive pages.
+		var base, size uint
+		for j := j0; j < len(entries); j++ {
+			sum := entries[j]
+			if sum == 0 {
+				// A full entry means we broke any streak and
+				// that we should skip it altogether.
+				size = 0
+				continue
+			}
+
+			// We've encountered a non-zero summary which means
+			// free memory, so update firstFree.
+			foundFree(levelIndexToOffAddr(l, i+j), (uintptr(1)<<logMaxPages)*pageSize)
+
+			s := sum.start()
+			if size+s >= uint(npages) {
+				// If size == 0 we don't have a run yet,
+				// which means base isn't valid. So, set
+				// base to the first page in this block.
+				if size == 0 {
+					base = uint(j) << logMaxPages
+				}
+				// We hit npages; we're done!
+				size += s
+				break
+			}
+			if sum.max() >= uint(npages) {
+				// The entry itself contains npages contiguous
+				// free pages, so continue on the next level
+				// to find that run.
+				i += j
+				lastSumIdx = i
+				lastSum = sum
+				continue nextLevel
+			}
+			if size == 0 || s < 1<<logMaxPages {
+				// We either don't have a current run started, or this entry
+				// isn't totally free (meaning we can't continue the current
+				// one), so try to begin a new run by setting size and base
+				// based on sum.end.
+				size = sum.end()
+				base = uint(j+1)<<logMaxPages - size
+				continue
+			}
+			// The entry is completely free, so continue the run.
+			size += 1 << logMaxPages
+		}
+		if size >= uint(npages) {
+			// We found a sufficiently large run of free pages straddling
+			// some boundary, so compute the address and return it.
+			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
+			return addr, firstFree.base
+		}
+		if l == 0 {
+			// We're at level zero, so that means we've exhausted our search.
+			return 0, maxSearchAddr
+		}
+
+		// We're not at level zero, and we exhausted the level we were looking in.
+		// This means that either our calculations were wrong or the level above
+		// lied to us. In either case, dump some useful state and throw.
+		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
+		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
+		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
+		for j := 0; j < len(entries); j++ {
+			sum := entries[j]
+			print("runtime: summary[", l, "][", i+j, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		}
+		throw("bad summary data")
+	}
+
+	// Since we've gotten to this point, that means we haven't found a
+	// sufficiently-sized free region straddling some boundary (chunk or larger).
+	// This means the last summary we inspected must have had a large enough "max"
+	// value, so look inside the chunk to find a suitable run.
+	//
+	// After iterating over all levels, i must contain a chunk index which
+	// is what the final level represents.
+	ci := chunkIdx(i)
+	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	if j == ^uint(0) {
+		// We couldn't find any space in this chunk despite the summaries telling
+		// us it should be there. There's likely a bug, so dump some state and throw.
+		sum := s.summary[len(s.summary)-1][i]
+		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		print("runtime: npages = ", npages, "\n")
+		throw("bad summary data")
+	}
+
+	// Compute the address at which the free space starts.
+	addr := chunkBase(ci) + uintptr(j)*pageSize
+
+	// Since we actually searched the chunk, we may have
+	// found an even narrower free window.
+	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
+	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
+	return addr, firstFree.base
+}
+
+// alloc allocates npages worth of memory from the page heap, returning the base
+// address for the allocation and the amount of scavenged memory in bytes
+// contained in the region [base address, base address + npages*pageSize).
+//
+// Returns a 0 base address on failure, in which case other returned values
+// should be ignored.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(s.searchAddr.addr()) >= s.end {
+		return 0, 0
+	}
+
+	// If npages has a chance of fitting in the chunk where the searchAddr is,
+	// search it directly.
+	searchAddr := minOffAddr
+	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
+		// npages is guaranteed to be no greater than pallocChunkPages here.
+		i := chunkIndex(s.searchAddr.addr())
+		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+			if j == ^uint(0) {
+				print("runtime: max = ", max, ", npages = ", npages, "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
+				throw("bad summary data")
+			}
+			addr = chunkBase(i) + uintptr(j)*pageSize
+			searchAddr = offAddr{chunkBase(i) + uintptr(searchIdx)*pageSize}
+			goto Found
+		}
+	}
+	// We failed to use a searchAddr for one reason or another, so try
+	// the slow path.
+	addr, searchAddr = s.find(npages)
+	if addr == 0 {
+		if npages == 1 {
+			// We failed to find a single free page, the smallest unit
+			// of allocation. This means we know the heap is completely
+			// exhausted. Otherwise, the heap still might have free
+			// space in it, just not enough contiguous space to
+			// accommodate npages.
+			s.searchAddr = maxSearchAddr
+		}
+		return 0, 0
+	}
+Found:
+	// Go ahead and actually mark the bits now that we have an address.
+	scav = s.allocRange(addr, npages)
+
+	// If we found a higher searchAddr, we know that all the
+	// heap memory before that searchAddr in an offset address space is
+	// allocated, so bump s.searchAddr up to the new one.
+	if s.searchAddr.lessThan(searchAddr) {
+		s.searchAddr = searchAddr
+	}
+	return addr, scav
+}
+
+// free returns npages worth of memory starting at base back to the page heap.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) free(base, npages uintptr) {
+	// If we're freeing pages below the s.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
+	}
+	// Update the free high watermark for the scavenger.
+	limit := base + npages*pageSize - 1
+	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
+		s.scav.freeHWM = offLimit
+	}
+	if npages == 1 {
+		// Fast path: we're clearing a single bit, and we know exactly
+		// where it is, so mark it directly.
+		i := chunkIndex(base)
+		s.chunkOf(i).free1(chunkPageIndex(base))
+	} else {
+		// Slow path: we're clearing more bits so we may need to iterate.
+		sc, ec := chunkIndex(base), chunkIndex(limit)
+		si, ei := chunkPageIndex(base), chunkPageIndex(limit)
+
+		if sc == ec {
+			// The range doesn't cross any chunk boundaries.
+			s.chunkOf(sc).free(si, ei+1-si)
+		} else {
+			// The range crosses at least one chunk boundary.
+			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			for c := sc + 1; c < ec; c++ {
+				s.chunkOf(c).freeAll()
+			}
+			s.chunkOf(ec).free(0, ei+1)
+		}
+	}
+	s.update(base, npages, true, false)
+}
+
+const (
+	pallocSumBytes = unsafe.Sizeof(pallocSum(0))
+
+	// maxPackedValue is the maximum value that any of the three fields in
+	// the pallocSum may take on.
+	maxPackedValue    = 1 << logMaxPackedValue
+	logMaxPackedValue = logPallocChunkPages + (summaryLevels-1)*summaryLevelBits
+
+	freeChunkSum = pallocSum(uint64(pallocChunkPages) |
+		uint64(pallocChunkPages<<logMaxPackedValue) |
+		uint64(pallocChunkPages<<(2*logMaxPackedValue)))
+)
+
+// pallocSum is a packed summary type which packs three numbers: start, max,
+// and end into a single 8-byte value. Each of these values are a summary of
+// a bitmap and are thus counts, each of which may have a maximum value of
+// 2^21 - 1, or all three may be equal to 2^21. The latter case is represented
+// by just setting the 64th bit.
+type pallocSum uint64
+
+// packPallocSum takes a start, max, and end value and produces a pallocSum.
+func packPallocSum(start, max, end uint) pallocSum {
+	if max == maxPackedValue {
+		return pallocSum(uint64(1 << 63))
+	}
+	return pallocSum((uint64(start) & (maxPackedValue - 1)) |
+		((uint64(max) & (maxPackedValue - 1)) << logMaxPackedValue) |
+		((uint64(end) & (maxPackedValue - 1)) << (2 * logMaxPackedValue)))
+}
+
+// start extracts the start value from a packed sum.
+func (p pallocSum) start() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1))
+}
+
+// max extracts the max value from a packed sum.
+func (p pallocSum) max() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1))
+}
+
+// end extracts the end value from a packed sum.
+func (p pallocSum) end() uint {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue
+	}
+	return uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// unpack unpacks all three values from the summary.
+func (p pallocSum) unpack() (uint, uint, uint) {
+	if uint64(p)&uint64(1<<63) != 0 {
+		return maxPackedValue, maxPackedValue, maxPackedValue
+	}
+	return uint(uint64(p) & (maxPackedValue - 1)),
+		uint((uint64(p) >> logMaxPackedValue) & (maxPackedValue - 1)),
+		uint((uint64(p) >> (2 * logMaxPackedValue)) & (maxPackedValue - 1))
+}
+
+// mergeSummaries merges consecutive summaries which may each represent at
+// most 1 << logMaxPagesPerSum pages each together into one.
+func mergeSummaries(sums []pallocSum, logMaxPagesPerSum uint) pallocSum {
+	// Merge the summaries in sums into one.
+	//
+	// We do this by keeping a running summary representing the merged
+	// summaries of sums[:i] in start, max, and end.
+	start, max, end := sums[0].unpack()
+	for i := 1; i < len(sums); i++ {
+		// Merge in sums[i].
+		si, mi, ei := sums[i].unpack()
+
+		// Merge in sums[i].start only if the running summary is
+		// completely free, otherwise this summary's start
+		// plays no role in the combined sum.
+		if start == uint(i)<<logMaxPagesPerSum {
+			start += si
+		}
+
+		// Recompute the max value of the running sum by looking
+		// across the boundary between the running sum and sums[i]
+		// and at the max sums[i], taking the greatest of those two
+		// and the max of the running sum.
+		if end+si > max {
+			max = end + si
+		}
+		if mi > max {
+			max = mi
+		}
+
+		// Merge in end by checking if this new summary is totally
+		// free. If it is, then we want to extend the running sum's
+		// end by the new summary. If not, then we have some alloc'd
+		// pages in there and we just want to take the end value in
+		// sums[i].
+		if ei == 1<<logMaxPagesPerSum {
+			end += 1 << logMaxPagesPerSum
+		} else {
+			end = ei
+		}
+	}
+	return packPallocSum(start, max, end)
+}
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
new file mode 100644
index 0000000..6658a90
--- /dev/null
+++ b/src/runtime/mpagealloc_32bit.go
@@ -0,0 +1,116 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 arm mips mipsle wasm darwin,arm64
+
+// wasm is a treated as a 32-bit architecture for the purposes of the page
+// allocator, even though it has 64-bit pointers. This is because any wasm
+// pointer always has its top 32 bits as zero, so the effective heap address
+// space is only 2^32 bytes in size (see heapAddrBits).
+
+// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// page allocator, even though it has 64-bit pointers and a 33-bit address
+// space (see heapAddrBits). The 33 bit address space cannot be rounded up
+// to 64 bits because there are too many summary levels to fit in just 33
+// bits.
+
+package runtime
+
+import "unsafe"
+
+const (
+	// The number of levels in the radix tree.
+	summaryLevels = 4
+
+	// Constants for testing.
+	pageAlloc32Bit = 1
+	pageAlloc64Bit = 0
+
+	// Number of bits needed to represent all indices into the L1 of the
+	// chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this number change.
+	pallocChunksL1Bits = 0
+)
+
+// See comment in mpagealloc_64bit.go.
+var levelBits = [summaryLevels]uint{
+	summaryL0Bits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+}
+
+// See comment in mpagealloc_64bit.go.
+var levelShift = [summaryLevels]uint{
+	heapAddrBits - summaryL0Bits,
+	heapAddrBits - summaryL0Bits - 1*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 2*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 3*summaryLevelBits,
+}
+
+// See comment in mpagealloc_64bit.go.
+var levelLogPages = [summaryLevels]uint{
+	logPallocChunkPages + 3*summaryLevelBits,
+	logPallocChunkPages + 2*summaryLevelBits,
+	logPallocChunkPages + 1*summaryLevelBits,
+	logPallocChunkPages,
+}
+
+// See mpagealloc_64bit.go for details.
+func (s *pageAlloc) sysInit() {
+	// Calculate how much memory all our entries will take up.
+	//
+	// This should be around 12 KiB or less.
+	totalSize := uintptr(0)
+	for l := 0; l < summaryLevels; l++ {
+		totalSize += (uintptr(1) << (heapAddrBits - levelShift[l])) * pallocSumBytes
+	}
+	totalSize = alignUp(totalSize, physPageSize)
+
+	// Reserve memory for all levels in one go. There shouldn't be much for 32-bit.
+	reservation := sysReserve(nil, totalSize)
+	if reservation == nil {
+		throw("failed to reserve page summary memory")
+	}
+	// There isn't much. Just map it and mark it as used immediately.
+	sysMap(reservation, totalSize, s.sysStat)
+	sysUsed(reservation, totalSize)
+
+	// Iterate over the reservation and cut it up into slices.
+	//
+	// Maintain i as the byte offset from reservation where
+	// the new slice should start.
+	for l, shift := range levelShift {
+		entries := 1 << (heapAddrBits - shift)
+
+		// Put this reservation into a slice.
+		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
+		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+
+		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
+	}
+}
+
+// See mpagealloc_64bit.go for details.
+func (s *pageAlloc) sysGrow(base, limit uintptr) {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+
+	// Walk up the tree and update the summary slices.
+	for l := len(s.summary) - 1; l >= 0; l-- {
+		// Figure out what part of the summary array this new address space needs.
+		// Note that we need to align the ranges to the block width (1<<levelBits[l])
+		// at this level because the full block is needed to compute the summary for
+		// the next level.
+		lo, hi := addrsToSummaryRange(l, base, limit)
+		_, hi = blockAlignSummaryRange(l, lo, hi)
+		if hi > len(s.summary[l]) {
+			s.summary[l] = s.summary[l][:hi]
+		}
+	}
+}
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
new file mode 100644
index 0000000..831626e
--- /dev/null
+++ b/src/runtime/mpagealloc_64bit.go
@@ -0,0 +1,180 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
+
+// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+
+package runtime
+
+import "unsafe"
+
+const (
+	// The number of levels in the radix tree.
+	summaryLevels = 5
+
+	// Constants for testing.
+	pageAlloc32Bit = 0
+	pageAlloc64Bit = 1
+
+	// Number of bits needed to represent all indices into the L1 of the
+	// chunks map.
+	//
+	// See (*pageAlloc).chunks for more details. Update the documentation
+	// there should this number change.
+	pallocChunksL1Bits = 13
+)
+
+// levelBits is the number of bits in the radix for a given level in the super summary
+// structure.
+//
+// The sum of all the entries of levelBits should equal heapAddrBits.
+var levelBits = [summaryLevels]uint{
+	summaryL0Bits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+	summaryLevelBits,
+}
+
+// levelShift is the number of bits to shift to acquire the radix for a given level
+// in the super summary structure.
+//
+// With levelShift, one can compute the index of the summary at level l related to a
+// pointer p by doing:
+//   p >> levelShift[l]
+var levelShift = [summaryLevels]uint{
+	heapAddrBits - summaryL0Bits,
+	heapAddrBits - summaryL0Bits - 1*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 2*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 3*summaryLevelBits,
+	heapAddrBits - summaryL0Bits - 4*summaryLevelBits,
+}
+
+// levelLogPages is log2 the maximum number of runtime pages in the address space
+// a summary in the given level represents.
+//
+// The leaf level always represents exactly log2 of 1 chunk's worth of pages.
+var levelLogPages = [summaryLevels]uint{
+	logPallocChunkPages + 4*summaryLevelBits,
+	logPallocChunkPages + 3*summaryLevelBits,
+	logPallocChunkPages + 2*summaryLevelBits,
+	logPallocChunkPages + 1*summaryLevelBits,
+	logPallocChunkPages,
+}
+
+// sysInit performs architecture-dependent initialization of fields
+// in pageAlloc. pageAlloc should be uninitialized except for sysStat
+// if any runtime statistic should be updated.
+func (s *pageAlloc) sysInit() {
+	// Reserve memory for each level. This will get mapped in
+	// as R/W by setArenas.
+	for l, shift := range levelShift {
+		entries := 1 << (heapAddrBits - shift)
+
+		// Reserve b bytes of memory anywhere in the address space.
+		b := alignUp(uintptr(entries)*pallocSumBytes, physPageSize)
+		r := sysReserve(nil, b)
+		if r == nil {
+			throw("failed to reserve page summary memory")
+		}
+
+		// Put this reservation into a slice.
+		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
+		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+	}
+}
+
+// sysGrow performs architecture-dependent operations on heap
+// growth for the page allocator, such as mapping in new memory
+// for summaries. It also updates the length of the slices in
+// s.summary.
+//
+// base is the base of the newly-added heap memory and limit is
+// the first address past the end of the newly-added heap memory.
+// Both must be aligned to pallocChunkBytes.
+//
+// The caller must update s.start and s.end after calling sysGrow.
+func (s *pageAlloc) sysGrow(base, limit uintptr) {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+
+	// addrRangeToSummaryRange converts a range of addresses into a range
+	// of summary indices which must be mapped to support those addresses
+	// in the summary range.
+	addrRangeToSummaryRange := func(level int, r addrRange) (int, int) {
+		sumIdxBase, sumIdxLimit := addrsToSummaryRange(level, r.base.addr(), r.limit.addr())
+		return blockAlignSummaryRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// summaryRangeToSumAddrRange converts a range of indices in any
+	// level of s.summary into page-aligned addresses which cover that
+	// range of indices.
+	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
+		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
+		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
+		base := unsafe.Pointer(&s.summary[level][0])
+		return addrRange{
+			offAddr{uintptr(add(base, baseOffset))},
+			offAddr{uintptr(add(base, limitOffset))},
+		}
+	}
+
+	// addrRangeToSumAddrRange is a convienience function that converts
+	// an address range r to the address range of the given summary level
+	// that stores the summaries for r.
+	addrRangeToSumAddrRange := func(level int, r addrRange) addrRange {
+		sumIdxBase, sumIdxLimit := addrRangeToSummaryRange(level, r)
+		return summaryRangeToSumAddrRange(level, sumIdxBase, sumIdxLimit)
+	}
+
+	// Find the first inUse index which is strictly greater than base.
+	//
+	// Because this function will never be asked remap the same memory
+	// twice, this index is effectively the index at which we would insert
+	// this new growth, and base will never overlap/be contained within
+	// any existing range.
+	//
+	// This will be used to look at what memory in the summary array is already
+	// mapped before and after this new range.
+	inUseIndex := s.inUse.findSucc(base)
+
+	// Walk up the radix tree and map summaries in as needed.
+	for l := range s.summary {
+		// Figure out what part of the summary array this new address space needs.
+		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
+
+		// Update the summary slices with a new upper-bound. This ensures
+		// we get tight bounds checks on at least the top bound.
+		//
+		// We must do this regardless of whether we map new memory.
+		if needIdxLimit > len(s.summary[l]) {
+			s.summary[l] = s.summary[l][:needIdxLimit]
+		}
+
+		// Compute the needed address range in the summary array for level l.
+		need := summaryRangeToSumAddrRange(l, needIdxBase, needIdxLimit)
+
+		// Prune need down to what needs to be newly mapped. Some parts of it may
+		// already be mapped by what inUse describes due to page alignment requirements
+		// for mapping. prune's invariants are guaranteed by the fact that this
+		// function will never be asked to remap the same memory twice.
+		if inUseIndex > 0 {
+			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+		}
+		if inUseIndex < len(s.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		}
+		// It's possible that after our pruning above, there's nothing new to map.
+		if need.size() == 0 {
+			continue
+		}
+
+		// Map and commit need.
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
+	}
+}
diff --git a/src/runtime/mpagealloc_test.go b/src/runtime/mpagealloc_test.go
new file mode 100644
index 0000000..89a4a25
--- /dev/null
+++ b/src/runtime/mpagealloc_test.go
@@ -0,0 +1,978 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	. "runtime"
+	"testing"
+)
+
+func checkPageAlloc(t *testing.T, want, got *PageAlloc) {
+	// Ensure start and end are correct.
+	wantStart, wantEnd := want.Bounds()
+	gotStart, gotEnd := got.Bounds()
+	if gotStart != wantStart {
+		t.Fatalf("start values not equal: got %d, want %d", gotStart, wantStart)
+	}
+	if gotEnd != wantEnd {
+		t.Fatalf("end values not equal: got %d, want %d", gotEnd, wantEnd)
+	}
+
+	for i := gotStart; i < gotEnd; i++ {
+		// Check the bitmaps. Note that we may have nil data.
+		gb, wb := got.PallocData(i), want.PallocData(i)
+		if gb == nil && wb == nil {
+			continue
+		}
+		if (gb == nil && wb != nil) || (gb != nil && wb == nil) {
+			t.Errorf("chunk %d nilness mismatch", i)
+		}
+		if !checkPallocBits(t, gb.PallocBits(), wb.PallocBits()) {
+			t.Logf("in chunk %d (mallocBits)", i)
+		}
+		if !checkPallocBits(t, gb.Scavenged(), wb.Scavenged()) {
+			t.Logf("in chunk %d (scavenged)", i)
+		}
+	}
+	// TODO(mknyszek): Verify summaries too?
+}
+
+func TestPageAllocGrow(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	type test struct {
+		chunks []ChunkIdx
+		inUse  []AddrRange
+	}
+	tests := map[string]test{
+		"One": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+			},
+		},
+		"Contiguous2": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+			},
+		},
+		"Contiguous5": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 3,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"Discontiguous": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"Mixed": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 2,
+				BaseChunkIdx + 4,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+			},
+		},
+		"WildlyDiscontiguous": {
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 1,
+				BaseChunkIdx + 0x10,
+				BaseChunkIdx + 0x21,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
+				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+			},
+		},
+		"ManyDiscontiguous": {
+			// The initial cap is 16. Test 33 ranges, to exercise the growth path (twice).
+			chunks: []ChunkIdx{
+				BaseChunkIdx, BaseChunkIdx + 2, BaseChunkIdx + 4, BaseChunkIdx + 6,
+				BaseChunkIdx + 8, BaseChunkIdx + 10, BaseChunkIdx + 12, BaseChunkIdx + 14,
+				BaseChunkIdx + 16, BaseChunkIdx + 18, BaseChunkIdx + 20, BaseChunkIdx + 22,
+				BaseChunkIdx + 24, BaseChunkIdx + 26, BaseChunkIdx + 28, BaseChunkIdx + 30,
+				BaseChunkIdx + 32, BaseChunkIdx + 34, BaseChunkIdx + 36, BaseChunkIdx + 38,
+				BaseChunkIdx + 40, BaseChunkIdx + 42, BaseChunkIdx + 44, BaseChunkIdx + 46,
+				BaseChunkIdx + 48, BaseChunkIdx + 50, BaseChunkIdx + 52, BaseChunkIdx + 54,
+				BaseChunkIdx + 56, BaseChunkIdx + 58, BaseChunkIdx + 60, BaseChunkIdx + 62,
+				BaseChunkIdx + 64,
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
+				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
+				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
+				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
+				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
+				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
+				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
+				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
+				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
+				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
+				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
+				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
+				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
+				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
+				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
+				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
+				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
+				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
+				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
+				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
+				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
+				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
+				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
+				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
+				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
+				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
+				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
+				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
+				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
+				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
+				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+			},
+		},
+	}
+	if PageAlloc64Bit != 0 {
+		tests["ExtremelyDiscontiguous"] = test{
+			chunks: []ChunkIdx{
+				BaseChunkIdx,
+				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
+			},
+			inUse: []AddrRange{
+				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+			},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			// By creating a new pageAlloc, we will
+			// grow it for each chunk defined in x.
+			x := make(map[ChunkIdx][]BitRange)
+			for _, c := range v.chunks {
+				x[c] = []BitRange{}
+			}
+			b := NewPageAlloc(x, nil)
+			defer FreePageAlloc(b)
+
+			got := b.InUse()
+			want := v.inUse
+
+			// Check for mismatches.
+			if len(got) != len(want) {
+				t.Fail()
+			} else {
+				for i := range want {
+					if want[i] != got[i] {
+						t.Fail()
+						break
+					}
+				}
+			}
+			if t.Failed() {
+				t.Logf("found inUse mismatch")
+				t.Logf("got:")
+				for i, r := range got {
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+				}
+				t.Logf("want:")
+				for i, r := range want {
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+				}
+			}
+		})
+	}
+}
+
+func TestPageAllocAlloc(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	type hit struct {
+		npages, base, scav uintptr
+	}
+	type test struct {
+		scav   map[ChunkIdx][]BitRange
+		before map[ChunkIdx][]BitRange
+		after  map[ChunkIdx][]BitRange
+		hits   []hit
+	}
+	tests := map[string]test{
+		"AllFree1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 1}, {2, 2}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx, 0), PageSize},
+				{1, PageBase(BaseChunkIdx, 1), 0},
+				{1, PageBase(BaseChunkIdx, 2), PageSize},
+				{1, PageBase(BaseChunkIdx, 3), PageSize},
+				{1, PageBase(BaseChunkIdx, 4), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 5}},
+			},
+		},
+		"ManyArena1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 1}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx+2, PallocChunkPages-1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguous1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{1, PageBase(BaseChunkIdx+0xff, 0), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 1}},
+			},
+		},
+		"AllFree2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 3}, {7, 1}},
+			},
+			hits: []hit{
+				{2, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{2, PageBase(BaseChunkIdx, 2), PageSize},
+				{2, PageBase(BaseChunkIdx, 4), 0},
+				{2, PageBase(BaseChunkIdx, 6), PageSize},
+				{2, PageBase(BaseChunkIdx, 8), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 10}},
+			},
+		},
+		"Straddle2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages - 1}},
+				BaseChunkIdx + 1: {{1, PallocChunkPages - 1}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages - 1, 1}},
+				BaseChunkIdx + 1: {},
+			},
+			hits: []hit{
+				{2, PageBase(BaseChunkIdx, PallocChunkPages-1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFree5": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 8}, {9, 1}, {17, 5}},
+			},
+			hits: []hit{
+				{5, PageBase(BaseChunkIdx, 0), 5 * PageSize},
+				{5, PageBase(BaseChunkIdx, 5), 4 * PageSize},
+				{5, PageBase(BaseChunkIdx, 10), 0},
+				{5, PageBase(BaseChunkIdx, 15), 3 * PageSize},
+				{5, PageBase(BaseChunkIdx, 20), 2 * PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 25}},
+			},
+		},
+		"AllFree64": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{21, 1}, {63, 65}},
+			},
+			hits: []hit{
+				{64, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{64, PageBase(BaseChunkIdx, 64), 64 * PageSize},
+				{64, PageBase(BaseChunkIdx, 128), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 192}},
+			},
+		},
+		"AllFree65": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{129, 1}},
+			},
+			hits: []hit{
+				{65, PageBase(BaseChunkIdx, 0), 0},
+				{65, PageBase(BaseChunkIdx, 65), PageSize},
+				{65, PageBase(BaseChunkIdx, 130), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 195}},
+			},
+		},
+		"ExhaustPallocChunkPages-3": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{10, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages - 3, PageBase(BaseChunkIdx, 0), PageSize},
+				{PallocChunkPages - 3, 0, 0},
+				{1, PageBase(BaseChunkIdx, PallocChunkPages-3), 0},
+				{2, PageBase(BaseChunkIdx, PallocChunkPages-2), 0},
+				{1, 0, 0},
+				{PallocChunkPages - 3, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFreePallocChunkPages": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 1}, {PallocChunkPages - 1, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages, PageBase(BaseChunkIdx, 0), 2 * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {{3, 100}},
+			},
+			hits: []hit{
+				{PallocChunkPages, PageBase(BaseChunkIdx, PallocChunkPages/2), 100 * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages+1": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			hits: []hit{
+				{PallocChunkPages + 1, PageBase(BaseChunkIdx, PallocChunkPages/2), (PallocChunkPages + 1) * PageSize},
+				{PallocChunkPages, 0, 0},
+				{1, PageBase(BaseChunkIdx+1, PallocChunkPages/2+1), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages/2 + 2}},
+			},
+		},
+		"AllFreePallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx, 0), 0},
+				{PallocChunkPages * 2, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguousPallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {},
+				BaseChunkIdx + 0x40: {},
+				BaseChunkIdx + 0x41: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0x40: {},
+				BaseChunkIdx + 0x41: {},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx+0x40, 0), 0},
+				{21, PageBase(BaseChunkIdx, 0), 21 * PageSize},
+				{1, PageBase(BaseChunkIdx, 21), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, 22}},
+				BaseChunkIdx + 0x40: {{0, PallocChunkPages}},
+				BaseChunkIdx + 0x41: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages*2": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, 7}},
+				BaseChunkIdx + 1: {{3, 5}, {121, 10}},
+				BaseChunkIdx + 2: {{PallocChunkPages/2 + 12, 2}},
+			},
+			hits: []hit{
+				{PallocChunkPages * 2, PageBase(BaseChunkIdx, PallocChunkPages/2), 15 * PageSize},
+				{PallocChunkPages * 2, 0, 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"StraddlePallocChunkPages*5/4": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages * 3 / 4}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages * 3 / 4}},
+				BaseChunkIdx + 3: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{PallocChunkPages / 2, PallocChunkPages/4 + 1}},
+				BaseChunkIdx + 2: {{PallocChunkPages / 3, 1}},
+				BaseChunkIdx + 3: {{PallocChunkPages * 2 / 3, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages * 5 / 4, PageBase(BaseChunkIdx+2, PallocChunkPages*3/4), PageSize},
+				{PallocChunkPages * 5 / 4, 0, 0},
+				{1, PageBase(BaseChunkIdx+1, PallocChunkPages*3/4), PageSize},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages*3/4 + 1}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+			},
+		},
+		"AllFreePallocChunkPages*7+5": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{50, 1}},
+				BaseChunkIdx + 1: {{31, 1}},
+				BaseChunkIdx + 2: {{7, 1}},
+				BaseChunkIdx + 3: {{200, 1}},
+				BaseChunkIdx + 4: {{3, 1}},
+				BaseChunkIdx + 5: {{51, 1}},
+				BaseChunkIdx + 6: {{20, 1}},
+				BaseChunkIdx + 7: {{1, 1}},
+			},
+			hits: []hit{
+				{PallocChunkPages*7 + 5, PageBase(BaseChunkIdx, 0), 8 * PageSize},
+				{PallocChunkPages*7 + 5, 0, 0},
+				{1, PageBase(BaseChunkIdx+7, 5), 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+				BaseChunkIdx + 4: {{0, PallocChunkPages}},
+				BaseChunkIdx + 5: {{0, PallocChunkPages}},
+				BaseChunkIdx + 6: {{0, PallocChunkPages}},
+				BaseChunkIdx + 7: {{0, 6}},
+			},
+		},
+	}
+	if PageAlloc64Bit != 0 {
+		const chunkIdxBigJump = 0x100000 // chunk index offset which translates to O(TiB)
+
+		// This test attempts to trigger a bug wherein we look at unmapped summary
+		// memory that isn't just in the case where we exhaust the heap.
+		//
+		// It achieves this by placing a chunk such that its summary will be
+		// at the very end of a physical page. It then also places another chunk
+		// much further up in the address space, such that any allocations into the
+		// first chunk do not exhaust the heap and the second chunk's summary is not in the
+		// page immediately adjacent to the first chunk's summary's page.
+		// Allocating into this first chunk to exhaustion and then into the second
+		// chunk may then trigger a check in the allocator which erroneously looks at
+		// unmapped summary memory and crashes.
+
+		// Figure out how many chunks are in a physical page, then align BaseChunkIdx
+		// to a physical page in the chunk summary array. Here we only assume that
+		// each summary array is aligned to some physical page.
+		sumsPerPhysPage := ChunkIdx(PhysPageSize / PallocSumBytes)
+		baseChunkIdx := BaseChunkIdx &^ (sumsPerPhysPage - 1)
+		tests["DiscontiguousMappedSumBoundary"] = test{
+			before: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {},
+				baseChunkIdx + chunkIdxBigJump:     {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {},
+				baseChunkIdx + chunkIdxBigJump:     {},
+			},
+			hits: []hit{
+				{PallocChunkPages - 1, PageBase(baseChunkIdx+sumsPerPhysPage-1, 0), 0},
+				{1, PageBase(baseChunkIdx+sumsPerPhysPage-1, PallocChunkPages-1), 0},
+				{1, PageBase(baseChunkIdx+chunkIdxBigJump, 0), 0},
+				{PallocChunkPages - 1, PageBase(baseChunkIdx+chunkIdxBigJump, 1), 0},
+				{1, 0, 0},
+			},
+			after: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {{0, PallocChunkPages}},
+				baseChunkIdx + chunkIdxBigJump:     {{0, PallocChunkPages}},
+			},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, v.scav)
+			defer FreePageAlloc(b)
+
+			for iter, i := range v.hits {
+				a, s := b.Alloc(i.npages)
+				if a != i.base {
+					t.Fatalf("bad alloc #%d: want base 0x%x, got 0x%x", iter+1, i.base, a)
+				}
+				if s != i.scav {
+					t.Fatalf("bad alloc #%d: want scav %d, got %d", iter+1, i.scav, s)
+				}
+			}
+			want := NewPageAlloc(v.after, v.scav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocExhaust(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	for _, npages := range []uintptr{1, 2, 3, 4, 5, 8, 16, 64, 1024, 1025, 2048, 2049} {
+		npages := npages
+		t.Run(fmt.Sprintf("%d", npages), func(t *testing.T) {
+			// Construct b.
+			bDesc := make(map[ChunkIdx][]BitRange)
+			for i := ChunkIdx(0); i < 4; i++ {
+				bDesc[BaseChunkIdx+i] = []BitRange{}
+			}
+			b := NewPageAlloc(bDesc, nil)
+			defer FreePageAlloc(b)
+
+			// Allocate into b with npages until we've exhausted the heap.
+			nAlloc := (PallocChunkPages * 4) / int(npages)
+			for i := 0; i < nAlloc; i++ {
+				addr := PageBase(BaseChunkIdx, uint(i)*uint(npages))
+				if a, _ := b.Alloc(npages); a != addr {
+					t.Fatalf("bad alloc #%d: want 0x%x, got 0x%x", i+1, addr, a)
+				}
+			}
+
+			// Check to make sure the next allocation fails.
+			if a, _ := b.Alloc(npages); a != 0 {
+				t.Fatalf("bad alloc #%d: want 0, got 0x%x", nAlloc, a)
+			}
+
+			// Construct what we want the heap to look like now.
+			allocPages := nAlloc * int(npages)
+			wantDesc := make(map[ChunkIdx][]BitRange)
+			for i := ChunkIdx(0); i < 4; i++ {
+				if allocPages >= PallocChunkPages {
+					wantDesc[BaseChunkIdx+i] = []BitRange{{0, PallocChunkPages}}
+					allocPages -= PallocChunkPages
+				} else if allocPages > 0 {
+					wantDesc[BaseChunkIdx+i] = []BitRange{{0, uint(allocPages)}}
+					allocPages = 0
+				} else {
+					wantDesc[BaseChunkIdx+i] = []BitRange{}
+				}
+			}
+			want := NewPageAlloc(wantDesc, nil)
+			defer FreePageAlloc(want)
+
+			// Check to make sure the heap b matches what we want.
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocFree(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	tests := map[string]struct {
+		before map[ChunkIdx][]BitRange
+		after  map[ChunkIdx][]BitRange
+		npages uintptr
+		frees  []uintptr
+	}{
+		"Free1": {
+			npages: 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 1),
+				PageBase(BaseChunkIdx, 2),
+				PageBase(BaseChunkIdx, 3),
+				PageBase(BaseChunkIdx, 4),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{5, PallocChunkPages - 5}},
+			},
+		},
+		"ManyArena1": {
+			npages: 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+				PageBase(BaseChunkIdx+1, 0),
+				PageBase(BaseChunkIdx+2, PallocChunkPages-1),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}, {PallocChunkPages/2 + 1, PallocChunkPages/2 - 1}},
+				BaseChunkIdx + 1: {{1, PallocChunkPages - 1}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 1}},
+			},
+		},
+		"Free2": {
+			npages: 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 2),
+				PageBase(BaseChunkIdx, 4),
+				PageBase(BaseChunkIdx, 6),
+				PageBase(BaseChunkIdx, 8),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{10, PallocChunkPages - 10}},
+			},
+		},
+		"Straddle2": {
+			npages: 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages - 1, 1}},
+				BaseChunkIdx + 1: {{0, 1}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages-1),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"Free5": {
+			npages: 5,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 5),
+				PageBase(BaseChunkIdx, 10),
+				PageBase(BaseChunkIdx, 15),
+				PageBase(BaseChunkIdx, 20),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{25, PallocChunkPages - 25}},
+			},
+		},
+		"Free64": {
+			npages: 64,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 64),
+				PageBase(BaseChunkIdx, 128),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{192, PallocChunkPages - 192}},
+			},
+		},
+		"Free65": {
+			npages: 65,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+				PageBase(BaseChunkIdx, 65),
+				PageBase(BaseChunkIdx, 130),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{195, PallocChunkPages - 195}},
+			},
+		},
+		"FreePallocChunkPages": {
+			npages: PallocChunkPages,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+		},
+		"StraddlePallocChunkPages": {
+			npages: PallocChunkPages,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{PallocChunkPages / 2, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages / 2}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"StraddlePallocChunkPages+1": {
+			npages: PallocChunkPages + 1,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {{PallocChunkPages/2 + 1, PallocChunkPages/2 - 1}},
+			},
+		},
+		"FreePallocChunkPages*2": {
+			npages: PallocChunkPages * 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+			},
+		},
+		"StraddlePallocChunkPages*2": {
+			npages: PallocChunkPages * 2,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, PallocChunkPages/2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages / 2}},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {{PallocChunkPages / 2, PallocChunkPages / 2}},
+			},
+		},
+		"AllFreePallocChunkPages*7+5": {
+			npages: PallocChunkPages*7 + 5,
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+				BaseChunkIdx + 3: {{0, PallocChunkPages}},
+				BaseChunkIdx + 4: {{0, PallocChunkPages}},
+				BaseChunkIdx + 5: {{0, PallocChunkPages}},
+				BaseChunkIdx + 6: {{0, PallocChunkPages}},
+				BaseChunkIdx + 7: {{0, PallocChunkPages}},
+			},
+			frees: []uintptr{
+				PageBase(BaseChunkIdx, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {{5, PallocChunkPages - 5}},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, nil)
+			defer FreePageAlloc(b)
+
+			for _, addr := range v.frees {
+				b.Free(addr, v.npages)
+			}
+			want := NewPageAlloc(v.after, nil)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
+
+func TestPageAllocAllocAndFree(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	type hit struct {
+		alloc  bool
+		npages uintptr
+		base   uintptr
+	}
+	tests := map[string]struct {
+		init map[ChunkIdx][]BitRange
+		hits []hit
+	}{
+		// TODO(mknyszek): Write more tests here.
+		"Chunks8": {
+			init: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {},
+				BaseChunkIdx + 1: {},
+				BaseChunkIdx + 2: {},
+				BaseChunkIdx + 3: {},
+				BaseChunkIdx + 4: {},
+				BaseChunkIdx + 5: {},
+				BaseChunkIdx + 6: {},
+				BaseChunkIdx + 7: {},
+			},
+			hits: []hit{
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{false, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+				{true, 1, PageBase(BaseChunkIdx, 0)},
+				{false, 1, PageBase(BaseChunkIdx, 0)},
+				{true, PallocChunkPages * 8, PageBase(BaseChunkIdx, 0)},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.init, nil)
+			defer FreePageAlloc(b)
+
+			for iter, i := range v.hits {
+				if i.alloc {
+					if a, _ := b.Alloc(i.npages); a != i.base {
+						t.Fatalf("bad alloc #%d: want 0x%x, got 0x%x", iter+1, i.base, a)
+					}
+				} else {
+					b.Free(i.base, i.npages)
+				}
+			}
+		})
+	}
+}
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
new file mode 100644
index 0000000..683a997
--- /dev/null
+++ b/src/runtime/mpagecache.go
@@ -0,0 +1,161 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const pageCachePages = 8 * unsafe.Sizeof(pageCache{}.cache)
+
+// pageCache represents a per-p cache of pages the allocator can
+// allocate from without a lock. More specifically, it represents
+// a pageCachePages*pageSize chunk of memory with 0 or more free
+// pages in it.
+type pageCache struct {
+	base  uintptr // base address of the chunk
+	cache uint64  // 64-bit bitmap representing free pages (1 means free)
+	scav  uint64  // 64-bit bitmap representing scavenged pages (1 means scavenged)
+}
+
+// empty returns true if the pageCache has any free pages, and false
+// otherwise.
+func (c *pageCache) empty() bool {
+	return c.cache == 0
+}
+
+// alloc allocates npages from the page cache and is the main entry
+// point for allocation.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+//
+// Returns a base address of zero on failure, in which case the
+// amount of scavenged memory should be ignored.
+func (c *pageCache) alloc(npages uintptr) (uintptr, uintptr) {
+	if c.cache == 0 {
+		return 0, 0
+	}
+	if npages == 1 {
+		i := uintptr(sys.TrailingZeros64(c.cache))
+		scav := (c.scav >> i) & 1
+		c.cache &^= 1 << i // set bit to mark in-use
+		c.scav &^= 1 << i  // clear bit to mark unscavenged
+		return c.base + i*pageSize, uintptr(scav) * pageSize
+	}
+	return c.allocN(npages)
+}
+
+// allocN is a helper which attempts to allocate npages worth of pages
+// from the cache. It represents the general case for allocating from
+// the page cache.
+//
+// Returns a base address and the amount of scavenged memory in the
+// allocated region in bytes.
+func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
+	i := findBitRange64(c.cache, uint(npages))
+	if i >= 64 {
+		return 0, 0
+	}
+	mask := ((uint64(1) << npages) - 1) << i
+	scav := sys.OnesCount64(c.scav & mask)
+	c.cache &^= mask // mark in-use bits
+	c.scav &^= mask  // clear scavenged bits
+	return c.base + uintptr(i*pageSize), uintptr(scav) * pageSize
+}
+
+// flush empties out unallocated free pages in the given cache
+// into s. Then, it clears the cache, such that empty returns
+// true.
+//
+// s.mheapLock must be held or the world must be stopped.
+func (c *pageCache) flush(s *pageAlloc) {
+	if c.empty() {
+		return
+	}
+	ci := chunkIndex(c.base)
+	pi := chunkPageIndex(c.base)
+
+	// This method is called very infrequently, so just do the
+	// slower, safer thing by iterating over each bit individually.
+	for i := uint(0); i < 64; i++ {
+		if c.cache&(1<<i) != 0 {
+			s.chunkOf(ci).free1(pi + i)
+		}
+		if c.scav&(1<<i) != 0 {
+			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+		}
+	}
+	// Since this is a lot like a free, we need to make sure
+	// we update the searchAddr just like free does.
+	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
+		s.searchAddr = b
+	}
+	s.update(c.base, pageCachePages, false, false)
+	*c = pageCache{}
+}
+
+// allocToCache acquires a pageCachePages-aligned chunk of free pages which
+// may not be contiguous, and returns a pageCache structure which owns the
+// chunk.
+//
+// s.mheapLock must be held.
+func (s *pageAlloc) allocToCache() pageCache {
+	// If the searchAddr refers to a region which has a higher address than
+	// any known chunk, then we know we're out of memory.
+	if chunkIndex(s.searchAddr.addr()) >= s.end {
+		return pageCache{}
+	}
+	c := pageCache{}
+	ci := chunkIndex(s.searchAddr.addr()) // chunk index
+	if s.summary[len(s.summary)-1][ci] != 0 {
+		// Fast path: there's free pages at or near the searchAddr address.
+		chunk := s.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		if j == ^uint(0) {
+			throw("bad summary data")
+		}
+		c = pageCache{
+			base:  chunkBase(ci) + alignDown(uintptr(j), 64)*pageSize,
+			cache: ^chunk.pages64(j),
+			scav:  chunk.scavenged.block64(j),
+		}
+	} else {
+		// Slow path: the searchAddr address had nothing there, so go find
+		// the first free page the slow way.
+		addr, _ := s.find(1)
+		if addr == 0 {
+			// We failed to find adequate free space, so mark the searchAddr as OoM
+			// and return an empty pageCache.
+			s.searchAddr = maxSearchAddr
+			return pageCache{}
+		}
+		ci := chunkIndex(addr)
+		chunk := s.chunkOf(ci)
+		c = pageCache{
+			base:  alignDown(addr, 64*pageSize),
+			cache: ^chunk.pages64(chunkPageIndex(addr)),
+			scav:  chunk.scavenged.block64(chunkPageIndex(addr)),
+		}
+	}
+
+	// Set the bits as allocated and clear the scavenged bits.
+	s.allocRange(c.base, pageCachePages)
+
+	// Update as an allocation, but note that it's not contiguous.
+	s.update(c.base, pageCachePages, false, true)
+
+	// Set the search address to the last page represented by the cache.
+	// Since all of the pages in this block are going to the cache, and we
+	// searched for the first free page, we can confidently start at the
+	// next page.
+	//
+	// However, s.searchAddr is not allowed to point into unmapped heap memory
+	// unless it is maxSearchAddr, so make it the last page as opposed to
+	// the page after.
+	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	return c
+}
diff --git a/src/runtime/mpagecache_test.go b/src/runtime/mpagecache_test.go
new file mode 100644
index 0000000..2ed0c0a
--- /dev/null
+++ b/src/runtime/mpagecache_test.go
@@ -0,0 +1,399 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+func checkPageCache(t *testing.T, got, want PageCache) {
+	if got.Base() != want.Base() {
+		t.Errorf("bad pageCache base: got 0x%x, want 0x%x", got.Base(), want.Base())
+	}
+	if got.Cache() != want.Cache() {
+		t.Errorf("bad pageCache bits: got %016x, want %016x", got.Base(), want.Base())
+	}
+	if got.Scav() != want.Scav() {
+		t.Errorf("bad pageCache scav: got %016x, want %016x", got.Scav(), want.Scav())
+	}
+}
+
+func TestPageCacheAlloc(t *testing.T) {
+	base := PageBase(BaseChunkIdx, 0)
+	type hit struct {
+		npages uintptr
+		base   uintptr
+		scav   uintptr
+	}
+	tests := map[string]struct {
+		cache PageCache
+		hits  []hit
+	}{
+		"Empty": {
+			cache: NewPageCache(base, 0, 0),
+			hits: []hit{
+				{1, 0, 0},
+				{2, 0, 0},
+				{3, 0, 0},
+				{4, 0, 0},
+				{5, 0, 0},
+				{11, 0, 0},
+				{12, 0, 0},
+				{16, 0, 0},
+				{27, 0, 0},
+				{32, 0, 0},
+				{43, 0, 0},
+				{57, 0, 0},
+				{64, 0, 0},
+				{121, 0, 0},
+			},
+		},
+		"Lo1": {
+			cache: NewPageCache(base, 0x1, 0x1),
+			hits: []hit{
+				{1, base, PageSize},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Hi1": {
+			cache: NewPageCache(base, 0x1<<63, 0x1),
+			hits: []hit{
+				{1, base + 63*PageSize, 0},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Swiss1": {
+			cache: NewPageCache(base, 0x20005555, 0x5505),
+			hits: []hit{
+				{2, 0, 0},
+				{1, base, PageSize},
+				{1, base + 2*PageSize, PageSize},
+				{1, base + 4*PageSize, 0},
+				{1, base + 6*PageSize, 0},
+				{1, base + 8*PageSize, PageSize},
+				{1, base + 10*PageSize, PageSize},
+				{1, base + 12*PageSize, PageSize},
+				{1, base + 14*PageSize, PageSize},
+				{1, base + 29*PageSize, 0},
+				{1, 0, 0},
+				{10, 0, 0},
+			},
+		},
+		"Lo2": {
+			cache: NewPageCache(base, 0x3, 0x2<<62),
+			hits: []hit{
+				{2, base, 0},
+				{2, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Hi2": {
+			cache: NewPageCache(base, 0x3<<62, 0x3<<62),
+			hits: []hit{
+				{2, base + 62*PageSize, 2 * PageSize},
+				{2, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Swiss2": {
+			cache: NewPageCache(base, 0x3333<<31, 0x3030<<31),
+			hits: []hit{
+				{2, base + 31*PageSize, 0},
+				{2, base + 35*PageSize, 2 * PageSize},
+				{2, base + 39*PageSize, 0},
+				{2, base + 43*PageSize, 2 * PageSize},
+				{2, 0, 0},
+			},
+		},
+		"Hi53": {
+			cache: NewPageCache(base, ((uint64(1)<<53)-1)<<10, ((uint64(1)<<16)-1)<<10),
+			hits: []hit{
+				{53, base + 10*PageSize, 16 * PageSize},
+				{53, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"Full53": {
+			cache: NewPageCache(base, ^uint64(0), ((uint64(1)<<16)-1)<<10),
+			hits: []hit{
+				{53, base, 16 * PageSize},
+				{53, 0, 0},
+				{1, base + 53*PageSize, 0},
+			},
+		},
+		"Full64": {
+			cache: NewPageCache(base, ^uint64(0), ^uint64(0)),
+			hits: []hit{
+				{64, base, 64 * PageSize},
+				{64, 0, 0},
+				{1, 0, 0},
+			},
+		},
+		"FullMixed": {
+			cache: NewPageCache(base, ^uint64(0), ^uint64(0)),
+			hits: []hit{
+				{5, base, 5 * PageSize},
+				{7, base + 5*PageSize, 7 * PageSize},
+				{1, base + 12*PageSize, 1 * PageSize},
+				{23, base + 13*PageSize, 23 * PageSize},
+				{63, 0, 0},
+				{3, base + 36*PageSize, 3 * PageSize},
+				{3, base + 39*PageSize, 3 * PageSize},
+				{3, base + 42*PageSize, 3 * PageSize},
+				{12, base + 45*PageSize, 12 * PageSize},
+				{11, 0, 0},
+				{4, base + 57*PageSize, 4 * PageSize},
+				{4, 0, 0},
+				{6, 0, 0},
+				{36, 0, 0},
+				{2, base + 61*PageSize, 2 * PageSize},
+				{3, 0, 0},
+				{1, base + 63*PageSize, 1 * PageSize},
+				{4, 0, 0},
+				{2, 0, 0},
+				{62, 0, 0},
+				{1, 0, 0},
+			},
+		},
+	}
+	for name, test := range tests {
+		test := test
+		t.Run(name, func(t *testing.T) {
+			c := test.cache
+			for i, h := range test.hits {
+				b, s := c.Alloc(h.npages)
+				if b != h.base {
+					t.Fatalf("bad alloc base #%d: got 0x%x, want 0x%x", i, b, h.base)
+				}
+				if s != h.scav {
+					t.Fatalf("bad alloc scav #%d: got %d, want %d", i, s, h.scav)
+				}
+			}
+		})
+	}
+}
+
+func TestPageCacheFlush(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	bits64ToBitRanges := func(bits uint64, base uint) []BitRange {
+		var ranges []BitRange
+		start, size := uint(0), uint(0)
+		for i := 0; i < 64; i++ {
+			if bits&(1<<i) != 0 {
+				if size == 0 {
+					start = uint(i) + base
+				}
+				size++
+			} else {
+				if size != 0 {
+					ranges = append(ranges, BitRange{start, size})
+					size = 0
+				}
+			}
+		}
+		if size != 0 {
+			ranges = append(ranges, BitRange{start, size})
+		}
+		return ranges
+	}
+	runTest := func(t *testing.T, base uint, cache, scav uint64) {
+		// Set up the before state.
+		beforeAlloc := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: {{base, 64}},
+		}
+		beforeScav := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: {},
+		}
+		b := NewPageAlloc(beforeAlloc, beforeScav)
+		defer FreePageAlloc(b)
+
+		// Create and flush the cache.
+		c := NewPageCache(PageBase(BaseChunkIdx, base), cache, scav)
+		c.Flush(b)
+		if !c.Empty() {
+			t.Errorf("pageCache flush did not clear cache")
+		}
+
+		// Set up the expected after state.
+		afterAlloc := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: bits64ToBitRanges(^cache, base),
+		}
+		afterScav := map[ChunkIdx][]BitRange{
+			BaseChunkIdx: bits64ToBitRanges(scav, base),
+		}
+		want := NewPageAlloc(afterAlloc, afterScav)
+		defer FreePageAlloc(want)
+
+		// Check to see if it worked.
+		checkPageAlloc(t, want, b)
+	}
+
+	// Empty.
+	runTest(t, 0, 0, 0)
+
+	// Full.
+	runTest(t, 0, ^uint64(0), ^uint64(0))
+
+	// Random.
+	for i := 0; i < 100; i++ {
+		// Generate random valid base within a chunk.
+		base := uint(rand.Intn(PallocChunkPages/64)) * 64
+
+		// Generate random cache.
+		cache := rand.Uint64()
+		scav := rand.Uint64() & cache
+
+		// Run the test.
+		runTest(t, base, cache, scav)
+	}
+}
+
+func TestPageAllocAllocToCache(t *testing.T) {
+	if GOOS == "openbsd" && testing.Short() {
+		t.Skip("skipping because virtual memory is limited; see #36210")
+	}
+	type test struct {
+		before map[ChunkIdx][]BitRange
+		scav   map[ChunkIdx][]BitRange
+		hits   []PageCache // expected base addresses and patterns
+		after  map[ChunkIdx][]BitRange
+	}
+	tests := map[string]test{
+		"AllFree": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{1, 1}, {64, 64}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx, 0), ^uint64(0), 0x2),
+				NewPageCache(PageBase(BaseChunkIdx, 64), ^uint64(0), ^uint64(0)),
+				NewPageCache(PageBase(BaseChunkIdx, 128), ^uint64(0), 0),
+				NewPageCache(PageBase(BaseChunkIdx, 192), ^uint64(0), 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 256}},
+			},
+		},
+		"ManyArena": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages - 64}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx+2, PallocChunkPages-64), ^uint64(0), 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:     {{0, PallocChunkPages}},
+				BaseChunkIdx + 1: {{0, PallocChunkPages}},
+				BaseChunkIdx + 2: {{0, PallocChunkPages}},
+			},
+		},
+		"NotContiguous": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 0}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{31, 67}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx+0xff, 0), ^uint64(0), ((uint64(1)<<33)-1)<<31),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx:        {{0, PallocChunkPages}},
+				BaseChunkIdx + 0xff: {{0, 64}},
+			},
+		},
+		"First": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 32}, {33, 31}, {96, 32}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{1, 4}, {31, 5}, {66, 2}},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(BaseChunkIdx, 0), 1<<32, 1<<32),
+				NewPageCache(PageBase(BaseChunkIdx, 64), (uint64(1)<<32)-1, 0x3<<2),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, 128}},
+			},
+		},
+		"Fail": {
+			before: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+			hits: []PageCache{
+				NewPageCache(0, 0, 0),
+				NewPageCache(0, 0, 0),
+				NewPageCache(0, 0, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				BaseChunkIdx: {{0, PallocChunkPages}},
+			},
+		},
+	}
+	if PageAlloc64Bit != 0 {
+		const chunkIdxBigJump = 0x100000 // chunk index offset which translates to O(TiB)
+
+		// This test is similar to the one with the same name for
+		// pageAlloc.alloc and serves the same purpose.
+		// See mpagealloc_test.go for details.
+		sumsPerPhysPage := ChunkIdx(PhysPageSize / PallocSumBytes)
+		baseChunkIdx := BaseChunkIdx &^ (sumsPerPhysPage - 1)
+		tests["DiscontiguousMappedSumBoundary"] = test{
+			before: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {{0, PallocChunkPages - 1}},
+				baseChunkIdx + chunkIdxBigJump:     {{1, PallocChunkPages - 1}},
+			},
+			scav: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {},
+				baseChunkIdx + chunkIdxBigJump:     {},
+			},
+			hits: []PageCache{
+				NewPageCache(PageBase(baseChunkIdx+sumsPerPhysPage-1, PallocChunkPages-64), 1<<63, 0),
+				NewPageCache(PageBase(baseChunkIdx+chunkIdxBigJump, 0), 1, 0),
+				NewPageCache(0, 0, 0),
+			},
+			after: map[ChunkIdx][]BitRange{
+				baseChunkIdx + sumsPerPhysPage - 1: {{0, PallocChunkPages}},
+				baseChunkIdx + chunkIdxBigJump:     {{0, PallocChunkPages}},
+			},
+		}
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := NewPageAlloc(v.before, v.scav)
+			defer FreePageAlloc(b)
+
+			for _, expect := range v.hits {
+				checkPageCache(t, b.AllocToCache(), expect)
+				if t.Failed() {
+					return
+				}
+			}
+			want := NewPageAlloc(v.after, v.scav)
+			defer FreePageAlloc(want)
+
+			checkPageAlloc(t, want, b)
+		})
+	}
+}
diff --git a/src/runtime/mpallocbits.go b/src/runtime/mpallocbits.go
new file mode 100644
index 0000000..a801134
--- /dev/null
+++ b/src/runtime/mpallocbits.go
@@ -0,0 +1,388 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+)
+
+// pageBits is a bitmap representing one bit per page in a palloc chunk.
+type pageBits [pallocChunkPages / 64]uint64
+
+// get returns the value of the i'th bit in the bitmap.
+func (b *pageBits) get(i uint) uint {
+	return uint((b[i/64] >> (i % 64)) & 1)
+}
+
+// block64 returns the 64-bit aligned block of bits containing the i'th bit.
+func (b *pageBits) block64(i uint) uint64 {
+	return b[i/64]
+}
+
+// set sets bit i of pageBits.
+func (b *pageBits) set(i uint) {
+	b[i/64] |= 1 << (i % 64)
+}
+
+// setRange sets bits in the range [i, i+n).
+func (b *pageBits) setRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.set(i)
+		return
+	}
+	// Set bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] |= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Set leading bits.
+	b[i/64] |= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = ^uint64(0)
+	}
+	// Set trailing bits.
+	b[j/64] |= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// setAll sets all the bits of b.
+func (b *pageBits) setAll() {
+	for i := range b {
+		b[i] = ^uint64(0)
+	}
+}
+
+// clear clears bit i of pageBits.
+func (b *pageBits) clear(i uint) {
+	b[i/64] &^= 1 << (i % 64)
+}
+
+// clearRange clears bits in the range [i, i+n).
+func (b *pageBits) clearRange(i, n uint) {
+	_ = b[i/64]
+	if n == 1 {
+		// Fast path for the n == 1 case.
+		b.clear(i)
+		return
+	}
+	// Clear bits [i, j].
+	j := i + n - 1
+	if i/64 == j/64 {
+		b[i/64] &^= ((uint64(1) << n) - 1) << (i % 64)
+		return
+	}
+	_ = b[j/64]
+	// Clear leading bits.
+	b[i/64] &^= ^uint64(0) << (i % 64)
+	for k := i/64 + 1; k < j/64; k++ {
+		b[k] = 0
+	}
+	// Clear trailing bits.
+	b[j/64] &^= (uint64(1) << (j%64 + 1)) - 1
+}
+
+// clearAll frees all the bits of b.
+func (b *pageBits) clearAll() {
+	for i := range b {
+		b[i] = 0
+	}
+}
+
+// popcntRange counts the number of set bits in the
+// range [i, i+n).
+func (b *pageBits) popcntRange(i, n uint) (s uint) {
+	if n == 1 {
+		return uint((b[i/64] >> (i % 64)) & 1)
+	}
+	_ = b[i/64]
+	j := i + n - 1
+	if i/64 == j/64 {
+		return uint(sys.OnesCount64((b[i/64] >> (i % 64)) & ((1 << n) - 1)))
+	}
+	_ = b[j/64]
+	s += uint(sys.OnesCount64(b[i/64] >> (i % 64)))
+	for k := i/64 + 1; k < j/64; k++ {
+		s += uint(sys.OnesCount64(b[k]))
+	}
+	s += uint(sys.OnesCount64(b[j/64] & ((1 << (j%64 + 1)) - 1)))
+	return
+}
+
+// pallocBits is a bitmap that tracks page allocations for at most one
+// palloc chunk.
+//
+// The precise representation is an implementation detail, but for the
+// sake of documentation, 0s are free pages and 1s are allocated pages.
+type pallocBits pageBits
+
+// consec8tab is a table containing the number of consecutive
+// zero bits for any uint8 value.
+//
+// The table is generated by calling consec8(i) for each
+// possible uint8 value, which is defined as:
+//
+// // consec8 counts the maximum number of consecutive 0 bits
+// // in a uint8.
+// func consec8(n uint8) int {
+// 	n = ^n
+// 	i := 0
+// 	for n != 0 {
+// 		n &= (n << 1)
+// 		i++
+// 	}
+// 	return i
+// }
+var consec8tab = [256]uint{
+	8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
+	4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
+	4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1,
+	5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1,
+	4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0,
+}
+
+// summarize returns a packed summary of the bitmap in pallocBits.
+func (b *pallocBits) summarize() pallocSum {
+	// TODO(mknyszek): There may be something more clever to be done
+	// here to make the summarize operation more efficient. For example,
+	// we can compute start and end with 64-bit wide operations easily,
+	// but max is a bit more complex. Perhaps there exists some way to
+	// leverage the 64-bit start and end to our advantage?
+	var start, max, end uint
+	for i := 0; i < len(b); i++ {
+		a := b[i]
+		for j := 0; j < 64; j += 8 {
+			k := uint8(a >> j)
+
+			// Compute start.
+			si := uint(sys.TrailingZeros8(k))
+			if start == uint(i*64+j) {
+				start += si
+			}
+
+			// Compute max.
+			if end+si > max {
+				max = end + si
+			}
+			if mi := consec8tab[k]; mi > max {
+				max = mi
+			}
+
+			// Compute end.
+			if k == 0 {
+				end += 8
+			} else {
+				end = uint(sys.LeadingZeros8(k))
+			}
+		}
+	}
+	return packPallocSum(start, max, end)
+}
+
+// find searches for npages contiguous free pages in pallocBits and returns
+// the index where that run starts, as well as the index of the first free page
+// it found in the search. searchIdx represents the first known free page and
+// where to begin the search from.
+//
+// If find fails to find any free space, it returns an index of ^uint(0) and
+// the new searchIdx should be ignored.
+//
+// Note that if npages == 1, the two returned values will always be identical.
+func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
+	if npages == 1 {
+		addr := b.find1(searchIdx)
+		return addr, addr
+	} else if npages <= 64 {
+		return b.findSmallN(npages, searchIdx)
+	}
+	return b.findLargeN(npages, searchIdx)
+}
+
+// find1 is a helper for find which searches for a single free page
+// in the pallocBits and returns the index.
+//
+// See find for an explanation of the searchIdx parameter.
+func (b *pallocBits) find1(searchIdx uint) uint {
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if x == ^uint64(0) {
+			continue
+		}
+		return i*64 + uint(sys.TrailingZeros64(^x))
+	}
+	return ^uint(0)
+}
+
+// findSmallN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run of contiguous pages
+// starts as well as the index of the first free page it finds in its search.
+//
+// See find for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findSmallN assumes npages <= 64, where any such contiguous run of pages
+// crosses at most one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
+	end, newSearchIdx := uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		bi := b[i]
+		if bi == ^uint64(0) {
+			end = 0
+			continue
+		}
+		// First see if we can pack our allocation in the trailing
+		// zeros plus the end of the last 64 bits.
+		start := uint(sys.TrailingZeros64(bi))
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
+		}
+		if end+start >= uint(npages) {
+			return i*64 - end, newSearchIdx
+		}
+		// Next, check the interior of the 64-bit chunk.
+		j := findBitRange64(^bi, uint(npages))
+		if j < 64 {
+			return i*64 + j, newSearchIdx
+		}
+		end = uint(sys.LeadingZeros64(bi))
+	}
+	return ^uint(0), newSearchIdx
+}
+
+// findLargeN is a helper for find which searches for npages contiguous free pages
+// in this pallocBits and returns the index where that run starts, as well as the
+// index of the first free page it found it its search.
+//
+// See alloc for an explanation of the searchIdx parameter.
+//
+// Returns a ^uint(0) index on failure and the new searchIdx should be ignored.
+//
+// findLargeN assumes npages > 64, where any such run of free pages
+// crosses at least one aligned 64-bit boundary in the bits.
+func (b *pallocBits) findLargeN(npages uintptr, searchIdx uint) (uint, uint) {
+	start, size, newSearchIdx := ^uint(0), uint(0), ^uint(0)
+	for i := searchIdx / 64; i < uint(len(b)); i++ {
+		x := b[i]
+		if x == ^uint64(0) {
+			size = 0
+			continue
+		}
+		if newSearchIdx == ^uint(0) {
+			// The new searchIdx is going to be at these 64 bits after any
+			// 1s we file, so count trailing 1s.
+			newSearchIdx = i*64 + uint(sys.TrailingZeros64(^x))
+		}
+		if size == 0 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		s := uint(sys.TrailingZeros64(x))
+		if s+size >= uint(npages) {
+			size += s
+			return start, newSearchIdx
+		}
+		if s < 64 {
+			size = uint(sys.LeadingZeros64(x))
+			start = i*64 + 64 - size
+			continue
+		}
+		size += 64
+	}
+	if size < uint(npages) {
+		return ^uint(0), newSearchIdx
+	}
+	return start, newSearchIdx
+}
+
+// allocRange allocates the range [i, i+n).
+func (b *pallocBits) allocRange(i, n uint) {
+	(*pageBits)(b).setRange(i, n)
+}
+
+// allocAll allocates all the bits of b.
+func (b *pallocBits) allocAll() {
+	(*pageBits)(b).setAll()
+}
+
+// free1 frees a single page in the pallocBits at i.
+func (b *pallocBits) free1(i uint) {
+	(*pageBits)(b).clear(i)
+}
+
+// free frees the range [i, i+n) of pages in the pallocBits.
+func (b *pallocBits) free(i, n uint) {
+	(*pageBits)(b).clearRange(i, n)
+}
+
+// freeAll frees all the bits of b.
+func (b *pallocBits) freeAll() {
+	(*pageBits)(b).clearAll()
+}
+
+// pages64 returns a 64-bit bitmap representing a block of 64 pages aligned
+// to 64 pages. The returned block of pages is the one containing the i'th
+// page in this pallocBits. Each bit represents whether the page is in-use.
+func (b *pallocBits) pages64(i uint) uint64 {
+	return (*pageBits)(b).block64(i)
+}
+
+// findBitRange64 returns the bit index of the first set of
+// n consecutive 1 bits. If no consecutive set of 1 bits of
+// size n may be found in c, then it returns an integer >= 64.
+func findBitRange64(c uint64, n uint) uint {
+	i := uint(0)
+	cont := uint(sys.TrailingZeros64(^c))
+	for cont < n && i < 64 {
+		i += cont
+		i += uint(sys.TrailingZeros64(c >> i))
+		cont = uint(sys.TrailingZeros64(^(c >> i)))
+	}
+	return i
+}
+
+// pallocData encapsulates pallocBits and a bitmap for
+// whether or not a given page is scavenged in a single
+// structure. It's effectively a pallocBits with
+// additional functionality.
+//
+// Update the comment on (*pageAlloc).chunks should this
+// structure change.
+type pallocData struct {
+	pallocBits
+	scavenged pageBits
+}
+
+// allocRange sets bits [i, i+n) in the bitmap to 1 and
+// updates the scavenged bits appropriately.
+func (m *pallocData) allocRange(i, n uint) {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocRange(i, n)
+	m.scavenged.clearRange(i, n)
+}
+
+// allocAll sets every bit in the bitmap to 1 and updates
+// the scavenged bits appropriately.
+func (m *pallocData) allocAll() {
+	// Clear the scavenged bits when we alloc the range.
+	m.pallocBits.allocAll()
+	m.scavenged.clearAll()
+}
diff --git a/src/runtime/mpallocbits_test.go b/src/runtime/mpallocbits_test.go
new file mode 100644
index 0000000..71a29f3
--- /dev/null
+++ b/src/runtime/mpallocbits_test.go
@@ -0,0 +1,510 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"math/rand"
+	. "runtime"
+	"testing"
+)
+
+// Ensures that got and want are the same, and if not, reports
+// detailed diff information.
+func checkPallocBits(t *testing.T, got, want *PallocBits) bool {
+	d := DiffPallocBits(got, want)
+	if len(d) != 0 {
+		t.Errorf("%d range(s) different", len(d))
+		for _, bits := range d {
+			t.Logf("\t@ bit index %d", bits.I)
+			t.Logf("\t|  got: %s", StringifyPallocBits(got, bits))
+			t.Logf("\t| want: %s", StringifyPallocBits(want, bits))
+		}
+		return false
+	}
+	return true
+}
+
+// makePallocBits produces an initialized PallocBits by setting
+// the ranges in s to 1 and the rest to zero.
+func makePallocBits(s []BitRange) *PallocBits {
+	b := new(PallocBits)
+	for _, v := range s {
+		b.AllocRange(v.I, v.N)
+	}
+	return b
+}
+
+// Ensures that PallocBits.AllocRange works, which is a fundamental
+// method used for testing and initialization since it's used by
+// makePallocBits.
+func TestPallocBitsAllocRange(t *testing.T) {
+	test := func(t *testing.T, i, n uint, want *PallocBits) {
+		checkPallocBits(t, makePallocBits([]BitRange{{i, n}}), want)
+	}
+	t.Run("OneLow", func(t *testing.T) {
+		want := new(PallocBits)
+		want[0] = 0x1
+		test(t, 0, 1, want)
+	})
+	t.Run("OneHigh", func(t *testing.T) {
+		want := new(PallocBits)
+		want[PallocChunkPages/64-1] = 1 << 63
+		test(t, PallocChunkPages-1, 1, want)
+	})
+	t.Run("Inner", func(t *testing.T) {
+		want := new(PallocBits)
+		want[2] = 0x3e
+		test(t, 129, 5, want)
+	})
+	t.Run("Aligned", func(t *testing.T) {
+		want := new(PallocBits)
+		want[2] = ^uint64(0)
+		want[3] = ^uint64(0)
+		test(t, 128, 128, want)
+	})
+	t.Run("Begin", func(t *testing.T) {
+		want := new(PallocBits)
+		want[0] = ^uint64(0)
+		want[1] = ^uint64(0)
+		want[2] = ^uint64(0)
+		want[3] = ^uint64(0)
+		want[4] = ^uint64(0)
+		want[5] = 0x1
+		test(t, 0, 321, want)
+	})
+	t.Run("End", func(t *testing.T) {
+		want := new(PallocBits)
+		want[PallocChunkPages/64-1] = ^uint64(0)
+		want[PallocChunkPages/64-2] = ^uint64(0)
+		want[PallocChunkPages/64-3] = ^uint64(0)
+		want[PallocChunkPages/64-4] = 1 << 63
+		test(t, PallocChunkPages-(64*3+1), 64*3+1, want)
+	})
+	t.Run("All", func(t *testing.T) {
+		want := new(PallocBits)
+		for i := range want {
+			want[i] = ^uint64(0)
+		}
+		test(t, 0, PallocChunkPages, want)
+	})
+}
+
+// Inverts every bit in the PallocBits.
+func invertPallocBits(b *PallocBits) {
+	for i := range b {
+		b[i] = ^b[i]
+	}
+}
+
+// Ensures two packed summaries are identical, and reports a detailed description
+// of the difference if they're not.
+func checkPallocSum(t *testing.T, got, want PallocSum) {
+	if got.Start() != want.Start() {
+		t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start())
+	}
+	if got.Max() != want.Max() {
+		t.Errorf("inconsistent max: got %d, want %d", got.Max(), want.Max())
+	}
+	if got.End() != want.End() {
+		t.Errorf("inconsistent end: got %d, want %d", got.End(), want.End())
+	}
+}
+
+func TestMallocBitsPopcntRange(t *testing.T) {
+	type test struct {
+		i, n uint // bit range to popcnt over.
+		want uint // expected popcnt result on that range.
+	}
+	tests := map[string]struct {
+		init  []BitRange // bit ranges to set to 1 in the bitmap.
+		tests []test     // a set of popcnt tests to run over the bitmap.
+	}{
+		"None": {
+			tests: []test{
+				{0, 1, 0},
+				{5, 3, 0},
+				{2, 11, 0},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, 0},
+				{0, PallocChunkPages, 0},
+			},
+		},
+		"All": {
+			init: []BitRange{{0, PallocChunkPages}},
+			tests: []test{
+				{0, 1, 1},
+				{5, 3, 3},
+				{2, 11, 11},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, PallocChunkPages / 2},
+				{0, PallocChunkPages, PallocChunkPages},
+			},
+		},
+		"Half": {
+			init: []BitRange{{PallocChunkPages / 2, PallocChunkPages / 2}},
+			tests: []test{
+				{0, 1, 0},
+				{5, 3, 0},
+				{2, 11, 0},
+				{PallocChunkPages/2 - 1, 1, 0},
+				{PallocChunkPages / 2, 1, 1},
+				{PallocChunkPages/2 + 10, 1, 1},
+				{PallocChunkPages/2 - 1, 2, 1},
+				{PallocChunkPages / 4, PallocChunkPages / 4, 0},
+				{PallocChunkPages / 4, PallocChunkPages/4 + 1, 1},
+				{PallocChunkPages/4 + 1, PallocChunkPages / 2, PallocChunkPages/4 + 1},
+				{0, PallocChunkPages, PallocChunkPages / 2},
+			},
+		},
+		"OddBound": {
+			init: []BitRange{{0, 111}},
+			tests: []test{
+				{0, 1, 1},
+				{5, 3, 3},
+				{2, 11, 11},
+				{110, 2, 1},
+				{99, 50, 12},
+				{110, 1, 1},
+				{111, 1, 0},
+				{99, 1, 1},
+				{120, 1, 0},
+				{PallocChunkPages / 2, PallocChunkPages / 2, 0},
+				{0, PallocChunkPages, 111},
+			},
+		},
+		"Scattered": {
+			init: []BitRange{
+				{1, 3}, {5, 1}, {7, 1}, {10, 2}, {13, 1}, {15, 4},
+				{21, 1}, {23, 1}, {26, 2}, {30, 5}, {36, 2}, {40, 3},
+				{44, 6}, {51, 1}, {53, 2}, {58, 3}, {63, 1}, {67, 2},
+				{71, 10}, {84, 1}, {89, 7}, {99, 2}, {103, 1}, {107, 2},
+				{111, 1}, {113, 1}, {115, 1}, {118, 1}, {120, 2}, {125, 5},
+			},
+			tests: []test{
+				{0, 11, 6},
+				{0, 64, 39},
+				{13, 64, 40},
+				{64, 64, 34},
+				{0, 128, 73},
+				{1, 128, 74},
+				{0, PallocChunkPages, 75},
+			},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.init)
+			for _, h := range v.tests {
+				if got := b.PopcntRange(h.i, h.n); got != h.want {
+					t.Errorf("bad popcnt (i=%d, n=%d): got %d, want %d", h.i, h.n, got, h.want)
+				}
+			}
+		})
+	}
+}
+
+// Ensures computing bit summaries works as expected by generating random
+// bitmaps and checking against a reference implementation.
+func TestPallocBitsSummarizeRandom(t *testing.T) {
+	b := new(PallocBits)
+	for i := 0; i < 1000; i++ {
+		// Randomize bitmap.
+		for i := range b {
+			b[i] = rand.Uint64()
+		}
+		// Check summary against reference implementation.
+		checkPallocSum(t, b.Summarize(), SummarizeSlow(b))
+	}
+}
+
+// Ensures computing bit summaries works as expected.
+func TestPallocBitsSummarize(t *testing.T) {
+	var emptySum = PackPallocSum(PallocChunkPages, PallocChunkPages, PallocChunkPages)
+	type test struct {
+		free []BitRange // Ranges of free (zero) bits.
+		hits []PallocSum
+	}
+	tests := make(map[string]test)
+	tests["NoneFree"] = test{
+		free: []BitRange{},
+		hits: []PallocSum{
+			PackPallocSum(0, 0, 0),
+		},
+	}
+	tests["OnlyStart"] = test{
+		free: []BitRange{{0, 10}},
+		hits: []PallocSum{
+			PackPallocSum(10, 10, 0),
+		},
+	}
+	tests["OnlyEnd"] = test{
+		free: []BitRange{{PallocChunkPages - 40, 40}},
+		hits: []PallocSum{
+			PackPallocSum(0, 40, 40),
+		},
+	}
+	tests["StartAndEnd"] = test{
+		free: []BitRange{{0, 11}, {PallocChunkPages - 23, 23}},
+		hits: []PallocSum{
+			PackPallocSum(11, 23, 23),
+		},
+	}
+	tests["StartMaxEnd"] = test{
+		free: []BitRange{{0, 4}, {50, 100}, {PallocChunkPages - 4, 4}},
+		hits: []PallocSum{
+			PackPallocSum(4, 100, 4),
+		},
+	}
+	tests["OnlyMax"] = test{
+		free: []BitRange{{1, 20}, {35, 241}, {PallocChunkPages - 50, 30}},
+		hits: []PallocSum{
+			PackPallocSum(0, 241, 0),
+		},
+	}
+	tests["MultiMax"] = test{
+		free: []BitRange{{35, 2}, {40, 5}, {100, 5}},
+		hits: []PallocSum{
+			PackPallocSum(0, 5, 0),
+		},
+	}
+	tests["One"] = test{
+		free: []BitRange{{2, 1}},
+		hits: []PallocSum{
+			PackPallocSum(0, 1, 0),
+		},
+	}
+	tests["AllFree"] = test{
+		free: []BitRange{{0, PallocChunkPages}},
+		hits: []PallocSum{
+			emptySum,
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.free)
+			// In the PallocBits we create 1's represent free spots, but in our actual
+			// PallocBits 1 means not free, so invert.
+			invertPallocBits(b)
+			for _, h := range v.hits {
+				checkPallocSum(t, b.Summarize(), h)
+			}
+		})
+	}
+}
+
+// Benchmarks how quickly we can summarize a PallocBits.
+func BenchmarkPallocBitsSummarize(b *testing.B) {
+	buf0 := new(PallocBits)
+	buf1 := new(PallocBits)
+	for i := 0; i < len(buf1); i++ {
+		buf1[i] = ^uint64(0)
+	}
+	bufa := new(PallocBits)
+	for i := 0; i < len(bufa); i++ {
+		bufa[i] = 0xaa
+	}
+	for _, buf := range []*PallocBits{buf0, buf1, bufa} {
+		b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				buf.Summarize()
+			}
+		})
+	}
+}
+
+// Ensures page allocation works.
+func TestPallocBitsAlloc(t *testing.T) {
+	tests := map[string]struct {
+		before []BitRange
+		after  []BitRange
+		npages uintptr
+		hits   []uint
+	}{
+		"AllFree1": {
+			npages: 1,
+			hits:   []uint{0, 1, 2, 3, 4, 5},
+			after:  []BitRange{{0, 6}},
+		},
+		"AllFree2": {
+			npages: 2,
+			hits:   []uint{0, 2, 4, 6, 8, 10},
+			after:  []BitRange{{0, 12}},
+		},
+		"AllFree5": {
+			npages: 5,
+			hits:   []uint{0, 5, 10, 15, 20},
+			after:  []BitRange{{0, 25}},
+		},
+		"AllFree64": {
+			npages: 64,
+			hits:   []uint{0, 64, 128},
+			after:  []BitRange{{0, 192}},
+		},
+		"AllFree65": {
+			npages: 65,
+			hits:   []uint{0, 65, 130},
+			after:  []BitRange{{0, 195}},
+		},
+		"SomeFree64": {
+			before: []BitRange{{0, 32}, {64, 32}, {100, PallocChunkPages - 100}},
+			npages: 64,
+			hits:   []uint{^uint(0)},
+			after:  []BitRange{{0, 32}, {64, 32}, {100, PallocChunkPages - 100}},
+		},
+		"NoneFree1": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 1,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree2": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 2,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree5": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 5,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"NoneFree65": {
+			before: []BitRange{{0, PallocChunkPages}},
+			npages: 65,
+			hits:   []uint{^uint(0), ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit1": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 - 2, PallocChunkPages/2 + 2}},
+			npages: 1,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit2": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 - 1, PallocChunkPages/2 + 1}},
+			npages: 2,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit5": {
+			before: []BitRange{{0, PallocChunkPages/2 - 3}, {PallocChunkPages/2 + 2, PallocChunkPages/2 - 2}},
+			npages: 5,
+			hits:   []uint{PallocChunkPages/2 - 3, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"ExactFit65": {
+			before: []BitRange{{0, PallocChunkPages/2 - 31}, {PallocChunkPages/2 + 34, PallocChunkPages/2 - 34}},
+			npages: 65,
+			hits:   []uint{PallocChunkPages/2 - 31, ^uint(0)},
+			after:  []BitRange{{0, PallocChunkPages}},
+		},
+		"SomeFree161": {
+			before: []BitRange{{0, 185}, {331, 1}},
+			npages: 161,
+			hits:   []uint{332},
+			after:  []BitRange{{0, 185}, {331, 162}},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.before)
+			for iter, i := range v.hits {
+				a, _ := b.Find(v.npages, 0)
+				if i != a {
+					t.Fatalf("find #%d picked wrong index: want %d, got %d", iter+1, i, a)
+				}
+				if i != ^uint(0) {
+					b.AllocRange(a, uint(v.npages))
+				}
+			}
+			want := makePallocBits(v.after)
+			checkPallocBits(t, b, want)
+		})
+	}
+}
+
+// Ensures page freeing works.
+func TestPallocBitsFree(t *testing.T) {
+	tests := map[string]struct {
+		beforeInv []BitRange
+		afterInv  []BitRange
+		frees     []uint
+		npages    uintptr
+	}{
+		"SomeFree": {
+			npages:    1,
+			beforeInv: []BitRange{{0, 32}, {64, 32}, {100, 1}},
+			frees:     []uint{32},
+			afterInv:  []BitRange{{0, 33}, {64, 32}, {100, 1}},
+		},
+		"NoneFree1": {
+			npages:   1,
+			frees:    []uint{0, 1, 2, 3, 4, 5},
+			afterInv: []BitRange{{0, 6}},
+		},
+		"NoneFree2": {
+			npages:   2,
+			frees:    []uint{0, 2, 4, 6, 8, 10},
+			afterInv: []BitRange{{0, 12}},
+		},
+		"NoneFree5": {
+			npages:   5,
+			frees:    []uint{0, 5, 10, 15, 20},
+			afterInv: []BitRange{{0, 25}},
+		},
+		"NoneFree64": {
+			npages:   64,
+			frees:    []uint{0, 64, 128},
+			afterInv: []BitRange{{0, 192}},
+		},
+		"NoneFree65": {
+			npages:   65,
+			frees:    []uint{0, 65, 130},
+			afterInv: []BitRange{{0, 195}},
+		},
+	}
+	for name, v := range tests {
+		v := v
+		t.Run(name, func(t *testing.T) {
+			b := makePallocBits(v.beforeInv)
+			invertPallocBits(b)
+			for _, i := range v.frees {
+				b.Free(i, uint(v.npages))
+			}
+			want := makePallocBits(v.afterInv)
+			invertPallocBits(want)
+			checkPallocBits(t, b, want)
+		})
+	}
+}
+
+func TestFindBitRange64(t *testing.T) {
+	check := func(x uint64, n uint, result uint) {
+		i := FindBitRange64(x, n)
+		if result == ^uint(0) && i < 64 {
+			t.Errorf("case (%016x, %d): got %d, want failure", x, n, i)
+		} else if result != ^uint(0) && i != result {
+			t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result)
+		}
+	}
+	for i := uint(0); i <= 64; i++ {
+		check(^uint64(0), i, 0)
+	}
+	check(0, 0, 0)
+	for i := uint(1); i <= 64; i++ {
+		check(0, i, ^uint(0))
+	}
+	check(0x8000000000000000, 1, 63)
+	check(0xc000010001010000, 2, 62)
+	check(0xc000010001030000, 2, 16)
+	check(0xe000030001030000, 3, 61)
+	check(0xe000030001070000, 3, 16)
+	check(0xffff03ff01070000, 16, 48)
+	check(0xffff03ff0107ffff, 16, 0)
+	check(0x0fff03ff01079fff, 16, ^uint(0))
+}
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 2bd41b6..128498d 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -711,13 +711,16 @@
 	return
 }
 
-// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
-// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
-// If len(p) < n, GoroutineProfile does not change p and returns n, false.
-//
-// Most clients should use the runtime/pprof package instead
-// of calling GoroutineProfile directly.
-func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+//go:linkname runtime_goroutineProfileWithLabels runtime/pprof.runtime_goroutineProfileWithLabels
+func runtime_goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	return goroutineProfileWithLabels(p, labels)
+}
+
+// labels may be nil. If labels is non-nil, it must have the same length as p.
+func goroutineProfileWithLabels(p []StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
+	if labels != nil && len(labels) != len(p) {
+		labels = nil
+	}
 	gp := getg()
 
 	isOK := func(gp1 *g) bool {
@@ -737,7 +740,7 @@
 
 	if n <= len(p) {
 		ok = true
-		r := p
+		r, lbl := p, labels
 
 		// Save current goroutine.
 		sp := getcallersp()
@@ -747,6 +750,12 @@
 		})
 		r = r[1:]
 
+		// If we have a place to put our goroutine labelmap, insert it there.
+		if labels != nil {
+			lbl[0] = gp.labels
+			lbl = lbl[1:]
+		}
+
 		// Save other goroutines.
 		for _, gp1 := range allgs {
 			if isOK(gp1) {
@@ -756,16 +765,30 @@
 					break
 				}
 				saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
+				if labels != nil {
+					lbl[0] = gp1.labels
+					lbl = lbl[1:]
+				}
 				r = r[1:]
 			}
 		}
 	}
 
 	startTheWorld()
-
 	return n, ok
 }
 
+// GoroutineProfile returns n, the number of records in the active goroutine stack profile.
+// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.
+// If len(p) < n, GoroutineProfile does not change p and returns n, false.
+//
+// Most clients should use the runtime/pprof package instead
+// of calling GoroutineProfile directly.
+func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+
+	return goroutineProfileWithLabels(p, nil)
+}
+
 func saveg(pc, sp uintptr, gp *g, r *StackRecord) {
 	n := gentraceback(pc, sp, 0, gp, 0, &r.Stack0[0], len(r.Stack0), nil, nil, 0)
 	if n < len(r.Stack0) {
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
new file mode 100644
index 0000000..e23d077
--- /dev/null
+++ b/src/runtime/mranges.go
@@ -0,0 +1,321 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Address range data structure.
+//
+// This file contains an implementation of a data structure which
+// manages ordered address ranges.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// addrRange represents a region of address space.
+//
+// An addrRange must never span a gap in the address space.
+type addrRange struct {
+	// base and limit together represent the region of address space
+	// [base, limit). That is, base is inclusive, limit is exclusive.
+	// These are address over an offset view of the address space on
+	// platforms with a segmented address space, that is, on platforms
+	// where arenaBaseOffset != 0.
+	base, limit offAddr
+}
+
+// makeAddrRange creates a new address range from two virtual addresses.
+//
+// Throws if the base and limit are not in the same memory segment.
+func makeAddrRange(base, limit uintptr) addrRange {
+	r := addrRange{offAddr{base}, offAddr{limit}}
+	if (base-arenaBaseOffset >= base) != (limit-arenaBaseOffset >= limit) {
+		throw("addr range base and limit are not in the same memory segment")
+	}
+	return r
+}
+
+// size returns the size of the range represented in bytes.
+func (a addrRange) size() uintptr {
+	if !a.base.lessThan(a.limit) {
+		return 0
+	}
+	// Subtraction is safe because limit and base must be in the same
+	// segment of the address space.
+	return a.limit.diff(a.base)
+}
+
+// contains returns whether or not the range contains a given address.
+func (a addrRange) contains(addr uintptr) bool {
+	return a.base.lessEqual(offAddr{addr}) && (offAddr{addr}).lessThan(a.limit)
+}
+
+// subtract takes the addrRange toPrune and cuts out any overlap with
+// from, then returns the new range. subtract assumes that a and b
+// either don't overlap at all, only overlap on one side, or are equal.
+// If b is strictly contained in a, thus forcing a split, it will throw.
+func (a addrRange) subtract(b addrRange) addrRange {
+	if b.base.lessEqual(a.base) && a.limit.lessEqual(b.limit) {
+		return addrRange{}
+	} else if a.base.lessThan(b.base) && b.limit.lessThan(a.limit) {
+		throw("bad prune")
+	} else if b.limit.lessThan(a.limit) && a.base.lessThan(b.limit) {
+		a.base = b.limit
+	} else if a.base.lessThan(b.base) && b.base.lessThan(a.limit) {
+		a.limit = b.base
+	}
+	return a
+}
+
+// removeGreaterEqual removes all addresses in a greater than or equal
+// to addr and returns the new range.
+func (a addrRange) removeGreaterEqual(addr uintptr) addrRange {
+	if (offAddr{addr}).lessEqual(a.base) {
+		return addrRange{}
+	}
+	if a.limit.lessEqual(offAddr{addr}) {
+		return a
+	}
+	return makeAddrRange(a.base.addr(), addr)
+}
+
+var (
+	// minOffAddr is the minimum address in the offset space, and
+	// it corresponds to the virtual address arenaBaseOffset.
+	minOffAddr = offAddr{arenaBaseOffset}
+
+	// maxOffAddr is the maximum address in the offset address
+	// space. It corresponds to the highest virtual address representable
+	// by the page alloc chunk and heap arena maps.
+	maxOffAddr = offAddr{(((1 << heapAddrBits) - 1) + arenaBaseOffset) & uintptrMask}
+)
+
+// offAddr represents an address in a contiguous view
+// of the address space on systems where the address space is
+// segmented. On other systems, it's just a normal address.
+type offAddr struct {
+	// a is just the virtual address, but should never be used
+	// directly. Call addr() to get this value instead.
+	a uintptr
+}
+
+// add adds a uintptr offset to the offAddr.
+func (l offAddr) add(bytes uintptr) offAddr {
+	return offAddr{a: l.a + bytes}
+}
+
+// sub subtracts a uintptr offset from the offAddr.
+func (l offAddr) sub(bytes uintptr) offAddr {
+	return offAddr{a: l.a - bytes}
+}
+
+// diff returns the amount of bytes in between the
+// two offAddrs.
+func (l1 offAddr) diff(l2 offAddr) uintptr {
+	return l1.a - l2.a
+}
+
+// lessThan returns true if l1 is less than l2 in the offset
+// address space.
+func (l1 offAddr) lessThan(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) < (l2.a - arenaBaseOffset)
+}
+
+// lessEqual returns true if l1 is less than or equal to l2 in
+// the offset address space.
+func (l1 offAddr) lessEqual(l2 offAddr) bool {
+	return (l1.a - arenaBaseOffset) <= (l2.a - arenaBaseOffset)
+}
+
+// equal returns true if the two offAddr values are equal.
+func (l1 offAddr) equal(l2 offAddr) bool {
+	// No need to compare in the offset space, it
+	// means the same thing.
+	return l1 == l2
+}
+
+// addr returns the virtual address for this offset address.
+func (l offAddr) addr() uintptr {
+	return l.a
+}
+
+// addrRanges is a data structure holding a collection of ranges of
+// address space.
+//
+// The ranges are coalesced eagerly to reduce the
+// number ranges it holds.
+//
+// The slice backing store for this field is persistentalloc'd
+// and thus there is no way to free it.
+//
+// addrRanges is not thread-safe.
+type addrRanges struct {
+	// ranges is a slice of ranges sorted by base.
+	ranges []addrRange
+
+	// totalBytes is the total amount of address space in bytes counted by
+	// this addrRanges.
+	totalBytes uintptr
+
+	// sysStat is the stat to track allocations by this type
+	sysStat *uint64
+}
+
+func (a *addrRanges) init(sysStat *uint64) {
+	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+	ranges.len = 0
+	ranges.cap = 16
+	ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, sysStat))
+	a.sysStat = sysStat
+	a.totalBytes = 0
+}
+
+// findSucc returns the first index in a such that base is
+// less than the base of the addrRange at that index.
+func (a *addrRanges) findSucc(addr uintptr) int {
+	// TODO(mknyszek): Consider a binary search for large arrays.
+	// While iterating over these ranges is potentially expensive,
+	// the expected number of ranges is small, ideally just 1,
+	// since Go heaps are usually mostly contiguous.
+	base := offAddr{addr}
+	for i := range a.ranges {
+		if base.lessThan(a.ranges[i].base) {
+			return i
+		}
+	}
+	return len(a.ranges)
+}
+
+// contains returns true if a covers the address addr.
+func (a *addrRanges) contains(addr uintptr) bool {
+	i := a.findSucc(addr)
+	if i == 0 {
+		return false
+	}
+	return a.ranges[i-1].contains(addr)
+}
+
+// add inserts a new address range to a.
+//
+// r must not overlap with any address range in a.
+func (a *addrRanges) add(r addrRange) {
+	// The copies in this function are potentially expensive, but this data
+	// structure is meant to represent the Go heap. At worst, copying this
+	// would take ~160µs assuming a conservative copying rate of 25 GiB/s (the
+	// copy will almost never trigger a page fault) for a 1 TiB heap with 4 MiB
+	// arenas which is completely discontiguous. ~160µs is still a lot, but in
+	// practice most platforms have 64 MiB arenas (which cuts this by a factor
+	// of 16) and Go heaps are usually mostly contiguous, so the chance that
+	// an addrRanges even grows to that size is extremely low.
+
+	// Because we assume r is not currently represented in a,
+	// findSucc gives us our insertion index.
+	i := a.findSucc(r.base.addr())
+	coalescesDown := i > 0 && a.ranges[i-1].limit.equal(r.base)
+	coalescesUp := i < len(a.ranges) && r.limit.equal(a.ranges[i].base)
+	if coalescesUp && coalescesDown {
+		// We have neighbors and they both border us.
+		// Merge a.ranges[i-1], r, and a.ranges[i] together into a.ranges[i-1].
+		a.ranges[i-1].limit = a.ranges[i].limit
+
+		// Delete a.ranges[i].
+		copy(a.ranges[i:], a.ranges[i+1:])
+		a.ranges = a.ranges[:len(a.ranges)-1]
+	} else if coalescesDown {
+		// We have a neighbor at a lower address only and it borders us.
+		// Merge the new space into a.ranges[i-1].
+		a.ranges[i-1].limit = r.limit
+	} else if coalescesUp {
+		// We have a neighbor at a higher address only and it borders us.
+		// Merge the new space into a.ranges[i].
+		a.ranges[i].base = r.base
+	} else {
+		// We may or may not have neighbors which don't border us.
+		// Add the new range.
+		if len(a.ranges)+1 > cap(a.ranges) {
+			// Grow the array. Note that this leaks the old array, but since
+			// we're doubling we have at most 2x waste. For a 1 TiB heap and
+			// 4 MiB arenas which are all discontiguous (both very conservative
+			// assumptions), this would waste at most 4 MiB of memory.
+			oldRanges := a.ranges
+			ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
+			ranges.len = len(oldRanges) + 1
+			ranges.cap = cap(oldRanges) * 2
+			ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, a.sysStat))
+
+			// Copy in the old array, but make space for the new range.
+			copy(a.ranges[:i], oldRanges[:i])
+			copy(a.ranges[i+1:], oldRanges[i:])
+		} else {
+			a.ranges = a.ranges[:len(a.ranges)+1]
+			copy(a.ranges[i+1:], a.ranges[i:])
+		}
+		a.ranges[i] = r
+	}
+	a.totalBytes += r.size()
+}
+
+// removeLast removes and returns the highest-addressed contiguous range
+// of a, or the last nBytes of that range, whichever is smaller. If a is
+// empty, it returns an empty range.
+func (a *addrRanges) removeLast(nBytes uintptr) addrRange {
+	if len(a.ranges) == 0 {
+		return addrRange{}
+	}
+	r := a.ranges[len(a.ranges)-1]
+	size := r.size()
+	if size > nBytes {
+		newEnd := r.limit.sub(nBytes)
+		a.ranges[len(a.ranges)-1].limit = newEnd
+		a.totalBytes -= nBytes
+		return addrRange{newEnd, r.limit}
+	}
+	a.ranges = a.ranges[:len(a.ranges)-1]
+	a.totalBytes -= size
+	return r
+}
+
+// removeGreaterEqual removes the ranges of a which are above addr, and additionally
+// splits any range containing addr.
+func (a *addrRanges) removeGreaterEqual(addr uintptr) {
+	pivot := a.findSucc(addr)
+	if pivot == 0 {
+		// addr is before all ranges in a.
+		a.totalBytes = 0
+		a.ranges = a.ranges[:0]
+		return
+	}
+	removed := uintptr(0)
+	for _, r := range a.ranges[pivot:] {
+		removed += r.size()
+	}
+	if r := a.ranges[pivot-1]; r.contains(addr) {
+		removed += r.size()
+		r = r.removeGreaterEqual(addr)
+		if r.size() == 0 {
+			pivot--
+		} else {
+			removed -= r.size()
+			a.ranges[pivot-1] = r
+		}
+	}
+	a.ranges = a.ranges[:pivot]
+	a.totalBytes -= removed
+}
+
+// cloneInto makes a deep clone of a's state into b, re-using
+// b's ranges if able.
+func (a *addrRanges) cloneInto(b *addrRanges) {
+	if len(a.ranges) > cap(b.ranges) {
+		// Grow the array.
+		ranges := (*notInHeapSlice)(unsafe.Pointer(&b.ranges))
+		ranges.len = 0
+		ranges.cap = cap(a.ranges)
+		ranges.array = (*notInHeap)(persistentalloc(unsafe.Sizeof(addrRange{})*uintptr(ranges.cap), sys.PtrSize, b.sysStat))
+	}
+	b.ranges = b.ranges[:len(a.ranges)]
+	b.totalBytes = a.totalBytes
+	copy(b.ranges, a.ranges)
+}
diff --git a/src/runtime/msan_arm64.s b/src/runtime/msan_arm64.s
index 4dfe5e3..5e29f1a 100644
--- a/src/runtime/msan_arm64.s
+++ b/src/runtime/msan_arm64.s
@@ -47,9 +47,10 @@
 
 // Switches SP to g0 stack and calls (FARG). Arguments already set.
 TEXT	msancall<>(SB), NOSPLIT, $0-0
+	MOVD	RSP, R19                  // callee-saved
+	CBZ	g, g0stack                // no g, still on a system stack
 	MOVD	g_m(g), R10
 	MOVD	m_g0(R10), R11
-	MOVD	RSP, R19	// callee-saved
 	CMP	R11, g
 	BEQ	g0stack
 
diff --git a/src/runtime/msize.go b/src/runtime/msize.go
index 0accb83..c56aa5a 100644
--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go
@@ -13,13 +13,13 @@
 func roundupsize(size uintptr) uintptr {
 	if size < _MaxSmallSize {
 		if size <= smallSizeMax-8 {
-			return uintptr(class_to_size[size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv]])
+			return uintptr(class_to_size[size_to_class8[divRoundUp(size, smallSizeDiv)]])
 		} else {
-			return uintptr(class_to_size[size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]])
+			return uintptr(class_to_size[size_to_class128[divRoundUp(size-smallSizeMax, largeSizeDiv)]])
 		}
 	}
 	if size+_PageSize < size {
 		return size
 	}
-	return round(size, _PageSize)
+	return alignUp(size, _PageSize)
 }
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
new file mode 100644
index 0000000..490eed4
--- /dev/null
+++ b/src/runtime/mspanset.go
@@ -0,0 +1,354 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"internal/cpu"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// A spanSet is a set of *mspans.
+//
+// spanSet is safe for concurrent push and pop operations.
+type spanSet struct {
+	// A spanSet is a two-level data structure consisting of a
+	// growable spine that points to fixed-sized blocks. The spine
+	// can be accessed without locks, but adding a block or
+	// growing it requires taking the spine lock.
+	//
+	// Because each mspan covers at least 8K of heap and takes at
+	// most 8 bytes in the spanSet, the growth of the spine is
+	// quite limited.
+	//
+	// The spine and all blocks are allocated off-heap, which
+	// allows this to be used in the memory manager and avoids the
+	// need for write barriers on all of these. spanSetBlocks are
+	// managed in a pool, though never freed back to the operating
+	// system. We never release spine memory because there could be
+	// concurrent lock-free access and we're likely to reuse it
+	// anyway. (In principle, we could do this during STW.)
+
+	spineLock mutex
+	spine     unsafe.Pointer // *[N]*spanSetBlock, accessed atomically
+	spineLen  uintptr        // Spine array length, accessed atomically
+	spineCap  uintptr        // Spine array cap, accessed under lock
+
+	// index is the head and tail of the spanSet in a single field.
+	// The head and the tail both represent an index into the logical
+	// concatenation of all blocks, with the head always behind or
+	// equal to the tail (indicating an empty set). This field is
+	// always accessed atomically.
+	//
+	// The head and the tail are only 32 bits wide, which means we
+	// can only support up to 2^32 pushes before a reset. If every
+	// span in the heap were stored in this set, and each span were
+	// the minimum size (1 runtime page, 8 KiB), then roughly the
+	// smallest heap which would be unrepresentable is 32 TiB in size.
+	index headTailIndex
+}
+
+const (
+	spanSetBlockEntries = 512 // 4KB on 64-bit
+	spanSetInitSpineCap = 256 // Enough for 1GB heap on 64-bit
+)
+
+type spanSetBlock struct {
+	// Free spanSetBlocks are managed via a lock-free stack.
+	lfnode
+
+	// popped is the number of pop operations that have occurred on
+	// this block. This number is used to help determine when a block
+	// may be safely recycled.
+	popped uint32
+
+	// spans is the set of spans in this block.
+	spans [spanSetBlockEntries]*mspan
+}
+
+// push adds span s to buffer b. push is safe to call concurrently
+// with other push and pop operations.
+func (b *spanSet) push(s *mspan) {
+	// Obtain our slot.
+	cursor := uintptr(b.index.incTail().tail() - 1)
+	top, bottom := cursor/spanSetBlockEntries, cursor%spanSetBlockEntries
+
+	// Do we need to add a block?
+	spineLen := atomic.Loaduintptr(&b.spineLen)
+	var block *spanSetBlock
+retry:
+	if top < spineLen {
+		spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+		blockp := add(spine, sys.PtrSize*top)
+		block = (*spanSetBlock)(atomic.Loadp(blockp))
+	} else {
+		// Add a new block to the spine, potentially growing
+		// the spine.
+		lock(&b.spineLock)
+		// spineLen cannot change until we release the lock,
+		// but may have changed while we were waiting.
+		spineLen = atomic.Loaduintptr(&b.spineLen)
+		if top < spineLen {
+			unlock(&b.spineLock)
+			goto retry
+		}
+
+		if spineLen == b.spineCap {
+			// Grow the spine.
+			newCap := b.spineCap * 2
+			if newCap == 0 {
+				newCap = spanSetInitSpineCap
+			}
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			if b.spineCap != 0 {
+				// Blocks are allocated off-heap, so
+				// no write barriers.
+				memmove(newSpine, b.spine, b.spineCap*sys.PtrSize)
+			}
+			// Spine is allocated off-heap, so no write barrier.
+			atomic.StorepNoWB(unsafe.Pointer(&b.spine), newSpine)
+			b.spineCap = newCap
+			// We can't immediately free the old spine
+			// since a concurrent push with a lower index
+			// could still be reading from it. We let it
+			// leak because even a 1TB heap would waste
+			// less than 2MB of memory on old spines. If
+			// this is a problem, we could free old spines
+			// during STW.
+		}
+
+		// Allocate a new block from the pool.
+		block = spanSetBlockPool.alloc()
+
+		// Add it to the spine.
+		blockp := add(b.spine, sys.PtrSize*top)
+		// Blocks are allocated off-heap, so no write barrier.
+		atomic.StorepNoWB(blockp, unsafe.Pointer(block))
+		atomic.Storeuintptr(&b.spineLen, spineLen+1)
+		unlock(&b.spineLock)
+	}
+
+	// We have a block. Insert the span atomically, since there may be
+	// concurrent readers via the block API.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), unsafe.Pointer(s))
+}
+
+// pop removes and returns a span from buffer b, or nil if b is empty.
+// pop is safe to call concurrently with other pop and push operations.
+func (b *spanSet) pop() *mspan {
+	var head, tail uint32
+claimLoop:
+	for {
+		headtail := b.index.load()
+		head, tail = headtail.split()
+		if head >= tail {
+			// The buf is empty, as far as we can tell.
+			return nil
+		}
+		// Check if the head position we want to claim is actually
+		// backed by a block.
+		spineLen := atomic.Loaduintptr(&b.spineLen)
+		if spineLen <= uintptr(head)/spanSetBlockEntries {
+			// We're racing with a spine growth and the allocation of
+			// a new block (and maybe a new spine!), and trying to grab
+			// the span at the index which is currently being pushed.
+			// Instead of spinning, let's just notify the caller that
+			// there's nothing currently here. Spinning on this is
+			// almost definitely not worth it.
+			return nil
+		}
+		// Try to claim the current head by CASing in an updated head.
+		// This may fail transiently due to a push which modifies the
+		// tail, so keep trying while the head isn't changing.
+		want := head
+		for want == head {
+			if b.index.cas(headtail, makeHeadTailIndex(want+1, tail)) {
+				break claimLoop
+			}
+			headtail = b.index.load()
+			head, tail = headtail.split()
+		}
+		// We failed to claim the spot we were after and the head changed,
+		// meaning a popper got ahead of us. Try again from the top because
+		// the buf may not be empty.
+	}
+	top, bottom := head/spanSetBlockEntries, head%spanSetBlockEntries
+
+	// We may be reading a stale spine pointer, but because the length
+	// grows monotonically and we've already verified it, we'll definitely
+	// be reading from a valid block.
+	spine := atomic.Loadp(unsafe.Pointer(&b.spine))
+	blockp := add(spine, sys.PtrSize*uintptr(top))
+
+	// Given that the spine length is correct, we know we will never
+	// see a nil block here, since the length is always updated after
+	// the block is set.
+	block := (*spanSetBlock)(atomic.Loadp(blockp))
+	s := (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	for s == nil {
+		// We raced with the span actually being set, but given that we
+		// know a block for this span exists, the race window here is
+		// extremely small. Try again.
+		s = (*mspan)(atomic.Loadp(unsafe.Pointer(&block.spans[bottom])))
+	}
+	// Clear the pointer. This isn't strictly necessary, but defensively
+	// avoids accidentally re-using blocks which could lead to memory
+	// corruption. This way, we'll get a nil pointer access instead.
+	atomic.StorepNoWB(unsafe.Pointer(&block.spans[bottom]), nil)
+
+	// Increase the popped count. If we are the last possible popper
+	// in the block (note that bottom need not equal spanSetBlockEntries-1
+	// due to races) then it's our resposibility to free the block.
+	//
+	// If we increment popped to spanSetBlockEntries, we can be sure that
+	// we're the last popper for this block, and it's thus safe to free it.
+	// Every other popper must have crossed this barrier (and thus finished
+	// popping its corresponding mspan) by the time we get here. Because
+	// we're the last popper, we also don't have to worry about concurrent
+	// pushers (there can't be any). Note that we may not be the popper
+	// which claimed the last slot in the block, we're just the last one
+	// to finish popping.
+	if atomic.Xadd(&block.popped, 1) == spanSetBlockEntries {
+		// Clear the block's pointer.
+		atomic.StorepNoWB(blockp, nil)
+
+		// Return the block to the block pool.
+		spanSetBlockPool.free(block)
+	}
+	return s
+}
+
+// reset resets a spanSet which is empty. It will also clean up
+// any left over blocks.
+//
+// Throws if the buf is not empty.
+//
+// reset may not be called concurrently with any other operations
+// on the span set.
+func (b *spanSet) reset() {
+	head, tail := b.index.load().split()
+	if head < tail {
+		print("head = ", head, ", tail = ", tail, "\n")
+		throw("attempt to clear non-empty span set")
+	}
+	top := head / spanSetBlockEntries
+	if uintptr(top) < b.spineLen {
+		// If the head catches up to the tail and the set is empty,
+		// we may not clean up the block containing the head and tail
+		// since it may be pushed into again. In order to avoid leaking
+		// memory since we're going to reset the head and tail, clean
+		// up such a block now, if it exists.
+		blockp := (**spanSetBlock)(add(b.spine, sys.PtrSize*uintptr(top)))
+		block := *blockp
+		if block != nil {
+			// Sanity check the popped value.
+			if block.popped == 0 {
+				// popped should never be zero because that means we have
+				// pushed at least one value but not yet popped if this
+				// block pointer is not nil.
+				throw("span set block with unpopped elements found in reset")
+			}
+			if block.popped == spanSetBlockEntries {
+				// popped should also never be equal to spanSetBlockEntries
+				// because the last popper should have made the block pointer
+				// in this slot nil.
+				throw("fully empty unfreed span set block found in reset")
+			}
+
+			// Clear the pointer to the block.
+			atomic.StorepNoWB(unsafe.Pointer(blockp), nil)
+
+			// Return the block to the block pool.
+			spanSetBlockPool.free(block)
+		}
+	}
+	b.index.reset()
+	atomic.Storeuintptr(&b.spineLen, 0)
+}
+
+// spanSetBlockPool is a global pool of spanSetBlocks.
+var spanSetBlockPool spanSetBlockAlloc
+
+// spanSetBlockAlloc represents a concurrent pool of spanSetBlocks.
+type spanSetBlockAlloc struct {
+	stack lfstack
+}
+
+// alloc tries to grab a spanSetBlock out of the pool, and if it fails
+// persistentallocs a new one and returns it.
+func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
+	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
+		return s
+	}
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+}
+
+// free returns a spanSetBlock back to the pool.
+func (p *spanSetBlockAlloc) free(block *spanSetBlock) {
+	atomic.Store(&block.popped, 0)
+	p.stack.push(&block.lfnode)
+}
+
+// haidTailIndex represents a combined 32-bit head and 32-bit tail
+// of a queue into a single 64-bit value.
+type headTailIndex uint64
+
+// makeHeadTailIndex creates a headTailIndex value from a separate
+// head and tail.
+func makeHeadTailIndex(head, tail uint32) headTailIndex {
+	return headTailIndex(uint64(head)<<32 | uint64(tail))
+}
+
+// head returns the head of a headTailIndex value.
+func (h headTailIndex) head() uint32 {
+	return uint32(h >> 32)
+}
+
+// tail returns the tail of a headTailIndex value.
+func (h headTailIndex) tail() uint32 {
+	return uint32(h)
+}
+
+// split splits the headTailIndex value into its parts.
+func (h headTailIndex) split() (head uint32, tail uint32) {
+	return h.head(), h.tail()
+}
+
+// load atomically reads a headTailIndex value.
+func (h *headTailIndex) load() headTailIndex {
+	return headTailIndex(atomic.Load64((*uint64)(h)))
+}
+
+// cas atomically compares-and-swaps a headTailIndex value.
+func (h *headTailIndex) cas(old, new headTailIndex) bool {
+	return atomic.Cas64((*uint64)(h), uint64(old), uint64(new))
+}
+
+// incHead atomically increments the head of a headTailIndex.
+func (h *headTailIndex) incHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), (1 << 32)))
+}
+
+// decHead atomically decrements the head of a headTailIndex.
+func (h *headTailIndex) decHead() headTailIndex {
+	return headTailIndex(atomic.Xadd64((*uint64)(h), -(1 << 32)))
+}
+
+// incTail atomically increments the tail of a headTailIndex.
+func (h *headTailIndex) incTail() headTailIndex {
+	ht := headTailIndex(atomic.Xadd64((*uint64)(h), +1))
+	// Check for overflow.
+	if ht.tail() == 0 {
+		print("runtime: head = ", ht.head(), ", tail = ", ht.tail(), "\n")
+		throw("headTailIndex overflow")
+	}
+	return ht
+}
+
+// reset clears the headTailIndex to (0, 0).
+func (h *headTailIndex) reset() {
+	atomic.Store64((*uint64)(h), 0)
+}
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 421580e..6a8a34d 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -31,7 +31,7 @@
 	nfree       uint64 // number of frees
 
 	// Statistics about malloc heap.
-	// Protected by mheap.lock
+	// Updated atomically, or with the world stopped.
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
@@ -40,19 +40,22 @@
 	heap_idle     uint64 // bytes in idle spans
 	heap_inuse    uint64 // bytes in mSpanInUse spans
 	heap_released uint64 // bytes released to the os
-	heap_objects  uint64 // total number of allocated objects
+
+	// heap_objects is not used by the runtime directly and instead
+	// computed on the fly by updatememstats.
+	heap_objects uint64 // total number of allocated objects
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans
+	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
 	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
 	mspan_sys    uint64
 	mcache_inuse uint64 // mcache structures
 	mcache_sys   uint64
 	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64
-	other_sys    uint64
+	gc_sys       uint64 // updated atomically or during STW
+	other_sys    uint64 // updated atomically or during STW
 
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
@@ -79,6 +82,8 @@
 
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+	last_next_gc     uint64 // next_gc for the previous GC
+	last_heap_inuse  uint64 // heap_inuse at mark termination of the previous GC
 
 	// triggerRatio is the heap growth ratio that triggers marking.
 	//
@@ -508,6 +513,12 @@
 
 //go:nowritebarrier
 func updatememstats() {
+	// Flush mcaches to mcentral before doing anything else.
+	//
+	// Flushing to the mcentral may in general cause stats to
+	// change as mcentral data structures are manipulated.
+	systemstack(flushallmcaches)
+
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
@@ -518,7 +529,7 @@
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
-	// Current number of alive object in the heap and amount of alive heap memory
+	// Current number of alive objects in the heap and amount of alive heap memory
 	// are calculated by scanning all spans.
 	// Total number of mallocs is calculated as number of frees plus number of alive objects.
 	// Similarly, total amount of allocated memory is calculated as amount of freed memory
@@ -532,9 +543,6 @@
 		memstats.by_size[i].nfree = 0
 	}
 
-	// Flush mcache's to mcentral.
-	systemstack(flushallmcaches)
-
 	// Aggregate local stats.
 	cachestats()
 
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index f444452..632769c 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -296,6 +296,13 @@
 			continue
 		}
 		mbits.setMarked()
+
+		// Mark span.
+		arena, pageIdx, pageMask := pageIndexOf(span.base())
+		if arena.pageMarks[pageIdx]&pageMask == 0 {
+			atomic.Or8(&arena.pageMarks[pageIdx], pageMask)
+		}
+
 		if span.spanclass.noscan() {
 			gcw.bytesMarked += uint64(span.elemsize)
 			continue
diff --git a/src/runtime/nbpipe_fcntl_libc_test.go b/src/runtime/nbpipe_fcntl_libc_test.go
new file mode 100644
index 0000000..b38c583
--- /dev/null
+++ b/src/runtime/nbpipe_fcntl_libc_test.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin solaris
+
+package runtime_test
+
+import (
+	"runtime"
+	"syscall"
+)
+
+// Call fcntl libc function rather than calling syscall.
+func fcntl(fd uintptr, cmd int, arg uintptr) (uintptr, syscall.Errno) {
+	res, errno := runtime.Fcntl(fd, uintptr(cmd), arg)
+	return res, syscall.Errno(errno)
+}
diff --git a/src/runtime/nbpipe_fcntl_unix_test.go b/src/runtime/nbpipe_fcntl_unix_test.go
new file mode 100644
index 0000000..75acdb6
--- /dev/null
+++ b/src/runtime/nbpipe_fcntl_unix_test.go
@@ -0,0 +1,17 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build dragonfly freebsd linux netbsd openbsd
+
+package runtime_test
+
+import (
+	"internal/syscall/unix"
+	"syscall"
+)
+
+func fcntl(fd uintptr, cmd int, arg uintptr) (uintptr, syscall.Errno) {
+	res, _, err := syscall.Syscall(unix.FcntlSyscall, fd, uintptr(cmd), arg)
+	return res, err
+}
diff --git a/src/runtime/nbpipe_pipe.go b/src/runtime/nbpipe_pipe.go
new file mode 100644
index 0000000..822b294
--- /dev/null
+++ b/src/runtime/nbpipe_pipe.go
@@ -0,0 +1,19 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe()
+	if errno != 0 {
+		return -1, -1, errno
+	}
+	closeonexec(r)
+	setNonblock(r)
+	closeonexec(w)
+	setNonblock(w)
+	return r, w, errno
+}
diff --git a/src/runtime/nbpipe_pipe2.go b/src/runtime/nbpipe_pipe2.go
new file mode 100644
index 0000000..e3639d9
--- /dev/null
+++ b/src/runtime/nbpipe_pipe2.go
@@ -0,0 +1,22 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func nonblockingPipe() (r, w int32, errno int32) {
+	r, w, errno = pipe2(_O_NONBLOCK | _O_CLOEXEC)
+	if errno == -_ENOSYS {
+		r, w, errno = pipe()
+		if errno != 0 {
+			return -1, -1, errno
+		}
+		closeonexec(r)
+		setNonblock(r)
+		closeonexec(w)
+		setNonblock(w)
+	}
+	return r, w, errno
+}
diff --git a/src/runtime/nbpipe_test.go b/src/runtime/nbpipe_test.go
new file mode 100644
index 0000000..d739f57
--- /dev/null
+++ b/src/runtime/nbpipe_test.go
@@ -0,0 +1,93 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime_test
+
+import (
+	"runtime"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestNonblockingPipe(t *testing.T) {
+	t.Parallel()
+
+	// NonblockingPipe is the test name for nonblockingPipe.
+	r, w, errno := runtime.NonblockingPipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+
+	checkIsPipe(t, r, w)
+	checkNonblocking(t, r, "reader")
+	checkCloseonexec(t, r, "reader")
+	checkNonblocking(t, w, "writer")
+	checkCloseonexec(t, w, "writer")
+}
+
+func checkIsPipe(t *testing.T, r, w int32) {
+	bw := byte(42)
+	if n := runtime.Write(uintptr(w), unsafe.Pointer(&bw), 1); n != 1 {
+		t.Fatalf("Write(w, &b, 1) == %d, expected 1", n)
+	}
+	var br byte
+	if n := runtime.Read(r, unsafe.Pointer(&br), 1); n != 1 {
+		t.Fatalf("Read(r, &b, 1) == %d, expected 1", n)
+	}
+	if br != bw {
+		t.Errorf("pipe read %d, expected %d", br, bw)
+	}
+}
+
+func checkNonblocking(t *testing.T, fd int32, name string) {
+	t.Helper()
+	flags, errno := fcntl(uintptr(fd), syscall.F_GETFL, 0)
+	if errno != 0 {
+		t.Errorf("fcntl(%s, F_GETFL) failed: %v", name, syscall.Errno(errno))
+	} else if flags&syscall.O_NONBLOCK == 0 {
+		t.Errorf("O_NONBLOCK not set in %s flags %#x", name, flags)
+	}
+}
+
+func checkCloseonexec(t *testing.T, fd int32, name string) {
+	t.Helper()
+	flags, errno := fcntl(uintptr(fd), syscall.F_GETFD, 0)
+	if errno != 0 {
+		t.Errorf("fcntl(%s, F_GETFD) failed: %v", name, syscall.Errno(errno))
+	} else if flags&syscall.FD_CLOEXEC == 0 {
+		t.Errorf("FD_CLOEXEC not set in %s flags %#x", name, flags)
+	}
+}
+
+func TestSetNonblock(t *testing.T) {
+	t.Parallel()
+
+	r, w, errno := runtime.Pipe()
+	if errno != 0 {
+		t.Fatal(syscall.Errno(errno))
+	}
+	defer func() {
+		runtime.Close(r)
+		runtime.Close(w)
+	}()
+
+	checkIsPipe(t, r, w)
+
+	runtime.SetNonblock(r)
+	runtime.SetNonblock(w)
+	checkNonblocking(t, r, "reader")
+	checkNonblocking(t, w, "writer")
+
+	runtime.Closeonexec(r)
+	runtime.Closeonexec(w)
+	checkCloseonexec(t, r, "reader")
+	checkCloseonexec(t, w, "writer")
+}
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 73bbc5e..34ea82a 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd js,wasm linux nacl netbsd openbsd solaris windows
+// +build aix darwin dragonfly freebsd js,wasm linux netbsd openbsd solaris windows
 
 package runtime
 
@@ -12,12 +12,35 @@
 )
 
 // Integrated network poller (platform-independent part).
-// A particular implementation (epoll/kqueue) must define the following functions:
-// func netpollinit()			// to initialize the poller
-// func netpollopen(fd uintptr, pd *pollDesc) int32	// to arm edge-triggered notifications
-// and associate fd with pd.
-// An implementation must call the following function to denote that the pd is ready.
-// func netpollready(gpp **g, pd *pollDesc, mode int32)
+// A particular implementation (epoll/kqueue/port/AIX/Windows)
+// must define the following functions:
+//
+// func netpollinit()
+//     Initialize the poller. Only called once.
+//
+// func netpollopen(fd uintptr, pd *pollDesc) int32
+//     Arm edge-triggered notifications for fd. The pd argument is to pass
+//     back to netpollready when fd is ready. Return an errno value.
+//
+// func netpoll(delta int64) gList
+//     Poll the network. If delta < 0, block indefinitely. If delta == 0,
+//     poll without blocking. If delta > 0, block for up to delta nanoseconds.
+//     Return a list of goroutines built by calling netpollready.
+//
+// func netpollBreak()
+//     Wake up the network poller, assumed to be blocked in netpoll.
+//
+// func netpollIsPollDescriptor(fd uintptr) bool
+//     Reports whether fd is a file descriptor used by the poller.
+
+// Error codes returned by runtime_pollReset and runtime_pollWait.
+// These must match the values in internal/poll/fd_poll_runtime.go.
+const (
+	pollNoError        = 0 // no error
+	pollErrClosing     = 1 // descriptor is closed
+	pollErrTimeout     = 2 // I/O timeout
+	pollErrNotPollable = 3 // general error polling descriptor
+)
 
 // pollDesc contains 2 binary semaphores, rg and wg, to park reader and writer
 // goroutines respectively. The semaphore can be in the following states:
@@ -25,12 +48,12 @@
 //           a goroutine consumes the notification by changing the state to nil.
 // pdWait - a goroutine prepares to park on the semaphore, but not yet parked;
 //          the goroutine commits to park by changing the state to G pointer,
-//          or, alternatively, concurrent io notification changes the state to READY,
+//          or, alternatively, concurrent io notification changes the state to pdReady,
 //          or, alternatively, concurrent timeout/close changes the state to nil.
 // G pointer - the goroutine is blocked on the semaphore;
-//             io notification or timeout/close changes the state to READY or nil respectively
+//             io notification or timeout/close changes the state to pdReady or nil respectively
 //             and unparks the goroutine.
-// nil - nothing of the above.
+// nil - none of the above.
 const (
 	pdReady uintptr = 1
 	pdWait  uintptr = 2
@@ -79,15 +102,28 @@
 }
 
 var (
-	netpollInited  uint32
+	netpollInitLock mutex
+	netpollInited   uint32
+
 	pollcache      pollCache
 	netpollWaiters uint32
 )
 
 //go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
 func poll_runtime_pollServerInit() {
-	netpollinit()
-	atomic.Store(&netpollInited, 1)
+	netpollGenericInit()
+}
+
+func netpollGenericInit() {
+	if atomic.Load(&netpollInited) == 0 {
+		lockInit(&netpollInitLock, lockRankNetpollInit)
+		lock(&netpollInitLock)
+		if netpollInited == 0 {
+			netpollinit()
+			atomic.Store(&netpollInited, 1)
+		}
+		unlock(&netpollInitLock)
+	}
 }
 
 func netpollinited() bool {
@@ -99,14 +135,7 @@
 // poll_runtime_isPollServerDescriptor reports whether fd is a
 // descriptor being used by netpoll.
 func poll_runtime_isPollServerDescriptor(fd uintptr) bool {
-	fds := netpolldescriptor()
-	if GOOS != "aix" {
-		return fd == fds
-	} else {
-		// AIX have a pipe in its netpoll implementation.
-		// Therefore, two fd are returned by netpolldescriptor using a mask.
-		return fd == fds&0xFFFF || fd == (fds>>16)&0xFFFF
-	}
+	return netpollIsPollDescriptor(fd)
 }
 
 //go:linkname poll_runtime_pollOpen internal/poll.runtime_pollOpen
@@ -157,40 +186,47 @@
 	unlock(&c.lock)
 }
 
+// poll_runtime_pollReset, which is internal/poll.runtime_pollReset,
+// prepares a descriptor for polling in mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
 //go:linkname poll_runtime_pollReset internal/poll.runtime_pollReset
 func poll_runtime_pollReset(pd *pollDesc, mode int) int {
-	err := netpollcheckerr(pd, int32(mode))
-	if err != 0 {
-		return err
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
 	}
 	if mode == 'r' {
 		pd.rg = 0
 	} else if mode == 'w' {
 		pd.wg = 0
 	}
-	return 0
+	return pollNoError
 }
 
+// poll_runtime_pollWait, which is internal/poll.runtime_pollWait,
+// waits for a descriptor to be ready for reading or writing,
+// according to mode, which is 'r' or 'w'.
+// This returns an error code; the codes are defined above.
 //go:linkname poll_runtime_pollWait internal/poll.runtime_pollWait
 func poll_runtime_pollWait(pd *pollDesc, mode int) int {
-	err := netpollcheckerr(pd, int32(mode))
-	if err != 0 {
-		return err
+	errcode := netpollcheckerr(pd, int32(mode))
+	if errcode != pollNoError {
+		return errcode
 	}
 	// As for now only Solaris, illumos, and AIX use level-triggered IO.
 	if GOOS == "solaris" || GOOS == "illumos" || GOOS == "aix" {
 		netpollarm(pd, mode)
 	}
 	for !netpollblock(pd, int32(mode), false) {
-		err = netpollcheckerr(pd, int32(mode))
-		if err != 0 {
-			return err
+		errcode = netpollcheckerr(pd, int32(mode))
+		if errcode != pollNoError {
+			return errcode
 		}
 		// Can happen if timeout has fired and unblocked us,
 		// but before we had a chance to run, timeout has been reset.
 		// Pretend it has not happened and retry.
 	}
-	return 0
+	return pollNoError
 }
 
 //go:linkname poll_runtime_pollWaitCanceled internal/poll.runtime_pollWaitCanceled
@@ -232,13 +268,12 @@
 	if pd.rt.f == nil {
 		if pd.rd > 0 {
 			pd.rt.f = rtf
-			pd.rt.when = pd.rd
 			// Copy current seq into the timer arg.
 			// Timer func will check the seq against current descriptor seq,
 			// if they differ the descriptor was reused or timers were reset.
 			pd.rt.arg = pd
 			pd.rt.seq = pd.rseq
-			addtimer(&pd.rt)
+			resettimer(&pd.rt, pd.rd)
 		}
 	} else if pd.rd != rd0 || combo != combo0 {
 		pd.rseq++ // invalidate current timers
@@ -252,10 +287,9 @@
 	if pd.wt.f == nil {
 		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
-			pd.wt.when = pd.wd
 			pd.wt.arg = pd
 			pd.wt.seq = pd.wseq
-			addtimer(&pd.wt)
+			resettimer(&pd.wt, pd.wd)
 		}
 	} else if pd.wd != wd0 || combo != combo0 {
 		pd.wseq++ // invalidate current timers
@@ -316,8 +350,13 @@
 	}
 }
 
-// make pd ready, newly runnable goroutines (if any) are added to toRun.
-// May run during STW, so write barriers are not allowed.
+// netpollready is called by the platform-specific netpoll function.
+// It declares that the fd associated with pd is ready for I/O.
+// The toRun argument is used to build a list of goroutines to return
+// from netpoll. The mode argument is 'r', 'w', or 'r'+'w' to indicate
+// whether the fd is ready for reading or writing or both.
+//
+// This may run while the world is stopped, so write barriers are not allowed.
 //go:nowritebarrier
 func netpollready(toRun *gList, pd *pollDesc, mode int32) {
 	var rg, wg *g
@@ -337,18 +376,18 @@
 
 func netpollcheckerr(pd *pollDesc, mode int32) int {
 	if pd.closing {
-		return 1 // ErrFileClosing or ErrNetClosing
+		return pollErrClosing
 	}
 	if (mode == 'r' && pd.rd < 0) || (mode == 'w' && pd.wd < 0) {
-		return 2 // ErrTimeout
+		return pollErrTimeout
 	}
 	// Report an event scanning error only on a read event.
 	// An error on a write event will be captured in a subsequent
 	// write call that is able to report a more specific error.
 	if mode == 'r' && pd.everr {
-		return 3 // ErrNotPollable
+		return pollErrNotPollable
 	}
-	return 0
+	return pollNoError
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
@@ -375,7 +414,7 @@
 		gpp = &pd.wg
 	}
 
-	// set the gpp semaphore to WAIT
+	// set the gpp semaphore to pdWait
 	for {
 		old := *gpp
 		if old == pdReady {
@@ -390,13 +429,13 @@
 		}
 	}
 
-	// need to recheck error states after setting gpp to WAIT
+	// need to recheck error states after setting gpp to pdWait
 	// this is necessary because runtime_pollUnblock/runtime_pollSetDeadline/deadlineimpl
 	// do the opposite: store to closing/rd/wd, membarrier, load of rg/wg
 	if waitio || netpollcheckerr(pd, mode) == 0 {
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), waitReasonIOWait, traceEvGoBlockNet, 5)
 	}
-	// be careful to not lose concurrent READY notification
+	// be careful to not lose concurrent pdReady notification
 	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
 		throw("runtime: corrupted polldesc")
@@ -416,7 +455,7 @@
 			return nil
 		}
 		if old == 0 && !ioready {
-			// Only set READY for ioready. runtime_pollWait
+			// Only set pdReady for ioready. runtime_pollWait
 			// will check for timeout/cancel before waiting.
 			return nil
 		}
@@ -425,7 +464,7 @@
 			new = pdReady
 		}
 		if atomic.Casuintptr(gpp, old, new) {
-			if old == pdReady || old == pdWait {
+			if old == pdWait {
 				old = 0
 			}
 			return (*g)(unsafe.Pointer(old))
@@ -504,6 +543,7 @@
 	}
 	pd := c.first
 	c.first = pd.link
+	lockInit(&pd.lock, lockRankPollDesc)
 	unlock(&c.lock)
 	return pd
 }
diff --git a/src/runtime/netpoll_aix.go b/src/runtime/netpoll_aix.go
index f0ba094..4590ed8 100644
--- a/src/runtime/netpoll_aix.go
+++ b/src/runtime/netpoll_aix.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This is based on the former libgo/runtime/netpoll_select.c implementation
 // except that it uses poll instead of select and is written in Go.
@@ -21,12 +24,6 @@
 	return int32(r), int32(err)
 }
 
-//go:nosplit
-func fcntl(fd, cmd int32, arg uintptr) int32 {
-	r, _ := syscall3(&libc_fcntl, uintptr(fd), uintptr(cmd), arg)
-	return int32(r)
-}
-
 // pollfd represents the poll structure for AIX operating system.
 type pollfd struct {
 	fd      int32
@@ -38,7 +35,6 @@
 const _POLLOUT = 0x0002
 const _POLLHUP = 0x2000
 const _POLLERR = 0x4000
-const _O_NONBLOCK = 0x4
 
 var (
 	pfds           []pollfd
@@ -48,25 +44,18 @@
 	rdwake         int32
 	wrwake         int32
 	pendingUpdates int32
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
-	var p [2]int32
-
 	// Create the pipe we use to wakeup poll.
-	if err := pipe(&p[0]); err < 0 {
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
 		throw("netpollinit: failed to create pipe")
 	}
-	rdwake = p[0]
-	wrwake = p[1]
-
-	fl := uintptr(fcntl(rdwake, _F_GETFL, 0))
-	fcntl(rdwake, _F_SETFL, fl|_O_NONBLOCK)
-	fcntl(rdwake, _F_SETFD, _FD_CLOEXEC)
-
-	fl = uintptr(fcntl(wrwake, _F_GETFL, 0))
-	fcntl(wrwake, _F_SETFL, fl|_O_NONBLOCK)
-	fcntl(wrwake, _F_SETFD, _FD_CLOEXEC)
+	rdwake = r
+	wrwake = w
 
 	// Pre-allocate array of pollfd structures for poll.
 	pfds = make([]pollfd, 1, 128)
@@ -79,12 +68,8 @@
 	pds[0] = nil
 }
 
-func netpolldescriptor() uintptr {
-	// Both fd must be returned
-	if rdwake > 0xFFFF || wrwake > 0xFFFF {
-		throw("netpolldescriptor: invalid fd number")
-	}
-	return uintptr(rdwake<<16 | wrwake)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(rdwake) || fd == uintptr(wrwake)
 }
 
 // netpollwakeup writes on wrwake to wakeup poll before any changes.
@@ -148,12 +133,35 @@
 	unlock(&mtxset)
 }
 
+// netpollBreak interrupts a poll.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		b := [1]byte{0}
+		write(uintptr(wrwake), unsafe.Pointer(&b[0]), 1)
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
 //go:nowritebarrierrec
-func netpoll(block bool) gList {
-	timeout := ^uintptr(0)
-	if !block {
-		timeout = 0
+func netpoll(delay int64) gList {
+	var timeout uintptr
+	if delay < 0 {
+		timeout = ^uintptr(0)
+	} else if delay == 0 {
+		// TODO: call poll with timeout == 0
 		return gList{}
+	} else if delay < 1e6 {
+		timeout = 1
+	} else if delay < 1e15 {
+		timeout = uintptr(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		timeout = 1e9
 	}
 retry:
 	lock(&mtxpoll)
@@ -168,20 +176,30 @@
 			throw("poll failed")
 		}
 		unlock(&mtxset)
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if timeout > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	// Check if some descriptors need to be changed
 	if n != 0 && pfds[0].revents&(_POLLIN|_POLLHUP|_POLLERR) != 0 {
-		var b [1]byte
-		for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
+		if delay != 0 {
+			// A netpollwakeup could be picked up by a
+			// non-blocking poll. Only clear the wakeup
+			// if blocking.
+			var b [1]byte
+			for read(rdwake, unsafe.Pointer(&b[0]), 1) == 1 {
+			}
+			atomic.Store(&netpollWakeSig, 0)
 		}
-		// Do not look at the other fds in this case as the mode may have changed
-		// XXX only additions of flags are made, so maybe it is ok
-		unlock(&mtxset)
-		goto retry
+		// Still look at the other fds even if the mode may have
+		// changed, as netpollBreak might have been called.
+		n--
 	}
 	var toRun gList
-	for i := 0; i < len(pfds) && n > 0; i++ {
+	for i := 1; i < len(pfds) && n > 0; i++ {
 		pfd := &pfds[i]
 
 		var mode int32
@@ -203,8 +221,5 @@
 		}
 	}
 	unlock(&mtxset)
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index 8f49309..58f4fa8 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 func epollcreate(size int32) int32
 func epollcreate1(flags int32) int32
@@ -20,24 +23,42 @@
 
 var (
 	epfd int32 = -1 // epoll descriptor
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
 	epfd = epollcreate1(_EPOLL_CLOEXEC)
-	if epfd >= 0 {
-		return
-	}
-	epfd = epollcreate(1024)
-	if epfd >= 0 {
+	if epfd < 0 {
+		epfd = epollcreate(1024)
+		if epfd < 0 {
+			println("runtime: epollcreate failed with", -epfd)
+			throw("runtime: netpollinit failed")
+		}
 		closeonexec(epfd)
-		return
 	}
-	println("runtime: epollcreate failed with", -epfd)
-	throw("runtime: netpollinit failed")
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
+		println("runtime: pipe failed with", -errno)
+		throw("runtime: pipe failed")
+	}
+	ev := epollevent{
+		events: _EPOLLIN,
+	}
+	*(**uintptr)(unsafe.Pointer(&ev.data)) = &netpollBreakRd
+	errno = epollctl(epfd, _EPOLL_CTL_ADD, r, &ev)
+	if errno != 0 {
+		println("runtime: epollctl failed with", -errno)
+		throw("runtime: epollctl failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(epfd)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(epfd) || fd == netpollBreakRd || fd == netpollBreakWr
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -56,15 +77,49 @@
 	throw("runtime: unused")
 }
 
-// polls for ready network connections
-// returns list of goroutines that become runnable
-func netpoll(block bool) gList {
+// netpollBreak interrupts an epollwait.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			if n == -_EAGAIN {
+				return
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if epfd == -1 {
 		return gList{}
 	}
-	waitms := int32(-1)
-	if !block {
+	var waitms int32
+	if delay < 0 {
+		waitms = -1
+	} else if delay == 0 {
 		waitms = 0
+	} else if delay < 1e6 {
+		waitms = 1
+	} else if delay < 1e15 {
+		waitms = int32(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		waitms = 1e9
 	}
 	var events [128]epollevent
 retry:
@@ -74,6 +129,11 @@
 			println("runtime: epollwait on fd", epfd, "failed with", -n)
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if waitms > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	var toRun gList
@@ -82,6 +142,23 @@
 		if ev.events == 0 {
 			continue
 		}
+
+		if *(**uintptr)(unsafe.Pointer(&ev.data)) == &netpollBreakRd {
+			if ev.events != _EPOLLIN {
+				println("runtime: netpoll: break fd ready for", ev.events)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
+			}
+			continue
+		}
+
 		var mode int32
 		if ev.events&(_EPOLLIN|_EPOLLRDHUP|_EPOLLHUP|_EPOLLERR) != 0 {
 			mode += 'r'
@@ -98,8 +175,5 @@
 			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/src/runtime/netpoll_fake.go b/src/runtime/netpoll_fake.go
index 5b1a63a..b2af3b8 100644
--- a/src/runtime/netpoll_fake.go
+++ b/src/runtime/netpoll_fake.go
@@ -2,18 +2,18 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Fake network poller for NaCl and wasm/js.
-// Should never be used, because NaCl and wasm/js network connections do not honor "SetNonblock".
+// Fake network poller for wasm/js.
+// Should never be used, because wasm/js network connections do not honor "SetNonblock".
 
-// +build nacl js,wasm
+// +build js,wasm
 
 package runtime
 
 func netpollinit() {
 }
 
-func netpolldescriptor() uintptr {
-	return ^uintptr(0)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return false
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -27,6 +27,9 @@
 func netpollarm(pd *pollDesc, mode int) {
 }
 
-func netpoll(block bool) gList {
+func netpollBreak() {
+}
+
+func netpoll(delay int64) gList {
 	return gList{}
 }
diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index a8880e8..3bd93c1 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go
@@ -8,10 +8,17 @@
 
 // Integrated network poller (kqueue-based implementation).
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var (
 	kq int32 = -1
+
+	netpollBreakRd, netpollBreakWr uintptr // for netpollBreak
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func netpollinit() {
@@ -21,10 +28,27 @@
 		throw("runtime: netpollinit failed")
 	}
 	closeonexec(kq)
+	r, w, errno := nonblockingPipe()
+	if errno != 0 {
+		println("runtime: pipe failed with", -errno)
+		throw("runtime: pipe failed")
+	}
+	ev := keventt{
+		filter: _EVFILT_READ,
+		flags:  _EV_ADD,
+	}
+	*(*uintptr)(unsafe.Pointer(&ev.ident)) = uintptr(r)
+	n := kevent(kq, &ev, 1, nil, 0, nil)
+	if n < 0 {
+		println("runtime: kevent failed with", -n)
+		throw("runtime: kevent failed")
+	}
+	netpollBreakRd = uintptr(r)
+	netpollBreakWr = uintptr(w)
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(kq)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(kq) || fd == netpollBreakRd || fd == netpollBreakWr
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -57,15 +81,45 @@
 	throw("runtime: unused")
 }
 
-// Polls for ready network connections.
+// netpollBreak interrupts a kevent.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		for {
+			var b byte
+			n := write(netpollBreakWr, unsafe.Pointer(&b), 1)
+			if n == 1 || n == -_EAGAIN {
+				break
+			}
+			if n == -_EINTR {
+				continue
+			}
+			println("runtime: netpollBreak write failed with", -n)
+			throw("runtime: netpollBreak write failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if kq == -1 {
 		return gList{}
 	}
 	var tp *timespec
 	var ts timespec
-	if !block {
+	if delay < 0 {
+		tp = nil
+	} else if delay == 0 {
+		tp = &ts
+	} else {
+		ts.setNsec(delay)
+		if ts.tv_sec > 1e6 {
+			// Darwin returns EINVAL if the sleep time is too long.
+			ts.tv_sec = 1e6
+		}
 		tp = &ts
 	}
 	var events [64]keventt
@@ -76,11 +130,33 @@
 			println("runtime: kevent on fd", kq, "failed with", -n)
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted, just return to
+		// recalculate how long we should sleep now.
+		if delay > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 	var toRun gList
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
+
+		if uintptr(ev.ident) == netpollBreakRd {
+			if ev.filter != _EVFILT_READ {
+				println("runtime: netpoll: break fd ready for", ev.filter)
+				throw("runtime: netpoll: break fd ready for something unexpected")
+			}
+			if delay != 0 {
+				// netpollBreak could be picked up by a
+				// nonblocking poll. Only read the byte
+				// if blocking.
+				var tmp [16]byte
+				read(int32(netpollBreakRd), noescape(unsafe.Pointer(&tmp[0])), int32(len(tmp)))
+				atomic.Store(&netpollWakeSig, 0)
+			}
+			continue
+		}
+
 		var mode int32
 		switch ev.filter {
 		case _EVFILT_READ:
@@ -110,8 +186,5 @@
 			netpollready(&toRun, pd, mode)
 		}
 	}
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/src/runtime/netpoll_os_test.go b/src/runtime/netpoll_os_test.go
new file mode 100644
index 0000000..b96b9f3
--- /dev/null
+++ b/src/runtime/netpoll_os_test.go
@@ -0,0 +1,28 @@
+package runtime_test
+
+import (
+	"runtime"
+	"sync"
+	"testing"
+)
+
+var wg sync.WaitGroup
+
+func init() {
+	runtime.NetpollGenericInit()
+}
+
+func BenchmarkNetpollBreak(b *testing.B) {
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 10; j++ {
+			wg.Add(1)
+			go func() {
+				runtime.NetpollBreak()
+				wg.Done()
+			}()
+		}
+	}
+	wg.Wait()
+	b.StopTimer()
+}
diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index ddddb27..d217d5b 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go
@@ -4,7 +4,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Solaris runtime-integrated network poller.
 //
@@ -71,25 +74,29 @@
 //go:cgo_import_dynamic libc_port_associate port_associate "libc.so"
 //go:cgo_import_dynamic libc_port_dissociate port_dissociate "libc.so"
 //go:cgo_import_dynamic libc_port_getn port_getn "libc.so"
+//go:cgo_import_dynamic libc_port_alert port_alert "libc.so"
 
 //go:linkname libc_port_create libc_port_create
 //go:linkname libc_port_associate libc_port_associate
 //go:linkname libc_port_dissociate libc_port_dissociate
 //go:linkname libc_port_getn libc_port_getn
+//go:linkname libc_port_alert libc_port_alert
 
 var (
 	libc_port_create,
 	libc_port_associate,
 	libc_port_dissociate,
-	libc_port_getn libcFunc
+	libc_port_getn,
+	libc_port_alert libcFunc
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
 )
 
 func errno() int32 {
 	return *getg().m.perrno
 }
 
-func fcntl(fd, cmd int32, arg uintptr) int32 {
-	return int32(sysvicall3(&libc_fcntl, uintptr(fd), uintptr(cmd), arg))
+func fcntl(fd, cmd, arg int32) int32 {
+	return int32(sysvicall3(&libc_fcntl, uintptr(fd), uintptr(cmd), uintptr(arg)))
 }
 
 func port_create() int32 {
@@ -108,6 +115,10 @@
 	return int32(sysvicall5(&libc_port_getn, uintptr(port), uintptr(unsafe.Pointer(evs)), uintptr(max), uintptr(unsafe.Pointer(nget)), uintptr(unsafe.Pointer(timeout))))
 }
 
+func port_alert(port int32, flags, events uint32, user uintptr) int32 {
+	return int32(sysvicall4(&libc_port_alert, uintptr(port), uintptr(flags), uintptr(events), user))
+}
+
 var portfd int32 = -1
 
 func netpollinit() {
@@ -121,8 +132,8 @@
 	throw("runtime: netpollinit failed")
 }
 
-func netpolldescriptor() uintptr {
-	return uintptr(portfd)
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == uintptr(portfd)
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -178,27 +189,70 @@
 	unlock(&pd.lock)
 }
 
-// polls for ready network connections
-// returns list of goroutines that become runnable
-func netpoll(block bool) gList {
+// netpollBreak interrupts a port_getn wait.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		// Use port_alert to put portfd into alert mode.
+		// This will wake up all threads sleeping in port_getn on portfd,
+		// and cause their calls to port_getn to return immediately.
+		// Further, until portfd is taken out of alert mode,
+		// all calls to port_getn will return immediately.
+		if port_alert(portfd, _PORT_ALERT_UPDATE, _POLLHUP, uintptr(unsafe.Pointer(&portfd))) < 0 {
+			if e := errno(); e != _EBUSY {
+				println("runtime: port_alert failed with", e)
+				throw("runtime: netpoll: port_alert failed")
+			}
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
+// Returns list of goroutines that become runnable.
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	if portfd == -1 {
 		return gList{}
 	}
 
 	var wait *timespec
-	var zero timespec
-	if !block {
-		wait = &zero
+	var ts timespec
+	if delay < 0 {
+		wait = nil
+	} else if delay == 0 {
+		wait = &ts
+	} else {
+		ts.setNsec(delay)
+		if ts.tv_sec > 1e6 {
+			// An arbitrary cap on how long to wait for a timer.
+			// 1e6 s == ~11.5 days.
+			ts.tv_sec = 1e6
+		}
+		wait = &ts
 	}
 
 	var events [128]portevent
 retry:
 	var n uint32 = 1
-	if port_getn(portfd, &events[0], uint32(len(events)), &n, wait) < 0 {
-		if e := errno(); e != _EINTR {
+	r := port_getn(portfd, &events[0], uint32(len(events)), &n, wait)
+	e := errno()
+	if r < 0 && e == _ETIME && n > 0 {
+		// As per port_getn(3C), an ETIME failure does not preclude the
+		// delivery of some number of events.  Treat a timeout failure
+		// with delivered events as a success.
+		r = 0
+	}
+	if r < 0 {
+		if e != _EINTR && e != _ETIME {
 			print("runtime: port_getn on fd ", portfd, " failed (errno=", e, ")\n")
 			throw("runtime: netpoll failed")
 		}
+		// If a timed sleep was interrupted and there are no events,
+		// just return to recalculate how long we should sleep now.
+		if delay > 0 {
+			return gList{}
+		}
 		goto retry
 	}
 
@@ -206,6 +260,25 @@
 	for i := 0; i < int(n); i++ {
 		ev := &events[i]
 
+		if ev.portev_source == _PORT_SOURCE_ALERT {
+			if ev.portev_events != _POLLHUP || unsafe.Pointer(ev.portev_user) != unsafe.Pointer(&portfd) {
+				throw("runtime: netpoll: bad port_alert wakeup")
+			}
+			if delay != 0 {
+				// Now that a blocking call to netpoll
+				// has seen the alert, take portfd
+				// back out of alert mode.
+				// See the comment in netpollBreak.
+				if port_alert(portfd, 0, 0, 0) < 0 {
+					e := errno()
+					println("runtime: port_alert failed with", e)
+					throw("runtime: netpoll: port_alert failed")
+				}
+				atomic.Store(&netpollWakeSig, 0)
+			}
+			continue
+		}
+
 		if ev.portev_events == 0 {
 			continue
 		}
@@ -242,8 +315,5 @@
 		}
 	}
 
-	if block && toRun.empty() {
-		goto retry
-	}
 	return toRun
 }
diff --git a/src/runtime/netpoll_stub.go b/src/runtime/netpoll_stub.go
index f585333..f86f2f6 100644
--- a/src/runtime/netpoll_stub.go
+++ b/src/runtime/netpoll_stub.go
@@ -6,16 +6,53 @@
 
 package runtime
 
+import "runtime/internal/atomic"
+
+var netpollInited uint32
 var netpollWaiters uint32
 
+var netpollStubLock mutex
+var netpollNote note
+
+// netpollBroken, protected by netpollBrokenLock, avoids a double notewakeup.
+var netpollBrokenLock mutex
+var netpollBroken bool
+
+func netpollGenericInit() {
+	atomic.Store(&netpollInited, 1)
+}
+
+func netpollBreak() {
+	lock(&netpollBrokenLock)
+	broken := netpollBroken
+	netpollBroken = true
+	if !broken {
+		notewakeup(&netpollNote)
+	}
+	unlock(&netpollBrokenLock)
+}
+
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+func netpoll(delay int64) gList {
 	// Implementation for platforms that do not support
 	// integrated network poller.
+	if delay != 0 {
+		// This lock ensures that only one goroutine tries to use
+		// the note. It should normally be completely uncontended.
+		lock(&netpollStubLock)
+
+		lock(&netpollBrokenLock)
+		noteclear(&netpollNote)
+		netpollBroken = false
+		unlock(&netpollBrokenLock)
+
+		notetsleep(&netpollNote, delay)
+		unlock(&netpollStubLock)
+	}
 	return gList{}
 }
 
 func netpollinited() bool {
-	return false
+	return atomic.Load(&netpollInited) != 0
 }
diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 07ef15c..4c1cd26 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -31,7 +32,11 @@
 	qty      uint32
 }
 
-var iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
+var (
+	iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
+
+	netpollWakeSig uint32 // used to avoid duplicate calls of netpollBreak
+)
 
 func netpollinit() {
 	iocphandle = stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX)
@@ -41,8 +46,8 @@
 	}
 }
 
-func netpolldescriptor() uintptr {
-	return iocphandle
+func netpollIsPollDescriptor(fd uintptr) bool {
+	return fd == iocphandle
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -61,11 +66,23 @@
 	throw("runtime: unused")
 }
 
-// Polls for completed network IO.
+func netpollBreak() {
+	if atomic.Cas(&netpollWakeSig, 0, 1) {
+		if stdcall4(_PostQueuedCompletionStatus, iocphandle, 0, 0, 0) == 0 {
+			println("runtime: netpoll: PostQueuedCompletionStatus failed (errno=", getlasterror(), ")")
+			throw("runtime: netpoll: PostQueuedCompletionStatus failed")
+		}
+	}
+}
+
+// netpoll checks for ready network connections.
 // Returns list of goroutines that become runnable.
-func netpoll(block bool) gList {
+// delay < 0: blocks indefinitely
+// delay == 0: does not block, just polls
+// delay > 0: block for up to that many nanoseconds
+func netpoll(delay int64) gList {
 	var entries [64]overlappedEntry
-	var wait, qty, key, flags, n, i uint32
+	var wait, qty, flags, n, i uint32
 	var errno int32
 	var op *net_op
 	var toRun gList
@@ -75,74 +92,62 @@
 	if iocphandle == _INVALID_HANDLE_VALUE {
 		return gList{}
 	}
-	wait = 0
-	if block {
+	if delay < 0 {
 		wait = _INFINITE
+	} else if delay == 0 {
+		wait = 0
+	} else if delay < 1e6 {
+		wait = 1
+	} else if delay < 1e15 {
+		wait = uint32(delay / 1e6)
+	} else {
+		// An arbitrary cap on how long to wait for a timer.
+		// 1e9 ms == ~11.5 days.
+		wait = 1e9
 	}
-retry:
-	if _GetQueuedCompletionStatusEx != nil {
-		n = uint32(len(entries) / int(gomaxprocs))
-		if n < 8 {
-			n = 8
-		}
-		if block {
-			mp.blocked = true
-		}
-		if stdcall6(_GetQueuedCompletionStatusEx, iocphandle, uintptr(unsafe.Pointer(&entries[0])), uintptr(n), uintptr(unsafe.Pointer(&n)), uintptr(wait), 0) == 0 {
-			mp.blocked = false
-			errno = int32(getlasterror())
-			if !block && errno == _WAIT_TIMEOUT {
-				return gList{}
-			}
-			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
-			throw("runtime: netpoll failed")
-		}
+
+	n = uint32(len(entries) / int(gomaxprocs))
+	if n < 8 {
+		n = 8
+	}
+	if delay != 0 {
+		mp.blocked = true
+	}
+	if stdcall6(_GetQueuedCompletionStatusEx, iocphandle, uintptr(unsafe.Pointer(&entries[0])), uintptr(n), uintptr(unsafe.Pointer(&n)), uintptr(wait), 0) == 0 {
 		mp.blocked = false
-		for i = 0; i < n; i++ {
-			op = entries[i].op
+		errno = int32(getlasterror())
+		if errno == _WAIT_TIMEOUT {
+			return gList{}
+		}
+		println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
+		throw("runtime: netpoll failed")
+	}
+	mp.blocked = false
+	for i = 0; i < n; i++ {
+		op = entries[i].op
+		if op != nil {
 			errno = 0
 			qty = 0
 			if stdcall5(_WSAGetOverlappedResult, op.pd.fd, uintptr(unsafe.Pointer(op)), uintptr(unsafe.Pointer(&qty)), 0, uintptr(unsafe.Pointer(&flags))) == 0 {
 				errno = int32(getlasterror())
 			}
 			handlecompletion(&toRun, op, errno, qty)
-		}
-	} else {
-		op = nil
-		errno = 0
-		qty = 0
-		if block {
-			mp.blocked = true
-		}
-		if stdcall5(_GetQueuedCompletionStatus, iocphandle, uintptr(unsafe.Pointer(&qty)), uintptr(unsafe.Pointer(&key)), uintptr(unsafe.Pointer(&op)), uintptr(wait)) == 0 {
-			mp.blocked = false
-			errno = int32(getlasterror())
-			if !block && errno == _WAIT_TIMEOUT {
-				return gList{}
+		} else {
+			atomic.Store(&netpollWakeSig, 0)
+			if delay == 0 {
+				// Forward the notification to the
+				// blocked poller.
+				netpollBreak()
 			}
-			if op == nil {
-				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
-				throw("runtime: netpoll failed")
-			}
-			// dequeued failed IO packet, so report that
 		}
-		mp.blocked = false
-		handlecompletion(&toRun, op, errno, qty)
-	}
-	if block && toRun.empty() {
-		goto retry
 	}
 	return toRun
 }
 
 func handlecompletion(toRun *gList, op *net_op, errno int32, qty uint32) {
-	if op == nil {
-		println("runtime: GetQueuedCompletionStatus returned op == nil")
-		throw("runtime: netpoll failed")
-	}
 	mode := op.mode
 	if mode != 'r' && mode != 'w' {
-		println("runtime: GetQueuedCompletionStatus returned invalid mode=", mode)
+		println("runtime: GetQueuedCompletionStatusEx returned invalid mode=", mode)
 		throw("runtime: netpoll failed")
 	}
 	op.errno = errno
diff --git a/src/runtime/os2_aix.go b/src/runtime/os2_aix.go
index 162d93e..31ac6dd 100644
--- a/src/runtime/os2_aix.go
+++ b/src/runtime/os2_aix.go
@@ -6,7 +6,7 @@
 // Pollset syscalls are in netpoll_aix.go.
 // The implementation is based on Solaris and Windows.
 // Each syscall is made by calling its libc symbol using asmcgocall and asmsyscall6
-// asssembly functions.
+// assembly functions.
 
 package runtime
 
@@ -38,6 +38,7 @@
 //go:cgo_import_dynamic libc_madvise madvise "libc.a/shr_64.o"
 //go:cgo_import_dynamic libc_malloc malloc "libc.a/shr_64.o"
 //go:cgo_import_dynamic libc_mmap mmap "libc.a/shr_64.o"
+//go:cgo_import_dynamic libc_mprotect mprotect "libc.a/shr_64.o"
 //go:cgo_import_dynamic libc_munmap munmap "libc.a/shr_64.o"
 //go:cgo_import_dynamic libc_open open "libc.a/shr_64.o"
 //go:cgo_import_dynamic libc_pipe pipe "libc.a/shr_64.o"
@@ -64,6 +65,8 @@
 //go:cgo_import_dynamic libpthread_attr_setstackaddr pthread_attr_setstackaddr "libpthread.a/shr_xpg5_64.o"
 //go:cgo_import_dynamic libpthread_create pthread_create "libpthread.a/shr_xpg5_64.o"
 //go:cgo_import_dynamic libpthread_sigthreadmask sigthreadmask "libpthread.a/shr_xpg5_64.o"
+//go:cgo_import_dynamic libpthread_self pthread_self "libpthread.a/shr_xpg5_64.o"
+//go:cgo_import_dynamic libpthread_kill pthread_kill "libpthread.a/shr_xpg5_64.o"
 
 //go:linkname libc__Errno libc__Errno
 //go:linkname libc_clock_gettime libc_clock_gettime
@@ -75,6 +78,7 @@
 //go:linkname libc_madvise libc_madvise
 //go:linkname libc_malloc libc_malloc
 //go:linkname libc_mmap libc_mmap
+//go:linkname libc_mprotect libc_mprotect
 //go:linkname libc_munmap libc_munmap
 //go:linkname libc_open libc_open
 //go:linkname libc_pipe libc_pipe
@@ -101,6 +105,8 @@
 //go:linkname libpthread_attr_setstackaddr libpthread_attr_setstackaddr
 //go:linkname libpthread_create libpthread_create
 //go:linkname libpthread_sigthreadmask libpthread_sigthreadmask
+//go:linkname libpthread_self libpthread_self
+//go:linkname libpthread_kill libpthread_kill
 
 var (
 	//libc
@@ -114,6 +120,7 @@
 	libc_madvise,
 	libc_malloc,
 	libc_mmap,
+	libc_mprotect,
 	libc_munmap,
 	libc_open,
 	libc_pipe,
@@ -139,7 +146,9 @@
 	libpthread_attr_setdetachstate,
 	libpthread_attr_setstackaddr,
 	libpthread_create,
-	libpthread_sigthreadmask libFunc
+	libpthread_sigthreadmask,
+	libpthread_self,
+	libpthread_kill libFunc
 )
 
 type libFunc uintptr
@@ -390,25 +399,32 @@
 	exit1(code)
 }
 
-func write1(fd, p uintptr, n int32) int32
+func write2(fd, p uintptr, n int32) int32
 
 //go:nosplit
-func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
 	_g_ := getg()
 
 	// Check the validity of g because without a g during
 	// newosproc0.
 	if _g_ != nil {
-		r, _ := syscall3(&libc_write, uintptr(fd), uintptr(p), uintptr(n))
+		r, errno := syscall3(&libc_write, uintptr(fd), uintptr(p), uintptr(n))
+		if int32(r) < 0 {
+			return -int32(errno)
+		}
 		return int32(r)
 	}
-	return write1(fd, uintptr(p), n)
+	// Note that in this case we can't return a valid errno value.
+	return write2(fd, uintptr(p), n)
 
 }
 
 //go:nosplit
 func read(fd int32, p unsafe.Pointer, n int32) int32 {
-	r, _ := syscall3(&libc_read, uintptr(fd), uintptr(p), uintptr(n))
+	r, errno := syscall3(&libc_read, uintptr(fd), uintptr(p), uintptr(n))
+	if int32(r) < 0 {
+		return -int32(errno)
+	}
 	return int32(r)
 }
 
@@ -425,9 +441,10 @@
 }
 
 //go:nosplit
-func pipe(fd *int32) int32 {
-	r, _ := syscall1(&libc_pipe, uintptr(unsafe.Pointer(fd)))
-	return int32(r)
+func pipe() (r, w int32, errno int32) {
+	var p [2]int32
+	_, err := syscall1(&libc_pipe, uintptr(noescape(unsafe.Pointer(&p[0]))))
+	return p[0], p[1], int32(err)
 }
 
 // mmap calls the mmap system call.
@@ -445,6 +462,15 @@
 }
 
 //go:nosplit
+func mprotect(addr unsafe.Pointer, n uintptr, prot int32) (unsafe.Pointer, int) {
+	r, err0 := syscall3(&libc_mprotect, uintptr(addr), uintptr(n), uintptr(prot))
+	if r == ^uintptr(0) {
+		return nil, int(err0)
+	}
+	return unsafe.Pointer(r), int(err0)
+}
+
+//go:nosplit
 func munmap(addr unsafe.Pointer, n uintptr) {
 	r, err := syscall2(&libc_munmap, uintptr(addr), uintptr(n))
 	if int32(r) == -1 {
@@ -716,3 +742,14 @@
 	sigprocmask1(uintptr(how), uintptr(unsafe.Pointer(new)), uintptr(unsafe.Pointer(old)))
 
 }
+
+//go:nosplit
+func pthread_self() pthread {
+	r, _ := syscall0(&libpthread_self)
+	return pthread(r)
+}
+
+//go:nosplit
+func signalM(mp *m, sig int) {
+	syscall2(&libpthread_kill, uintptr(pthread(mp.procid)), uintptr(sig))
+}
diff --git a/src/runtime/os2_nacl.go b/src/runtime/os2_nacl.go
deleted file mode 100644
index b84cb18..0000000
--- a/src/runtime/os2_nacl.go
+++ /dev/null
@@ -1,155 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_NSIG    = 32
-	_SI_USER = 1
-
-	// native_client/src/trusted/service_runtime/include/sys/errno.h
-	// The errors are mainly copied from Linux.
-	_EPERM   = 1  /* Operation not permitted */
-	_ENOENT  = 2  /* No such file or directory */
-	_ESRCH   = 3  /* No such process */
-	_EINTR   = 4  /* Interrupted system call */
-	_EIO     = 5  /* I/O error */
-	_ENXIO   = 6  /* No such device or address */
-	_E2BIG   = 7  /* Argument list too long */
-	_ENOEXEC = 8  /* Exec format error */
-	_EBADF   = 9  /* Bad file number */
-	_ECHILD  = 10 /* No child processes */
-	_EAGAIN  = 11 /* Try again */
-	// _ENOMEM is defined in mem_bsd.go for nacl.
-	// _ENOMEM          = 12       /* Out of memory */
-	_EACCES          = 13       /* Permission denied */
-	_EFAULT          = 14       /* Bad address */
-	_EBUSY           = 16       /* Device or resource busy */
-	_EEXIST          = 17       /* File exists */
-	_EXDEV           = 18       /* Cross-device link */
-	_ENODEV          = 19       /* No such device */
-	_ENOTDIR         = 20       /* Not a directory */
-	_EISDIR          = 21       /* Is a directory */
-	_EINVAL          = 22       /* Invalid argument */
-	_ENFILE          = 23       /* File table overflow */
-	_EMFILE          = 24       /* Too many open files */
-	_ENOTTY          = 25       /* Not a typewriter */
-	_EFBIG           = 27       /* File too large */
-	_ENOSPC          = 28       /* No space left on device */
-	_ESPIPE          = 29       /* Illegal seek */
-	_EROFS           = 30       /* Read-only file system */
-	_EMLINK          = 31       /* Too many links */
-	_EPIPE           = 32       /* Broken pipe */
-	_ENAMETOOLONG    = 36       /* File name too long */
-	_ENOSYS          = 38       /* Function not implemented */
-	_EDQUOT          = 122      /* Quota exceeded */
-	_EDOM            = 33       /* Math arg out of domain of func */
-	_ERANGE          = 34       /* Math result not representable */
-	_EDEADLK         = 35       /* Deadlock condition */
-	_ENOLCK          = 37       /* No record locks available */
-	_ENOTEMPTY       = 39       /* Directory not empty */
-	_ELOOP           = 40       /* Too many symbolic links */
-	_ENOMSG          = 42       /* No message of desired type */
-	_EIDRM           = 43       /* Identifier removed */
-	_ECHRNG          = 44       /* Channel number out of range */
-	_EL2NSYNC        = 45       /* Level 2 not synchronized */
-	_EL3HLT          = 46       /* Level 3 halted */
-	_EL3RST          = 47       /* Level 3 reset */
-	_ELNRNG          = 48       /* Link number out of range */
-	_EUNATCH         = 49       /* Protocol driver not attached */
-	_ENOCSI          = 50       /* No CSI structure available */
-	_EL2HLT          = 51       /* Level 2 halted */
-	_EBADE           = 52       /* Invalid exchange */
-	_EBADR           = 53       /* Invalid request descriptor */
-	_EXFULL          = 54       /* Exchange full */
-	_ENOANO          = 55       /* No anode */
-	_EBADRQC         = 56       /* Invalid request code */
-	_EBADSLT         = 57       /* Invalid slot */
-	_EDEADLOCK       = _EDEADLK /* File locking deadlock error */
-	_EBFONT          = 59       /* Bad font file fmt */
-	_ENOSTR          = 60       /* Device not a stream */
-	_ENODATA         = 61       /* No data (for no delay io) */
-	_ETIME           = 62       /* Timer expired */
-	_ENOSR           = 63       /* Out of streams resources */
-	_ENONET          = 64       /* Machine is not on the network */
-	_ENOPKG          = 65       /* Package not installed */
-	_EREMOTE         = 66       /* The object is remote */
-	_ENOLINK         = 67       /* The link has been severed */
-	_EADV            = 68       /* Advertise error */
-	_ESRMNT          = 69       /* Srmount error */
-	_ECOMM           = 70       /* Communication error on send */
-	_EPROTO          = 71       /* Protocol error */
-	_EMULTIHOP       = 72       /* Multihop attempted */
-	_EDOTDOT         = 73       /* Cross mount point (not really error) */
-	_EBADMSG         = 74       /* Trying to read unreadable message */
-	_EOVERFLOW       = 75       /* Value too large for defined data type */
-	_ENOTUNIQ        = 76       /* Given log. name not unique */
-	_EBADFD          = 77       /* f.d. invalid for this operation */
-	_EREMCHG         = 78       /* Remote address changed */
-	_ELIBACC         = 79       /* Can't access a needed shared lib */
-	_ELIBBAD         = 80       /* Accessing a corrupted shared lib */
-	_ELIBSCN         = 81       /* .lib section in a.out corrupted */
-	_ELIBMAX         = 82       /* Attempting to link in too many libs */
-	_ELIBEXEC        = 83       /* Attempting to exec a shared library */
-	_EILSEQ          = 84
-	_EUSERS          = 87
-	_ENOTSOCK        = 88  /* Socket operation on non-socket */
-	_EDESTADDRREQ    = 89  /* Destination address required */
-	_EMSGSIZE        = 90  /* Message too long */
-	_EPROTOTYPE      = 91  /* Protocol wrong type for socket */
-	_ENOPROTOOPT     = 92  /* Protocol not available */
-	_EPROTONOSUPPORT = 93  /* Unknown protocol */
-	_ESOCKTNOSUPPORT = 94  /* Socket type not supported */
-	_EOPNOTSUPP      = 95  /* Operation not supported on transport endpoint */
-	_EPFNOSUPPORT    = 96  /* Protocol family not supported */
-	_EAFNOSUPPORT    = 97  /* Address family not supported by protocol family */
-	_EADDRINUSE      = 98  /* Address already in use */
-	_EADDRNOTAVAIL   = 99  /* Address not available */
-	_ENETDOWN        = 100 /* Network interface is not configured */
-	_ENETUNREACH     = 101 /* Network is unreachable */
-	_ENETRESET       = 102
-	_ECONNABORTED    = 103 /* Connection aborted */
-	_ECONNRESET      = 104 /* Connection reset by peer */
-	_ENOBUFS         = 105 /* No buffer space available */
-	_EISCONN         = 106 /* Socket is already connected */
-	_ENOTCONN        = 107 /* Socket is not connected */
-	_ESHUTDOWN       = 108 /* Can't send after socket shutdown */
-	_ETOOMANYREFS    = 109
-	_ETIMEDOUT       = 110 /* Connection timed out */
-	_ECONNREFUSED    = 111 /* Connection refused */
-	_EHOSTDOWN       = 112 /* Host is down */
-	_EHOSTUNREACH    = 113 /* Host is unreachable */
-	_EALREADY        = 114 /* Socket already connected */
-	_EINPROGRESS     = 115 /* Connection already in progress */
-	_ESTALE          = 116
-	_ENOTSUP         = _EOPNOTSUPP /* Not supported */
-	_ENOMEDIUM       = 123         /* No medium (in tape drive) */
-	_ECANCELED       = 125         /* Operation canceled. */
-	_ELBIN           = 2048        /* Inode is remote (not really error) */
-	_EFTYPE          = 2049        /* Inappropriate file type or format */
-	_ENMFILE         = 2050        /* No more files */
-	_EPROCLIM        = 2051
-	_ENOSHARE        = 2052    /* No such host or network path */
-	_ECASECLASH      = 2053    /* Filename exists with different case */
-	_EWOULDBLOCK     = _EAGAIN /* Operation would block */
-
-	// native_client/src/trusted/service_runtime/include/bits/mman.h.
-	// NOTE: DO NOT USE native_client/src/shared/imc/nacl_imc_c.h.
-	// Those MAP_*values are different from these.
-	_PROT_NONE  = 0x0
-	_PROT_READ  = 0x1
-	_PROT_WRITE = 0x2
-	_PROT_EXEC  = 0x4
-
-	_MAP_SHARED  = 0x1
-	_MAP_PRIVATE = 0x2
-	_MAP_FIXED   = 0x10
-	_MAP_ANON    = 0x20
-
-	_MADV_FREE  = 0
-	_SIGFPE     = 8
-	_FPE_INTDIV = 0
-)
-
-type siginfo struct{}
diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index b5a11e8..d6e36fb 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go
@@ -29,6 +29,8 @@
 //go:cgo_import_dynamic libc_pthread_attr_setdetachstate pthread_attr_setdetachstate "libc.so"
 //go:cgo_import_dynamic libc_pthread_attr_setstack pthread_attr_setstack "libc.so"
 //go:cgo_import_dynamic libc_pthread_create pthread_create "libc.so"
+//go:cgo_import_dynamic libc_pthread_self pthread_self "libc.so"
+//go:cgo_import_dynamic libc_pthread_kill pthread_kill "libc.so"
 //go:cgo_import_dynamic libc_raise raise "libc.so"
 //go:cgo_import_dynamic libc_read read "libc.so"
 //go:cgo_import_dynamic libc_select select "libc.so"
@@ -44,6 +46,8 @@
 //go:cgo_import_dynamic libc_sysconf sysconf "libc.so"
 //go:cgo_import_dynamic libc_usleep usleep "libc.so"
 //go:cgo_import_dynamic libc_write write "libc.so"
+//go:cgo_import_dynamic libc_pipe pipe "libc.so"
+//go:cgo_import_dynamic libc_pipe2 pipe2 "libc.so"
 
 //go:linkname libc____errno libc____errno
 //go:linkname libc_clock_gettime libc_clock_gettime
@@ -61,6 +65,8 @@
 //go:linkname libc_pthread_attr_setdetachstate libc_pthread_attr_setdetachstate
 //go:linkname libc_pthread_attr_setstack libc_pthread_attr_setstack
 //go:linkname libc_pthread_create libc_pthread_create
+//go:linkname libc_pthread_self libc_pthread_self
+//go:linkname libc_pthread_kill libc_pthread_kill
 //go:linkname libc_raise libc_raise
 //go:linkname libc_read libc_read
 //go:linkname libc_select libc_select
@@ -76,6 +82,8 @@
 //go:linkname libc_sysconf libc_sysconf
 //go:linkname libc_usleep libc_usleep
 //go:linkname libc_write libc_write
+//go:linkname libc_pipe libc_pipe
+//go:linkname libc_pipe2 libc_pipe2
 
 var (
 	libc____errno,
@@ -94,6 +102,8 @@
 	libc_pthread_attr_setdetachstate,
 	libc_pthread_attr_setstack,
 	libc_pthread_create,
+	libc_pthread_self,
+	libc_pthread_kill,
 	libc_raise,
 	libc_read,
 	libc_sched_yield,
@@ -108,19 +118,13 @@
 	libc_sigprocmask,
 	libc_sysconf,
 	libc_usleep,
-	libc_write libcFunc
+	libc_write,
+	libc_pipe,
+	libc_pipe2 libcFunc
 )
 
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
-func getncpu() int32 {
-	n := int32(sysconf(__SC_NPROCESSORS_ONLN))
-	if n < 1 {
-		return 1
-	}
-	return n
-}
-
 func getPageSize() uintptr {
 	n := int32(sysconf(__SC_PAGESIZE))
 	if n <= 0 {
@@ -214,6 +218,8 @@
 	asmcgocall(unsafe.Pointer(funcPC(miniterrno)), unsafe.Pointer(&libc____errno))
 
 	minitSignals()
+
+	getg().m.procid = uint64(pthread_self())
 }
 
 // Called from dropm to undo the effect of an minit.
@@ -393,11 +399,16 @@
 	sysvicall2(&libc_munmap, uintptr(addr), uintptr(n))
 }
 
-func nanotime1()
+const (
+	_CLOCK_REALTIME  = 3
+	_CLOCK_MONOTONIC = 4
+)
 
 //go:nosplit
-func nanotime() int64 {
-	return int64(sysvicall0((*libcFunc)(unsafe.Pointer(funcPC(nanotime1)))))
+func nanotime1() int64 {
+	var ts mts
+	sysvicall2(&libc_clock_gettime, _CLOCK_MONOTONIC, uintptr(unsafe.Pointer(&ts)))
+	return ts.tv_sec*1e9 + ts.tv_nsec
 }
 
 //go:nosplit
@@ -429,6 +440,14 @@
 	return int32(sysvicall4(&libc_pthread_create, uintptr(unsafe.Pointer(thread)), uintptr(unsafe.Pointer(attr)), uintptr(fn), uintptr(arg)))
 }
 
+func pthread_self() pthread {
+	return pthread(sysvicall0(&libc_pthread_self))
+}
+
+func signalM(mp *m, sig int) {
+	sysvicall2(&libc_pthread_kill, uintptr(pthread(mp.procid)), uintptr(sig))
+}
+
 //go:nosplit
 //go:nowritebarrierrec
 func raise(sig uint32) /* int32 */ {
@@ -442,7 +461,11 @@
 
 //go:nosplit
 func read(fd int32, buf unsafe.Pointer, nbyte int32) int32 {
-	return int32(sysvicall3(&libc_read, uintptr(fd), uintptr(buf), uintptr(nbyte)))
+	r1, err := sysvicall3Err(&libc_read, uintptr(fd), uintptr(buf), uintptr(nbyte))
+	if c := int32(r1); c >= 0 {
+		return c
+	}
+	return -int32(err)
 }
 
 //go:nosplit
@@ -498,9 +521,44 @@
 	usleep1(µs)
 }
 
+func walltime1() (sec int64, nsec int32) {
+	var ts mts
+	sysvicall2(&libc_clock_gettime, _CLOCK_REALTIME, uintptr(unsafe.Pointer(&ts)))
+	return ts.tv_sec, int32(ts.tv_nsec)
+}
+
 //go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, nbyte int32) int32 {
-	return int32(sysvicall3(&libc_write, uintptr(fd), uintptr(buf), uintptr(nbyte)))
+func write1(fd uintptr, buf unsafe.Pointer, nbyte int32) int32 {
+	r1, err := sysvicall3Err(&libc_write, fd, uintptr(buf), uintptr(nbyte))
+	if c := int32(r1); c >= 0 {
+		return c
+	}
+	return -int32(err)
+}
+
+//go:nosplit
+func pipe() (r, w int32, errno int32) {
+	var p [2]int32
+	_, e := sysvicall1Err(&libc_pipe, uintptr(noescape(unsafe.Pointer(&p))))
+	return p[0], p[1], int32(e)
+}
+
+//go:nosplit
+func pipe2(flags int32) (r, w int32, errno int32) {
+	var p [2]int32
+	_, e := sysvicall2Err(&libc_pipe2, uintptr(noescape(unsafe.Pointer(&p))), uintptr(flags))
+	return p[0], p[1], int32(e)
+}
+
+//go:nosplit
+func closeonexec(fd int32) {
+	fcntl(fd, _F_SETFD, _FD_CLOEXEC)
+}
+
+//go:nosplit
+func setNonblock(fd int32) {
+	flags := fcntl(fd, _F_GETFL, 0)
+	fcntl(fd, _F_SETFL, flags|_O_NONBLOCK)
 }
 
 func osyield1()
diff --git a/src/runtime/os_aix.go b/src/runtime/os_aix.go
index 197869f..9a6b8ae 100644
--- a/src/runtime/os_aix.go
+++ b/src/runtime/os_aix.go
@@ -175,6 +175,7 @@
 func minit() {
 	miniterrno()
 	minitSignals()
+	getg().m.procid = uint64(pthread_self())
 }
 
 func unminit() {
@@ -323,7 +324,7 @@
 )
 
 //go:nosplit
-func nanotime() int64 {
+func nanotime1() int64 {
 	tp := &timespec{}
 	if clock_gettime(_CLOCK_REALTIME, tp) != 0 {
 		throw("syscall clock_gettime failed")
@@ -331,7 +332,7 @@
 	return tp.tv_sec*1000000000 + tp.tv_nsec
 }
 
-func walltime() (sec int64, nsec int32) {
+func walltime1() (sec int64, nsec int32) {
 	ts := &timespec{}
 	if clock_gettime(_CLOCK_REALTIME, ts) != 0 {
 		throw("syscall clock_gettime failed")
@@ -357,3 +358,20 @@
 		cpu.HWCap2 |= cpu.PPC_FEATURE2_ARCH_3_00
 	}
 }
+
+//go:nosplit
+func fcntl(fd, cmd, arg int32) int32 {
+	r, _ := syscall3(&libc_fcntl, uintptr(fd), uintptr(cmd), uintptr(arg))
+	return int32(r)
+}
+
+//go:nosplit
+func closeonexec(fd int32) {
+	fcntl(fd, _F_SETFD, _FD_CLOEXEC)
+}
+
+//go:nosplit
+func setNonblock(fd int32) {
+	flags := fcntl(fd, _F_GETFL, 0)
+	fcntl(fd, _F_SETFL, flags|_O_NONBLOCK)
+}
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 1614b66..01c40b4 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -289,20 +289,21 @@
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// The alternate signal stack is buggy on arm and arm64.
+	// The alternate signal stack is buggy on arm64.
 	// The signal handler handles it directly.
-	if GOARCH != "arm" && GOARCH != "arm64" {
+	if GOARCH != "arm64" {
 		minitSignalStack()
 	}
 	minitSignalMask()
+	getg().m.procid = uint64(pthread_self())
 }
 
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	// The alternate signal stack is buggy on arm and arm64.
+	// The alternate signal stack is buggy on arm64.
 	// See minit.
-	if GOARCH != "arm" && GOARCH != "arm64" {
+	if GOARCH != "arm64" {
 		unminitSignals()
 	}
 }
@@ -406,3 +407,7 @@
 		executablePath = executablePath[len(prefix):]
 	}
 }
+
+func signalM(mp *m, sig int) {
+	pthread_kill(pthread(mp.procid), uint32(sig))
+}
diff --git a/src/runtime/os_darwin_arm.go b/src/runtime/os_darwin_arm.go
deleted file mode 100644
index ee1bd17..0000000
--- a/src/runtime/os_darwin_arm.go
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-func checkgoarm() {
-	// TODO(minux): FP checks like in os_linux_arm.go.
-
-	// osinit not called yet, so ncpu not set: must use getncpu directly.
-	if getncpu() > 1 && goarm < 7 {
-		print("runtime: this system has multiple CPUs and must use\n")
-		print("atomic synchronization instructions. Recompile using GOARM=7.\n")
-		exit(1)
-	}
-}
-
-//go:nosplit
-func cputicks() int64 {
-	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
-	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
-	return nanotime()
-}
diff --git a/src/runtime/os_darwin_arm64.go b/src/runtime/os_darwin_arm64.go
index 8de132d..b808150 100644
--- a/src/runtime/os_darwin_arm64.go
+++ b/src/runtime/os_darwin_arm64.go
@@ -8,6 +8,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
index 4fda7ea..6578fcb 100644
--- a/src/runtime/os_dragonfly.go
+++ b/src/runtime/os_dragonfly.go
@@ -38,9 +38,11 @@
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
 
-func raise(sig uint32)
 func raiseproc(sig uint32)
 
+func lwp_gettid() int32
+func lwp_kill(pid, tid int32, sig int)
+
 //go:noescape
 func sys_umtx_sleep(addr *uint32, val, timeout int32) int32
 
@@ -54,6 +56,9 @@
 //go:noescape
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
 func closeonexec(fd int32)
+func setNonblock(fd int32)
+
+func pipe() (r, w int32, errno int32)
 
 const stackSystem = 0
 
@@ -148,7 +153,7 @@
 		start_func: funcPC(lwp_start),
 		arg:        unsafe.Pointer(mp),
 		stack:      uintptr(stk),
-		tid1:       unsafe.Pointer(&mp.procid),
+		tid1:       nil, // minit will record tid
 		tid2:       nil,
 	}
 
@@ -188,10 +193,7 @@
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// m.procid is a uint64, but lwp_start writes an int32. Fix it up.
-	_g_ := getg()
-	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
-
+	getg().m.procid = uint64(lwp_gettid())
 	minitSignals()
 }
 
@@ -285,3 +287,17 @@
 		}
 	}
 }
+
+// raise sends a signal to the calling thread.
+//
+// It must be nosplit because it is used by the signal handler before
+// it definitely has a Go stack.
+//
+//go:nosplit
+func raise(sig uint32) {
+	lwp_kill(-1, lwp_gettid(), int(sig))
+}
+
+func signalM(mp *m, sig int) {
+	lwp_kill(-1, int32(mp.procid), sig)
+}
diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index cbb72cf..730973a 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go
@@ -26,9 +26,11 @@
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
 
-func raise(sig uint32)
 func raiseproc(sig uint32)
 
+func thr_self() thread
+func thr_kill(tid thread, sig int)
+
 //go:noescape
 func sys_umtx_op(addr *uint32, mode int32, val uint32, uaddr1 uintptr, ut *umtx_time) int32
 
@@ -38,7 +40,11 @@
 
 //go:noescape
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
+
+func pipe() (r, w int32, errno int32)
+func pipe2(flags int32) (r, w int32, errno int32)
 func closeonexec(fd int32)
+func setNonblock(fd int32)
 
 // From FreeBSD's <sys/sysctl.h>
 const (
@@ -194,7 +200,7 @@
 		arg:        unsafe.Pointer(mp),
 		stack_base: mp.g0.stack.lo,
 		stack_size: uintptr(stk) - mp.g0.stack.lo,
-		child_tid:  unsafe.Pointer(&mp.procid),
+		child_tid:  nil, // minit will record tid
 		parent_tid: nil,
 		tls_base:   unsafe.Pointer(&mp.tls[0]),
 		tls_size:   unsafe.Sizeof(mp.tls),
@@ -230,7 +236,7 @@
 		arg:        nil,
 		stack_base: uintptr(stack), //+stacksize?
 		stack_size: stacksize,
-		child_tid:  unsafe.Pointer(&m0.procid),
+		child_tid:  nil, // minit will record tid
 		parent_tid: nil,
 		tls_base:   unsafe.Pointer(&m0.tls[0]),
 		tls_size:   unsafe.Sizeof(m0.tls),
@@ -289,12 +295,7 @@
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// m.procid is a uint64, but thr_new writes a uint32 on 32-bit systems.
-	// Fix it up. (Only matters on big-endian, but be clean anyway.)
-	if sys.PtrSize == 4 {
-		_g_ := getg()
-		_g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
-	}
+	getg().m.procid = uint64(thr_self())
 
 	// On FreeBSD before about April 2017 there was a bug such
 	// that calling execve from a thread other than the main
@@ -422,3 +423,17 @@
 // asmSigaction is implemented in assembly.
 //go:noescape
 func asmSigaction(sig uintptr, new, old *sigactiont) int32
+
+// raise sends a signal to the calling thread.
+//
+// It must be nosplit because it is used by the signal handler before
+// it definitely has a Go stack.
+//
+//go:nosplit
+func raise(sig uint32) {
+	thr_kill(thr_self(), int(sig))
+}
+
+func signalM(mp *m, sig int) {
+	thr_kill(thread(mp.procid), sig)
+}
diff --git a/src/runtime/os_freebsd_arm.go b/src/runtime/os_freebsd_arm.go
index 3edd381..3feaa5e 100644
--- a/src/runtime/os_freebsd_arm.go
+++ b/src/runtime/os_freebsd_arm.go
@@ -44,6 +44,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_freebsd_arm64.go b/src/runtime/os_freebsd_arm64.go
new file mode 100644
index 0000000..51ebf9d
--- /dev/null
+++ b/src/runtime/os_freebsd_arm64.go
@@ -0,0 +1,155 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "internal/cpu"
+
+const (
+	hwcap_FP       = 1 << 0
+	hwcap_ASIMD    = 1 << 1
+	hwcap_EVTSTRM  = 1 << 2
+	hwcap_AES      = 1 << 3
+	hwcap_PMULL    = 1 << 4
+	hwcap_SHA1     = 1 << 5
+	hwcap_SHA2     = 1 << 6
+	hwcap_CRC32    = 1 << 7
+	hwcap_ATOMICS  = 1 << 8
+	hwcap_FPHP     = 1 << 9
+	hwcap_ASIMDHP  = 1 << 10
+	hwcap_CPUID    = 1 << 11
+	hwcap_ASIMDRDM = 1 << 12
+	hwcap_JSCVT    = 1 << 13
+	hwcap_FCMA     = 1 << 14
+	hwcap_LRCPC    = 1 << 15
+	hwcap_DCPOP    = 1 << 16
+	hwcap_SHA3     = 1 << 17
+	hwcap_SM3      = 1 << 18
+	hwcap_SM4      = 1 << 19
+	hwcap_ASIMDDP  = 1 << 20
+	hwcap_SHA512   = 1 << 21
+	hwcap_SVE      = 1 << 22
+	hwcap_ASIMDFHM = 1 << 23
+)
+
+func getisar0() uint64
+func getisar1() uint64
+func getpfr0() uint64
+
+// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
+// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
+func archauxv(tag, val uintptr) {
+	var isar0, isar1, pfr0 uint64
+
+	isar0 = getisar0()
+	isar1 = getisar1()
+	pfr0 = getpfr0()
+
+	// ID_AA64ISAR0_EL1
+	switch extractBits(isar0, 4, 7) {
+	case 1:
+		cpu.HWCap |= hwcap_AES
+	case 2:
+		cpu.HWCap |= hwcap_PMULL | hwcap_AES
+	}
+
+	switch extractBits(isar0, 8, 11) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA1
+	}
+
+	switch extractBits(isar0, 12, 15) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA2
+	case 2:
+		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
+	}
+
+	switch extractBits(isar0, 16, 19) {
+	case 1:
+		cpu.HWCap |= hwcap_CRC32
+	}
+
+	switch extractBits(isar0, 20, 23) {
+	case 2:
+		cpu.HWCap |= hwcap_ATOMICS
+	}
+
+	switch extractBits(isar0, 28, 31) {
+	case 1:
+		cpu.HWCap |= hwcap_ASIMDRDM
+	}
+
+	switch extractBits(isar0, 32, 35) {
+	case 1:
+		cpu.HWCap |= hwcap_SHA3
+	}
+
+	switch extractBits(isar0, 36, 39) {
+	case 1:
+		cpu.HWCap |= hwcap_SM3
+	}
+
+	switch extractBits(isar0, 40, 43) {
+	case 1:
+		cpu.HWCap |= hwcap_SM4
+	}
+
+	switch extractBits(isar0, 44, 47) {
+	case 1:
+		cpu.HWCap |= hwcap_ASIMDDP
+	}
+
+	// ID_AA64ISAR1_EL1
+	switch extractBits(isar1, 0, 3) {
+	case 1:
+		cpu.HWCap |= hwcap_DCPOP
+	}
+
+	switch extractBits(isar1, 12, 15) {
+	case 1:
+		cpu.HWCap |= hwcap_JSCVT
+	}
+
+	switch extractBits(isar1, 16, 19) {
+	case 1:
+		cpu.HWCap |= hwcap_FCMA
+	}
+
+	switch extractBits(isar1, 20, 23) {
+	case 1:
+		cpu.HWCap |= hwcap_LRCPC
+	}
+
+	// ID_AA64PFR0_EL1
+	switch extractBits(pfr0, 16, 19) {
+	case 0:
+		cpu.HWCap |= hwcap_FP
+	case 1:
+		cpu.HWCap |= hwcap_FP | hwcap_FPHP
+	}
+
+	switch extractBits(pfr0, 20, 23) {
+	case 0:
+		cpu.HWCap |= hwcap_ASIMD
+	case 1:
+		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
+	}
+
+	switch extractBits(pfr0, 32, 35) {
+	case 1:
+		cpu.HWCap |= hwcap_SVE
+	}
+}
+
+func extractBits(data uint64, start, end uint) uint {
+	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
+}
+
+//go:nosplit
+func cputicks() int64 {
+	// Currently cputicks() is used in blocking profiler and to seed fastrand().
+	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	return nanotime()
+}
diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go
index 01efb9b..c6a4992 100644
--- a/src/runtime/os_freebsd_noauxv.go
+++ b/src/runtime/os_freebsd_noauxv.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build freebsd
-// +build !arm
+// +build !arm,!arm64
 
 package runtime
 
diff --git a/src/runtime/os_illumos.go b/src/runtime/os_illumos.go
new file mode 100644
index 0000000..c3c3e4e
--- /dev/null
+++ b/src/runtime/os_illumos.go
@@ -0,0 +1,132 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+//go:cgo_import_dynamic libc_getrctl getrctl "libc.so"
+//go:cgo_import_dynamic libc_rctlblk_get_local_action rctlblk_get_local_action "libc.so"
+//go:cgo_import_dynamic libc_rctlblk_get_local_flags rctlblk_get_local_flags "libc.so"
+//go:cgo_import_dynamic libc_rctlblk_get_value rctlblk_get_value "libc.so"
+//go:cgo_import_dynamic libc_rctlblk_size rctlblk_size "libc.so"
+
+//go:linkname libc_getrctl libc_getrctl
+//go:linkname libc_rctlblk_get_local_action libc_rctlblk_get_local_action
+//go:linkname libc_rctlblk_get_local_flags libc_rctlblk_get_local_flags
+//go:linkname libc_rctlblk_get_value libc_rctlblk_get_value
+//go:linkname libc_rctlblk_size libc_rctlblk_size
+
+var (
+	libc_getrctl,
+	libc_rctlblk_get_local_action,
+	libc_rctlblk_get_local_flags,
+	libc_rctlblk_get_value,
+	libc_rctlblk_size libcFunc
+)
+
+// Return the minimum value seen for the zone CPU cap, or 0 if no cap is
+// detected.
+func getcpucap() uint64 {
+	// The resource control block is an opaque object whose size is only
+	// known to libc.  In practice, given the contents, it is unlikely to
+	// grow beyond 8KB so we'll use a static buffer of that size here.
+	const rblkmaxsize = 8 * 1024
+	if rctlblk_size() > rblkmaxsize {
+		return 0
+	}
+
+	// The "zone.cpu-cap" resource control, as described in
+	// resource_controls(5), "sets a limit on the amount of CPU time that
+	// can be used by a zone.  The unit used is the percentage of a single
+	// CPU that can be used by all user threads in a zone, expressed as an
+	// integer."  A C string of the name must be passed to getrctl(2).
+	name := []byte("zone.cpu-cap\x00")
+
+	// To iterate over the list of values for a particular resource
+	// control, we need two blocks: one for the previously read value and
+	// one for the next value.
+	var rblk0 [rblkmaxsize]byte
+	var rblk1 [rblkmaxsize]byte
+	rblk := &rblk0[0]
+	rblkprev := &rblk1[0]
+
+	var flag uint32 = _RCTL_FIRST
+	var capval uint64 = 0
+
+	for {
+		if getrctl(unsafe.Pointer(&name[0]), unsafe.Pointer(rblkprev), unsafe.Pointer(rblk), flag) != 0 {
+			// The end of the sequence is reported as an ENOENT
+			// failure, but determining the CPU cap is not critical
+			// here.  We'll treat any failure as if it were the end
+			// of sequence.
+			break
+		}
+
+		lflags := rctlblk_get_local_flags(unsafe.Pointer(rblk))
+		action := rctlblk_get_local_action(unsafe.Pointer(rblk))
+		if (lflags&_RCTL_LOCAL_MAXIMAL) == 0 && action == _RCTL_LOCAL_DENY {
+			// This is a finite (not maximal) value representing a
+			// cap (deny) action.
+			v := rctlblk_get_value(unsafe.Pointer(rblk))
+			if capval == 0 || capval > v {
+				capval = v
+			}
+		}
+
+		// Swap the blocks around so that we can fetch the next value
+		t := rblk
+		rblk = rblkprev
+		rblkprev = t
+		flag = _RCTL_NEXT
+	}
+
+	return capval
+}
+
+func getncpu() int32 {
+	n := int32(sysconf(__SC_NPROCESSORS_ONLN))
+	if n < 1 {
+		return 1
+	}
+
+	if cents := int32(getcpucap()); cents > 0 {
+		// Convert from a percentage of CPUs to a number of CPUs,
+		// rounding up to make use of a fractional CPU
+		// e.g., 336% becomes 4 CPUs
+		ncap := (cents + 99) / 100
+		if ncap < n {
+			return ncap
+		}
+	}
+
+	return n
+}
+
+//go:nosplit
+func getrctl(controlname, oldbuf, newbuf unsafe.Pointer, flags uint32) uintptr {
+	return sysvicall4(&libc_getrctl, uintptr(controlname), uintptr(oldbuf), uintptr(newbuf), uintptr(flags))
+}
+
+//go:nosplit
+func rctlblk_get_local_action(buf unsafe.Pointer) uintptr {
+	return sysvicall2(&libc_rctlblk_get_local_action, uintptr(buf), uintptr(0))
+}
+
+//go:nosplit
+func rctlblk_get_local_flags(buf unsafe.Pointer) uintptr {
+	return sysvicall1(&libc_rctlblk_get_local_flags, uintptr(buf))
+}
+
+//go:nosplit
+func rctlblk_get_value(buf unsafe.Pointer) uint64 {
+	return uint64(sysvicall1(&libc_rctlblk_get_value, uintptr(buf)))
+}
+
+//go:nosplit
+func rctlblk_size() uintptr {
+	return sysvicall0(&libc_rctlblk_size)
+}
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index ad6db18..ff0ee3a 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -12,7 +12,7 @@
 
 func exit(code int32)
 
-func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
 	if fd > 2 {
 		throw("runtime.write to fd > 2 is unsupported")
 	}
@@ -131,7 +131,6 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
 
@@ -143,3 +142,9 @@
 
 // gsignalStack is unused on js.
 type gsignalStack struct{}
+
+const preemptMSupported = false
+
+func preemptM(mp *m) {
+	// No threads, so nothing to do.
+}
diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index d4a9bd4..7b95ff2 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -116,6 +117,13 @@
 	_CLONE_NEWUTS         = 0x4000000
 	_CLONE_NEWIPC         = 0x8000000
 
+	// As of QEMU 2.8.0 (5ea2fc84d), user emulation requires all six of these
+	// flags to be set when creating a thread; attempts to share the other
+	// five but leave SYSVSEM unshared will fail with -EINVAL.
+	//
+	// In non-QEMU environments CLONE_SYSVSEM is inconsequential as we do not
+	// use System V semaphores.
+
 	cloneFlags = _CLONE_VM | /* share memory */
 		_CLONE_FS | /* share cwd, etc */
 		_CLONE_FILES | /* share fd table */
@@ -269,13 +277,14 @@
 	if fd < 0 {
 		return 0
 	}
-	n := read(fd, noescape(unsafe.Pointer(&numbuf[0])), int32(len(numbuf)))
+	ptr := noescape(unsafe.Pointer(&numbuf[0]))
+	n := read(fd, ptr, int32(len(numbuf)))
 	closefd(fd)
 	if n <= 0 {
 		return 0
 	}
-	l := n - 1 // remove trailing newline
-	v, ok := atoi(slicebytetostringtmp(numbuf[:l]))
+	n-- // remove trailing newline
+	v, ok := atoi(slicebytetostringtmp((*byte)(ptr), int(n)))
 	if !ok || v < 0 {
 		v = 0
 	}
@@ -289,6 +298,7 @@
 func osinit() {
 	ncpu = getproccount()
 	physHugePageSize = getHugePageSize()
+	osArchInit()
 }
 
 var urandom_dev = []byte("/dev/urandom\x00")
@@ -318,11 +328,20 @@
 	initsig(true)
 }
 
+// gsignalInitQuirk, if non-nil, is called for every allocated gsignal G.
+//
+// TODO(austin): Remove this after Go 1.15 when we remove the
+// mlockGsignal workaround.
+var gsignalInitQuirk func(gsignal *g)
+
 // Called to initialize a new m (including the bootstrap m).
 // Called on the parent thread (main thread in case of bootstrap), can allocate memory.
 func mpreinit(mp *m) {
 	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
 	mp.gsignal.m = mp
+	if gsignalInitQuirk != nil {
+		gsignalInitQuirk(mp.gsignal)
+	}
 }
 
 func gettid() uint32
@@ -332,7 +351,9 @@
 func minit() {
 	minitSignals()
 
-	// for debuggers, in case cgo created the thread
+	// Cgo-created threads and the bootstrap m are missing a
+	// procid. We need this for asynchronous preemption and it's
+	// useful in debuggers.
 	getg().m.procid = uint64(gettid())
 }
 
@@ -372,6 +393,10 @@
 func sched_getaffinity(pid, len uintptr, buf *byte) int32
 func osyield()
 
+func pipe() (r, w int32, errno int32)
+func pipe2(flags int32) (r, w int32, errno int32)
+func setNonblock(fd int32)
+
 //go:nosplit
 //go:nowritebarrierrec
 func setsig(i uint32, fn uintptr) {
@@ -452,3 +477,25 @@
 // rt_sigaction is implemented in assembly.
 //go:noescape
 func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
+
+func getpid() int
+func tgkill(tgid, tid, sig int)
+
+// touchStackBeforeSignal stores an errno value. If non-zero, it means
+// that we should touch the signal stack before sending a signal.
+// This is used on systems that have a bug when the signal stack must
+// be faulted in.  See #35777 and #37436.
+//
+// This is accessed atomically as it is set and read in different threads.
+//
+// TODO(austin): Remove this after Go 1.15 when we remove the
+// mlockGsignal workaround.
+var touchStackBeforeSignal uint32
+
+// signalM sends a signal to mp.
+func signalM(mp *m, sig int) {
+	if atomic.Load(&touchStackBeforeSignal) != 0 {
+		atomic.Cas((*uint32)(unsafe.Pointer(mp.gsignal.stack.hi-4)), 0, 0)
+	}
+	tgkill(getpid(), int(mp.procid), sig)
+}
diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index 207b0e4..b590da7 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go
@@ -11,8 +11,6 @@
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
 )
 
-var randomNumber uint32
-
 func checkgoarm() {
 	// On Android, /proc/self/auxv might be unreadable and hwcap won't
 	// reflect the CPU capabilities. Assume that every Android arm device
@@ -34,13 +32,6 @@
 
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-
 	case _AT_HWCAP:
 		cpu.HWCap = uint(val)
 	case _AT_HWCAP2:
@@ -48,10 +39,11 @@
 	}
 }
 
+func osArchInit() {}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
 	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// randomNumber provides better seeding of fastrand.
-	return nanotime() + int64(randomNumber)
+	return nanotime()
 }
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 2d6f68b..19968dc 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -8,17 +8,8 @@
 
 import "internal/cpu"
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-
 	case _AT_HWCAP:
 		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
 		// HWCAP/HWCAP2 bits for hardware capabilities.
@@ -36,10 +27,11 @@
 	}
 }
 
+func osArchInit() {}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
 	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// randomNumber provides better seeding of fastrand.
-	return nanotime() + int64(randomNumber)
+	return nanotime()
 }
diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
index 0d7b84d..4ff66f9 100644
--- a/src/runtime/os_linux_mips64x.go
+++ b/src/runtime/os_linux_mips64x.go
@@ -7,25 +7,22 @@
 
 package runtime
 
-var randomNumber uint32
+import "internal/cpu"
 
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
 	}
 }
 
+func osArchInit() {}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
 	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// randomNumber provides better seeding of fastrand.
-	return nanotime() + int64(randomNumber)
+	return nanotime()
 }
 
 const (
diff --git a/src/runtime/os_linux_mipsx.go b/src/runtime/os_linux_mipsx.go
index e0548ec..87962ed 100644
--- a/src/runtime/os_linux_mipsx.go
+++ b/src/runtime/os_linux_mipsx.go
@@ -7,25 +7,16 @@
 
 package runtime
 
-var randomNumber uint32
-
 func archauxv(tag, val uintptr) {
-	switch tag {
-	case _AT_RANDOM:
-		// sysargs filled in startupRandomData, but that
-		// pointer may not be word aligned, so we must treat
-		// it as a byte array.
-		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-	}
 }
 
+func osArchInit() {}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
 	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// randomNumber provides better seeding of fastrand1.
-	return nanotime() + int64(randomNumber)
+	return nanotime()
 }
 
 const (
diff --git a/src/runtime/os_linux_novdso.go b/src/runtime/os_linux_novdso.go
index e54c1c4..155f415 100644
--- a/src/runtime/os_linux_novdso.go
+++ b/src/runtime/os_linux_novdso.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build !386,!amd64,!arm,!arm64,!ppc64,!ppc64le
+// +build !386,!amd64,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le
 
 package runtime
 
diff --git a/src/runtime/os_linux_ppc64x.go b/src/runtime/os_linux_ppc64x.go
index cc79cc4..3aedc23 100644
--- a/src/runtime/os_linux_ppc64x.go
+++ b/src/runtime/os_linux_ppc64x.go
@@ -20,3 +20,5 @@
 		cpu.HWCap2 = uint(val)
 	}
 }
+
+func osArchInit() {}
diff --git a/src/runtime/os_linux_riscv64.go b/src/runtime/os_linux_riscv64.go
new file mode 100644
index 0000000..9be88a5
--- /dev/null
+++ b/src/runtime/os_linux_riscv64.go
@@ -0,0 +1,7 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func osArchInit() {}
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
index 55d35c7..ee18fd1 100644
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -17,3 +17,5 @@
 		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
 	}
 }
+
+func osArchInit() {}
diff --git a/src/runtime/os_linux_x86.go b/src/runtime/os_linux_x86.go
new file mode 100644
index 0000000..d001e6e
--- /dev/null
+++ b/src/runtime/os_linux_x86.go
@@ -0,0 +1,93 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build 386 amd64
+
+package runtime
+
+import "runtime/internal/atomic"
+
+//go:noescape
+func uname(utsname *new_utsname) int
+
+func mlock(addr, len uintptr) int
+
+func osArchInit() {
+	// Linux 5.2 introduced a bug that can corrupt vector
+	// registers on return from a signal if the signal stack isn't
+	// faulted in:
+	// https://bugzilla.kernel.org/show_bug.cgi?id=205663
+	//
+	// It was fixed in 5.3.15, 5.4.2, and all 5.5 and later
+	// kernels.
+	//
+	// If we're on an affected kernel, work around this issue by
+	// mlocking the top page of every signal stack. This doesn't
+	// help for signal stacks created in C, but there's not much
+	// we can do about that.
+	//
+	// TODO(austin): Remove this in Go 1.15, at which point it
+	// will be unlikely to encounter any of the affected kernels
+	// in the wild.
+
+	var uts new_utsname
+	if uname(&uts) < 0 {
+		throw("uname failed")
+	}
+	// Check for null terminator to ensure gostringnocopy doesn't
+	// walk off the end of the release string.
+	found := false
+	for _, b := range uts.release {
+		if b == 0 {
+			found = true
+			break
+		}
+	}
+	if !found {
+		return
+	}
+	rel := gostringnocopy(&uts.release[0])
+
+	major, minor, patch, ok := parseRelease(rel)
+	if !ok {
+		return
+	}
+
+	if major == 5 && (minor == 2 || minor == 3 && patch < 15 || minor == 4 && patch < 2) {
+		gsignalInitQuirk = mlockGsignal
+		if m0.gsignal != nil {
+			throw("gsignal quirk too late")
+		}
+		throwReportQuirk = throwBadKernel
+	}
+}
+
+func mlockGsignal(gsignal *g) {
+	if atomic.Load(&touchStackBeforeSignal) != 0 {
+		// mlock has already failed, don't try again.
+		return
+	}
+
+	// This mlock call may fail, but we don't report the failure.
+	// Instead, if something goes badly wrong, we rely on prepareSignalM
+	// and throwBadKernel to do further mitigation and to report a problem
+	// to the user if mitigation fails. This is because many
+	// systems have a limit on the total mlock size, and many kernels
+	// that appear to have bad versions are actually patched to avoid the
+	// bug described above. We want Go 1.14 to run on those systems.
+	// See #37436.
+	if errno := mlock(gsignal.stack.hi-physPageSize, physPageSize); errno < 0 {
+		atomic.Store(&touchStackBeforeSignal, uint32(-errno))
+	}
+}
+
+// throwBadKernel is called, via throwReportQuirk, by throw.
+func throwBadKernel() {
+	if errno := atomic.Load(&touchStackBeforeSignal); errno != 0 {
+		println("runtime: note: your Linux kernel may be buggy")
+		println("runtime: note: see https://golang.org/wiki/LinuxKernelSignalVectorBug")
+		println("runtime: note: mlock workaround for kernel bug failed with errno", errno)
+	}
+}
diff --git a/src/runtime/os_nacl.go b/src/runtime/os_nacl.go
deleted file mode 100644
index 2b9a1cf..0000000
--- a/src/runtime/os_nacl.go
+++ /dev/null
@@ -1,328 +0,0 @@
-// Copyright 2010 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-type mOS struct {
-	waitsema      int32 // semaphore for parking on locks
-	waitsemacount int32
-	waitsemalock  int32
-}
-
-func nacl_exception_stack(p uintptr, size int32) int32
-func nacl_exception_handler(fn uintptr, arg unsafe.Pointer) int32
-func nacl_sem_create(flag int32) int32
-func nacl_sem_wait(sem int32) int32
-func nacl_sem_post(sem int32) int32
-func nacl_mutex_create(flag int32) int32
-func nacl_mutex_lock(mutex int32) int32
-func nacl_mutex_trylock(mutex int32) int32
-func nacl_mutex_unlock(mutex int32) int32
-func nacl_cond_create(flag int32) int32
-func nacl_cond_wait(cond, n int32) int32
-func nacl_cond_signal(cond int32) int32
-func nacl_cond_broadcast(cond int32) int32
-
-//go:noescape
-func nacl_cond_timed_wait_abs(cond, lock int32, ts *timespec) int32
-func nacl_thread_create(fn uintptr, stk, tls, xx unsafe.Pointer) int32
-
-//go:noescape
-func nacl_nanosleep(ts, extra *timespec) int32
-func nanotime() int64
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
-func exit(code int32)
-func osyield()
-
-//go:noescape
-func write(fd uintptr, p unsafe.Pointer, n int32) int32
-
-//go:linkname os_sigpipe os.sigpipe
-func os_sigpipe() {
-	throw("too many writes on closed pipe")
-}
-
-func dieFromSignal(sig uint32) {
-	exit(2)
-}
-
-func sigpanic() {
-	g := getg()
-	if !canpanic(g) {
-		throw("unexpected signal during runtime execution")
-	}
-
-	// Native Client only invokes the exception handler for memory faults.
-	g.sig = _SIGSEGV
-	panicmem()
-}
-
-func raiseproc(sig uint32) {
-}
-
-// Stubs so tests can link correctly. These should never be called.
-func open(name *byte, mode, perm int32) int32
-func closefd(fd int32) int32
-func read(fd int32, p unsafe.Pointer, n int32) int32
-
-type sigset struct{}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func sigtramp(ctxt byte)
-
-//go:nosplit
-func msigsave(mp *m) {
-}
-
-//go:nosplit
-func msigrestore(sigmask sigset) {
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func clearSignalHandlers() {
-}
-
-//go:nosplit
-func sigblock() {
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, cannot allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// Initialize signal handling
-	ret := nacl_exception_stack(_g_.m.gsignal.stack.lo, 32*1024)
-	if ret < 0 {
-		print("runtime: nacl_exception_stack: error ", -ret, "\n")
-	}
-
-	ret = nacl_exception_handler(funcPC(sigtramp), nil)
-	if ret < 0 {
-		print("runtime: nacl_exception_handler: error ", -ret, "\n")
-	}
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-}
-
-func osinit() {
-	ncpu = 1
-	getg().m.procid = 2
-	//nacl_exception_handler(funcPC(sigtramp), nil);
-	physPageSize = 65536
-}
-
-func signame(sig uint32) string {
-	if sig >= uint32(len(sigtable)) {
-		return ""
-	}
-	return sigtable[sig].name
-}
-
-//go:nosplit
-func crash() {
-	*(*int32)(nil) = 0
-}
-
-//go:noescape
-func getRandomData([]byte)
-
-func goenvs() {
-	goenvs_unix()
-}
-
-func initsig(preinit bool) {
-}
-
-//go:nosplit
-func usleep(us uint32) {
-	var ts timespec
-
-	ts.tv_sec = int64(us / 1e6)
-	ts.tv_nsec = int32(us%1e6) * 1e3
-	nacl_nanosleep(&ts, nil)
-}
-
-func mstart_nacl()
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m) {
-	stk := unsafe.Pointer(mp.g0.stack.hi)
-	mp.tls[0] = uintptr(unsafe.Pointer(mp.g0))
-	mp.tls[1] = uintptr(unsafe.Pointer(mp))
-	ret := nacl_thread_create(funcPC(mstart_nacl), stk, unsafe.Pointer(&mp.tls[2]), nil)
-	if ret < 0 {
-		print("nacl_thread_create: error ", -ret, "\n")
-		throw("newosproc")
-	}
-}
-
-//go:noescape
-func exitThread(wait *uint32)
-
-//go:nosplit
-func semacreate(mp *m) {
-	if mp.waitsema != 0 {
-		return
-	}
-	systemstack(func() {
-		mu := nacl_mutex_create(0)
-		if mu < 0 {
-			print("nacl_mutex_create: error ", -mu, "\n")
-			throw("semacreate")
-		}
-		c := nacl_cond_create(0)
-		if c < 0 {
-			print("nacl_cond_create: error ", -c, "\n")
-			throw("semacreate")
-		}
-		mp.waitsema = c
-		mp.waitsemalock = mu
-	})
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	var ret int32
-	systemstack(func() {
-		_g_ := getg()
-		if nacl_mutex_lock(_g_.m.waitsemalock) < 0 {
-			throw("semasleep")
-		}
-		var ts timespec
-		if ns >= 0 {
-			end := ns + nanotime()
-			ts.tv_sec = end / 1e9
-			ts.tv_nsec = int32(end % 1e9)
-		}
-		for _g_.m.waitsemacount == 0 {
-			if ns < 0 {
-				if nacl_cond_wait(_g_.m.waitsema, _g_.m.waitsemalock) < 0 {
-					throw("semasleep")
-				}
-			} else {
-				r := nacl_cond_timed_wait_abs(_g_.m.waitsema, _g_.m.waitsemalock, &ts)
-				if r == -_ETIMEDOUT {
-					nacl_mutex_unlock(_g_.m.waitsemalock)
-					ret = -1
-					return
-				}
-				if r < 0 {
-					throw("semasleep")
-				}
-			}
-		}
-
-		_g_.m.waitsemacount = 0
-		nacl_mutex_unlock(_g_.m.waitsemalock)
-		ret = 0
-	})
-	return ret
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	systemstack(func() {
-		if nacl_mutex_lock(mp.waitsemalock) < 0 {
-			throw("semawakeup")
-		}
-		if mp.waitsemacount != 0 {
-			throw("semawakeup")
-		}
-		mp.waitsemacount = 1
-		nacl_cond_signal(mp.waitsema)
-		nacl_mutex_unlock(mp.waitsemalock)
-	})
-}
-
-// This runs on a foreign stack, without an m or a g. No stack split.
-//go:nosplit
-//go:norace
-//go:nowritebarrierrec
-func badsignal(sig uintptr) {
-	cgocallback(unsafe.Pointer(funcPC(badsignalgo)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig), 0)
-}
-
-func badsignalgo(sig uintptr) {
-	if !sigsend(uint32(sig)) {
-		// A foreign thread received the signal sig, and the
-		// Go code does not want to handle it.
-		raisebadsignal(uint32(sig))
-	}
-}
-
-// This runs on a foreign stack, without an m or a g. No stack split.
-//go:nosplit
-func badsignal2() {
-	write(2, unsafe.Pointer(&badsignal1[0]), int32(len(badsignal1)))
-	exit(2)
-}
-
-var badsignal1 = []byte("runtime: signal received on thread not created by Go.\n")
-
-func raisebadsignal(sig uint32) {
-	badsignal2()
-}
-
-func madvise(addr unsafe.Pointer, n uintptr, flags int32) {}
-func munmap(addr unsafe.Pointer, n uintptr)               {}
-func setProcessCPUProfiler(hz int32)                      {}
-func setThreadCPUProfiler(hz int32)                       {}
-func sigdisable(uint32)                                   {}
-func sigenable(uint32)                                    {}
-func sigignore(uint32)                                    {}
-func closeonexec(int32)                                   {}
-
-// gsignalStack is unused on nacl.
-type gsignalStack struct{}
-
-var writelock uint32 // test-and-set spin lock for write
-
-// lastfaketime stores the last faketime value written to fd 1 or 2.
-var lastfaketime int64
-
-// lastfaketimefd stores the fd to which lastfaketime was written.
-//
-// Subsequent writes to the same fd may use the same timestamp,
-// but the timestamp must increase if the fd changes.
-var lastfaketimefd int32
-
-/*
-An attempt at IRT. Doesn't work. See end of sys_nacl_amd64.s.
-
-void (*nacl_irt_query)(void);
-
-int8 nacl_irt_basic_v0_1_str[] = "nacl-irt-basic-0.1";
-void *nacl_irt_basic_v0_1[6]; // exit, gettod, clock, nanosleep, sched_yield, sysconf
-int32 nacl_irt_basic_v0_1_size = sizeof(nacl_irt_basic_v0_1);
-
-int8 nacl_irt_memory_v0_3_str[] = "nacl-irt-memory-0.3";
-void *nacl_irt_memory_v0_3[3]; // mmap, munmap, mprotect
-int32 nacl_irt_memory_v0_3_size = sizeof(nacl_irt_memory_v0_3);
-
-int8 nacl_irt_thread_v0_1_str[] = "nacl-irt-thread-0.1";
-void *nacl_irt_thread_v0_1[3]; // thread_create, thread_exit, thread_nice
-int32 nacl_irt_thread_v0_1_size = sizeof(nacl_irt_thread_v0_1);
-*/
-
-// The following functions are implemented in runtime assembly.
-// Provide a Go declaration to go with its assembly definitions.
-
-//go:linkname syscall_naclWrite syscall.naclWrite
-func syscall_naclWrite(fd int, b []byte) int
-
-//go:linkname syscall_now syscall.now
-func syscall_now() (sec int64, nsec int32)
diff --git a/src/runtime/os_nacl_arm.go b/src/runtime/os_nacl_arm.go
deleted file mode 100644
index 8669ee7..0000000
--- a/src/runtime/os_nacl_arm.go
+++ /dev/null
@@ -1,23 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-func checkgoarm() {
-	// TODO(minux): FP checks like in os_linux_arm.go.
-
-	// NaCl/ARM only supports ARMv7
-	if goarm != 7 {
-		print("runtime: NaCl requires ARMv7. Recompile using GOARM=7.\n")
-		exit(1)
-	}
-}
-
-//go:nosplit
-func cputicks() int64 {
-	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
-	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
-	return nanotime()
-}
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index da024cd..97106c7 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -24,8 +24,6 @@
 
 	// From <sys/lwp.h>
 	_LWP_DETACHED = 0x00000040
-
-	_EAGAIN = 35
 )
 
 type mOS struct {
@@ -49,9 +47,10 @@
 
 func lwp_tramp()
 
-func raise(sig uint32)
 func raiseproc(sig uint32)
 
+func lwp_kill(tid int32, sig int)
+
 //go:noescape
 func getcontext(ctxt unsafe.Pointer)
 
@@ -72,7 +71,11 @@
 
 //go:noescape
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
+
+func pipe() (r, w int32, errno int32)
+func pipe2(flags int32) (r, w int32, errno int32)
 func closeonexec(fd int32)
+func setNonblock(fd int32)
 
 const (
 	_ESRCH     = 3
@@ -362,3 +365,17 @@
 		}
 	}
 }
+
+// raise sends signal to the calling thread.
+//
+// It must be nosplit because it is used by the signal handler before
+// it definitely has a Go stack.
+//
+//go:nosplit
+func raise(sig uint32) {
+	lwp_kill(lwp_self(), int(sig))
+}
+
+func signalM(mp *m, sig int) {
+	lwp_kill(int32(mp.procid), sig)
+}
diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go
index 95603da..b5ec23e 100644
--- a/src/runtime/os_netbsd_arm.go
+++ b/src/runtime/os_netbsd_arm.go
@@ -30,6 +30,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_netbsd_arm64.go b/src/runtime/os_netbsd_arm64.go
index fd81eb7..8d21b0a 100644
--- a/src/runtime/os_netbsd_arm64.go
+++ b/src/runtime/os_netbsd_arm64.go
@@ -19,6 +19,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_only_solaris.go b/src/runtime/os_only_solaris.go
new file mode 100644
index 0000000..e2f5409
--- /dev/null
+++ b/src/runtime/os_only_solaris.go
@@ -0,0 +1,18 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Solaris code that doesn't also apply to illumos.
+
+// +build !illumos
+
+package runtime
+
+func getncpu() int32 {
+	n := int32(sysconf(__SC_NPROCESSORS_ONLN))
+	if n < 1 {
+		return 1
+	}
+
+	return n
+}
diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
index 2d6334e..b486b83 100644
--- a/src/runtime/os_openbsd.go
+++ b/src/runtime/os_openbsd.go
@@ -42,9 +42,11 @@
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
 
-func raise(sig uint32)
 func raiseproc(sig uint32)
 
+func getthrid() int32
+func thrkill(tid int32, sig int)
+
 //go:noescape
 func tfork(param *tforkt, psize uintptr, mm *m, gg *g, fn uintptr) int32
 
@@ -60,11 +62,14 @@
 
 //go:noescape
 func kevent(kq int32, ch *keventt, nch int32, ev *keventt, nev int32, ts *timespec) int32
+
+func pipe() (r, w int32, errno int32)
+func pipe2(flags int32) (r, w int32, errno int32)
 func closeonexec(fd int32)
+func setNonblock(fd int32)
 
 const (
 	_ESRCH       = 3
-	_EAGAIN      = 35
 	_EWOULDBLOCK = _EAGAIN
 	_ENOTSUP     = 91
 
@@ -190,7 +195,7 @@
 	// rather than at the top of it.
 	param := tforkt{
 		tf_tcb:   unsafe.Pointer(&mp.tls[0]),
-		tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
+		tf_tid:   nil, // minit will record tid
 		tf_stack: uintptr(stk) - sys.PtrSize,
 	}
 
@@ -238,10 +243,7 @@
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, can not allocate memory.
 func minit() {
-	// m.procid is a uint64, but tfork writes an int32. Fix it up.
-	_g_ := getg()
-	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
-
+	getg().m.procid = uint64(getthrid())
 	minitSignals()
 }
 
@@ -337,3 +339,11 @@
 		throw("remapping stack memory failed")
 	}
 }
+
+func raise(sig uint32) {
+	thrkill(getthrid(), int(sig))
+}
+
+func signalM(mp *m, sig int) {
+	thrkill(int32(mp.procid), sig)
+}
diff --git a/src/runtime/os_openbsd_arm.go b/src/runtime/os_openbsd_arm.go
index be2e1e9..0a24096 100644
--- a/src/runtime/os_openbsd_arm.go
+++ b/src/runtime/os_openbsd_arm.go
@@ -19,6 +19,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_openbsd_arm64.go b/src/runtime/os_openbsd_arm64.go
index f15a95b..d559a2a 100644
--- a/src/runtime/os_openbsd_arm64.go
+++ b/src/runtime/os_openbsd_arm64.go
@@ -12,7 +12,6 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
 
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index d7ea1ef..2bea105 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -293,7 +293,6 @@
 	ncpu = getproccount()
 	physPageSize = getPageSize()
 	getg().m.procid = getpid()
-	notify(unsafe.Pointer(funcPC(sigtramp)))
 }
 
 //go:nosplit
@@ -311,6 +310,9 @@
 }
 
 func initsig(preinit bool) {
+	if !preinit {
+		notify(unsafe.Pointer(funcPC(sigtramp)))
+	}
 }
 
 //go:nosplit
@@ -328,7 +330,7 @@
 }
 
 //go:nosplit
-func nanotime() int64 {
+func nanotime1() int64 {
 	var scratch int64
 	ns := nsec(&scratch)
 	// TODO(aram): remove hack after I fix _nsec in the pc64 kernel.
@@ -373,7 +375,7 @@
 		return -1
 	}
 	len := findnull(&msg[0])
-	if write(uintptr(fd), unsafe.Pointer(&msg[0]), int32(len)) != int64(len) {
+	if write1(uintptr(fd), unsafe.Pointer(&msg[0]), int32(len)) != int32(len) {
 		closefd(fd)
 		return -1
 	}
@@ -451,8 +453,8 @@
 }
 
 //go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, n int32) int64 {
-	return int64(pwrite(int32(fd), buf, n, -1))
+func write1(fd uintptr, buf unsafe.Pointer, n int32) int32 {
+	return pwrite(int32(fd), buf, n, -1)
 }
 
 var _badsignal = []byte("runtime: signal received on thread not created by Go.\n")
@@ -483,3 +485,11 @@
 	}
 	return sigtable[sig].name
 }
+
+const preemptMSupported = false
+
+func preemptM(mp *m) {
+	// Not currently supported.
+	//
+	// TODO: Use a note like we use signals on POSIX OSes
+}
diff --git a/src/runtime/os_plan9_arm.go b/src/runtime/os_plan9_arm.go
index fdce1e7..f165a34 100644
--- a/src/runtime/os_plan9_arm.go
+++ b/src/runtime/os_plan9_arm.go
@@ -12,6 +12,5 @@
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
-	// TODO: need more entropy to better seed fastrand.
 	return nanotime()
 }
diff --git a/src/runtime/os_solaris.go b/src/runtime/os_solaris.go
index 989edb5..89129e5 100644
--- a/src/runtime/os_solaris.go
+++ b/src/runtime/os_solaris.go
@@ -63,6 +63,15 @@
 
 //go:nosplit
 func sysvicall1(fn *libcFunc, a1 uintptr) uintptr {
+	r1, _ := sysvicall1Err(fn, a1)
+	return r1
+}
+
+//go:nosplit
+
+// sysvicall1Err returns both the system call result and the errno value.
+// This is used by sysvicall1 and pipe.
+func sysvicall1Err(fn *libcFunc, a1 uintptr) (r1, err uintptr) {
 	// Leave caller's PC/SP around for traceback.
 	gp := getg()
 	var mp *m
@@ -88,11 +97,21 @@
 	if mp != nil {
 		mp.libcallsp = 0
 	}
-	return libcall.r1
+	return libcall.r1, libcall.err
 }
 
 //go:nosplit
 func sysvicall2(fn *libcFunc, a1, a2 uintptr) uintptr {
+	r1, _ := sysvicall2Err(fn, a1, a2)
+	return r1
+}
+
+//go:nosplit
+//go:cgo_unsafe_args
+
+// sysvicall2Err returns both the system call result and the errno value.
+// This is used by sysvicall2 and pipe2.
+func sysvicall2Err(fn *libcFunc, a1, a2 uintptr) (uintptr, uintptr) {
 	// Leave caller's PC/SP around for traceback.
 	gp := getg()
 	var mp *m
@@ -117,11 +136,21 @@
 	if mp != nil {
 		mp.libcallsp = 0
 	}
-	return libcall.r1
+	return libcall.r1, libcall.err
 }
 
 //go:nosplit
 func sysvicall3(fn *libcFunc, a1, a2, a3 uintptr) uintptr {
+	r1, _ := sysvicall3Err(fn, a1, a2, a3)
+	return r1
+}
+
+//go:nosplit
+//go:cgo_unsafe_args
+
+// sysvicall3Err returns both the system call result and the errno value.
+// This is used by sysicall3 and write1.
+func sysvicall3Err(fn *libcFunc, a1, a2, a3 uintptr) (r1, err uintptr) {
 	// Leave caller's PC/SP around for traceback.
 	gp := getg()
 	var mp *m
@@ -146,7 +175,7 @@
 	if mp != nil {
 		mp.libcallsp = 0
 	}
-	return libcall.r1
+	return libcall.r1, libcall.err
 }
 
 //go:nosplit
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 074ae0f..a584ada 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -6,6 +6,7 @@
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -27,13 +28,15 @@
 //go:cgo_import_dynamic runtime._GetEnvironmentStringsW GetEnvironmentStringsW%0 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetProcAddress GetProcAddress%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetProcessAffinityMask GetProcessAffinityMask%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetQueuedCompletionStatus GetQueuedCompletionStatus%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetQueuedCompletionStatusEx GetQueuedCompletionStatusEx%6 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetStdHandle GetStdHandle%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetSystemDirectoryA GetSystemDirectoryA%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetThreadContext SetThreadContext%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._PostQueuedCompletionStatus PostQueuedCompletionStatus%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._ResumeThread ResumeThread%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._SetConsoleCtrlHandler SetConsoleCtrlHandler%2 "kernel32.dll"
 //go:cgo_import_dynamic runtime._SetErrorMode SetErrorMode%1 "kernel32.dll"
@@ -49,6 +52,7 @@
 //go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
 //go:cgo_import_dynamic runtime._VirtualQuery VirtualQuery%3 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WaitForMultipleObjects WaitForMultipleObjects%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
 
@@ -71,14 +75,16 @@
 	_GetEnvironmentStringsW,
 	_GetProcAddress,
 	_GetProcessAffinityMask,
-	_GetQueuedCompletionStatus,
+	_GetQueuedCompletionStatusEx,
 	_GetStdHandle,
 	_GetSystemDirectoryA,
 	_GetSystemInfo,
 	_GetSystemTimeAsFileTime,
 	_GetThreadContext,
+	_SetThreadContext,
 	_LoadLibraryW,
 	_LoadLibraryA,
+	_PostQueuedCompletionStatus,
 	_QueryPerformanceCounter,
 	_QueryPerformanceFrequency,
 	_ResumeThread,
@@ -96,6 +102,7 @@
 	_VirtualFree,
 	_VirtualQuery,
 	_WaitForSingleObject,
+	_WaitForMultipleObjects,
 	_WriteConsoleW,
 	_WriteFile,
 	_ stdFunction
@@ -104,7 +111,6 @@
 	// We will load syscalls, if available, before using them.
 	_AddDllDirectory,
 	_AddVectoredContinueHandler,
-	_GetQueuedCompletionStatusEx,
 	_LoadLibraryExA,
 	_LoadLibraryExW,
 	_ stdFunction
@@ -139,7 +145,34 @@
 func ctrlhandler()
 
 type mOS struct {
-	waitsema uintptr // semaphore for parking on locks
+	threadLock mutex   // protects "thread" and prevents closing
+	thread     uintptr // thread handle
+
+	waitsema   uintptr // semaphore for parking on locks
+	resumesema uintptr // semaphore to indicate suspend/resume
+
+	// preemptExtLock synchronizes preemptM with entry/exit from
+	// external C code.
+	//
+	// This protects against races between preemptM calling
+	// SuspendThread and external code on this thread calling
+	// ExitProcess. If these happen concurrently, it's possible to
+	// exit the suspending thread and suspend the exiting thread,
+	// leading to deadlock.
+	//
+	// 0 indicates this M is not being preempted or in external
+	// code. Entering external code CASes this from 0 to 1. If
+	// this fails, a preemption is in progress, so the thread must
+	// wait for the preemption. preemptM also CASes this from 0 to
+	// 1. If this fails, the preemption fails (as it would if the
+	// PC weren't in Go code). The value is reset to 0 when
+	// returning from external code or after a preemption is
+	// complete.
+	//
+	// TODO(austin): We may not need this if preemption were more
+	// tightly synchronized on the G/P status and preemption
+	// blocked transition into _Gsyscall/_Psyscall.
+	preemptExtLock uint32
 }
 
 //go:linkname os_sigpipe os.sigpipe
@@ -205,7 +238,6 @@
 	}
 	_AddDllDirectory = windowsFindfunc(k32, []byte("AddDllDirectory\000"))
 	_AddVectoredContinueHandler = windowsFindfunc(k32, []byte("AddVectoredContinueHandler\000"))
-	_GetQueuedCompletionStatusEx = windowsFindfunc(k32, []byte("GetQueuedCompletionStatusEx\000"))
 	_LoadLibraryExA = windowsFindfunc(k32, []byte("LoadLibraryExA\000"))
 	_LoadLibraryExW = windowsFindfunc(k32, []byte("LoadLibraryExW\000"))
 	useLoadLibraryEx = (_LoadLibraryExW != nil && _LoadLibraryExA != nil && _AddDllDirectory != nil)
@@ -258,6 +290,39 @@
 	}
 }
 
+func monitorSuspendResume() {
+	const (
+		_DEVICE_NOTIFY_CALLBACK = 2
+	)
+	type _DEVICE_NOTIFY_SUBSCRIBE_PARAMETERS struct {
+		callback uintptr
+		context  uintptr
+	}
+
+	powrprof := windowsLoadSystemLib([]byte("powrprof.dll\000"))
+	if powrprof == 0 {
+		return // Running on Windows 7, where we don't need it anyway.
+	}
+	powerRegisterSuspendResumeNotification := windowsFindfunc(powrprof, []byte("PowerRegisterSuspendResumeNotification\000"))
+	if powerRegisterSuspendResumeNotification == nil {
+		return // Running on Windows 7, where we don't need it anyway.
+	}
+	var fn interface{} = func(context uintptr, changeType uint32, setting uintptr) uintptr {
+		for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+			if mp.resumesema != 0 {
+				stdcall1(_SetEvent, mp.resumesema)
+			}
+		}
+		return 0
+	}
+	params := _DEVICE_NOTIFY_SUBSCRIBE_PARAMETERS{
+		callback: compileCallback(*efaceOf(&fn), true),
+	}
+	handle := uintptr(0)
+	stdcall3(powerRegisterSuspendResumeNotification, _DEVICE_NOTIFY_CALLBACK,
+		uintptr(unsafe.Pointer(&params)), uintptr(unsafe.Pointer(&handle)))
+}
+
 //go:nosplit
 func getLoadLibrary() uintptr {
 	return uintptr(unsafe.Pointer(_LoadLibraryW))
@@ -377,8 +442,6 @@
 	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
 }
 
-func nanotime() int64
-
 // useQPCTime controls whether time.now and nanotime use QueryPerformanceCounter.
 // This is only set to 1 when running under Wine.
 var useQPCTime uint8
@@ -488,6 +551,10 @@
 	}
 
 	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
+
+	// We call this all the way here, late in init, so that malloc works
+	// for the callback function this generates.
+	monitorSuspendResume()
 }
 
 // exiting is set to non-zero when the process is exiting.
@@ -495,12 +562,21 @@
 
 //go:nosplit
 func exit(code int32) {
+	// Disallow thread suspension for preemption. Otherwise,
+	// ExitProcess and SuspendThread can race: SuspendThread
+	// queues a suspension request for this thread, ExitProcess
+	// kills the suspending thread, and then this thread suspends.
+	lock(&suspendLock)
 	atomic.Store(&exiting, 1)
 	stdcall1(_ExitProcess, uintptr(code))
 }
 
+// write1 must be nosplit because it's used as a last resort in
+// functions like badmorestackg0. In such cases, we'll always take the
+// ASCII path.
+//
 //go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, n int32) int32 {
+func write1(fd uintptr, buf unsafe.Pointer, n int32) int32 {
 	const (
 		_STD_OUTPUT_HANDLE = ^uintptr(10) // -11
 		_STD_ERROR_HANDLE  = ^uintptr(11) // -12
@@ -597,6 +673,9 @@
 	return
 }
 
+// walltime1 isn't implemented on Windows, but will never be called.
+func walltime1() (sec int64, nsec int32)
+
 //go:nosplit
 func semasleep(ns int64) int32 {
 	const (
@@ -606,19 +685,32 @@
 		_WAIT_FAILED    = 0xFFFFFFFF
 	)
 
-	// store ms in ns to save stack space
+	var result uintptr
 	if ns < 0 {
-		ns = _INFINITE
+		result = stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(_INFINITE))
 	} else {
-		ns = int64(timediv(ns, 1000000, nil))
-		if ns == 0 {
-			ns = 1
+		start := nanotime()
+		elapsed := int64(0)
+		for {
+			ms := int64(timediv(ns-elapsed, 1000000, nil))
+			if ms == 0 {
+				ms = 1
+			}
+			result = stdcall4(_WaitForMultipleObjects, 2,
+				uintptr(unsafe.Pointer(&[2]uintptr{getg().m.waitsema, getg().m.resumesema})),
+				0, uintptr(ms))
+			if result != _WAIT_OBJECT_0+1 {
+				// Not a suspend/resume event
+				break
+			}
+			elapsed = nanotime() - start
+			if elapsed >= ns {
+				return -1
+			}
 		}
 	}
-
-	result := stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(ns))
 	switch result {
-	case _WAIT_OBJECT_0: //signaled
+	case _WAIT_OBJECT_0: // Signaled
 		return 0
 
 	case _WAIT_TIMEOUT:
@@ -667,6 +759,15 @@
 			throw("runtime.semacreate")
 		})
 	}
+	mp.resumesema = stdcall4(_CreateEventA, 0, 0, 0, 0)
+	if mp.resumesema == 0 {
+		systemstack(func() {
+			print("runtime: createevent failed; errno=", getlasterror(), "\n")
+			throw("runtime.semacreate")
+		})
+		stdcall1(_CloseHandle, mp.waitsema)
+		mp.waitsema = 0
+	}
 }
 
 // May run with m.p==nil, so write barriers are not allowed. This
@@ -705,7 +806,7 @@
 func newosproc0(mp *m, stk unsafe.Pointer) {
 	// TODO: this is completely broken. The args passed to newosproc0 (in asm_amd64.s)
 	// are stacksize and function, not *m and stack.
-	// Check os_linux.go for an implemention that might actually work.
+	// Check os_linux.go for an implementation that might actually work.
 	throw("bad newosproc0")
 }
 
@@ -742,7 +843,11 @@
 func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
-	atomic.Storeuintptr(&getg().m.thread, thandle)
+
+	mp := getg().m
+	lock(&mp.threadLock)
+	mp.thread = thandle
+	unlock(&mp.threadLock)
 
 	// Query the true stack base from the OS. Currently we're
 	// running on a small assumed stack.
@@ -775,9 +880,11 @@
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	tp := &getg().m.thread
-	stdcall1(_CloseHandle, *tp)
-	*tp = 0
+	mp := getg().m
+	lock(&mp.threadLock)
+	stdcall1(_CloseHandle, mp.thread)
+	mp.thread = 0
+	unlock(&mp.threadLock)
 }
 
 // Calling stdcall on os stack.
@@ -894,6 +1001,8 @@
 	switch _type {
 	case _CTRL_C_EVENT, _CTRL_BREAK_EVENT:
 		s = _SIGINT
+	case _CTRL_CLOSE_EVENT, _CTRL_LOGOFF_EVENT, _CTRL_SHUTDOWN_EVENT:
+		s = _SIGTERM
 	default:
 		return 0
 	}
@@ -901,7 +1010,11 @@
 	if sigsend(s) {
 		return 1
 	}
-	exit(2) // SIGINT, SIGTERM, etc
+	if !islibrary && !isarchive {
+		// Only exit the program if we don't have a DLL.
+		// See https://golang.org/issues/35965.
+		exit(2) // SIGINT, SIGTERM, etc
+	}
 	return 0
 }
 
@@ -914,27 +1027,30 @@
 var profiletimer uintptr
 
 func profilem(mp *m, thread uintptr) {
-	var r *context
-	rbuf := make([]byte, unsafe.Sizeof(*r)+15)
+	// Align Context to 16 bytes.
+	var c *context
+	var cbuf [unsafe.Sizeof(*c) + 15]byte
+	c = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&cbuf[15]))) &^ 15))
 
-	// align Context to 16 bytes
-	r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
-	r.contextflags = _CONTEXT_CONTROL
-	stdcall2(_GetThreadContext, thread, uintptr(unsafe.Pointer(r)))
+	c.contextflags = _CONTEXT_CONTROL
+	stdcall2(_GetThreadContext, thread, uintptr(unsafe.Pointer(c)))
 
-	var gp *g
+	gp := gFromTLS(mp)
+
+	sigprof(c.ip(), c.sp(), c.lr(), gp, mp)
+}
+
+func gFromTLS(mp *m) *g {
 	switch GOARCH {
-	default:
-		panic("unsupported architecture")
 	case "arm":
 		tls := &mp.tls[0]
-		gp = **((***g)(unsafe.Pointer(tls)))
+		return **((***g)(unsafe.Pointer(tls)))
 	case "386", "amd64":
 		tls := &mp.tls[0]
-		gp = *((**g)(unsafe.Pointer(tls)))
+		return *((**g)(unsafe.Pointer(tls)))
 	}
-
-	sigprof(r.ip(), r.sp(), r.lr(), gp, mp)
+	throw("unsupported architecture")
+	return nil
 }
 
 func profileloop1(param uintptr) uint32 {
@@ -944,17 +1060,25 @@
 		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
 		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
 		for mp := first; mp != nil; mp = mp.alllink {
-			thread := atomic.Loaduintptr(&mp.thread)
+			lock(&mp.threadLock)
 			// Do not profile threads blocked on Notes,
 			// this includes idle worker threads,
 			// idle timer thread, idle heap scavenger, etc.
-			if thread == 0 || mp.profilehz == 0 || mp.blocked {
+			if mp.thread == 0 || mp.profilehz == 0 || mp.blocked {
+				unlock(&mp.threadLock)
 				continue
 			}
-			// mp may exit between the load above and the
-			// SuspendThread, so be careful.
+			// Acquire our own handle to the thread.
+			var thread uintptr
+			stdcall7(_DuplicateHandle, currentProcess, mp.thread, currentProcess, uintptr(unsafe.Pointer(&thread)), 0, 0, _DUPLICATE_SAME_ACCESS)
+			unlock(&mp.threadLock)
+			// mp may exit between the DuplicateHandle
+			// above and the SuspendThread. The handle
+			// will remain valid, but SuspendThread may
+			// fail.
 			if int32(stdcall1(_SuspendThread, thread)) == -1 {
 				// The thread no longer exists.
+				stdcall1(_CloseHandle, thread)
 				continue
 			}
 			if mp.profilehz != 0 && !mp.blocked {
@@ -963,6 +1087,7 @@
 				profilem(mp, thread)
 			}
 			stdcall1(_ResumeThread, thread)
+			stdcall1(_CloseHandle, thread)
 		}
 	}
 }
@@ -990,3 +1115,140 @@
 	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
 	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
 }
+
+const preemptMSupported = GOARCH != "arm"
+
+// suspendLock protects simultaneous SuspendThread operations from
+// suspending each other.
+var suspendLock mutex
+
+func preemptM(mp *m) {
+	if GOARCH == "arm" {
+		// TODO: Implement call injection
+		return
+	}
+
+	if mp == getg().m {
+		throw("self-preempt")
+	}
+
+	// Synchronize with external code that may try to ExitProcess.
+	if !atomic.Cas(&mp.preemptExtLock, 0, 1) {
+		// External code is running. Fail the preemption
+		// attempt.
+		atomic.Xadd(&mp.preemptGen, 1)
+		return
+	}
+
+	// Acquire our own handle to mp's thread.
+	lock(&mp.threadLock)
+	if mp.thread == 0 {
+		// The M hasn't been minit'd yet (or was just unminit'd).
+		unlock(&mp.threadLock)
+		atomic.Store(&mp.preemptExtLock, 0)
+		atomic.Xadd(&mp.preemptGen, 1)
+		return
+	}
+	var thread uintptr
+	stdcall7(_DuplicateHandle, currentProcess, mp.thread, currentProcess, uintptr(unsafe.Pointer(&thread)), 0, 0, _DUPLICATE_SAME_ACCESS)
+	unlock(&mp.threadLock)
+
+	// Prepare thread context buffer. This must be aligned to 16 bytes.
+	var c *context
+	var cbuf [unsafe.Sizeof(*c) + 15]byte
+	c = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&cbuf[15]))) &^ 15))
+	c.contextflags = _CONTEXT_CONTROL
+
+	// Serialize thread suspension. SuspendThread is asynchronous,
+	// so it's otherwise possible for two threads to suspend each
+	// other and deadlock. We must hold this lock until after
+	// GetThreadContext, since that blocks until the thread is
+	// actually suspended.
+	lock(&suspendLock)
+
+	// Suspend the thread.
+	if int32(stdcall1(_SuspendThread, thread)) == -1 {
+		unlock(&suspendLock)
+		stdcall1(_CloseHandle, thread)
+		atomic.Store(&mp.preemptExtLock, 0)
+		// The thread no longer exists. This shouldn't be
+		// possible, but just acknowledge the request.
+		atomic.Xadd(&mp.preemptGen, 1)
+		return
+	}
+
+	// We have to be very careful between this point and once
+	// we've shown mp is at an async safe-point. This is like a
+	// signal handler in the sense that mp could have been doing
+	// anything when we stopped it, including holding arbitrary
+	// locks.
+
+	// We have to get the thread context before inspecting the M
+	// because SuspendThread only requests a suspend.
+	// GetThreadContext actually blocks until it's suspended.
+	stdcall2(_GetThreadContext, thread, uintptr(unsafe.Pointer(c)))
+
+	unlock(&suspendLock)
+
+	// Does it want a preemption and is it safe to preempt?
+	gp := gFromTLS(mp)
+	if wantAsyncPreempt(gp) {
+		if ok, newpc := isAsyncSafePoint(gp, c.ip(), c.sp(), c.lr()); ok {
+			// Inject call to asyncPreempt
+			targetPC := funcPC(asyncPreempt)
+			switch GOARCH {
+			default:
+				throw("unsupported architecture")
+			case "386", "amd64":
+				// Make it look like the thread called targetPC.
+				sp := c.sp()
+				sp -= sys.PtrSize
+				*(*uintptr)(unsafe.Pointer(sp)) = newpc
+				c.set_sp(sp)
+				c.set_ip(targetPC)
+			}
+
+			stdcall2(_SetThreadContext, thread, uintptr(unsafe.Pointer(c)))
+		}
+	}
+
+	atomic.Store(&mp.preemptExtLock, 0)
+
+	// Acknowledge the preemption.
+	atomic.Xadd(&mp.preemptGen, 1)
+
+	stdcall1(_ResumeThread, thread)
+	stdcall1(_CloseHandle, thread)
+}
+
+// osPreemptExtEnter is called before entering external code that may
+// call ExitProcess.
+//
+// This must be nosplit because it may be called from a syscall with
+// untyped stack slots, so the stack must not be grown or scanned.
+//
+//go:nosplit
+func osPreemptExtEnter(mp *m) {
+	for !atomic.Cas(&mp.preemptExtLock, 0, 1) {
+		// An asynchronous preemption is in progress. It's not
+		// safe to enter external code because it may call
+		// ExitProcess and deadlock with SuspendThread.
+		// Ideally we would do the preemption ourselves, but
+		// can't since there may be untyped syscall arguments
+		// on the stack. Instead, just wait and encourage the
+		// SuspendThread APC to run. The preemption should be
+		// done shortly.
+		osyield()
+	}
+	// Asynchronous preemption is now blocked.
+}
+
+// osPreemptExtExit is called after returning from external code that
+// may call ExitProcess.
+//
+// See osPreemptExtEnter for why this is nosplit.
+//
+//go:nosplit
+func osPreemptExtExit(mp *m) {
+	atomic.Store(&mp.preemptExtLock, 0)
+}
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 5f33cd7..615249f 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -10,6 +10,19 @@
 	"unsafe"
 )
 
+// We have two different ways of doing defers. The older way involves creating a
+// defer record at the time that a defer statement is executing and adding it to a
+// defer chain. This chain is inspected by the deferreturn call at all function
+// exits in order to run the appropriate defer calls. A cheaper way (which we call
+// open-coded defers) is used for functions in which no defer statements occur in
+// loops. In that case, we simply store the defer function/arg information into
+// specific stack slots at the point of each defer statement, as well as setting a
+// bit in a bitmask. At each function exit, we add inline code to directly make
+// the appropriate defer calls based on the bitmask and fn/arg information stored
+// on the stack. During panic/Goexit processing, the appropriate defer calls are
+// made using extra funcdata info that indicates the exact stack slots that
+// contain the bitmask and defer fn/args.
+
 // Check to make sure we can really generate a panic. If the panic
 // was generated from the runtime, or from inside malloc, then convert
 // to a throw of msg.
@@ -203,7 +216,8 @@
 // The compiler turns a defer statement into a call to this.
 //go:nosplit
 func deferproc(siz int32, fn *funcval) { // arguments of fn follow fn
-	if getg().m.curg != getg() {
+	gp := getg()
+	if gp.m.curg != gp {
 		// go code on the system stack can't defer
 		throw("defer on system stack")
 	}
@@ -221,6 +235,8 @@
 	if d._panic != nil {
 		throw("deferproc: d.panic != nil after newdefer")
 	}
+	d.link = gp._defer
+	gp._defer = d
 	d.fn = fn
 	d.pc = callerpc
 	d.sp = sp
@@ -263,19 +279,24 @@
 	// are initialized here.
 	d.started = false
 	d.heap = false
+	d.openDefer = false
 	d.sp = getcallersp()
 	d.pc = getcallerpc()
+	d.framepc = 0
+	d.varp = 0
 	// The lines below implement:
 	//   d.panic = nil
+	//   d.fd = nil
 	//   d.link = gp._defer
 	//   gp._defer = d
-	// But without write barriers. The first two are writes to
+	// But without write barriers. The first three are writes to
 	// the stack so they don't need a write barrier, and furthermore
 	// are to uninitialized memory, so they must not use a write barrier.
-	// The third write does not require a write barrier because we
+	// The fourth write does not require a write barrier because we
 	// explicitly mark all the defer structures, so we don't need to
 	// keep track of pointers to them with a write barrier.
 	*(*uintptr)(unsafe.Pointer(&d._panic)) = 0
+	*(*uintptr)(unsafe.Pointer(&d.fd)) = 0
 	*(*uintptr)(unsafe.Pointer(&d.link)) = uintptr(unsafe.Pointer(gp._defer))
 	*(*uintptr)(unsafe.Pointer(&gp._defer)) = uintptr(unsafe.Pointer(d))
 
@@ -356,7 +377,8 @@
 }
 
 // Allocate a Defer, usually using per-P pool.
-// Each defer must be released with freedefer.
+// Each defer must be released with freedefer.  The defer is not
+// added to any defer chain yet.
 //
 // This must not grow the stack because there may be a frame without
 // stack map information when this is called.
@@ -406,8 +428,6 @@
 	}
 	d.siz = siz
 	d.heap = true
-	d.link = gp._defer
-	gp._defer = d
 	return d
 }
 
@@ -463,8 +483,12 @@
 	// started causing a nosplit stack overflow via typedmemmove.
 	d.siz = 0
 	d.started = false
+	d.openDefer = false
 	d.sp = 0
 	d.pc = 0
+	d.framepc = 0
+	d.varp = 0
+	d.fd = nil
 	// d._panic and d.fn must be nil already.
 	// If not, we would have called freedeferpanic or freedeferfn above,
 	// both of which throw.
@@ -493,9 +517,11 @@
 // to have been called by the caller of deferreturn at the point
 // just before deferreturn was called. The effect is that deferreturn
 // is called again and again until there are no more deferred functions.
-// Cannot split the stack because we reuse the caller's frame to
-// call the deferred function.
-
+//
+// Declared as nosplit, because the function should not be preempted once we start
+// modifying the caller's frame in order to reuse the frame to call the deferred
+// function.
+//
 // The single argument isn't actually used - it just has its address
 // taken so it can be matched against pending defers.
 //go:nosplit
@@ -509,6 +535,15 @@
 	if d.sp != sp {
 		return
 	}
+	if d.openDefer {
+		done := runOpenDeferFrame(gp, d)
+		if !done {
+			throw("unfinished open-coded defers in deferreturn")
+		}
+		gp._defer = d.link
+		freedefer(d)
+		return
+	}
 
 	// Moving arguments around.
 	//
@@ -528,6 +563,12 @@
 	d.fn = nil
 	gp._defer = d.link
 	freedefer(d)
+	// If the defer function pointer is nil, force the seg fault to happen
+	// here rather than in jmpdefer. gentraceback() throws an error if it is
+	// called with a callback on an LR architecture and jmpdefer is on the
+	// stack, because the stack trace can be incorrect in that case - see
+	// issue #8153).
+	_ = fn.fn
 	jmpdefer(fn, uintptr(unsafe.Pointer(&arg0)))
 }
 
@@ -544,6 +585,15 @@
 	// This code is similar to gopanic, see that implementation
 	// for detailed comments.
 	gp := getg()
+
+	// Create a panic object for Goexit, so we can recognize when it might be
+	// bypassed by a recover().
+	var p _panic
+	p.goexit = true
+	p.link = gp._panic
+	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+
+	addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
 	for {
 		d := gp._defer
 		if d == nil {
@@ -554,13 +604,47 @@
 				d._panic.aborted = true
 				d._panic = nil
 			}
-			d.fn = nil
-			gp._defer = d.link
-			freedefer(d)
-			continue
+			if !d.openDefer {
+				d.fn = nil
+				gp._defer = d.link
+				freedefer(d)
+				continue
+			}
 		}
 		d.started = true
-		reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
+		if d.openDefer {
+			done := runOpenDeferFrame(gp, d)
+			if !done {
+				// We should always run all defers in the frame,
+				// since there is no panic associated with this
+				// defer that can be recovered.
+				throw("unfinished open-coded defers in Goexit")
+			}
+			if p.aborted {
+				// Since our current defer caused a panic and may
+				// have been already freed, just restart scanning
+				// for open-coded defers from this frame again.
+				addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
+			} else {
+				addOneOpenDeferFrame(gp, 0, nil)
+			}
+		} else {
+
+			// Save the pc/sp in reflectcallSave(), so we can "recover" back to this
+			// loop if necessary.
+			reflectcallSave(&p, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz))
+		}
+		if p.aborted {
+			// We had a recursive panic in the defer d we started, and
+			// then did a recover in a defer that was further down the
+			// defer chain than d. In the case of an outstanding Goexit,
+			// we force the recover to return back to this loop. d will
+			// have already been freed if completed, so just continue
+			// immediately to the next defer on the chain.
+			p.aborted = false
+			continue
+		}
 		if gp._defer != d {
 			throw("bad defer entry in Goexit")
 		}
@@ -597,7 +681,12 @@
 func printpanics(p *_panic) {
 	if p.link != nil {
 		printpanics(p.link)
-		print("\t")
+		if !p.link.goexit {
+			print("\t")
+		}
+	}
+	if p.goexit {
+		return
 	}
 	print("panic: ")
 	printany(p.arg)
@@ -607,6 +696,195 @@
 	print("\n")
 }
 
+// addOneOpenDeferFrame scans the stack for the first frame (if any) with
+// open-coded defers and if it finds one, adds a single record to the defer chain
+// for that frame. If sp is non-nil, it starts the stack scan from the frame
+// specified by sp. If sp is nil, it uses the sp from the current defer record
+// (which has just been finished). Hence, it continues the stack scan from the
+// frame of the defer that just finished. It skips any frame that already has an
+// open-coded _defer record, which would have been been created from a previous
+// (unrecovered) panic.
+//
+// Note: All entries of the defer chain (including this new open-coded entry) have
+// their pointers (including sp) adjusted properly if the stack moves while
+// running deferred functions. Also, it is safe to pass in the sp arg (which is
+// the direct result of calling getcallersp()), because all pointer variables
+// (including arguments) are adjusted as needed during stack copies.
+func addOneOpenDeferFrame(gp *g, pc uintptr, sp unsafe.Pointer) {
+	var prevDefer *_defer
+	if sp == nil {
+		prevDefer = gp._defer
+		pc = prevDefer.framepc
+		sp = unsafe.Pointer(prevDefer.sp)
+	}
+	systemstack(func() {
+		gentraceback(pc, uintptr(sp), 0, gp, 0, nil, 0x7fffffff,
+			func(frame *stkframe, unused unsafe.Pointer) bool {
+				if prevDefer != nil && prevDefer.sp == frame.sp {
+					// Skip the frame for the previous defer that
+					// we just finished (and was used to set
+					// where we restarted the stack scan)
+					return true
+				}
+				f := frame.fn
+				fd := funcdata(f, _FUNCDATA_OpenCodedDeferInfo)
+				if fd == nil {
+					return true
+				}
+				// Insert the open defer record in the
+				// chain, in order sorted by sp.
+				d := gp._defer
+				var prev *_defer
+				for d != nil {
+					dsp := d.sp
+					if frame.sp < dsp {
+						break
+					}
+					if frame.sp == dsp {
+						if !d.openDefer {
+							throw("duplicated defer entry")
+						}
+						return true
+					}
+					prev = d
+					d = d.link
+				}
+				if frame.fn.deferreturn == 0 {
+					throw("missing deferreturn")
+				}
+
+				maxargsize, _ := readvarintUnsafe(fd)
+				d1 := newdefer(int32(maxargsize))
+				d1.openDefer = true
+				d1._panic = nil
+				// These are the pc/sp to set after we've
+				// run a defer in this frame that did a
+				// recover. We return to a special
+				// deferreturn that runs any remaining
+				// defers and then returns from the
+				// function.
+				d1.pc = frame.fn.entry + uintptr(frame.fn.deferreturn)
+				d1.varp = frame.varp
+				d1.fd = fd
+				// Save the SP/PC associated with current frame,
+				// so we can continue stack trace later if needed.
+				d1.framepc = frame.pc
+				d1.sp = frame.sp
+				d1.link = d
+				if prev == nil {
+					gp._defer = d1
+				} else {
+					prev.link = d1
+				}
+				// Stop stack scanning after adding one open defer record
+				return false
+			},
+			nil, 0)
+	})
+}
+
+// readvarintUnsafe reads the uint32 in varint format starting at fd, and returns the
+// uint32 and a pointer to the byte following the varint.
+//
+// There is a similar function runtime.readvarint, which takes a slice of bytes,
+// rather than an unsafe pointer. These functions are duplicated, because one of
+// the two use cases for the functions would get slower if the functions were
+// combined.
+func readvarintUnsafe(fd unsafe.Pointer) (uint32, unsafe.Pointer) {
+	var r uint32
+	var shift int
+	for {
+		b := *(*uint8)((unsafe.Pointer(fd)))
+		fd = add(fd, unsafe.Sizeof(b))
+		if b < 128 {
+			return r + uint32(b)<<shift, fd
+		}
+		r += ((uint32(b) &^ 128) << shift)
+		shift += 7
+		if shift > 28 {
+			panic("Bad varint")
+		}
+	}
+}
+
+// runOpenDeferFrame runs the active open-coded defers in the frame specified by
+// d. It normally processes all active defers in the frame, but stops immediately
+// if a defer does a successful recover. It returns true if there are no
+// remaining defers to run in the frame.
+func runOpenDeferFrame(gp *g, d *_defer) bool {
+	done := true
+	fd := d.fd
+
+	// Skip the maxargsize
+	_, fd = readvarintUnsafe(fd)
+	deferBitsOffset, fd := readvarintUnsafe(fd)
+	nDefers, fd := readvarintUnsafe(fd)
+	deferBits := *(*uint8)(unsafe.Pointer(d.varp - uintptr(deferBitsOffset)))
+
+	for i := int(nDefers) - 1; i >= 0; i-- {
+		// read the funcdata info for this defer
+		var argWidth, closureOffset, nArgs uint32
+		argWidth, fd = readvarintUnsafe(fd)
+		closureOffset, fd = readvarintUnsafe(fd)
+		nArgs, fd = readvarintUnsafe(fd)
+		if deferBits&(1<<i) == 0 {
+			for j := uint32(0); j < nArgs; j++ {
+				_, fd = readvarintUnsafe(fd)
+				_, fd = readvarintUnsafe(fd)
+				_, fd = readvarintUnsafe(fd)
+			}
+			continue
+		}
+		closure := *(**funcval)(unsafe.Pointer(d.varp - uintptr(closureOffset)))
+		d.fn = closure
+		deferArgs := deferArgs(d)
+		// If there is an interface receiver or method receiver, it is
+		// described/included as the first arg.
+		for j := uint32(0); j < nArgs; j++ {
+			var argOffset, argLen, argCallOffset uint32
+			argOffset, fd = readvarintUnsafe(fd)
+			argLen, fd = readvarintUnsafe(fd)
+			argCallOffset, fd = readvarintUnsafe(fd)
+			memmove(unsafe.Pointer(uintptr(deferArgs)+uintptr(argCallOffset)),
+				unsafe.Pointer(d.varp-uintptr(argOffset)),
+				uintptr(argLen))
+		}
+		deferBits = deferBits &^ (1 << i)
+		*(*uint8)(unsafe.Pointer(d.varp - uintptr(deferBitsOffset))) = deferBits
+		p := d._panic
+		reflectcallSave(p, unsafe.Pointer(closure), deferArgs, argWidth)
+		if p != nil && p.aborted {
+			break
+		}
+		d.fn = nil
+		// These args are just a copy, so can be cleared immediately
+		memclrNoHeapPointers(deferArgs, uintptr(argWidth))
+		if d._panic != nil && d._panic.recovered {
+			done = deferBits == 0
+			break
+		}
+	}
+
+	return done
+}
+
+// reflectcallSave calls reflectcall after saving the caller's pc and sp in the
+// panic record. This allows the runtime to return to the Goexit defer processing
+// loop, in the unusual case where the Goexit may be bypassed by a successful
+// recover.
+func reflectcallSave(p *_panic, fn, arg unsafe.Pointer, argsize uint32) {
+	if p != nil {
+		p.argp = unsafe.Pointer(getargp(0))
+		p.pc = getcallerpc()
+		p.sp = unsafe.Pointer(getcallersp())
+	}
+	reflectcall(nil, fn, arg, argsize, argsize)
+	if p != nil {
+		p.pc = 0
+		p.sp = unsafe.Pointer(nil)
+	}
+}
+
 // The implementation of the predeclared function panic.
 func gopanic(e interface{}) {
 	gp := getg()
@@ -646,6 +924,10 @@
 
 	atomic.Xadd(&runningPanicDefers, 1)
 
+	// By calculating getcallerpc/getcallersp here, we avoid scanning the
+	// gopanic frame (stack scanning is slow...)
+	addOneOpenDeferFrame(gp, getcallerpc(), unsafe.Pointer(getcallersp()))
+
 	for {
 		d := gp._defer
 		if d == nil {
@@ -653,16 +935,23 @@
 		}
 
 		// If defer was started by earlier panic or Goexit (and, since we're back here, that triggered a new panic),
-		// take defer off list. The earlier panic or Goexit will not continue running.
+		// take defer off list. An earlier panic will not continue running, but we will make sure below that an
+		// earlier Goexit does continue running.
 		if d.started {
 			if d._panic != nil {
 				d._panic.aborted = true
 			}
 			d._panic = nil
-			d.fn = nil
-			gp._defer = d.link
-			freedefer(d)
-			continue
+			if !d.openDefer {
+				// For open-coded defers, we need to process the
+				// defer again, in case there are any other defers
+				// to call in the frame (not including the defer
+				// call that caused the panic).
+				d.fn = nil
+				gp._defer = d.link
+				freedefer(d)
+				continue
+			}
 		}
 
 		// Mark defer as started, but keep on list, so that traceback
@@ -675,8 +964,16 @@
 		// will find d in the list and will mark d._panic (this panic) aborted.
 		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 
-		p.argp = unsafe.Pointer(getargp(0))
-		reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		done := true
+		if d.openDefer {
+			done = runOpenDeferFrame(gp, d)
+			if done && !d._panic.recovered {
+				addOneOpenDeferFrame(gp, 0, nil)
+			}
+		} else {
+			p.argp = unsafe.Pointer(getargp(0))
+			reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
+		}
 		p.argp = nil
 
 		// reflectcall did not panic. Remove d.
@@ -684,18 +981,63 @@
 			throw("bad defer entry in panic")
 		}
 		d._panic = nil
-		d.fn = nil
-		gp._defer = d.link
 
 		// trigger shrinkage to test stack copy. See stack_test.go:TestStackPanic
 		//GC()
 
 		pc := d.pc
 		sp := unsafe.Pointer(d.sp) // must be pointer so it gets adjusted during stack copy
-		freedefer(d)
+		if done {
+			d.fn = nil
+			gp._defer = d.link
+			freedefer(d)
+		}
 		if p.recovered {
+			gp._panic = p.link
+			if gp._panic != nil && gp._panic.goexit && gp._panic.aborted {
+				// A normal recover would bypass/abort the Goexit.  Instead,
+				// we return to the processing loop of the Goexit.
+				gp.sigcode0 = uintptr(gp._panic.sp)
+				gp.sigcode1 = uintptr(gp._panic.pc)
+				mcall(recovery)
+				throw("bypassed recovery failed") // mcall should not return
+			}
 			atomic.Xadd(&runningPanicDefers, -1)
 
+			if done {
+				// Remove any remaining non-started, open-coded
+				// defer entries after a recover, since the
+				// corresponding defers will be executed normally
+				// (inline). Any such entry will become stale once
+				// we run the corresponding defers inline and exit
+				// the associated stack frame.
+				d := gp._defer
+				var prev *_defer
+				for d != nil {
+					if d.openDefer {
+						if d.started {
+							// This defer is started but we
+							// are in the middle of a
+							// defer-panic-recover inside of
+							// it, so don't remove it or any
+							// further defer entries
+							break
+						}
+						if prev == nil {
+							gp._defer = d.link
+						} else {
+							prev.link = d.link
+						}
+						newd := d.link
+						freedefer(d)
+						d = newd
+					} else {
+						prev = d
+						d = d.link
+					}
+				}
+			}
+
 			gp._panic = p.link
 			// Aborted panics are marked but remain on the g.panic list.
 			// Remove them from the list.
@@ -748,7 +1090,7 @@
 	// If they match, the caller is the one who can recover.
 	gp := getg()
 	p := gp._panic
-	if p != nil && !p.recovered && argp == uintptr(p.argp) {
+	if p != nil && !p.goexit && !p.recovered && argp == uintptr(p.argp) {
 		p.recovered = true
 		return p.arg
 	}
@@ -803,7 +1145,7 @@
 	}
 
 	// Make the deferproc for this d return again,
-	// this time returning 1.  The calling function will
+	// this time returning 1. The calling function will
 	// jump to the standard return epilogue.
 	gp.sched.sp = sp
 	gp.sched.pc = pc
@@ -941,6 +1283,12 @@
 	}
 }
 
+// throwReportQuirk, if non-nil, is called by throw after dumping the stacks.
+//
+// TODO(austin): Remove this after Go 1.15 when we remove the
+// mlockGsignal workaround.
+var throwReportQuirk func()
+
 var didothers bool
 var deadlock mutex
 
@@ -987,6 +1335,10 @@
 
 	printDebugLog()
 
+	if throwReportQuirk != nil {
+		throwReportQuirk()
+	}
+
 	return docrash
 }
 
diff --git a/src/runtime/panic32.go b/src/runtime/panic32.go
index b89ce9d..aea8401 100644
--- a/src/runtime/panic32.go
+++ b/src/runtime/panic32.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 amd64p32 arm mips mipsle
+// +build 386 arm mips mipsle
 
 package runtime
 
diff --git a/src/runtime/panic_test.go b/src/runtime/panic_test.go
new file mode 100644
index 0000000..b8a300f
--- /dev/null
+++ b/src/runtime/panic_test.go
@@ -0,0 +1,48 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"strings"
+	"testing"
+)
+
+// Test that panics print out the underlying value
+// when the underlying kind is directly printable.
+// Issue: https://golang.org/issues/37531
+func TestPanicWithDirectlyPrintableCustomTypes(t *testing.T) {
+	tests := []struct {
+		name            string
+		wantPanicPrefix string
+	}{
+		{"panicCustomBool", `panic: main.MyBool(true)`},
+		{"panicCustomComplex128", `panic: main.MyComplex128(+3.210000e+001+1.000000e+001i)`},
+		{"panicCustomComplex64", `panic: main.MyComplex64(+1.100000e-001+3.000000e+000i)`},
+		{"panicCustomFloat32", `panic: main.MyFloat32(-9.370000e+001)`},
+		{"panicCustomFloat64", `panic: main.MyFloat64(-9.370000e+001)`},
+		{"panicCustomInt", `panic: main.MyInt(93)`},
+		{"panicCustomInt8", `panic: main.MyInt8(93)`},
+		{"panicCustomInt16", `panic: main.MyInt16(93)`},
+		{"panicCustomInt32", `panic: main.MyInt32(93)`},
+		{"panicCustomInt64", `panic: main.MyInt64(93)`},
+		{"panicCustomString", `panic: main.MyString("Panic")`},
+		{"panicCustomUint", `panic: main.MyUint(93)`},
+		{"panicCustomUint8", `panic: main.MyUint8(93)`},
+		{"panicCustomUint16", `panic: main.MyUint16(93)`},
+		{"panicCustomUint32", `panic: main.MyUint32(93)`},
+		{"panicCustomUint64", `panic: main.MyUint64(93)`},
+		{"panicCustomUintptr", `panic: main.MyUintptr(93)`},
+	}
+
+	for _, tt := range tests {
+		t := t
+		t.Run(tt.name, func(t *testing.T) {
+			output := runTestProg(t, "testprog", tt.name)
+			if !strings.HasPrefix(output, tt.wantPanicPrefix) {
+				t.Fatalf("%q\nis not present in\n%s", tt.wantPanicPrefix, output)
+			}
+		})
+	}
+}
diff --git a/src/runtime/pprof/internal/profile/encode.go b/src/runtime/pprof/internal/profile/encode.go
deleted file mode 100644
index af31933..0000000
--- a/src/runtime/pprof/internal/profile/encode.go
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package profile
-
-import (
-	"errors"
-	"fmt"
-	"sort"
-)
-
-func (p *Profile) decoder() []decoder {
-	return profileDecoder
-}
-
-// preEncode populates the unexported fields to be used by encode
-// (with suffix X) from the corresponding exported fields. The
-// exported fields are cleared up to facilitate testing.
-func (p *Profile) preEncode() {
-	strings := make(map[string]int)
-	addString(strings, "")
-
-	for _, st := range p.SampleType {
-		st.typeX = addString(strings, st.Type)
-		st.unitX = addString(strings, st.Unit)
-	}
-
-	for _, s := range p.Sample {
-		s.labelX = nil
-		var keys []string
-		for k := range s.Label {
-			keys = append(keys, k)
-		}
-		sort.Strings(keys)
-		for _, k := range keys {
-			vs := s.Label[k]
-			for _, v := range vs {
-				s.labelX = append(s.labelX,
-					Label{
-						keyX: addString(strings, k),
-						strX: addString(strings, v),
-					},
-				)
-			}
-		}
-		var numKeys []string
-		for k := range s.NumLabel {
-			numKeys = append(numKeys, k)
-		}
-		sort.Strings(numKeys)
-		for _, k := range numKeys {
-			vs := s.NumLabel[k]
-			for _, v := range vs {
-				s.labelX = append(s.labelX,
-					Label{
-						keyX: addString(strings, k),
-						numX: v,
-					},
-				)
-			}
-		}
-		s.locationIDX = nil
-		for _, l := range s.Location {
-			s.locationIDX = append(s.locationIDX, l.ID)
-		}
-	}
-
-	for _, m := range p.Mapping {
-		m.fileX = addString(strings, m.File)
-		m.buildIDX = addString(strings, m.BuildID)
-	}
-
-	for _, l := range p.Location {
-		for i, ln := range l.Line {
-			if ln.Function != nil {
-				l.Line[i].functionIDX = ln.Function.ID
-			} else {
-				l.Line[i].functionIDX = 0
-			}
-		}
-		if l.Mapping != nil {
-			l.mappingIDX = l.Mapping.ID
-		} else {
-			l.mappingIDX = 0
-		}
-	}
-	for _, f := range p.Function {
-		f.nameX = addString(strings, f.Name)
-		f.systemNameX = addString(strings, f.SystemName)
-		f.filenameX = addString(strings, f.Filename)
-	}
-
-	p.dropFramesX = addString(strings, p.DropFrames)
-	p.keepFramesX = addString(strings, p.KeepFrames)
-
-	if pt := p.PeriodType; pt != nil {
-		pt.typeX = addString(strings, pt.Type)
-		pt.unitX = addString(strings, pt.Unit)
-	}
-
-	p.stringTable = make([]string, len(strings))
-	for s, i := range strings {
-		p.stringTable[i] = s
-	}
-}
-
-func (p *Profile) encode(b *buffer) {
-	for _, x := range p.SampleType {
-		encodeMessage(b, 1, x)
-	}
-	for _, x := range p.Sample {
-		encodeMessage(b, 2, x)
-	}
-	for _, x := range p.Mapping {
-		encodeMessage(b, 3, x)
-	}
-	for _, x := range p.Location {
-		encodeMessage(b, 4, x)
-	}
-	for _, x := range p.Function {
-		encodeMessage(b, 5, x)
-	}
-	encodeStrings(b, 6, p.stringTable)
-	encodeInt64Opt(b, 7, p.dropFramesX)
-	encodeInt64Opt(b, 8, p.keepFramesX)
-	encodeInt64Opt(b, 9, p.TimeNanos)
-	encodeInt64Opt(b, 10, p.DurationNanos)
-	if pt := p.PeriodType; pt != nil && (pt.typeX != 0 || pt.unitX != 0) {
-		encodeMessage(b, 11, p.PeriodType)
-	}
-	encodeInt64Opt(b, 12, p.Period)
-}
-
-var profileDecoder = []decoder{
-	nil, // 0
-	// repeated ValueType sample_type = 1
-	func(b *buffer, m message) error {
-		x := new(ValueType)
-		pp := m.(*Profile)
-		pp.SampleType = append(pp.SampleType, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Sample sample = 2
-	func(b *buffer, m message) error {
-		x := new(Sample)
-		pp := m.(*Profile)
-		pp.Sample = append(pp.Sample, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Mapping mapping = 3
-	func(b *buffer, m message) error {
-		x := new(Mapping)
-		pp := m.(*Profile)
-		pp.Mapping = append(pp.Mapping, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Location location = 4
-	func(b *buffer, m message) error {
-		x := new(Location)
-		pp := m.(*Profile)
-		pp.Location = append(pp.Location, x)
-		return decodeMessage(b, x)
-	},
-	// repeated Function function = 5
-	func(b *buffer, m message) error {
-		x := new(Function)
-		pp := m.(*Profile)
-		pp.Function = append(pp.Function, x)
-		return decodeMessage(b, x)
-	},
-	// repeated string string_table = 6
-	func(b *buffer, m message) error {
-		err := decodeStrings(b, &m.(*Profile).stringTable)
-		if err != nil {
-			return err
-		}
-		if *&m.(*Profile).stringTable[0] != "" {
-			return errors.New("string_table[0] must be ''")
-		}
-		return nil
-	},
-	// repeated int64 drop_frames = 7
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).dropFramesX) },
-	// repeated int64 keep_frames = 8
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).keepFramesX) },
-	// repeated int64 time_nanos = 9
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).TimeNanos) },
-	// repeated int64 duration_nanos = 10
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).DurationNanos) },
-	// optional string period_type = 11
-	func(b *buffer, m message) error {
-		x := new(ValueType)
-		pp := m.(*Profile)
-		pp.PeriodType = x
-		return decodeMessage(b, x)
-	},
-	// repeated int64 period = 12
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).Period) },
-	// repeated int64 comment = 13
-	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Profile).commentX) },
-	// int64 defaultSampleType = 14
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).defaultSampleTypeX) },
-}
-
-// postDecode takes the unexported fields populated by decode (with
-// suffix X) and populates the corresponding exported fields.
-// The unexported fields are cleared up to facilitate testing.
-func (p *Profile) postDecode() error {
-	var err error
-
-	mappings := make(map[uint64]*Mapping)
-	for _, m := range p.Mapping {
-		m.File, err = getString(p.stringTable, &m.fileX, err)
-		m.BuildID, err = getString(p.stringTable, &m.buildIDX, err)
-		mappings[m.ID] = m
-	}
-
-	functions := make(map[uint64]*Function)
-	for _, f := range p.Function {
-		f.Name, err = getString(p.stringTable, &f.nameX, err)
-		f.SystemName, err = getString(p.stringTable, &f.systemNameX, err)
-		f.Filename, err = getString(p.stringTable, &f.filenameX, err)
-		functions[f.ID] = f
-	}
-
-	locations := make(map[uint64]*Location)
-	for _, l := range p.Location {
-		l.Mapping = mappings[l.mappingIDX]
-		l.mappingIDX = 0
-		for i, ln := range l.Line {
-			if id := ln.functionIDX; id != 0 {
-				l.Line[i].Function = functions[id]
-				if l.Line[i].Function == nil {
-					return fmt.Errorf("Function ID %d not found", id)
-				}
-				l.Line[i].functionIDX = 0
-			}
-		}
-		locations[l.ID] = l
-	}
-
-	for _, st := range p.SampleType {
-		st.Type, err = getString(p.stringTable, &st.typeX, err)
-		st.Unit, err = getString(p.stringTable, &st.unitX, err)
-	}
-
-	for _, s := range p.Sample {
-		labels := make(map[string][]string)
-		numLabels := make(map[string][]int64)
-		for _, l := range s.labelX {
-			var key, value string
-			key, err = getString(p.stringTable, &l.keyX, err)
-			if l.strX != 0 {
-				value, err = getString(p.stringTable, &l.strX, err)
-				labels[key] = append(labels[key], value)
-			} else {
-				numLabels[key] = append(numLabels[key], l.numX)
-			}
-		}
-		if len(labels) > 0 {
-			s.Label = labels
-		}
-		if len(numLabels) > 0 {
-			s.NumLabel = numLabels
-		}
-		s.Location = nil
-		for _, lid := range s.locationIDX {
-			s.Location = append(s.Location, locations[lid])
-		}
-		s.locationIDX = nil
-	}
-
-	p.DropFrames, err = getString(p.stringTable, &p.dropFramesX, err)
-	p.KeepFrames, err = getString(p.stringTable, &p.keepFramesX, err)
-
-	if pt := p.PeriodType; pt == nil {
-		p.PeriodType = &ValueType{}
-	}
-
-	if pt := p.PeriodType; pt != nil {
-		pt.Type, err = getString(p.stringTable, &pt.typeX, err)
-		pt.Unit, err = getString(p.stringTable, &pt.unitX, err)
-	}
-	for _, i := range p.commentX {
-		var c string
-		c, err = getString(p.stringTable, &i, err)
-		p.Comments = append(p.Comments, c)
-	}
-
-	p.commentX = nil
-	p.DefaultSampleType, err = getString(p.stringTable, &p.defaultSampleTypeX, err)
-	p.stringTable = nil
-	return nil
-}
-
-func (p *ValueType) decoder() []decoder {
-	return valueTypeDecoder
-}
-
-func (p *ValueType) encode(b *buffer) {
-	encodeInt64Opt(b, 1, p.typeX)
-	encodeInt64Opt(b, 2, p.unitX)
-}
-
-var valueTypeDecoder = []decoder{
-	nil, // 0
-	// optional int64 type = 1
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).typeX) },
-	// optional int64 unit = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).unitX) },
-}
-
-func (p *Sample) decoder() []decoder {
-	return sampleDecoder
-}
-
-func (p *Sample) encode(b *buffer) {
-	encodeUint64s(b, 1, p.locationIDX)
-	for _, x := range p.Value {
-		encodeInt64(b, 2, x)
-	}
-	for _, x := range p.labelX {
-		encodeMessage(b, 3, x)
-	}
-}
-
-var sampleDecoder = []decoder{
-	nil, // 0
-	// repeated uint64 location = 1
-	func(b *buffer, m message) error { return decodeUint64s(b, &m.(*Sample).locationIDX) },
-	// repeated int64 value = 2
-	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Sample).Value) },
-	// repeated Label label = 3
-	func(b *buffer, m message) error {
-		s := m.(*Sample)
-		n := len(s.labelX)
-		s.labelX = append(s.labelX, Label{})
-		return decodeMessage(b, &s.labelX[n])
-	},
-}
-
-func (p Label) decoder() []decoder {
-	return labelDecoder
-}
-
-func (p Label) encode(b *buffer) {
-	encodeInt64Opt(b, 1, p.keyX)
-	encodeInt64Opt(b, 2, p.strX)
-	encodeInt64Opt(b, 3, p.numX)
-}
-
-var labelDecoder = []decoder{
-	nil, // 0
-	// optional int64 key = 1
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).keyX) },
-	// optional int64 str = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).strX) },
-	// optional int64 num = 3
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).numX) },
-}
-
-func (p *Mapping) decoder() []decoder {
-	return mappingDecoder
-}
-
-func (p *Mapping) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeUint64Opt(b, 2, p.Start)
-	encodeUint64Opt(b, 3, p.Limit)
-	encodeUint64Opt(b, 4, p.Offset)
-	encodeInt64Opt(b, 5, p.fileX)
-	encodeInt64Opt(b, 6, p.buildIDX)
-	encodeBoolOpt(b, 7, p.HasFunctions)
-	encodeBoolOpt(b, 8, p.HasFilenames)
-	encodeBoolOpt(b, 9, p.HasLineNumbers)
-	encodeBoolOpt(b, 10, p.HasInlineFrames)
-}
-
-var mappingDecoder = []decoder{
-	nil, // 0
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).ID) },            // optional uint64 id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Start) },         // optional uint64 memory_offset = 2
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Limit) },         // optional uint64 memory_limit = 3
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Offset) },        // optional uint64 file_offset = 4
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).fileX) },          // optional int64 filename = 5
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).buildIDX) },       // optional int64 build_id = 6
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFunctions) },    // optional bool has_functions = 7
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFilenames) },    // optional bool has_filenames = 8
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasLineNumbers) },  // optional bool has_line_numbers = 9
-	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasInlineFrames) }, // optional bool has_inline_frames = 10
-}
-
-func (p *Location) decoder() []decoder {
-	return locationDecoder
-}
-
-func (p *Location) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeUint64Opt(b, 2, p.mappingIDX)
-	encodeUint64Opt(b, 3, p.Address)
-	for i := range p.Line {
-		encodeMessage(b, 4, &p.Line[i])
-	}
-}
-
-var locationDecoder = []decoder{
-	nil, // 0
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).ID) },         // optional uint64 id = 1;
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).mappingIDX) }, // optional uint64 mapping_id = 2;
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).Address) },    // optional uint64 address = 3;
-	func(b *buffer, m message) error { // repeated Line line = 4
-		pp := m.(*Location)
-		n := len(pp.Line)
-		pp.Line = append(pp.Line, Line{})
-		return decodeMessage(b, &pp.Line[n])
-	},
-}
-
-func (p *Line) decoder() []decoder {
-	return lineDecoder
-}
-
-func (p *Line) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.functionIDX)
-	encodeInt64Opt(b, 2, p.Line)
-}
-
-var lineDecoder = []decoder{
-	nil, // 0
-	// optional uint64 function_id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Line).functionIDX) },
-	// optional int64 line = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Line) },
-}
-
-func (p *Function) decoder() []decoder {
-	return functionDecoder
-}
-
-func (p *Function) encode(b *buffer) {
-	encodeUint64Opt(b, 1, p.ID)
-	encodeInt64Opt(b, 2, p.nameX)
-	encodeInt64Opt(b, 3, p.systemNameX)
-	encodeInt64Opt(b, 4, p.filenameX)
-	encodeInt64Opt(b, 5, p.StartLine)
-}
-
-var functionDecoder = []decoder{
-	nil, // 0
-	// optional uint64 id = 1
-	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Function).ID) },
-	// optional int64 function_name = 2
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).nameX) },
-	// optional int64 function_system_name = 3
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).systemNameX) },
-	// repeated int64 filename = 4
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).filenameX) },
-	// optional int64 start_line = 5
-	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).StartLine) },
-}
-
-func addString(strings map[string]int, s string) int64 {
-	i, ok := strings[s]
-	if !ok {
-		i = len(strings)
-		strings[s] = i
-	}
-	return int64(i)
-}
-
-func getString(strings []string, strng *int64, err error) (string, error) {
-	if err != nil {
-		return "", err
-	}
-	s := int(*strng)
-	if s < 0 || s >= len(strings) {
-		return "", errMalformed
-	}
-	*strng = 0
-	return strings[s], nil
-}
diff --git a/src/runtime/pprof/internal/profile/filter.go b/src/runtime/pprof/internal/profile/filter.go
deleted file mode 100644
index 9cad866..0000000
--- a/src/runtime/pprof/internal/profile/filter.go
+++ /dev/null
@@ -1,158 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implements methods to filter samples from profiles.
-
-package profile
-
-import "regexp"
-
-// FilterSamplesByName filters the samples in a profile and only keeps
-// samples where at least one frame matches focus but none match ignore.
-// Returns true is the corresponding regexp matched at least one sample.
-func (p *Profile) FilterSamplesByName(focus, ignore, hide *regexp.Regexp) (fm, im, hm bool) {
-	focusOrIgnore := make(map[uint64]bool)
-	hidden := make(map[uint64]bool)
-	for _, l := range p.Location {
-		if ignore != nil && l.matchesName(ignore) {
-			im = true
-			focusOrIgnore[l.ID] = false
-		} else if focus == nil || l.matchesName(focus) {
-			fm = true
-			focusOrIgnore[l.ID] = true
-		}
-		if hide != nil && l.matchesName(hide) {
-			hm = true
-			l.Line = l.unmatchedLines(hide)
-			if len(l.Line) == 0 {
-				hidden[l.ID] = true
-			}
-		}
-	}
-
-	s := make([]*Sample, 0, len(p.Sample))
-	for _, sample := range p.Sample {
-		if focusedAndNotIgnored(sample.Location, focusOrIgnore) {
-			if len(hidden) > 0 {
-				var locs []*Location
-				for _, loc := range sample.Location {
-					if !hidden[loc.ID] {
-						locs = append(locs, loc)
-					}
-				}
-				if len(locs) == 0 {
-					// Remove sample with no locations (by not adding it to s).
-					continue
-				}
-				sample.Location = locs
-			}
-			s = append(s, sample)
-		}
-	}
-	p.Sample = s
-
-	return
-}
-
-// matchesName reports whether the function name or file in the
-// location matches the regular expression.
-func (loc *Location) matchesName(re *regexp.Regexp) bool {
-	for _, ln := range loc.Line {
-		if fn := ln.Function; fn != nil {
-			if re.MatchString(fn.Name) {
-				return true
-			}
-			if re.MatchString(fn.Filename) {
-				return true
-			}
-		}
-	}
-	return false
-}
-
-// unmatchedLines returns the lines in the location that do not match
-// the regular expression.
-func (loc *Location) unmatchedLines(re *regexp.Regexp) []Line {
-	var lines []Line
-	for _, ln := range loc.Line {
-		if fn := ln.Function; fn != nil {
-			if re.MatchString(fn.Name) {
-				continue
-			}
-			if re.MatchString(fn.Filename) {
-				continue
-			}
-		}
-		lines = append(lines, ln)
-	}
-	return lines
-}
-
-// focusedAndNotIgnored looks up a slice of ids against a map of
-// focused/ignored locations. The map only contains locations that are
-// explicitly focused or ignored. Returns whether there is at least
-// one focused location but no ignored locations.
-func focusedAndNotIgnored(locs []*Location, m map[uint64]bool) bool {
-	var f bool
-	for _, loc := range locs {
-		if focus, focusOrIgnore := m[loc.ID]; focusOrIgnore {
-			if focus {
-				// Found focused location. Must keep searching in case there
-				// is an ignored one as well.
-				f = true
-			} else {
-				// Found ignored location. Can return false right away.
-				return false
-			}
-		}
-	}
-	return f
-}
-
-// TagMatch selects tags for filtering
-type TagMatch func(key, val string, nval int64) bool
-
-// FilterSamplesByTag removes all samples from the profile, except
-// those that match focus and do not match the ignore regular
-// expression.
-func (p *Profile) FilterSamplesByTag(focus, ignore TagMatch) (fm, im bool) {
-	samples := make([]*Sample, 0, len(p.Sample))
-	for _, s := range p.Sample {
-		focused, ignored := focusedSample(s, focus, ignore)
-		fm = fm || focused
-		im = im || ignored
-		if focused && !ignored {
-			samples = append(samples, s)
-		}
-	}
-	p.Sample = samples
-	return
-}
-
-// focusedTag checks a sample against focus and ignore regexps.
-// Returns whether the focus/ignore regexps match any tags
-func focusedSample(s *Sample, focus, ignore TagMatch) (fm, im bool) {
-	fm = focus == nil
-	for key, vals := range s.Label {
-		for _, val := range vals {
-			if ignore != nil && ignore(key, val, 0) {
-				im = true
-			}
-			if !fm && focus(key, val, 0) {
-				fm = true
-			}
-		}
-	}
-	for key, vals := range s.NumLabel {
-		for _, val := range vals {
-			if ignore != nil && ignore(key, "", val) {
-				im = true
-			}
-			if !fm && focus(key, "", val) {
-				fm = true
-			}
-		}
-	}
-	return fm, im
-}
diff --git a/src/runtime/pprof/internal/profile/legacy_profile.go b/src/runtime/pprof/internal/profile/legacy_profile.go
deleted file mode 100644
index d69f8de..0000000
--- a/src/runtime/pprof/internal/profile/legacy_profile.go
+++ /dev/null
@@ -1,1266 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file implements parsers to convert legacy profiles into the
-// profile.proto format.
-
-package profile
-
-import (
-	"bufio"
-	"bytes"
-	"fmt"
-	"io"
-	"math"
-	"regexp"
-	"strconv"
-	"strings"
-)
-
-var (
-	countStartRE = regexp.MustCompile(`\A(\w+) profile: total \d+\n\z`)
-	countRE      = regexp.MustCompile(`\A(\d+) @(( 0x[0-9a-f]+)+)\n\z`)
-
-	heapHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] *@ *(heap[_a-z0-9]*)/?(\d*)`)
-	heapSampleRE = regexp.MustCompile(`(-?\d+): *(-?\d+) *\[ *(\d+): *(\d+) *] @([ x0-9a-f]*)`)
-
-	contentionSampleRE = regexp.MustCompile(`(\d+) *(\d+) @([ x0-9a-f]*)`)
-
-	hexNumberRE = regexp.MustCompile(`0x[0-9a-f]+`)
-
-	growthHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ growthz`)
-
-	fragmentationHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ fragmentationz`)
-
-	threadzStartRE = regexp.MustCompile(`--- threadz \d+ ---`)
-	threadStartRE  = regexp.MustCompile(`--- Thread ([[:xdigit:]]+) \(name: (.*)/(\d+)\) stack: ---`)
-
-	procMapsRE = regexp.MustCompile(`([[:xdigit:]]+)-([[:xdigit:]]+)\s+([-rwxp]+)\s+([[:xdigit:]]+)\s+([[:xdigit:]]+):([[:xdigit:]]+)\s+([[:digit:]]+)\s*(\S+)?`)
-
-	briefMapsRE = regexp.MustCompile(`\s*([[:xdigit:]]+)-([[:xdigit:]]+):\s*(\S+)(\s.*@)?([[:xdigit:]]+)?`)
-
-	// LegacyHeapAllocated instructs the heapz parsers to use the
-	// allocated memory stats instead of the default in-use memory. Note
-	// that tcmalloc doesn't provide all allocated memory, only in-use
-	// stats.
-	LegacyHeapAllocated bool
-)
-
-func isSpaceOrComment(line string) bool {
-	trimmed := strings.TrimSpace(line)
-	return len(trimmed) == 0 || trimmed[0] == '#'
-}
-
-// parseGoCount parses a Go count profile (e.g., threadcreate or
-// goroutine) and returns a new Profile.
-func parseGoCount(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	var line string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		line, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(line) {
-			break
-		}
-	}
-
-	m := countStartRE.FindStringSubmatch(line)
-	if m == nil {
-		return nil, errUnrecognized
-	}
-	profileType := m[1]
-	p := &Profile{
-		PeriodType: &ValueType{Type: profileType, Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{{Type: profileType, Unit: "count"}},
-	}
-	locations := make(map[uint64]*Location)
-	for {
-		line, err = r.ReadString('\n')
-		if err != nil {
-			if err == io.EOF {
-				break
-			}
-			return nil, err
-		}
-		if isSpaceOrComment(line) {
-			continue
-		}
-		if strings.HasPrefix(line, "---") {
-			break
-		}
-		m := countRE.FindStringSubmatch(line)
-		if m == nil {
-			return nil, errMalformed
-		}
-		n, err := strconv.ParseInt(m[1], 0, 64)
-		if err != nil {
-			return nil, errMalformed
-		}
-		fields := strings.Fields(m[2])
-		locs := make([]*Location, 0, len(fields))
-		for _, stk := range fields {
-			addr, err := strconv.ParseUint(stk, 0, 64)
-			if err != nil {
-				return nil, errMalformed
-			}
-			// Adjust all frames by -1 to land on the call instruction.
-			addr--
-			loc := locations[addr]
-			if loc == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				locations[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			locs = append(locs, loc)
-		}
-		p.Sample = append(p.Sample, &Sample{
-			Location: locs,
-			Value:    []int64{n},
-		})
-	}
-
-	if err = parseAdditionalSections(strings.TrimSpace(line), r, p); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// remapLocationIDs ensures there is a location for each address
-// referenced by a sample, and remaps the samples to point to the new
-// location ids.
-func (p *Profile) remapLocationIDs() {
-	seen := make(map[*Location]bool, len(p.Location))
-	var locs []*Location
-
-	for _, s := range p.Sample {
-		for _, l := range s.Location {
-			if seen[l] {
-				continue
-			}
-			l.ID = uint64(len(locs) + 1)
-			locs = append(locs, l)
-			seen[l] = true
-		}
-	}
-	p.Location = locs
-}
-
-func (p *Profile) remapFunctionIDs() {
-	seen := make(map[*Function]bool, len(p.Function))
-	var fns []*Function
-
-	for _, l := range p.Location {
-		for _, ln := range l.Line {
-			fn := ln.Function
-			if fn == nil || seen[fn] {
-				continue
-			}
-			fn.ID = uint64(len(fns) + 1)
-			fns = append(fns, fn)
-			seen[fn] = true
-		}
-	}
-	p.Function = fns
-}
-
-// remapMappingIDs matches location addresses with existing mappings
-// and updates them appropriately. This is O(N*M), if this ever shows
-// up as a bottleneck, evaluate sorting the mappings and doing a
-// binary search, which would make it O(N*log(M)).
-func (p *Profile) remapMappingIDs() {
-	if len(p.Mapping) == 0 {
-		return
-	}
-
-	// Some profile handlers will incorrectly set regions for the main
-	// executable if its section is remapped. Fix them through heuristics.
-
-	// Remove the initial mapping if named '/anon_hugepage' and has a
-	// consecutive adjacent mapping.
-	if m := p.Mapping[0]; strings.HasPrefix(m.File, "/anon_hugepage") {
-		if len(p.Mapping) > 1 && m.Limit == p.Mapping[1].Start {
-			p.Mapping = p.Mapping[1:]
-		}
-	}
-
-	// Subtract the offset from the start of the main mapping if it
-	// ends up at a recognizable start address.
-	const expectedStart = 0x400000
-	if m := p.Mapping[0]; m.Start-m.Offset == expectedStart {
-		m.Start = expectedStart
-		m.Offset = 0
-	}
-
-	for _, l := range p.Location {
-		if a := l.Address; a != 0 {
-			for _, m := range p.Mapping {
-				if m.Start <= a && a < m.Limit {
-					l.Mapping = m
-					break
-				}
-			}
-		}
-	}
-
-	// Reset all mapping IDs.
-	for i, m := range p.Mapping {
-		m.ID = uint64(i + 1)
-	}
-}
-
-var cpuInts = []func([]byte) (uint64, []byte){
-	get32l,
-	get32b,
-	get64l,
-	get64b,
-}
-
-func get32l(b []byte) (uint64, []byte) {
-	if len(b) < 4 {
-		return 0, nil
-	}
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24, b[4:]
-}
-
-func get32b(b []byte) (uint64, []byte) {
-	if len(b) < 4 {
-		return 0, nil
-	}
-	return uint64(b[3]) | uint64(b[2])<<8 | uint64(b[1])<<16 | uint64(b[0])<<24, b[4:]
-}
-
-func get64l(b []byte) (uint64, []byte) {
-	if len(b) < 8 {
-		return 0, nil
-	}
-	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56, b[8:]
-}
-
-func get64b(b []byte) (uint64, []byte) {
-	if len(b) < 8 {
-		return 0, nil
-	}
-	return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56, b[8:]
-}
-
-// ParseTracebacks parses a set of tracebacks and returns a newly
-// populated profile. It will accept any text file and generate a
-// Profile out of it with any hex addresses it can identify, including
-// a process map if it can recognize one. Each sample will include a
-// tag "source" with the addresses recognized in string format.
-func ParseTracebacks(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	p := &Profile{
-		PeriodType: &ValueType{Type: "trace", Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{
-			{Type: "trace", Unit: "count"},
-		},
-	}
-
-	var sources []string
-	var sloc []*Location
-
-	locs := make(map[uint64]*Location)
-	for {
-		l, err := r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if sectionTrigger(l) == memoryMapSection {
-			break
-		}
-		if s, addrs := extractHexAddresses(l); len(s) > 0 {
-			for _, addr := range addrs {
-				// Addresses from stack traces point to the next instruction after
-				// each call. Adjust by -1 to land somewhere on the actual call.
-				addr--
-				loc := locs[addr]
-				if locs[addr] == nil {
-					loc = &Location{
-						Address: addr,
-					}
-					p.Location = append(p.Location, loc)
-					locs[addr] = loc
-				}
-				sloc = append(sloc, loc)
-			}
-
-			sources = append(sources, s...)
-		} else {
-			if len(sources) > 0 || len(sloc) > 0 {
-				addTracebackSample(sloc, sources, p)
-				sloc, sources = nil, nil
-			}
-		}
-	}
-
-	// Add final sample to save any leftover data.
-	if len(sources) > 0 || len(sloc) > 0 {
-		addTracebackSample(sloc, sources, p)
-	}
-
-	if err := p.ParseMemoryMap(r); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-func addTracebackSample(l []*Location, s []string, p *Profile) {
-	p.Sample = append(p.Sample,
-		&Sample{
-			Value:    []int64{1},
-			Location: l,
-			Label:    map[string][]string{"source": s},
-		})
-}
-
-// parseCPU parses a profilez legacy profile and returns a newly
-// populated Profile.
-//
-// The general format for profilez samples is a sequence of words in
-// binary format. The first words are a header with the following data:
-//   1st word -- 0
-//   2nd word -- 3
-//   3rd word -- 0 if a c++ application, 1 if a java application.
-//   4th word -- Sampling period (in microseconds).
-//   5th word -- Padding.
-func parseCPU(b []byte) (*Profile, error) {
-	var parse func([]byte) (uint64, []byte)
-	var n1, n2, n3, n4, n5 uint64
-	for _, parse = range cpuInts {
-		var tmp []byte
-		n1, tmp = parse(b)
-		n2, tmp = parse(tmp)
-		n3, tmp = parse(tmp)
-		n4, tmp = parse(tmp)
-		n5, tmp = parse(tmp)
-
-		if tmp != nil && n1 == 0 && n2 == 3 && n3 == 0 && n4 > 0 && n5 == 0 {
-			b = tmp
-			return cpuProfile(b, int64(n4), parse)
-		}
-	}
-	return nil, errUnrecognized
-}
-
-// cpuProfile returns a new Profile from C++ profilez data.
-// b is the profile bytes after the header, period is the profiling
-// period, and parse is a function to parse 8-byte chunks from the
-// profile in its native endianness.
-func cpuProfile(b []byte, period int64, parse func(b []byte) (uint64, []byte)) (*Profile, error) {
-	p := &Profile{
-		Period:     period * 1000,
-		PeriodType: &ValueType{Type: "cpu", Unit: "nanoseconds"},
-		SampleType: []*ValueType{
-			{Type: "samples", Unit: "count"},
-			{Type: "cpu", Unit: "nanoseconds"},
-		},
-	}
-	var err error
-	if b, _, err = parseCPUSamples(b, parse, true, p); err != nil {
-		return nil, err
-	}
-
-	// If all samples have the same second-to-the-bottom frame, it
-	// strongly suggests that it is an uninteresting artifact of
-	// measurement -- a stack frame pushed by the signal handler. The
-	// bottom frame is always correct as it is picked up from the signal
-	// structure, not the stack. Check if this is the case and if so,
-	// remove.
-	if len(p.Sample) > 1 && len(p.Sample[0].Location) > 1 {
-		allSame := true
-		id1 := p.Sample[0].Location[1].Address
-		for _, s := range p.Sample {
-			if len(s.Location) < 2 || id1 != s.Location[1].Address {
-				allSame = false
-				break
-			}
-		}
-		if allSame {
-			for _, s := range p.Sample {
-				s.Location = append(s.Location[:1], s.Location[2:]...)
-			}
-		}
-	}
-
-	if err := p.ParseMemoryMap(bytes.NewBuffer(b)); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// parseCPUSamples parses a collection of profilez samples from a
-// profile.
-//
-// profilez samples are a repeated sequence of stack frames of the
-// form:
-//    1st word -- The number of times this stack was encountered.
-//    2nd word -- The size of the stack (StackSize).
-//    3rd word -- The first address on the stack.
-//    ...
-//    StackSize + 2 -- The last address on the stack
-// The last stack trace is of the form:
-//   1st word -- 0
-//   2nd word -- 1
-//   3rd word -- 0
-//
-// Addresses from stack traces may point to the next instruction after
-// each call. Optionally adjust by -1 to land somewhere on the actual
-// call (except for the leaf, which is not a call).
-func parseCPUSamples(b []byte, parse func(b []byte) (uint64, []byte), adjust bool, p *Profile) ([]byte, map[uint64]*Location, error) {
-	locs := make(map[uint64]*Location)
-	for len(b) > 0 {
-		var count, nstk uint64
-		count, b = parse(b)
-		nstk, b = parse(b)
-		if b == nil || nstk > uint64(len(b)/4) {
-			return nil, nil, errUnrecognized
-		}
-		var sloc []*Location
-		addrs := make([]uint64, nstk)
-		for i := 0; i < int(nstk); i++ {
-			addrs[i], b = parse(b)
-		}
-
-		if count == 0 && nstk == 1 && addrs[0] == 0 {
-			// End of data marker
-			break
-		}
-		for i, addr := range addrs {
-			if adjust && i > 0 {
-				addr--
-			}
-			loc := locs[addr]
-			if loc == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc = append(sloc, loc)
-		}
-		p.Sample = append(p.Sample,
-			&Sample{
-				Value:    []int64{int64(count), int64(count) * p.Period},
-				Location: sloc,
-			})
-	}
-	// Reached the end without finding the EOD marker.
-	return b, locs, nil
-}
-
-// parseHeap parses a heapz legacy or a growthz profile and
-// returns a newly populated Profile.
-func parseHeap(b []byte) (p *Profile, err error) {
-	r := bytes.NewBuffer(b)
-	l, err := r.ReadString('\n')
-	if err != nil {
-		return nil, errUnrecognized
-	}
-
-	sampling := ""
-
-	if header := heapHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "objects", Unit: "bytes"},
-		}
-
-		var period int64
-		if len(header[6]) > 0 {
-			if period, err = strconv.ParseInt(header[6], 10, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-
-		switch header[5] {
-		case "heapz_v2", "heap_v2":
-			sampling, p.Period = "v2", period
-		case "heapprofile":
-			sampling, p.Period = "", 1
-		case "heap":
-			sampling, p.Period = "v2", period/2
-		default:
-			return nil, errUnrecognized
-		}
-	} else if header = growthHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "heapgrowth", Unit: "count"},
-			Period:     1,
-		}
-	} else if header = fragmentationHeaderRE.FindStringSubmatch(l); header != nil {
-		p = &Profile{
-			SampleType: []*ValueType{
-				{Type: "objects", Unit: "count"},
-				{Type: "space", Unit: "bytes"},
-			},
-			PeriodType: &ValueType{Type: "allocations", Unit: "count"},
-			Period:     1,
-		}
-	} else {
-		return nil, errUnrecognized
-	}
-
-	if LegacyHeapAllocated {
-		for _, st := range p.SampleType {
-			st.Type = "alloc_" + st.Type
-		}
-	} else {
-		for _, st := range p.SampleType {
-			st.Type = "inuse_" + st.Type
-		}
-	}
-
-	locs := make(map[uint64]*Location)
-	for {
-		l, err = r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-
-			if l == "" {
-				break
-			}
-		}
-
-		if isSpaceOrComment(l) {
-			continue
-		}
-		l = strings.TrimSpace(l)
-
-		if sectionTrigger(l) != unrecognizedSection {
-			break
-		}
-
-		value, blocksize, addrs, err := parseHeapSample(l, p.Period, sampling)
-		if err != nil {
-			return nil, err
-		}
-		var sloc []*Location
-		for _, addr := range addrs {
-			// Addresses from stack traces point to the next instruction after
-			// each call. Adjust by -1 to land somewhere on the actual call.
-			addr--
-			loc := locs[addr]
-			if locs[addr] == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				p.Location = append(p.Location, loc)
-				locs[addr] = loc
-			}
-			sloc = append(sloc, loc)
-		}
-
-		p.Sample = append(p.Sample, &Sample{
-			Value:    value,
-			Location: sloc,
-			NumLabel: map[string][]int64{"bytes": {blocksize}},
-		})
-	}
-
-	if err = parseAdditionalSections(l, r, p); err != nil {
-		return nil, err
-	}
-	return p, nil
-}
-
-// parseHeapSample parses a single row from a heap profile into a new Sample.
-func parseHeapSample(line string, rate int64, sampling string) (value []int64, blocksize int64, addrs []uint64, err error) {
-	sampleData := heapSampleRE.FindStringSubmatch(line)
-	if len(sampleData) != 6 {
-		return value, blocksize, addrs, fmt.Errorf("unexpected number of sample values: got %d, want 6", len(sampleData))
-	}
-
-	// Use first two values by default; tcmalloc sampling generates the
-	// same value for both, only the older heap-profile collect separate
-	// stats for in-use and allocated objects.
-	valueIndex := 1
-	if LegacyHeapAllocated {
-		valueIndex = 3
-	}
-
-	var v1, v2 int64
-	if v1, err = strconv.ParseInt(sampleData[valueIndex], 10, 64); err != nil {
-		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-	if v2, err = strconv.ParseInt(sampleData[valueIndex+1], 10, 64); err != nil {
-		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-
-	if v1 == 0 {
-		if v2 != 0 {
-			return value, blocksize, addrs, fmt.Errorf("allocation count was 0 but allocation bytes was %d", v2)
-		}
-	} else {
-		blocksize = v2 / v1
-		if sampling == "v2" {
-			v1, v2 = scaleHeapSample(v1, v2, rate)
-		}
-	}
-
-	value = []int64{v1, v2}
-	addrs = parseHexAddresses(sampleData[5])
-
-	return value, blocksize, addrs, nil
-}
-
-// extractHexAddresses extracts hex numbers from a string and returns
-// them, together with their numeric value, in a slice.
-func extractHexAddresses(s string) ([]string, []uint64) {
-	hexStrings := hexNumberRE.FindAllString(s, -1)
-	var ids []uint64
-	for _, s := range hexStrings {
-		if id, err := strconv.ParseUint(s, 0, 64); err == nil {
-			ids = append(ids, id)
-		} else {
-			// Do not expect any parsing failures due to the regexp matching.
-			panic("failed to parse hex value:" + s)
-		}
-	}
-	return hexStrings, ids
-}
-
-// parseHexAddresses parses hex numbers from a string and returns them
-// in a slice.
-func parseHexAddresses(s string) []uint64 {
-	_, ids := extractHexAddresses(s)
-	return ids
-}
-
-// scaleHeapSample adjusts the data from a heapz Sample to
-// account for its probability of appearing in the collected
-// data. heapz profiles are a sampling of the memory allocations
-// requests in a program. We estimate the unsampled value by dividing
-// each collected sample by its probability of appearing in the
-// profile. heapz v2 profiles rely on a poisson process to determine
-// which samples to collect, based on the desired average collection
-// rate R. The probability of a sample of size S to appear in that
-// profile is 1-exp(-S/R).
-func scaleHeapSample(count, size, rate int64) (int64, int64) {
-	if count == 0 || size == 0 {
-		return 0, 0
-	}
-
-	if rate <= 1 {
-		// if rate==1 all samples were collected so no adjustment is needed.
-		// if rate<1 treat as unknown and skip scaling.
-		return count, size
-	}
-
-	avgSize := float64(size) / float64(count)
-	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
-
-	return int64(float64(count) * scale), int64(float64(size) * scale)
-}
-
-// parseContention parses a mutex or contention profile. There are 2 cases:
-// "--- contentionz " for legacy C++ profiles (and backwards compatibility)
-// "--- mutex:" or "--- contention:" for profiles generated by the Go runtime.
-// This code converts the text output from runtime into a *Profile. (In the future
-// the runtime might write a serialized Profile directly making this unnecessary.)
-func parseContention(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-	var l string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		l, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(l) {
-			break
-		}
-	}
-
-	if strings.HasPrefix(l, "--- contentionz ") {
-		return parseCppContention(r)
-	} else if strings.HasPrefix(l, "--- mutex:") {
-		return parseCppContention(r)
-	} else if strings.HasPrefix(l, "--- contention:") {
-		return parseCppContention(r)
-	}
-	return nil, errUnrecognized
-}
-
-// parseCppContention parses the output from synchronization_profiling.cc
-// for backward compatibility, and the compatible (non-debug) block profile
-// output from the Go runtime.
-func parseCppContention(r *bytes.Buffer) (*Profile, error) {
-	p := &Profile{
-		PeriodType: &ValueType{Type: "contentions", Unit: "count"},
-		Period:     1,
-		SampleType: []*ValueType{
-			{Type: "contentions", Unit: "count"},
-			{Type: "delay", Unit: "nanoseconds"},
-		},
-	}
-
-	var cpuHz int64
-	var l string
-	var err error
-	// Parse text of the form "attribute = value" before the samples.
-	const delimiter = "="
-	for {
-		l, err = r.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-
-			if l == "" {
-				break
-			}
-		}
-		if isSpaceOrComment(l) {
-			continue
-		}
-
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if strings.HasPrefix(l, "---") {
-			break
-		}
-
-		attr := strings.SplitN(l, delimiter, 2)
-		if len(attr) != 2 {
-			break
-		}
-		key, val := strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1])
-		var err error
-		switch key {
-		case "cycles/second":
-			if cpuHz, err = strconv.ParseInt(val, 0, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		case "sampling period":
-			if p.Period, err = strconv.ParseInt(val, 0, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		case "ms since reset":
-			ms, err := strconv.ParseInt(val, 0, 64)
-			if err != nil {
-				return nil, errUnrecognized
-			}
-			p.DurationNanos = ms * 1000 * 1000
-		case "format":
-			// CPP contentionz profiles don't have format.
-			return nil, errUnrecognized
-		case "resolution":
-			// CPP contentionz profiles don't have resolution.
-			return nil, errUnrecognized
-		case "discarded samples":
-		default:
-			return nil, errUnrecognized
-		}
-	}
-
-	locs := make(map[uint64]*Location)
-	for {
-		if !isSpaceOrComment(l) {
-			if l = strings.TrimSpace(l); strings.HasPrefix(l, "---") {
-				break
-			}
-			value, addrs, err := parseContentionSample(l, p.Period, cpuHz)
-			if err != nil {
-				return nil, err
-			}
-			var sloc []*Location
-			for _, addr := range addrs {
-				// Addresses from stack traces point to the next instruction after
-				// each call. Adjust by -1 to land somewhere on the actual call.
-				addr--
-				loc := locs[addr]
-				if locs[addr] == nil {
-					loc = &Location{
-						Address: addr,
-					}
-					p.Location = append(p.Location, loc)
-					locs[addr] = loc
-				}
-				sloc = append(sloc, loc)
-			}
-			p.Sample = append(p.Sample, &Sample{
-				Value:    value,
-				Location: sloc,
-			})
-		}
-
-		if l, err = r.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-	}
-
-	if err = parseAdditionalSections(l, r, p); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-// parseContentionSample parses a single row from a contention profile
-// into a new Sample.
-func parseContentionSample(line string, period, cpuHz int64) (value []int64, addrs []uint64, err error) {
-	sampleData := contentionSampleRE.FindStringSubmatch(line)
-	if sampleData == nil {
-		return value, addrs, errUnrecognized
-	}
-
-	v1, err := strconv.ParseInt(sampleData[1], 10, 64)
-	if err != nil {
-		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-	v2, err := strconv.ParseInt(sampleData[2], 10, 64)
-	if err != nil {
-		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
-	}
-
-	// Unsample values if period and cpuHz are available.
-	// - Delays are scaled to cycles and then to nanoseconds.
-	// - Contentions are scaled to cycles.
-	if period > 0 {
-		if cpuHz > 0 {
-			cpuGHz := float64(cpuHz) / 1e9
-			v1 = int64(float64(v1) * float64(period) / cpuGHz)
-		}
-		v2 = v2 * period
-	}
-
-	value = []int64{v2, v1}
-	addrs = parseHexAddresses(sampleData[3])
-
-	return value, addrs, nil
-}
-
-// parseThread parses a Threadz profile and returns a new Profile.
-func parseThread(b []byte) (*Profile, error) {
-	r := bytes.NewBuffer(b)
-
-	var line string
-	var err error
-	for {
-		// Skip past comments and empty lines seeking a real header.
-		line, err = r.ReadString('\n')
-		if err != nil {
-			return nil, err
-		}
-		if !isSpaceOrComment(line) {
-			break
-		}
-	}
-
-	if m := threadzStartRE.FindStringSubmatch(line); m != nil {
-		// Advance over initial comments until first stack trace.
-		for {
-			line, err = r.ReadString('\n')
-			if err != nil {
-				if err != io.EOF {
-					return nil, err
-				}
-
-				if line == "" {
-					break
-				}
-			}
-			if sectionTrigger(line) != unrecognizedSection || line[0] == '-' {
-				break
-			}
-		}
-	} else if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
-		return nil, errUnrecognized
-	}
-
-	p := &Profile{
-		SampleType: []*ValueType{{Type: "thread", Unit: "count"}},
-		PeriodType: &ValueType{Type: "thread", Unit: "count"},
-		Period:     1,
-	}
-
-	locs := make(map[uint64]*Location)
-	// Recognize each thread and populate profile samples.
-	for sectionTrigger(line) == unrecognizedSection {
-		if strings.HasPrefix(line, "---- no stack trace for") {
-			line = ""
-			break
-		}
-		if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
-			return nil, errUnrecognized
-		}
-
-		var addrs []uint64
-		line, addrs, err = parseThreadSample(r)
-		if err != nil {
-			return nil, errUnrecognized
-		}
-		if len(addrs) == 0 {
-			// We got a --same as previous threads--. Bump counters.
-			if len(p.Sample) > 0 {
-				s := p.Sample[len(p.Sample)-1]
-				s.Value[0]++
-			}
-			continue
-		}
-
-		var sloc []*Location
-		for _, addr := range addrs {
-			// Addresses from stack traces point to the next instruction after
-			// each call. Adjust by -1 to land somewhere on the actual call.
-			addr--
-			loc := locs[addr]
-			if locs[addr] == nil {
-				loc = &Location{
-					Address: addr,
-				}
-				p.Location = append(p.Location, loc)
-				locs[addr] = loc
-			}
-			sloc = append(sloc, loc)
-		}
-
-		p.Sample = append(p.Sample, &Sample{
-			Value:    []int64{1},
-			Location: sloc,
-		})
-	}
-
-	if err = parseAdditionalSections(line, r, p); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-// parseThreadSample parses a symbolized or unsymbolized stack trace.
-// Returns the first line after the traceback, the sample (or nil if
-// it hits a 'same-as-previous' marker) and an error.
-func parseThreadSample(b *bytes.Buffer) (nextl string, addrs []uint64, err error) {
-	var l string
-	sameAsPrevious := false
-	for {
-		if l, err = b.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return "", nil, err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if strings.HasPrefix(l, "---") {
-			break
-		}
-		if strings.Contains(l, "same as previous thread") {
-			sameAsPrevious = true
-			continue
-		}
-
-		addrs = append(addrs, parseHexAddresses(l)...)
-	}
-
-	if sameAsPrevious {
-		return l, nil, nil
-	}
-	return l, addrs, nil
-}
-
-// parseAdditionalSections parses any additional sections in the
-// profile, ignoring any unrecognized sections.
-func parseAdditionalSections(l string, b *bytes.Buffer, p *Profile) (err error) {
-	for {
-		if sectionTrigger(l) == memoryMapSection {
-			break
-		}
-		// Ignore any unrecognized sections.
-		if l, err := b.ReadString('\n'); err != nil {
-			if err != io.EOF {
-				return err
-			}
-			if l == "" {
-				break
-			}
-		}
-	}
-	return p.ParseMemoryMap(b)
-}
-
-// ParseMemoryMap parses a memory map in the format of
-// /proc/self/maps, and overrides the mappings in the current profile.
-// It renumbers the samples and locations in the profile correspondingly.
-func (p *Profile) ParseMemoryMap(rd io.Reader) error {
-	b := bufio.NewReader(rd)
-
-	var attrs []string
-	var r *strings.Replacer
-	const delimiter = "="
-	for {
-		l, err := b.ReadString('\n')
-		if err != nil {
-			if err != io.EOF {
-				return err
-			}
-			if l == "" {
-				break
-			}
-		}
-		if l = strings.TrimSpace(l); l == "" {
-			continue
-		}
-
-		if r != nil {
-			l = r.Replace(l)
-		}
-		m, err := parseMappingEntry(l)
-		if err != nil {
-			if err == errUnrecognized {
-				// Recognize assignments of the form: attr=value, and replace
-				// $attr with value on subsequent mappings.
-				if attr := strings.SplitN(l, delimiter, 2); len(attr) == 2 {
-					attrs = append(attrs, "$"+strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1]))
-					r = strings.NewReplacer(attrs...)
-				}
-				// Ignore any unrecognized entries
-				continue
-			}
-			return err
-		}
-		if m == nil || (m.File == "" && len(p.Mapping) != 0) {
-			// In some cases the first entry may include the address range
-			// but not the name of the file. It should be followed by
-			// another entry with the name.
-			continue
-		}
-		if len(p.Mapping) == 1 && p.Mapping[0].File == "" {
-			// Update the name if this is the entry following that empty one.
-			p.Mapping[0].File = m.File
-			continue
-		}
-		p.Mapping = append(p.Mapping, m)
-	}
-	p.remapLocationIDs()
-	p.remapFunctionIDs()
-	p.remapMappingIDs()
-	return nil
-}
-
-func parseMappingEntry(l string) (*Mapping, error) {
-	mapping := &Mapping{}
-	var err error
-	if me := procMapsRE.FindStringSubmatch(l); len(me) == 9 {
-		if !strings.Contains(me[3], "x") {
-			// Skip non-executable entries.
-			return nil, nil
-		}
-		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if me[4] != "" {
-			if mapping.Offset, err = strconv.ParseUint(me[4], 16, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-		mapping.File = me[8]
-		return mapping, nil
-	}
-
-	if me := briefMapsRE.FindStringSubmatch(l); len(me) == 6 {
-		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
-			return nil, errUnrecognized
-		}
-		mapping.File = me[3]
-		if me[5] != "" {
-			if mapping.Offset, err = strconv.ParseUint(me[5], 16, 64); err != nil {
-				return nil, errUnrecognized
-			}
-		}
-		return mapping, nil
-	}
-
-	return nil, errUnrecognized
-}
-
-type sectionType int
-
-const (
-	unrecognizedSection sectionType = iota
-	memoryMapSection
-)
-
-var memoryMapTriggers = []string{
-	"--- Memory map: ---",
-	"MAPPED_LIBRARIES:",
-}
-
-func sectionTrigger(line string) sectionType {
-	for _, trigger := range memoryMapTriggers {
-		if strings.Contains(line, trigger) {
-			return memoryMapSection
-		}
-	}
-	return unrecognizedSection
-}
-
-func (p *Profile) addLegacyFrameInfo() {
-	switch {
-	case isProfileType(p, heapzSampleTypes) ||
-		isProfileType(p, heapzInUseSampleTypes) ||
-		isProfileType(p, heapzAllocSampleTypes):
-		p.DropFrames, p.KeepFrames = allocRxStr, allocSkipRxStr
-	case isProfileType(p, contentionzSampleTypes):
-		p.DropFrames, p.KeepFrames = lockRxStr, ""
-	default:
-		p.DropFrames, p.KeepFrames = cpuProfilerRxStr, ""
-	}
-}
-
-var heapzSampleTypes = []string{"allocations", "size"} // early Go pprof profiles
-var heapzInUseSampleTypes = []string{"inuse_objects", "inuse_space"}
-var heapzAllocSampleTypes = []string{"alloc_objects", "alloc_space"}
-var contentionzSampleTypes = []string{"contentions", "delay"}
-
-func isProfileType(p *Profile, t []string) bool {
-	st := p.SampleType
-	if len(st) != len(t) {
-		return false
-	}
-
-	for i := range st {
-		if st[i].Type != t[i] {
-			return false
-		}
-	}
-	return true
-}
-
-var allocRxStr = strings.Join([]string{
-	// POSIX entry points.
-	`calloc`,
-	`cfree`,
-	`malloc`,
-	`free`,
-	`memalign`,
-	`do_memalign`,
-	`(__)?posix_memalign`,
-	`pvalloc`,
-	`valloc`,
-	`realloc`,
-
-	// TC malloc.
-	`tcmalloc::.*`,
-	`tc_calloc`,
-	`tc_cfree`,
-	`tc_malloc`,
-	`tc_free`,
-	`tc_memalign`,
-	`tc_posix_memalign`,
-	`tc_pvalloc`,
-	`tc_valloc`,
-	`tc_realloc`,
-	`tc_new`,
-	`tc_delete`,
-	`tc_newarray`,
-	`tc_deletearray`,
-	`tc_new_nothrow`,
-	`tc_newarray_nothrow`,
-
-	// Memory-allocation routines on OS X.
-	`malloc_zone_malloc`,
-	`malloc_zone_calloc`,
-	`malloc_zone_valloc`,
-	`malloc_zone_realloc`,
-	`malloc_zone_memalign`,
-	`malloc_zone_free`,
-
-	// Go runtime
-	`runtime\..*`,
-
-	// Other misc. memory allocation routines
-	`BaseArena::.*`,
-	`(::)?do_malloc_no_errno`,
-	`(::)?do_malloc_pages`,
-	`(::)?do_malloc`,
-	`DoSampledAllocation`,
-	`MallocedMemBlock::MallocedMemBlock`,
-	`_M_allocate`,
-	`__builtin_(vec_)?delete`,
-	`__builtin_(vec_)?new`,
-	`__gnu_cxx::new_allocator::allocate`,
-	`__libc_malloc`,
-	`__malloc_alloc_template::allocate`,
-	`allocate`,
-	`cpp_alloc`,
-	`operator new(\[\])?`,
-	`simple_alloc::allocate`,
-}, `|`)
-
-var allocSkipRxStr = strings.Join([]string{
-	// Preserve Go runtime frames that appear in the middle/bottom of
-	// the stack.
-	`runtime\.panic`,
-	`runtime\.reflectcall`,
-	`runtime\.call[0-9]*`,
-}, `|`)
-
-var cpuProfilerRxStr = strings.Join([]string{
-	`ProfileData::Add`,
-	`ProfileData::prof_handler`,
-	`CpuProfiler::prof_handler`,
-	`__pthread_sighandler`,
-	`__restore`,
-}, `|`)
-
-var lockRxStr = strings.Join([]string{
-	`RecordLockProfileData`,
-	`(base::)?RecordLockProfileData.*`,
-	`(base::)?SubmitMutexProfileData.*`,
-	`(base::)?SubmitSpinLockProfileData.*`,
-	`(Mutex::)?AwaitCommon.*`,
-	`(Mutex::)?Unlock.*`,
-	`(Mutex::)?UnlockSlow.*`,
-	`(Mutex::)?ReaderUnlock.*`,
-	`(MutexLock::)?~MutexLock.*`,
-	`(SpinLock::)?Unlock.*`,
-	`(SpinLock::)?SlowUnlock.*`,
-	`(SpinLockHolder::)?~SpinLockHolder.*`,
-}, `|`)
diff --git a/src/runtime/pprof/internal/profile/profile.go b/src/runtime/pprof/internal/profile/profile.go
deleted file mode 100644
index 443accd..0000000
--- a/src/runtime/pprof/internal/profile/profile.go
+++ /dev/null
@@ -1,577 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package profile provides a representation of profile.proto and
-// methods to encode/decode profiles in this format.
-//
-// This package is only for testing runtime/pprof.
-// It is not used by production Go programs.
-package profile
-
-import (
-	"bytes"
-	"compress/gzip"
-	"fmt"
-	"io"
-	"io/ioutil"
-	"regexp"
-	"strings"
-	"time"
-)
-
-// Profile is an in-memory representation of profile.proto.
-type Profile struct {
-	SampleType        []*ValueType
-	DefaultSampleType string
-	Sample            []*Sample
-	Mapping           []*Mapping
-	Location          []*Location
-	Function          []*Function
-	Comments          []string
-
-	DropFrames string
-	KeepFrames string
-
-	TimeNanos     int64
-	DurationNanos int64
-	PeriodType    *ValueType
-	Period        int64
-
-	commentX           []int64
-	dropFramesX        int64
-	keepFramesX        int64
-	stringTable        []string
-	defaultSampleTypeX int64
-}
-
-// ValueType corresponds to Profile.ValueType
-type ValueType struct {
-	Type string // cpu, wall, inuse_space, etc
-	Unit string // seconds, nanoseconds, bytes, etc
-
-	typeX int64
-	unitX int64
-}
-
-// Sample corresponds to Profile.Sample
-type Sample struct {
-	Location []*Location
-	Value    []int64
-	Label    map[string][]string
-	NumLabel map[string][]int64
-
-	locationIDX []uint64
-	labelX      []Label
-}
-
-// Label corresponds to Profile.Label
-type Label struct {
-	keyX int64
-	// Exactly one of the two following values must be set
-	strX int64
-	numX int64 // Integer value for this label
-}
-
-// Mapping corresponds to Profile.Mapping
-type Mapping struct {
-	ID              uint64
-	Start           uint64
-	Limit           uint64
-	Offset          uint64
-	File            string
-	BuildID         string
-	HasFunctions    bool
-	HasFilenames    bool
-	HasLineNumbers  bool
-	HasInlineFrames bool
-
-	fileX    int64
-	buildIDX int64
-}
-
-// Location corresponds to Profile.Location
-type Location struct {
-	ID      uint64
-	Mapping *Mapping
-	Address uint64
-	Line    []Line
-
-	mappingIDX uint64
-}
-
-// Line corresponds to Profile.Line
-type Line struct {
-	Function *Function
-	Line     int64
-
-	functionIDX uint64
-}
-
-// Function corresponds to Profile.Function
-type Function struct {
-	ID         uint64
-	Name       string
-	SystemName string
-	Filename   string
-	StartLine  int64
-
-	nameX       int64
-	systemNameX int64
-	filenameX   int64
-}
-
-// Parse parses a profile and checks for its validity. The input
-// may be a gzip-compressed encoded protobuf or one of many legacy
-// profile formats which may be unsupported in the future.
-func Parse(r io.Reader) (*Profile, error) {
-	orig, err := ioutil.ReadAll(r)
-	if err != nil {
-		return nil, err
-	}
-
-	var p *Profile
-	if len(orig) >= 2 && orig[0] == 0x1f && orig[1] == 0x8b {
-		gz, err := gzip.NewReader(bytes.NewBuffer(orig))
-		if err != nil {
-			return nil, fmt.Errorf("decompressing profile: %v", err)
-		}
-		data, err := ioutil.ReadAll(gz)
-		if err != nil {
-			return nil, fmt.Errorf("decompressing profile: %v", err)
-		}
-		orig = data
-	}
-	if p, err = parseUncompressed(orig); err != nil {
-		if p, err = parseLegacy(orig); err != nil {
-			return nil, fmt.Errorf("parsing profile: %v", err)
-		}
-	}
-
-	if err := p.CheckValid(); err != nil {
-		return nil, fmt.Errorf("malformed profile: %v", err)
-	}
-	return p, nil
-}
-
-var errUnrecognized = fmt.Errorf("unrecognized profile format")
-var errMalformed = fmt.Errorf("malformed profile format")
-
-func parseLegacy(data []byte) (*Profile, error) {
-	parsers := []func([]byte) (*Profile, error){
-		parseCPU,
-		parseHeap,
-		parseGoCount, // goroutine, threadcreate
-		parseThread,
-		parseContention,
-	}
-
-	for _, parser := range parsers {
-		p, err := parser(data)
-		if err == nil {
-			p.setMain()
-			p.addLegacyFrameInfo()
-			return p, nil
-		}
-		if err != errUnrecognized {
-			return nil, err
-		}
-	}
-	return nil, errUnrecognized
-}
-
-func parseUncompressed(data []byte) (*Profile, error) {
-	p := &Profile{}
-	if err := unmarshal(data, p); err != nil {
-		return nil, err
-	}
-
-	if err := p.postDecode(); err != nil {
-		return nil, err
-	}
-
-	return p, nil
-}
-
-var libRx = regexp.MustCompile(`([.]so$|[.]so[._][0-9]+)`)
-
-// setMain scans Mapping entries and guesses which entry is main
-// because legacy profiles don't obey the convention of putting main
-// first.
-func (p *Profile) setMain() {
-	for i := 0; i < len(p.Mapping); i++ {
-		file := strings.TrimSpace(strings.ReplaceAll(p.Mapping[i].File, "(deleted)", ""))
-		if len(file) == 0 {
-			continue
-		}
-		if len(libRx.FindStringSubmatch(file)) > 0 {
-			continue
-		}
-		if strings.HasPrefix(file, "[") {
-			continue
-		}
-		// Swap what we guess is main to position 0.
-		p.Mapping[i], p.Mapping[0] = p.Mapping[0], p.Mapping[i]
-		break
-	}
-}
-
-// Write writes the profile as a gzip-compressed marshaled protobuf.
-func (p *Profile) Write(w io.Writer) error {
-	p.preEncode()
-	b := marshal(p)
-	zw := gzip.NewWriter(w)
-	defer zw.Close()
-	_, err := zw.Write(b)
-	return err
-}
-
-// CheckValid tests whether the profile is valid. Checks include, but are
-// not limited to:
-//   - len(Profile.Sample[n].value) == len(Profile.value_unit)
-//   - Sample.id has a corresponding Profile.Location
-func (p *Profile) CheckValid() error {
-	// Check that sample values are consistent
-	sampleLen := len(p.SampleType)
-	if sampleLen == 0 && len(p.Sample) != 0 {
-		return fmt.Errorf("missing sample type information")
-	}
-	for _, s := range p.Sample {
-		if len(s.Value) != sampleLen {
-			return fmt.Errorf("mismatch: sample has: %d values vs. %d types", len(s.Value), len(p.SampleType))
-		}
-	}
-
-	// Check that all mappings/locations/functions are in the tables
-	// Check that there are no duplicate ids
-	mappings := make(map[uint64]*Mapping, len(p.Mapping))
-	for _, m := range p.Mapping {
-		if m.ID == 0 {
-			return fmt.Errorf("found mapping with reserved ID=0")
-		}
-		if mappings[m.ID] != nil {
-			return fmt.Errorf("multiple mappings with same id: %d", m.ID)
-		}
-		mappings[m.ID] = m
-	}
-	functions := make(map[uint64]*Function, len(p.Function))
-	for _, f := range p.Function {
-		if f.ID == 0 {
-			return fmt.Errorf("found function with reserved ID=0")
-		}
-		if functions[f.ID] != nil {
-			return fmt.Errorf("multiple functions with same id: %d", f.ID)
-		}
-		functions[f.ID] = f
-	}
-	locations := make(map[uint64]*Location, len(p.Location))
-	for _, l := range p.Location {
-		if l.ID == 0 {
-			return fmt.Errorf("found location with reserved id=0")
-		}
-		if locations[l.ID] != nil {
-			return fmt.Errorf("multiple locations with same id: %d", l.ID)
-		}
-		locations[l.ID] = l
-		if m := l.Mapping; m != nil {
-			if m.ID == 0 || mappings[m.ID] != m {
-				return fmt.Errorf("inconsistent mapping %p: %d", m, m.ID)
-			}
-		}
-		for _, ln := range l.Line {
-			if f := ln.Function; f != nil {
-				if f.ID == 0 || functions[f.ID] != f {
-					return fmt.Errorf("inconsistent function %p: %d", f, f.ID)
-				}
-			}
-		}
-	}
-	return nil
-}
-
-// Aggregate merges the locations in the profile into equivalence
-// classes preserving the request attributes. It also updates the
-// samples to point to the merged locations.
-func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address bool) error {
-	for _, m := range p.Mapping {
-		m.HasInlineFrames = m.HasInlineFrames && inlineFrame
-		m.HasFunctions = m.HasFunctions && function
-		m.HasFilenames = m.HasFilenames && filename
-		m.HasLineNumbers = m.HasLineNumbers && linenumber
-	}
-
-	// Aggregate functions
-	if !function || !filename {
-		for _, f := range p.Function {
-			if !function {
-				f.Name = ""
-				f.SystemName = ""
-			}
-			if !filename {
-				f.Filename = ""
-			}
-		}
-	}
-
-	// Aggregate locations
-	if !inlineFrame || !address || !linenumber {
-		for _, l := range p.Location {
-			if !inlineFrame && len(l.Line) > 1 {
-				l.Line = l.Line[len(l.Line)-1:]
-			}
-			if !linenumber {
-				for i := range l.Line {
-					l.Line[i].Line = 0
-				}
-			}
-			if !address {
-				l.Address = 0
-			}
-		}
-	}
-
-	return p.CheckValid()
-}
-
-// Print dumps a text representation of a profile. Intended mainly
-// for debugging purposes.
-func (p *Profile) String() string {
-
-	ss := make([]string, 0, len(p.Sample)+len(p.Mapping)+len(p.Location))
-	if pt := p.PeriodType; pt != nil {
-		ss = append(ss, fmt.Sprintf("PeriodType: %s %s", pt.Type, pt.Unit))
-	}
-	ss = append(ss, fmt.Sprintf("Period: %d", p.Period))
-	if p.TimeNanos != 0 {
-		ss = append(ss, fmt.Sprintf("Time: %v", time.Unix(0, p.TimeNanos)))
-	}
-	if p.DurationNanos != 0 {
-		ss = append(ss, fmt.Sprintf("Duration: %v", time.Duration(p.DurationNanos)))
-	}
-
-	ss = append(ss, "Samples:")
-	var sh1 string
-	for _, s := range p.SampleType {
-		sh1 = sh1 + fmt.Sprintf("%s/%s ", s.Type, s.Unit)
-	}
-	ss = append(ss, strings.TrimSpace(sh1))
-	for _, s := range p.Sample {
-		var sv string
-		for _, v := range s.Value {
-			sv = fmt.Sprintf("%s %10d", sv, v)
-		}
-		sv = sv + ": "
-		for _, l := range s.Location {
-			sv = sv + fmt.Sprintf("%d ", l.ID)
-		}
-		ss = append(ss, sv)
-		const labelHeader = "                "
-		if len(s.Label) > 0 {
-			ls := labelHeader
-			for k, v := range s.Label {
-				ls = ls + fmt.Sprintf("%s:%v ", k, v)
-			}
-			ss = append(ss, ls)
-		}
-		if len(s.NumLabel) > 0 {
-			ls := labelHeader
-			for k, v := range s.NumLabel {
-				ls = ls + fmt.Sprintf("%s:%v ", k, v)
-			}
-			ss = append(ss, ls)
-		}
-	}
-
-	ss = append(ss, "Locations")
-	for _, l := range p.Location {
-		locStr := fmt.Sprintf("%6d: %#x ", l.ID, l.Address)
-		if m := l.Mapping; m != nil {
-			locStr = locStr + fmt.Sprintf("M=%d ", m.ID)
-		}
-		if len(l.Line) == 0 {
-			ss = append(ss, locStr)
-		}
-		for li := range l.Line {
-			lnStr := "??"
-			if fn := l.Line[li].Function; fn != nil {
-				lnStr = fmt.Sprintf("%s %s:%d s=%d",
-					fn.Name,
-					fn.Filename,
-					l.Line[li].Line,
-					fn.StartLine)
-				if fn.Name != fn.SystemName {
-					lnStr = lnStr + "(" + fn.SystemName + ")"
-				}
-			}
-			ss = append(ss, locStr+lnStr)
-			// Do not print location details past the first line
-			locStr = "             "
-		}
-	}
-
-	ss = append(ss, "Mappings")
-	for _, m := range p.Mapping {
-		bits := ""
-		if m.HasFunctions {
-			bits += "[FN]"
-		}
-		if m.HasFilenames {
-			bits += "[FL]"
-		}
-		if m.HasLineNumbers {
-			bits += "[LN]"
-		}
-		if m.HasInlineFrames {
-			bits += "[IN]"
-		}
-		ss = append(ss, fmt.Sprintf("%d: %#x/%#x/%#x %s %s %s",
-			m.ID,
-			m.Start, m.Limit, m.Offset,
-			m.File,
-			m.BuildID,
-			bits))
-	}
-
-	return strings.Join(ss, "\n") + "\n"
-}
-
-// Merge adds profile p adjusted by ratio r into profile p. Profiles
-// must be compatible (same Type and SampleType).
-// TODO(rsilvera): consider normalizing the profiles based on the
-// total samples collected.
-func (p *Profile) Merge(pb *Profile, r float64) error {
-	if err := p.Compatible(pb); err != nil {
-		return err
-	}
-
-	pb = pb.Copy()
-
-	// Keep the largest of the two periods.
-	if pb.Period > p.Period {
-		p.Period = pb.Period
-	}
-
-	p.DurationNanos += pb.DurationNanos
-
-	p.Mapping = append(p.Mapping, pb.Mapping...)
-	for i, m := range p.Mapping {
-		m.ID = uint64(i + 1)
-	}
-	p.Location = append(p.Location, pb.Location...)
-	for i, l := range p.Location {
-		l.ID = uint64(i + 1)
-	}
-	p.Function = append(p.Function, pb.Function...)
-	for i, f := range p.Function {
-		f.ID = uint64(i + 1)
-	}
-
-	if r != 1.0 {
-		for _, s := range pb.Sample {
-			for i, v := range s.Value {
-				s.Value[i] = int64((float64(v) * r))
-			}
-		}
-	}
-	p.Sample = append(p.Sample, pb.Sample...)
-	return p.CheckValid()
-}
-
-// Compatible determines if two profiles can be compared/merged.
-// returns nil if the profiles are compatible; otherwise an error with
-// details on the incompatibility.
-func (p *Profile) Compatible(pb *Profile) error {
-	if !compatibleValueTypes(p.PeriodType, pb.PeriodType) {
-		return fmt.Errorf("incompatible period types %v and %v", p.PeriodType, pb.PeriodType)
-	}
-
-	if len(p.SampleType) != len(pb.SampleType) {
-		return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
-	}
-
-	for i := range p.SampleType {
-		if !compatibleValueTypes(p.SampleType[i], pb.SampleType[i]) {
-			return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
-		}
-	}
-
-	return nil
-}
-
-// HasFunctions determines if all locations in this profile have
-// symbolized function information.
-func (p *Profile) HasFunctions() bool {
-	for _, l := range p.Location {
-		if l.Mapping == nil || !l.Mapping.HasFunctions {
-			return false
-		}
-	}
-	return true
-}
-
-// HasFileLines determines if all locations in this profile have
-// symbolized file and line number information.
-func (p *Profile) HasFileLines() bool {
-	for _, l := range p.Location {
-		if l.Mapping == nil || (!l.Mapping.HasFilenames || !l.Mapping.HasLineNumbers) {
-			return false
-		}
-	}
-	return true
-}
-
-func compatibleValueTypes(v1, v2 *ValueType) bool {
-	if v1 == nil || v2 == nil {
-		return true // No grounds to disqualify.
-	}
-	return v1.Type == v2.Type && v1.Unit == v2.Unit
-}
-
-// Copy makes a fully independent copy of a profile.
-func (p *Profile) Copy() *Profile {
-	p.preEncode()
-	b := marshal(p)
-
-	pp := &Profile{}
-	if err := unmarshal(b, pp); err != nil {
-		panic(err)
-	}
-	if err := pp.postDecode(); err != nil {
-		panic(err)
-	}
-
-	return pp
-}
-
-// Demangler maps symbol names to a human-readable form. This may
-// include C++ demangling and additional simplification. Names that
-// are not demangled may be missing from the resulting map.
-type Demangler func(name []string) (map[string]string, error)
-
-// Demangle attempts to demangle and optionally simplify any function
-// names referenced in the profile. It works on a best-effort basis:
-// it will silently preserve the original names in case of any errors.
-func (p *Profile) Demangle(d Demangler) error {
-	// Collect names to demangle.
-	var names []string
-	for _, fn := range p.Function {
-		names = append(names, fn.SystemName)
-	}
-
-	// Update profile with demangled names.
-	demangled, err := d(names)
-	if err != nil {
-		return err
-	}
-	for _, fn := range p.Function {
-		if dd, ok := demangled[fn.SystemName]; ok {
-			fn.Name = dd
-		}
-	}
-	return nil
-}
-
-// Empty reports whether the profile contains no samples.
-func (p *Profile) Empty() bool {
-	return len(p.Sample) == 0
-}
diff --git a/src/runtime/pprof/internal/profile/profile_test.go b/src/runtime/pprof/internal/profile/profile_test.go
deleted file mode 100644
index e1963f3..0000000
--- a/src/runtime/pprof/internal/profile/profile_test.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package profile
-
-import (
-	"bytes"
-	"testing"
-)
-
-func TestEmptyProfile(t *testing.T) {
-	var buf bytes.Buffer
-	p, err := Parse(&buf)
-	if err != nil {
-		t.Error("Want no error, got", err)
-	}
-	if p == nil {
-		t.Fatal("Want a valid profile, got <nil>")
-	}
-	if !p.Empty() {
-		t.Errorf("Profile should be empty, got %#v", p)
-	}
-}
-
-func TestParseContention(t *testing.T) {
-	tests := []struct {
-		name    string
-		in      string
-		wantErr bool
-	}{
-		{
-			name: "valid",
-			in: `--- mutex:
-cycles/second=3491920901
-sampling period=1
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
-34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
-`,
-		},
-		{
-			name: "valid with comment",
-			in: `--- mutex:
-cycles/second=3491920901
-sampling period=1
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
-#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
-#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
-#	0x4a2be0	main.main.func3+0x70		/go/src/internal/pprof/profile/a_binary.go:58
-
-34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
-#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
-#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
-#	0x4a2b16	main.main.func2+0xd6		/go/src/internal/pprof/profile/a_binary.go:48
-`,
-		},
-		{
-			name:    "empty",
-			in:      `--- mutex:`,
-			wantErr: true,
-		},
-		{
-			name: "invalid header",
-			in: `--- channel:
-43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31`,
-			wantErr: true,
-		},
-	}
-	for _, tc := range tests {
-		_, err := parseContention([]byte(tc.in))
-		if tc.wantErr && err == nil {
-			t.Errorf("parseContention(%q) succeeded unexpectedly", tc.name)
-		}
-		if !tc.wantErr && err != nil {
-			t.Errorf("parseContention(%q) failed unexpectedly: %v", tc.name, err)
-		}
-	}
-
-}
diff --git a/src/runtime/pprof/internal/profile/proto.go b/src/runtime/pprof/internal/profile/proto.go
deleted file mode 100644
index 11d7f9f..0000000
--- a/src/runtime/pprof/internal/profile/proto.go
+++ /dev/null
@@ -1,360 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file is a simple protocol buffer encoder and decoder.
-//
-// A protocol message must implement the message interface:
-//   decoder() []decoder
-//   encode(*buffer)
-//
-// The decode method returns a slice indexed by field number that gives the
-// function to decode that field.
-// The encode method encodes its receiver into the given buffer.
-//
-// The two methods are simple enough to be implemented by hand rather than
-// by using a protocol compiler.
-//
-// See profile.go for examples of messages implementing this interface.
-//
-// There is no support for groups, message sets, or "has" bits.
-
-package profile
-
-import "errors"
-
-type buffer struct {
-	field int
-	typ   int
-	u64   uint64
-	data  []byte
-	tmp   [16]byte
-}
-
-type decoder func(*buffer, message) error
-
-type message interface {
-	decoder() []decoder
-	encode(*buffer)
-}
-
-func marshal(m message) []byte {
-	var b buffer
-	m.encode(&b)
-	return b.data
-}
-
-func encodeVarint(b *buffer, x uint64) {
-	for x >= 128 {
-		b.data = append(b.data, byte(x)|0x80)
-		x >>= 7
-	}
-	b.data = append(b.data, byte(x))
-}
-
-func encodeLength(b *buffer, tag int, len int) {
-	encodeVarint(b, uint64(tag)<<3|2)
-	encodeVarint(b, uint64(len))
-}
-
-func encodeUint64(b *buffer, tag int, x uint64) {
-	// append varint to b.data
-	encodeVarint(b, uint64(tag)<<3|0)
-	encodeVarint(b, x)
-}
-
-func encodeUint64s(b *buffer, tag int, x []uint64) {
-	if len(x) > 2 {
-		// Use packed encoding
-		n1 := len(b.data)
-		for _, u := range x {
-			encodeVarint(b, u)
-		}
-		n2 := len(b.data)
-		encodeLength(b, tag, n2-n1)
-		n3 := len(b.data)
-		copy(b.tmp[:], b.data[n2:n3])
-		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-		copy(b.data[n1:], b.tmp[:n3-n2])
-		return
-	}
-	for _, u := range x {
-		encodeUint64(b, tag, u)
-	}
-}
-
-func encodeUint64Opt(b *buffer, tag int, x uint64) {
-	if x == 0 {
-		return
-	}
-	encodeUint64(b, tag, x)
-}
-
-func encodeInt64(b *buffer, tag int, x int64) {
-	u := uint64(x)
-	encodeUint64(b, tag, u)
-}
-
-func encodeInt64Opt(b *buffer, tag int, x int64) {
-	if x == 0 {
-		return
-	}
-	encodeInt64(b, tag, x)
-}
-
-func encodeInt64s(b *buffer, tag int, x []int64) {
-	if len(x) > 2 {
-		// Use packed encoding
-		n1 := len(b.data)
-		for _, u := range x {
-			encodeVarint(b, uint64(u))
-		}
-		n2 := len(b.data)
-		encodeLength(b, tag, n2-n1)
-		n3 := len(b.data)
-		copy(b.tmp[:], b.data[n2:n3])
-		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-		copy(b.data[n1:], b.tmp[:n3-n2])
-		return
-	}
-	for _, u := range x {
-		encodeInt64(b, tag, u)
-	}
-}
-
-func encodeString(b *buffer, tag int, x string) {
-	encodeLength(b, tag, len(x))
-	b.data = append(b.data, x...)
-}
-
-func encodeStrings(b *buffer, tag int, x []string) {
-	for _, s := range x {
-		encodeString(b, tag, s)
-	}
-}
-
-func encodeStringOpt(b *buffer, tag int, x string) {
-	if x == "" {
-		return
-	}
-	encodeString(b, tag, x)
-}
-
-func encodeBool(b *buffer, tag int, x bool) {
-	if x {
-		encodeUint64(b, tag, 1)
-	} else {
-		encodeUint64(b, tag, 0)
-	}
-}
-
-func encodeBoolOpt(b *buffer, tag int, x bool) {
-	if x == false {
-		return
-	}
-	encodeBool(b, tag, x)
-}
-
-func encodeMessage(b *buffer, tag int, m message) {
-	n1 := len(b.data)
-	m.encode(b)
-	n2 := len(b.data)
-	encodeLength(b, tag, n2-n1)
-	n3 := len(b.data)
-	copy(b.tmp[:], b.data[n2:n3])
-	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
-	copy(b.data[n1:], b.tmp[:n3-n2])
-}
-
-func unmarshal(data []byte, m message) (err error) {
-	b := buffer{data: data, typ: 2}
-	return decodeMessage(&b, m)
-}
-
-func le64(p []byte) uint64 {
-	return uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
-}
-
-func le32(p []byte) uint32 {
-	return uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-}
-
-func decodeVarint(data []byte) (uint64, []byte, error) {
-	var i int
-	var u uint64
-	for i = 0; ; i++ {
-		if i >= 10 || i >= len(data) {
-			return 0, nil, errors.New("bad varint")
-		}
-		u |= uint64(data[i]&0x7F) << uint(7*i)
-		if data[i]&0x80 == 0 {
-			return u, data[i+1:], nil
-		}
-	}
-}
-
-func decodeField(b *buffer, data []byte) ([]byte, error) {
-	x, data, err := decodeVarint(data)
-	if err != nil {
-		return nil, err
-	}
-	b.field = int(x >> 3)
-	b.typ = int(x & 7)
-	b.data = nil
-	b.u64 = 0
-	switch b.typ {
-	case 0:
-		b.u64, data, err = decodeVarint(data)
-		if err != nil {
-			return nil, err
-		}
-	case 1:
-		if len(data) < 8 {
-			return nil, errors.New("not enough data")
-		}
-		b.u64 = le64(data[:8])
-		data = data[8:]
-	case 2:
-		var n uint64
-		n, data, err = decodeVarint(data)
-		if err != nil {
-			return nil, err
-		}
-		if n > uint64(len(data)) {
-			return nil, errors.New("too much data")
-		}
-		b.data = data[:n]
-		data = data[n:]
-	case 5:
-		if len(data) < 4 {
-			return nil, errors.New("not enough data")
-		}
-		b.u64 = uint64(le32(data[:4]))
-		data = data[4:]
-	default:
-		return nil, errors.New("unknown type: " + string(b.typ))
-	}
-
-	return data, nil
-}
-
-func checkType(b *buffer, typ int) error {
-	if b.typ != typ {
-		return errors.New("type mismatch")
-	}
-	return nil
-}
-
-func decodeMessage(b *buffer, m message) error {
-	if err := checkType(b, 2); err != nil {
-		return err
-	}
-	dec := m.decoder()
-	data := b.data
-	for len(data) > 0 {
-		// pull varint field# + type
-		var err error
-		data, err = decodeField(b, data)
-		if err != nil {
-			return err
-		}
-		if b.field >= len(dec) || dec[b.field] == nil {
-			continue
-		}
-		if err := dec[b.field](b, m); err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func decodeInt64(b *buffer, x *int64) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	*x = int64(b.u64)
-	return nil
-}
-
-func decodeInt64s(b *buffer, x *[]int64) error {
-	if b.typ == 2 {
-		// Packed encoding
-		data := b.data
-		for len(data) > 0 {
-			var u uint64
-			var err error
-
-			if u, data, err = decodeVarint(data); err != nil {
-				return err
-			}
-			*x = append(*x, int64(u))
-		}
-		return nil
-	}
-	var i int64
-	if err := decodeInt64(b, &i); err != nil {
-		return err
-	}
-	*x = append(*x, i)
-	return nil
-}
-
-func decodeUint64(b *buffer, x *uint64) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	*x = b.u64
-	return nil
-}
-
-func decodeUint64s(b *buffer, x *[]uint64) error {
-	if b.typ == 2 {
-		data := b.data
-		// Packed encoding
-		for len(data) > 0 {
-			var u uint64
-			var err error
-
-			if u, data, err = decodeVarint(data); err != nil {
-				return err
-			}
-			*x = append(*x, u)
-		}
-		return nil
-	}
-	var u uint64
-	if err := decodeUint64(b, &u); err != nil {
-		return err
-	}
-	*x = append(*x, u)
-	return nil
-}
-
-func decodeString(b *buffer, x *string) error {
-	if err := checkType(b, 2); err != nil {
-		return err
-	}
-	*x = string(b.data)
-	return nil
-}
-
-func decodeStrings(b *buffer, x *[]string) error {
-	var s string
-	if err := decodeString(b, &s); err != nil {
-		return err
-	}
-	*x = append(*x, s)
-	return nil
-}
-
-func decodeBool(b *buffer, x *bool) error {
-	if err := checkType(b, 0); err != nil {
-		return err
-	}
-	if int64(b.u64) == 0 {
-		*x = false
-	} else {
-		*x = true
-	}
-	return nil
-}
diff --git a/src/runtime/pprof/internal/profile/proto_test.go b/src/runtime/pprof/internal/profile/proto_test.go
deleted file mode 100644
index c2613fc..0000000
--- a/src/runtime/pprof/internal/profile/proto_test.go
+++ /dev/null
@@ -1,67 +0,0 @@
-package profile
-
-import (
-	"reflect"
-	"testing"
-)
-
-func TestPackedEncoding(t *testing.T) {
-
-	type testcase struct {
-		uint64s []uint64
-		int64s  []int64
-		encoded []byte
-	}
-	for i, tc := range []testcase{
-		{
-			[]uint64{0, 1, 10, 100, 1000, 10000},
-			[]int64{1000, 0, 1000},
-			[]byte{10, 8, 0, 1, 10, 100, 232, 7, 144, 78, 18, 5, 232, 7, 0, 232, 7},
-		},
-		{
-			[]uint64{10000},
-			nil,
-			[]byte{8, 144, 78},
-		},
-		{
-			nil,
-			[]int64{-10000},
-			[]byte{16, 240, 177, 255, 255, 255, 255, 255, 255, 255, 1},
-		},
-	} {
-		source := &packedInts{tc.uint64s, tc.int64s}
-		if got, want := marshal(source), tc.encoded; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed encode %d, got %v, want %v", i, got, want)
-		}
-
-		dest := new(packedInts)
-		if err := unmarshal(tc.encoded, dest); err != nil {
-			t.Errorf("failed decode %d: %v", i, err)
-			continue
-		}
-		if got, want := dest.uint64s, tc.uint64s; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed decode uint64s %d, got %v, want %v", i, got, want)
-		}
-		if got, want := dest.int64s, tc.int64s; !reflect.DeepEqual(got, want) {
-			t.Errorf("failed decode int64s %d, got %v, want %v", i, got, want)
-		}
-	}
-}
-
-type packedInts struct {
-	uint64s []uint64
-	int64s  []int64
-}
-
-func (u *packedInts) decoder() []decoder {
-	return []decoder{
-		nil,
-		func(b *buffer, m message) error { return decodeUint64s(b, &m.(*packedInts).uint64s) },
-		func(b *buffer, m message) error { return decodeInt64s(b, &m.(*packedInts).int64s) },
-	}
-}
-
-func (u *packedInts) encode(b *buffer) {
-	encodeUint64s(b, 1, u.uint64s)
-	encodeInt64s(b, 2, u.int64s)
-}
diff --git a/src/runtime/pprof/internal/profile/prune.go b/src/runtime/pprof/internal/profile/prune.go
deleted file mode 100644
index 1924fad..0000000
--- a/src/runtime/pprof/internal/profile/prune.go
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implements methods to remove frames from profiles.
-
-package profile
-
-import (
-	"fmt"
-	"regexp"
-)
-
-// Prune removes all nodes beneath a node matching dropRx, and not
-// matching keepRx. If the root node of a Sample matches, the sample
-// will have an empty stack.
-func (p *Profile) Prune(dropRx, keepRx *regexp.Regexp) {
-	prune := make(map[uint64]bool)
-	pruneBeneath := make(map[uint64]bool)
-
-	for _, loc := range p.Location {
-		var i int
-		for i = len(loc.Line) - 1; i >= 0; i-- {
-			if fn := loc.Line[i].Function; fn != nil && fn.Name != "" {
-				funcName := fn.Name
-				// Account for leading '.' on the PPC ELF v1 ABI.
-				if funcName[0] == '.' {
-					funcName = funcName[1:]
-				}
-				if dropRx.MatchString(funcName) {
-					if keepRx == nil || !keepRx.MatchString(funcName) {
-						break
-					}
-				}
-			}
-		}
-
-		if i >= 0 {
-			// Found matching entry to prune.
-			pruneBeneath[loc.ID] = true
-
-			// Remove the matching location.
-			if i == len(loc.Line)-1 {
-				// Matched the top entry: prune the whole location.
-				prune[loc.ID] = true
-			} else {
-				loc.Line = loc.Line[i+1:]
-			}
-		}
-	}
-
-	// Prune locs from each Sample
-	for _, sample := range p.Sample {
-		// Scan from the root to the leaves to find the prune location.
-		// Do not prune frames before the first user frame, to avoid
-		// pruning everything.
-		foundUser := false
-		for i := len(sample.Location) - 1; i >= 0; i-- {
-			id := sample.Location[i].ID
-			if !prune[id] && !pruneBeneath[id] {
-				foundUser = true
-				continue
-			}
-			if !foundUser {
-				continue
-			}
-			if prune[id] {
-				sample.Location = sample.Location[i+1:]
-				break
-			}
-			if pruneBeneath[id] {
-				sample.Location = sample.Location[i:]
-				break
-			}
-		}
-	}
-}
-
-// RemoveUninteresting prunes and elides profiles using built-in
-// tables of uninteresting function names.
-func (p *Profile) RemoveUninteresting() error {
-	var keep, drop *regexp.Regexp
-	var err error
-
-	if p.DropFrames != "" {
-		if drop, err = regexp.Compile("^(" + p.DropFrames + ")$"); err != nil {
-			return fmt.Errorf("failed to compile regexp %s: %v", p.DropFrames, err)
-		}
-		if p.KeepFrames != "" {
-			if keep, err = regexp.Compile("^(" + p.KeepFrames + ")$"); err != nil {
-				return fmt.Errorf("failed to compile regexp %s: %v", p.KeepFrames, err)
-			}
-		}
-		p.Prune(drop, keep)
-	}
-	return nil
-}
diff --git a/src/runtime/pprof/label.go b/src/runtime/pprof/label.go
index 20f9cdb..b614f12 100644
--- a/src/runtime/pprof/label.go
+++ b/src/runtime/pprof/label.go
@@ -6,6 +6,9 @@
 
 import (
 	"context"
+	"fmt"
+	"sort"
+	"strings"
 )
 
 type label struct {
@@ -34,6 +37,23 @@
 // that admits incremental immutable modification more efficiently.
 type labelMap map[string]string
 
+// String statisfies Stringer and returns key, value pairs in a consistent
+// order.
+func (l *labelMap) String() string {
+	if l == nil {
+		return ""
+	}
+	keyVals := make([]string, 0, len(*l))
+
+	for k, v := range *l {
+		keyVals = append(keyVals, fmt.Sprintf("%q:%q", k, v))
+	}
+
+	sort.Strings(keyVals)
+
+	return "{" + strings.Join(keyVals, ", ") + "}"
+}
+
 // WithLabels returns a new context.Context with the given labels added.
 // A label overwrites a prior label with the same key.
 func WithLabels(ctx context.Context, labels LabelSet) context.Context {
@@ -54,17 +74,18 @@
 // Labels takes an even number of strings representing key-value pairs
 // and makes a LabelSet containing them.
 // A label overwrites a prior label with the same key.
-// Currently only CPU profile utilizes labels information.
+// Currently only the CPU and goroutine profiles utilize any labels
+// information.
 // See https://golang.org/issue/23458 for details.
 func Labels(args ...string) LabelSet {
 	if len(args)%2 != 0 {
 		panic("uneven number of arguments to pprof.Labels")
 	}
-	labels := LabelSet{}
+	list := make([]label, 0, len(args)/2)
 	for i := 0; i+1 < len(args); i += 2 {
-		labels.list = append(labels.list, label{key: args[i], value: args[i+1]})
+		list = append(list, label{key: args[i], value: args[i+1]})
 	}
-	return labels
+	return LabelSet{list: list}
 }
 
 // Label returns the value of the label with the given key on ctx, and a boolean indicating
diff --git a/src/runtime/pprof/label_test.go b/src/runtime/pprof/label_test.go
index 240445f..fcb00bd 100644
--- a/src/runtime/pprof/label_test.go
+++ b/src/runtime/pprof/label_test.go
@@ -24,7 +24,7 @@
 func (s labelSorter) Less(i, j int) bool { return s[i].key < s[j].key }
 
 func TestContextLabels(t *testing.T) {
-	// Background context starts with no lablels.
+	// Background context starts with no labels.
 	ctx := context.Background()
 	labels := labelsSorted(ctx)
 	if len(labels) != 0 {
@@ -80,3 +80,35 @@
 		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
 	}
 }
+
+func TestLabelMapStringer(t *testing.T) {
+	for _, tbl := range []struct {
+		m        labelMap
+		expected string
+	}{
+		{
+			m: labelMap{
+				// empty map
+			},
+			expected: "{}",
+		}, {
+			m: labelMap{
+				"foo": "bar",
+			},
+			expected: `{"foo":"bar"}`,
+		}, {
+			m: labelMap{
+				"foo":             "bar",
+				"key1":            "value1",
+				"key2":            "value2",
+				"key3":            "value3",
+				"key4WithNewline": "\nvalue4",
+			},
+			expected: `{"foo":"bar", "key1":"value1", "key2":"value2", "key3":"value3", "key4WithNewline":"\nvalue4"}`,
+		},
+	} {
+		if got := tbl.m.String(); tbl.expected != got {
+			t.Errorf("%#v.String() = %q; want %q", tbl.m, got, tbl.expected)
+		}
+	}
+}
diff --git a/src/runtime/pprof/map.go b/src/runtime/pprof/map.go
index a271ad0..7c75872 100644
--- a/src/runtime/pprof/map.go
+++ b/src/runtime/pprof/map.go
@@ -68,7 +68,8 @@
 	if len(m.freeStk) < len(stk) {
 		m.freeStk = make([]uintptr, 1024)
 	}
-	e.stk = m.freeStk[:len(stk)]
+	// Limit cap to prevent append from clobbering freeStk.
+	e.stk = m.freeStk[:len(stk):len(stk)]
 	m.freeStk = m.freeStk[len(stk):]
 
 	for j := range stk {
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index 4c14527..f253f07 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -2,11 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !js
+
 package pprof
 
 import (
 	"bytes"
 	"fmt"
+	"internal/profile"
 	"reflect"
 	"regexp"
 	"runtime"
@@ -27,6 +30,10 @@
 	memSink = make([]byte, 2<<20)
 }
 
+func allocateTransient2MInline() {
+	memSink = make([]byte, 2<<20)
+}
+
 type Obj32 struct {
 	link *Obj32
 	pad  [32 - unsafe.Sizeof(uintptr(0))]byte
@@ -71,42 +78,99 @@
 	// Do the interesting allocations.
 	allocateTransient1M()
 	allocateTransient2M()
+	allocateTransient2MInline()
 	allocatePersistent1K()
 	allocateReflect()
 	memSink = nil
 
 	runtime.GC() // materialize stats
-	var buf bytes.Buffer
-	if err := Lookup("heap").WriteTo(&buf, 1); err != nil {
-		t.Fatalf("failed to write heap profile: %v", err)
-	}
 
 	memoryProfilerRun++
 
-	tests := []string{
-		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:40
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:74
+	tests := []struct {
+		stk    []string
+		legacy string
+	}{{
+		stk: []string{"runtime/pprof.allocatePersistent1K", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:47
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:82
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
-
-		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:21
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:72
+	}, {
+		stk: []string{"runtime/pprof.allocateTransient1M", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:24
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:79
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
-
-		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:27
-#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:73
+	}, {
+		stk: []string{"runtime/pprof.allocateTransient2M", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:30
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:80
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
-
-		fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:48
+	}, {
+		stk: []string{"runtime/pprof.allocateTransient2MInline", "runtime/pprof.TestMemoryProfiler"},
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2MInline\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:34
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:81
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
-	}
+	}, {
+		stk: []string{"runtime/pprof.allocateReflectTransient"},
+		legacy: fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
+#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:55
+`, memoryProfilerRun, (2<<20)*memoryProfilerRun),
+	}}
 
-	for _, test := range tests {
-		if !regexp.MustCompile(test).Match(buf.Bytes()) {
-			t.Fatalf("The entry did not match:\n%v\n\nProfile:\n%v\n", test, buf.String())
+	t.Run("debug=1", func(t *testing.T) {
+		var buf bytes.Buffer
+		if err := Lookup("heap").WriteTo(&buf, 1); err != nil {
+			t.Fatalf("failed to write heap profile: %v", err)
 		}
-	}
+
+		for _, test := range tests {
+			if !regexp.MustCompile(test.legacy).Match(buf.Bytes()) {
+				t.Fatalf("The entry did not match:\n%v\n\nProfile:\n%v\n", test.legacy, buf.String())
+			}
+		}
+	})
+
+	t.Run("proto", func(t *testing.T) {
+		var buf bytes.Buffer
+		if err := Lookup("heap").WriteTo(&buf, 0); err != nil {
+			t.Fatalf("failed to write heap profile: %v", err)
+		}
+		p, err := profile.Parse(&buf)
+		if err != nil {
+			t.Fatalf("failed to parse heap profile: %v", err)
+		}
+		t.Logf("Profile = %v", p)
+
+		stks := stacks(p)
+		for _, test := range tests {
+			if !containsStack(stks, test.stk) {
+				t.Fatalf("No matching stack entry for %q\n\nProfile:\n%v\n", test.stk, p)
+			}
+		}
+
+		if !containsInlinedCall(TestMemoryProfiler, 4<<10) {
+			t.Logf("Can't determine whether allocateTransient2MInline was inlined into TestMemoryProfiler.")
+			return
+		}
+
+		// Check the inlined function location is encoded correctly.
+		for _, loc := range p.Location {
+			inlinedCaller, inlinedCallee := false, false
+			for _, line := range loc.Line {
+				if line.Function.Name == "runtime/pprof.allocateTransient2MInline" {
+					inlinedCallee = true
+				}
+				if inlinedCallee && line.Function.Name == "runtime/pprof.TestMemoryProfiler" {
+					inlinedCaller = true
+				}
+			}
+			if inlinedCallee != inlinedCaller {
+				t.Errorf("want allocateTransient2MInline after TestMemoryProfiler in one location, got separate location entries:\n%v", loc)
+			}
+		}
+	})
 }
diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index 74cdd15..d3b7df3 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go
@@ -28,7 +28,7 @@
 //            if err != nil {
 //                log.Fatal("could not create CPU profile: ", err)
 //            }
-//            defer f.Close()
+//            defer f.Close() // error handling omitted for example
 //            if err := pprof.StartCPUProfile(f); err != nil {
 //                log.Fatal("could not start CPU profile: ", err)
 //            }
@@ -42,7 +42,7 @@
 //            if err != nil {
 //                log.Fatal("could not create memory profile: ", err)
 //            }
-//            defer f.Close()
+//            defer f.Close() // error handling omitted for example
 //            runtime.GC() // get up-to-date statistics
 //            if err := pprof.WriteHeapProfile(f); err != nil {
 //                log.Fatal("could not write memory profile: ", err)
@@ -313,9 +313,11 @@
 // Otherwise, WriteTo returns nil.
 //
 // The debug parameter enables additional output.
-// Passing debug=0 prints only the hexadecimal addresses that pprof needs.
-// Passing debug=1 adds comments translating addresses to function names
-// and line numbers, so that a programmer can read the profile without tools.
+// Passing debug=0 writes the gzip-compressed protocol buffer described
+// in https://github.com/google/pprof/tree/master/proto#overview.
+// Passing debug=1 writes the legacy text format with comments
+// translating addresses to function names and line numbers, so that a
+// programmer can read the profile without tools.
 //
 // The predefined profiles may assign meaning to other debug values;
 // for example, when printing the "goroutine" profile, debug=2 means to
@@ -355,6 +357,7 @@
 
 func (x stackProfile) Len() int              { return len(x) }
 func (x stackProfile) Stack(i int) []uintptr { return x[i] }
+func (x stackProfile) Label(i int) *labelMap { return nil }
 
 // A countProfile is a set of stack traces to be printed as counts
 // grouped by stack trace. There are multiple implementations:
@@ -363,6 +366,7 @@
 type countProfile interface {
 	Len() int
 	Stack(i int) []uintptr
+	Label(i int) *labelMap
 }
 
 // printCountCycleProfile outputs block profile records (for block or mutex profiles)
@@ -386,16 +390,9 @@
 		count, nanosec := scaler(r.Count, float64(r.Cycles)/cpuGHz)
 		values[0] = count
 		values[1] = int64(nanosec)
-		locs = locs[:0]
-		for _, addr := range r.Stack() {
-			// For count profiles, all stack addresses are
-			// return PCs, which is what locForPC expects.
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
-			}
-			locs = append(locs, l)
-		}
+		// For count profiles, all stack addresses are
+		// return PCs, which is what appendLocsForStack expects.
+		locs = b.appendLocsForStack(locs[:0], r.Stack())
 		b.pbSample(values, locs, nil)
 	}
 	b.build()
@@ -407,12 +404,16 @@
 func printCountProfile(w io.Writer, debug int, name string, p countProfile) error {
 	// Build count of each stack.
 	var buf bytes.Buffer
-	key := func(stk []uintptr) string {
+	key := func(stk []uintptr, lbls *labelMap) string {
 		buf.Reset()
 		fmt.Fprintf(&buf, "@")
 		for _, pc := range stk {
 			fmt.Fprintf(&buf, " %#x", pc)
 		}
+		if lbls != nil {
+			buf.WriteString("\n# labels: ")
+			buf.WriteString(lbls.String())
+		}
 		return buf.String()
 	}
 	count := map[string]int{}
@@ -420,7 +421,7 @@
 	var keys []string
 	n := p.Len()
 	for i := 0; i < n; i++ {
-		k := key(p.Stack(i))
+		k := key(p.Stack(i), p.Label(i))
 		if count[k] == 0 {
 			index[k] = i
 			keys = append(keys, k)
@@ -451,17 +452,19 @@
 	var locs []uint64
 	for _, k := range keys {
 		values[0] = int64(count[k])
-		locs = locs[:0]
-		for _, addr := range p.Stack(index[k]) {
-			// For count profiles, all stack addresses are
-			// return PCs, which is what locForPC expects.
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
+		// For count profiles, all stack addresses are
+		// return PCs, which is what appendLocsForStack expects.
+		locs = b.appendLocsForStack(locs[:0], p.Stack(index[k]))
+		idx := index[k]
+		var labels func()
+		if p.Label(idx) != nil {
+			labels = func() {
+				for k, v := range *p.Label(idx) {
+					b.pbLabel(tagSample_Label, k, v, 0)
+				}
 			}
-			locs = append(locs, l)
 		}
-		b.pbSample(values, locs, nil)
+		b.pbSample(values, locs, labels)
 	}
 	b.build()
 	return nil
@@ -642,6 +645,9 @@
 	fmt.Fprintf(w, "# GCCPUFraction = %v\n", s.GCCPUFraction)
 	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
+	// Also flush out MaxRSS on supported platforms.
+	addMaxRSS(w)
+
 	tw.Flush()
 	return b.Flush()
 }
@@ -654,7 +660,12 @@
 
 // writeThreadCreate writes the current runtime ThreadCreateProfile to w.
 func writeThreadCreate(w io.Writer, debug int) error {
-	return writeRuntimeProfile(w, debug, "threadcreate", runtime.ThreadCreateProfile)
+	// Until https://golang.org/issues/6104 is addressed, wrap
+	// ThreadCreateProfile because there's no point in tracking labels when we
+	// don't get any stack-traces.
+	return writeRuntimeProfile(w, debug, "threadcreate", func(p []runtime.StackRecord, _ []unsafe.Pointer) (n int, ok bool) {
+		return runtime.ThreadCreateProfile(p)
+	})
 }
 
 // countGoroutine returns the number of goroutines.
@@ -662,12 +673,15 @@
 	return runtime.NumGoroutine()
 }
 
+// runtime_goroutineProfileWithLabels is defined in runtime/mprof.go
+func runtime_goroutineProfileWithLabels(p []runtime.StackRecord, labels []unsafe.Pointer) (n int, ok bool)
+
 // writeGoroutine writes the current runtime GoroutineProfile to w.
 func writeGoroutine(w io.Writer, debug int) error {
 	if debug >= 2 {
 		return writeGoroutineStacks(w)
 	}
-	return writeRuntimeProfile(w, debug, "goroutine", runtime.GoroutineProfile)
+	return writeRuntimeProfile(w, debug, "goroutine", runtime_goroutineProfileWithLabels)
 }
 
 func writeGoroutineStacks(w io.Writer) error {
@@ -691,7 +705,7 @@
 	return err
 }
 
-func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runtime.StackRecord) (int, bool)) error {
+func writeRuntimeProfile(w io.Writer, debug int, name string, fetch func([]runtime.StackRecord, []unsafe.Pointer) (int, bool)) error {
 	// Find out how many records there are (fetch(nil)),
 	// allocate that many records, and get the data.
 	// There's a race—more records might be added between
@@ -699,13 +713,15 @@
 	// and also try again if we're very unlucky.
 	// The loop should only execute one iteration in the common case.
 	var p []runtime.StackRecord
-	n, ok := fetch(nil)
+	var labels []unsafe.Pointer
+	n, ok := fetch(nil, nil)
 	for {
 		// Allocate room for a slightly bigger profile,
 		// in case a few more entries have been added
 		// since the call to ThreadProfile.
 		p = make([]runtime.StackRecord, n+10)
-		n, ok = fetch(p)
+		labels = make([]unsafe.Pointer, n+10)
+		n, ok = fetch(p, labels)
 		if ok {
 			p = p[0:n]
 			break
@@ -713,13 +729,17 @@
 		// Profile grew; try again.
 	}
 
-	return printCountProfile(w, debug, name, runtimeProfile(p))
+	return printCountProfile(w, debug, name, &runtimeProfile{p, labels})
 }
 
-type runtimeProfile []runtime.StackRecord
+type runtimeProfile struct {
+	stk    []runtime.StackRecord
+	labels []unsafe.Pointer
+}
 
-func (p runtimeProfile) Len() int              { return len(p) }
-func (p runtimeProfile) Stack(i int) []uintptr { return p[i].Stack() }
+func (p *runtimeProfile) Len() int              { return len(p.stk) }
+func (p *runtimeProfile) Stack(i int) []uintptr { return p.stk[i].Stack() }
+func (p *runtimeProfile) Label(i int) *labelMap { return (*labelMap)(p.labels[i]) }
 
 var cpu struct {
 	sync.Mutex
diff --git a/src/runtime/pprof/pprof_norusage.go b/src/runtime/pprof/pprof_norusage.go
new file mode 100644
index 0000000..6fdcc6c
--- /dev/null
+++ b/src/runtime/pprof/pprof_norusage.go
@@ -0,0 +1,15 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !darwin,!linux
+
+package pprof
+
+import (
+	"io"
+)
+
+// Stub call for platforms that don't support rusage.
+func addMaxRSS(w io.Writer) {
+}
diff --git a/src/runtime/pprof/pprof_rusage.go b/src/runtime/pprof/pprof_rusage.go
new file mode 100644
index 0000000..d42e6ed
--- /dev/null
+++ b/src/runtime/pprof/pprof_rusage.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin linux
+
+package pprof
+
+import (
+	"fmt"
+	"io"
+	"runtime"
+	"syscall"
+)
+
+// Adds MaxRSS to platforms that are supported.
+func addMaxRSS(w io.Writer) {
+	var rssToBytes uintptr
+	switch runtime.GOOS {
+	case "linux", "android":
+		rssToBytes = 1024
+	case "darwin":
+		rssToBytes = 1
+	default:
+		panic("unsupported OS")
+	}
+
+	var rusage syscall.Rusage
+	syscall.Getrusage(0, &rusage)
+	fmt.Fprintf(w, "# MaxRSS = %d\n", uintptr(rusage.Maxrss)*rssToBytes)
+}
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 5349637..7149bfb 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !nacl,!js
+// +build !js
 
 package pprof
 
@@ -10,6 +10,7 @@
 	"bytes"
 	"context"
 	"fmt"
+	"internal/profile"
 	"internal/testenv"
 	"io"
 	"io/ioutil"
@@ -18,7 +19,6 @@
 	"os/exec"
 	"regexp"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"strings"
 	"sync"
 	"sync/atomic"
@@ -49,8 +49,12 @@
 // Must not call other functions nor access heap/globals in the loop,
 // otherwise under race detector the samples will be in the race runtime.
 func cpuHog1(x int) int {
+	return cpuHog0(x, 1e5)
+}
+
+func cpuHog0(x, n int) int {
 	foo := x
-	for i := 0; i < 1e5; i++ {
+	for i := 0; i < n; i++ {
 		if foo > 0 {
 			foo *= foo
 		} else {
@@ -100,35 +104,149 @@
 	})
 }
 
+// containsInlinedCall reports whether the function body for the function f is
+// known to contain an inlined function call within the first maxBytes bytes.
+func containsInlinedCall(f interface{}, maxBytes int) bool {
+	_, found := findInlinedCall(f, maxBytes)
+	return found
+}
+
+// findInlinedCall returns the PC of an inlined function call within
+// the function body for the function f if any.
+func findInlinedCall(f interface{}, maxBytes int) (pc uint64, found bool) {
+	fFunc := runtime.FuncForPC(uintptr(funcPC(f)))
+	if fFunc == nil || fFunc.Entry() == 0 {
+		panic("failed to locate function entry")
+	}
+
+	for offset := 0; offset < maxBytes; offset++ {
+		innerPC := fFunc.Entry() + uintptr(offset)
+		inner := runtime.FuncForPC(innerPC)
+		if inner == nil {
+			// No function known for this PC value.
+			// It might simply be misaligned, so keep searching.
+			continue
+		}
+		if inner.Entry() != fFunc.Entry() {
+			// Scanned past f and didn't find any inlined functions.
+			break
+		}
+		if inner.Name() != fFunc.Name() {
+			// This PC has f as its entry-point, but is not f. Therefore, it must be a
+			// function inlined into f.
+			return uint64(innerPC), true
+		}
+	}
+
+	return 0, false
+}
+
 func TestCPUProfileInlining(t *testing.T) {
-	testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
+	if !containsInlinedCall(inlinedCaller, 4<<10) {
+		t.Skip("Can't determine whether inlinedCallee was inlined into inlinedCaller.")
+	}
+
+	p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
 		cpuHogger(inlinedCaller, &salt1, dur)
 	})
+
+	// Check if inlined function locations are encoded correctly. The inlinedCalee and inlinedCaller should be in one location.
+	for _, loc := range p.Location {
+		hasInlinedCallerAfterInlinedCallee, hasInlinedCallee := false, false
+		for _, line := range loc.Line {
+			if line.Function.Name == "runtime/pprof.inlinedCallee" {
+				hasInlinedCallee = true
+			}
+			if hasInlinedCallee && line.Function.Name == "runtime/pprof.inlinedCaller" {
+				hasInlinedCallerAfterInlinedCallee = true
+			}
+		}
+		if hasInlinedCallee != hasInlinedCallerAfterInlinedCallee {
+			t.Fatalf("want inlinedCallee followed by inlinedCaller, got separate Location entries:\n%v", p)
+		}
+	}
 }
 
 func inlinedCaller(x int) int {
-	x = inlinedCallee(x)
+	x = inlinedCallee(x, 1e5)
 	return x
 }
 
-func inlinedCallee(x int) int {
-	// We could just use cpuHog1, but for loops prevent inlining
-	// right now. :(
-	foo := x
-	i := 0
-loop:
-	if foo > 0 {
-		foo *= foo
-	} else {
-		foo *= foo + 1
-	}
-	if i++; i < 1e5 {
-		goto loop
-	}
-	return foo
+func inlinedCallee(x, n int) int {
+	return cpuHog0(x, n)
 }
 
-func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
+//go:noinline
+func dumpCallers(pcs []uintptr) {
+	if pcs == nil {
+		return
+	}
+
+	skip := 2 // Callers and dumpCallers
+	runtime.Callers(skip, pcs)
+}
+
+//go:noinline
+func inlinedCallerDump(pcs []uintptr) {
+	inlinedCalleeDump(pcs)
+}
+
+func inlinedCalleeDump(pcs []uintptr) {
+	dumpCallers(pcs)
+}
+
+func TestCPUProfileRecursion(t *testing.T) {
+	p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) {
+		cpuHogger(recursionCaller, &salt1, dur)
+	})
+
+	// check the Location encoding was not confused by recursive calls.
+	for i, loc := range p.Location {
+		recursionFunc := 0
+		for _, line := range loc.Line {
+			if name := line.Function.Name; name == "runtime/pprof.recursionCaller" || name == "runtime/pprof.recursionCallee" {
+				recursionFunc++
+			}
+		}
+		if recursionFunc > 1 {
+			t.Fatalf("want at most one recursionCaller or recursionCallee in one Location, got a violating Location (index: %d):\n%v", i, p)
+		}
+	}
+}
+
+func recursionCaller(x int) int {
+	y := recursionCallee(3, x)
+	return y
+}
+
+func recursionCallee(n, x int) int {
+	if n == 0 {
+		return 1
+	}
+	y := inlinedCallee(x, 1e4)
+	return y * recursionCallee(n-1, x)
+}
+
+func recursionChainTop(x int, pcs []uintptr) {
+	if x < 0 {
+		return
+	}
+	recursionChainMiddle(x, pcs)
+}
+
+func recursionChainMiddle(x int, pcs []uintptr) {
+	recursionChainBottom(x, pcs)
+}
+
+func recursionChainBottom(x int, pcs []uintptr) {
+	// This will be called each time, we only care about the last. We
+	// can't make this conditional or this function won't be inlined.
+	dumpCallers(pcs)
+
+	recursionChainTop(x-1, pcs)
+}
+
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile {
 	p, err := profile.Parse(bytes.NewReader(valBytes))
 	if err != nil {
 		t.Fatal(err)
@@ -137,15 +255,16 @@
 		count := uintptr(sample.Value[0])
 		f(count, sample.Location, sample.Label)
 	}
+	return p
 }
 
 // testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
-// as interpreted by matches.
-func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
+// as interpreted by matches, and returns the parsed profile.
+func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
-		case "arm", "arm64":
+		case "arm64":
 			// nothing
 		default:
 			out, err := exec.Command("uname", "-a").CombinedOutput()
@@ -195,8 +314,8 @@
 		f(duration)
 		StopCPUProfile()
 
-		if profileOk(t, matches, need, avoid, prof, duration) {
-			return
+		if p, ok := profileOk(t, matches, need, avoid, prof, duration); ok {
+			return p
 		}
 
 		duration *= 2
@@ -217,6 +336,7 @@
 		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
 	}
 	t.FailNow()
+	return nil
 }
 
 func contains(slice []string, s string) bool {
@@ -242,7 +362,7 @@
 
 type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
 
-func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (_ *profile.Profile, ok bool) {
 	ok = true
 
 	// Check that profile is well formed, contains 'need', and does not contain
@@ -251,7 +371,7 @@
 	avoidSamples := make([]uintptr, len(avoid))
 	var samples uintptr
 	var buf bytes.Buffer
-	parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
+	p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
 		fmt.Fprintf(&buf, "%d:", count)
 		fprintStack(&buf, stk)
 		samples += count
@@ -278,7 +398,7 @@
 		// not enough samples due to coarse timer
 		// resolution. Let it go.
 		t.Log("too few samples on Windows (golang.org/issue/10842)")
-		return false
+		return p, false
 	}
 
 	// Check that we got a reasonable number of samples.
@@ -300,7 +420,7 @@
 	}
 
 	if len(need) == 0 {
-		return ok
+		return p, ok
 	}
 
 	var total uintptr
@@ -323,7 +443,7 @@
 			ok = false
 		}
 	}
-	return ok
+	return p, ok
 }
 
 // Fork can hang if preempted with signals frequently enough (see issue 5517).
@@ -857,6 +977,26 @@
 			runtime.Gosched()
 		}
 	}
+	ctx := context.Background()
+
+	// ... and again, with labels this time (just with fewer iterations to keep
+	// sorting deterministic).
+	Do(ctx, Labels("label", "value"), func(context.Context) {
+		for i := 0; i < 89; i++ {
+			switch {
+			case i%10 == 0:
+				go func1(c)
+			case i%2 == 0:
+				go func2(c)
+			default:
+				go func3(c)
+			}
+			// Let goroutines block on channel
+			for j := 0; j < 5; j++ {
+				runtime.Gosched()
+			}
+		}
+	})
 
 	var w bytes.Buffer
 	goroutineProf := Lookup("goroutine")
@@ -865,8 +1005,11 @@
 	goroutineProf.WriteTo(&w, 1)
 	prof := w.String()
 
-	if !containsInOrder(prof, "\n50 @ ", "\n40 @", "\n10 @", "\n1 @") {
-		t.Errorf("expected sorted goroutine counts:\n%s", prof)
+	labels := labelMap{"label": "value"}
+	labelStr := "\n# labels: " + labels.String()
+	if !containsInOrder(prof, "\n50 @ ", "\n44 @", labelStr,
+		"\n40 @", "\n36 @", labelStr, "\n10 @", "\n9 @", labelStr, "\n1 @") {
+		t.Errorf("expected sorted goroutine counts with Labels:\n%s", prof)
 	}
 
 	// Check proto profile
@@ -879,9 +1022,18 @@
 	if err := p.CheckValid(); err != nil {
 		t.Errorf("protobuf profile is invalid: %v", err)
 	}
-	if !containsCounts(p, []int64{50, 40, 10, 1}) {
-		t.Errorf("expected count profile to contain goroutines with counts %v, got %v",
-			[]int64{50, 40, 10, 1}, p)
+	expectedLabels := map[int64]map[string]string{
+		50: map[string]string{},
+		44: map[string]string{"label": "value"},
+		40: map[string]string{},
+		36: map[string]string{"label": "value"},
+		10: map[string]string{},
+		9:  map[string]string{"label": "value"},
+		1:  map[string]string{},
+	}
+	if !containsCountsLabels(p, expectedLabels) {
+		t.Errorf("expected count profile to contain goroutines with counts and labels %v, got %v",
+			expectedLabels, p)
 	}
 
 	close(c)
@@ -900,10 +1052,23 @@
 	return true
 }
 
-func containsCounts(prof *profile.Profile, counts []int64) bool {
+func containsCountsLabels(prof *profile.Profile, countLabels map[int64]map[string]string) bool {
 	m := make(map[int64]int)
-	for _, c := range counts {
+	type nkey struct {
+		count    int64
+		key, val string
+	}
+	n := make(map[nkey]int)
+	for c, kv := range countLabels {
 		m[c]++
+		for k, v := range kv {
+			n[nkey{
+				count: c,
+				key:   k,
+				val:   v,
+			}]++
+
+		}
 	}
 	for _, s := range prof.Sample {
 		// The count is the single value in the sample
@@ -911,12 +1076,26 @@
 			return false
 		}
 		m[s.Value[0]]--
+		for k, vs := range s.Label {
+			for _, v := range vs {
+				n[nkey{
+					count: s.Value[0],
+					key:   k,
+					val:   v,
+				}]--
+			}
+		}
 	}
 	for _, n := range m {
 		if n > 0 {
 			return false
 		}
 	}
+	for _, ncnt := range n {
+		if ncnt != 0 {
+			return false
+		}
+	}
 	return true
 }
 
@@ -1056,3 +1235,222 @@
 		runtime.Stack(buf, true)
 	}
 }
+
+// TestTryAdd tests the cases that are hard to test with real program execution.
+//
+// For example, the current go compilers may not always inline functions
+// involved in recursion but that may not be true in the future compilers. This
+// tests such cases by using fake call sequences and forcing the profile build
+// utilizing translateCPUProfile defined in proto_test.go
+func TestTryAdd(t *testing.T) {
+	if _, found := findInlinedCall(inlinedCallerDump, 4<<10); !found {
+		t.Skip("Can't determine whether anything was inlined into inlinedCallerDump.")
+	}
+
+	// inlinedCallerDump
+	//   inlinedCalleeDump
+	pcs := make([]uintptr, 2)
+	inlinedCallerDump(pcs)
+	inlinedCallerStack := make([]uint64, 2)
+	for i := range pcs {
+		inlinedCallerStack[i] = uint64(pcs[i])
+	}
+
+	if _, found := findInlinedCall(recursionChainBottom, 4<<10); !found {
+		t.Skip("Can't determine whether anything was inlined into recursionChainBottom.")
+	}
+
+	// recursionChainTop
+	//   recursionChainMiddle
+	//     recursionChainBottom
+	//       recursionChainTop
+	//         recursionChainMiddle
+	//           recursionChainBottom
+	pcs = make([]uintptr, 6)
+	recursionChainTop(1, pcs)
+	recursionStack := make([]uint64, len(pcs))
+	for i := range pcs {
+		recursionStack[i] = uint64(pcs[i])
+	}
+
+	period := int64(2000 * 1000) // 1/500*1e9 nanosec.
+
+	testCases := []struct {
+		name        string
+		input       []uint64          // following the input format assumed by profileBuilder.addCPUData.
+		wantLocs    [][]string        // ordered location entries with function names.
+		wantSamples []*profile.Sample // ordered samples, we care only about Value and the profile location IDs.
+	}{{
+		// Sanity test for a normal, complete stack trace.
+		name: "full_stack_trace",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 50, inlinedCallerStack[0], inlinedCallerStack[1],
+		},
+		wantLocs: [][]string{
+			{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"},
+		},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "bug35538",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			// Fake frame: tryAdd will have inlinedCallerDump
+			// (stack[1]) on the deck when it encounters the next
+			// inline function. It should accept this.
+			7, 0, 10, inlinedCallerStack[0], inlinedCallerStack[1], inlinedCallerStack[0], inlinedCallerStack[1],
+			5, 0, 20, inlinedCallerStack[0], inlinedCallerStack[1],
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{10, 10 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}}},
+			{Value: []int64{20, 20 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "bug38096",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			// count (data[2]) == 0 && len(stk) == 1 is an overflow
+			// entry. The "stk" entry is actually the count.
+			4, 0, 0, 4242,
+		},
+		wantLocs: [][]string{{"runtime/pprof.lostProfileEvent"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{4242, 4242 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		// If a function is directly called recursively then it must
+		// not be inlined in the caller.
+		//
+		// N.B. We're generating an impossible profile here, with a
+		// recursive inlineCalleeDump call. This is simulating a non-Go
+		// function that looks like an inlined Go function other than
+		// its recursive property. See pcDeck.tryAdd.
+		name: "directly_recursive_func_is_not_inlined",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 30, inlinedCallerStack[0], inlinedCallerStack[0],
+			4, 0, 40, inlinedCallerStack[0],
+		},
+		// inlinedCallerDump shows up here because
+		// runtime_expandFinalInlineFrame adds it to the stack frame.
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump"}, {"runtime/pprof.inlinedCallerDump"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{30, 30 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}, {ID: 2}}},
+			{Value: []int64{40, 40 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}}},
+		},
+	}, {
+		name: "recursion_chain_inline",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			9, 0, 10, recursionStack[0], recursionStack[1], recursionStack[2], recursionStack[3], recursionStack[4], recursionStack[5],
+		},
+		wantLocs: [][]string{
+			{"runtime/pprof.recursionChainBottom"},
+			{
+				"runtime/pprof.recursionChainMiddle",
+				"runtime/pprof.recursionChainTop",
+				"runtime/pprof.recursionChainBottom",
+			},
+			{
+				"runtime/pprof.recursionChainMiddle",
+				"runtime/pprof.recursionChainTop",
+				"runtime/pprof.TestTryAdd", // inlined into the test.
+			},
+		},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{10, 10 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}, {ID: 3}}},
+		},
+	}, {
+		name: "truncated_stack_trace_later",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			5, 0, 50, inlinedCallerStack[0], inlinedCallerStack[1],
+			4, 0, 60, inlinedCallerStack[0],
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
+			{Value: []int64{60, 60 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		name: "truncated_stack_trace_first",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			4, 0, 70, inlinedCallerStack[0],
+			5, 0, 80, inlinedCallerStack[0], inlinedCallerStack[1],
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
+			{Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		// We can recover the inlined caller from a truncated stack.
+		name: "truncated_stack_trace_only",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			4, 0, 70, inlinedCallerStack[0],
+		},
+		wantLocs: [][]string{{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"}},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
+		},
+	}, {
+		// The same location is used for duplicated stacks.
+		name: "truncated_stack_trace_twice",
+		input: []uint64{
+			3, 0, 500, // hz = 500. Must match the period.
+			4, 0, 70, inlinedCallerStack[0],
+			// Fake frame: add a fake call to
+			// inlinedCallerDump to prevent this sample
+			// from getting merged into above.
+			5, 0, 80, inlinedCallerStack[1], inlinedCallerStack[0],
+		},
+		wantLocs: [][]string{
+			{"runtime/pprof.inlinedCalleeDump", "runtime/pprof.inlinedCallerDump"},
+			{"runtime/pprof.inlinedCallerDump"},
+		},
+		wantSamples: []*profile.Sample{
+			{Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
+			{Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 2}, {ID: 1}}},
+		},
+	}}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			p, err := translateCPUProfile(tc.input)
+			if err != nil {
+				t.Fatalf("translating profile: %v", err)
+			}
+			t.Logf("Profile: %v\n", p)
+
+			// One location entry with all inlined functions.
+			var gotLoc [][]string
+			for _, loc := range p.Location {
+				var names []string
+				for _, line := range loc.Line {
+					names = append(names, line.Function.Name)
+				}
+				gotLoc = append(gotLoc, names)
+			}
+			if got, want := fmtJSON(gotLoc), fmtJSON(tc.wantLocs); got != want {
+				t.Errorf("Got Location = %+v\n\twant %+v", got, want)
+			}
+			// All samples should point to one location.
+			var gotSamples []*profile.Sample
+			for _, sample := range p.Sample {
+				var locs []*profile.Location
+				for _, loc := range sample.Location {
+					locs = append(locs, &profile.Location{ID: loc.ID})
+				}
+				gotSamples = append(gotSamples, &profile.Sample{Value: sample.Value, Location: locs})
+			}
+			if got, want := fmtJSON(gotSamples), fmtJSON(tc.wantSamples); got != want {
+				t.Errorf("Got Samples = %+v\n\twant %+v", got, want)
+			}
+		})
+	}
+}
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
index 7864dd7..8519af6 100644
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -41,9 +41,10 @@
 	pb        protobuf
 	strings   []string
 	stringMap map[string]int
-	locs      map[uintptr]int
-	funcs     map[string]int // Package path-qualified function name to Function.ID
+	locs      map[uintptr]locInfo // list of locInfo starting with the given PC.
+	funcs     map[string]int      // Package path-qualified function name to Function.ID
 	mem       []memMap
+	deck      pcDeck
 }
 
 type memMap struct {
@@ -207,15 +208,7 @@
 	b.pb.endMessage(tag, start)
 }
 
-// locForPC returns the location ID for addr.
-// addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call.
-// It may emit to b.pb, so there must be no message encoding in progress.
-func (b *profileBuilder) locForPC(addr uintptr) uint64 {
-	id := uint64(b.locs[addr])
-	if id != 0 {
-		return id
-	}
-
+func allFrames(addr uintptr) ([]runtime.Frame, symbolizeFlag) {
 	// Expand this one address using CallersFrames so we can cache
 	// each expansion. In general, CallersFrames takes a whole
 	// stack, but in this case we know there will be no skips in
@@ -225,7 +218,7 @@
 	if frame.Function == "runtime.goexit" {
 		// Short-circuit if we see runtime.goexit so the loop
 		// below doesn't allocate a useless empty location.
-		return 0
+		return nil, 0
 	}
 
 	symbolizeResult := lookupTried
@@ -238,59 +231,22 @@
 		// a reasonable call PC. This mostly happens in tests.
 		frame.PC = addr - 1
 	}
-
-	// We can't write out functions while in the middle of the
-	// Location message, so record new functions we encounter and
-	// write them out after the Location.
-	type newFunc struct {
-		id         uint64
-		name, file string
-	}
-	newFuncs := make([]newFunc, 0, 8)
-
-	id = uint64(len(b.locs)) + 1
-	b.locs[addr] = int(id)
-	start := b.pb.startMessage()
-	b.pb.uint64Opt(tagLocation_ID, id)
-	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
-	for frame.Function != "runtime.goexit" {
-		// Write out each line in frame expansion.
-		funcID := uint64(b.funcs[frame.Function])
-		if funcID == 0 {
-			funcID = uint64(len(b.funcs)) + 1
-			b.funcs[frame.Function] = int(funcID)
-			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
-		}
-		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
-		if !more {
-			break
-		}
+	ret := []runtime.Frame{frame}
+	for frame.Function != "runtime.goexit" && more == true {
 		frame, more = frames.Next()
+		ret = append(ret, frame)
 	}
-	for i := range b.mem {
-		if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
-			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+	return ret, symbolizeResult
+}
 
-			m := b.mem[i]
-			m.funcs |= symbolizeResult
-			b.mem[i] = m
-			break
-		}
-	}
-	b.pb.endMessage(tagProfile_Location, start)
+type locInfo struct {
+	// location id assigned by the profileBuilder
+	id uint64
 
-	// Write out functions we found during frame expansion.
-	for _, fn := range newFuncs {
-		start := b.pb.startMessage()
-		b.pb.uint64Opt(tagFunction_ID, fn.id)
-		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
-		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
-		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
-		b.pb.endMessage(tagProfile_Function, start)
-	}
-
-	b.flush()
-	return id
+	// sequence of PCs, including the fake PCs returned by the traceback
+	// to represent inlined functions
+	// https://github.com/golang/go/blob/d6f2f833c93a41ec1c68e49804b8387a06b131c5/src/runtime/traceback.go#L347-L368
+	pcs []uintptr
 }
 
 // newProfileBuilder returns a new profileBuilder.
@@ -305,7 +261,7 @@
 		start:     time.Now(),
 		strings:   []string{""},
 		stringMap: map[string]int{"": 0},
-		locs:      map[uintptr]int{},
+		locs:      map[uintptr]locInfo{},
 		funcs:     map[string]int{},
 	}
 	b.readMapping()
@@ -366,7 +322,10 @@
 			// overflow record
 			count = uint64(stk[0])
 			stk = []uint64{
-				uint64(funcPC(lostProfileEvent)),
+				// gentraceback guarantees that PCs in the
+				// stack can be unconditionally decremented and
+				// still be valid, so we must do the same.
+				uint64(funcPC(lostProfileEvent) + 1),
 			}
 		}
 		b.m.lookup(stk, tag).count += int64(count)
@@ -389,6 +348,7 @@
 
 	values := []int64{0, 0}
 	var locs []uint64
+
 	for e := b.m.all; e != nil; e = e.nextAll {
 		values[0] = e.count
 		values[1] = e.count * b.period
@@ -402,23 +362,8 @@
 			}
 		}
 
-		locs = locs[:0]
-		for i, addr := range e.stk {
-			// Addresses from stack traces point to the
-			// next instruction after each call, except
-			// for the leaf, which points to where the
-			// signal occurred. locForPC expects return
-			// PCs, so increment the leaf address to look
-			// like a return PC.
-			if i == 0 {
-				addr++
-			}
-			l := b.locForPC(addr)
-			if l == 0 { // runtime.goexit
-				continue
-			}
-			locs = append(locs, l)
-		}
+		locs = b.appendLocsForStack(locs[:0], e.stk)
+
 		b.pbSample(values, locs, labels)
 	}
 
@@ -435,6 +380,197 @@
 	b.zw.Close()
 }
 
+// appendLocsForStack appends the location IDs for the given stack trace to the given
+// location ID slice, locs. The addresses in the stack are return PCs or 1 + the PC of
+// an inline marker as the runtime traceback function returns.
+//
+// It may emit to b.pb, so there must be no message encoding in progress.
+func (b *profileBuilder) appendLocsForStack(locs []uint64, stk []uintptr) (newLocs []uint64) {
+	b.deck.reset()
+
+	// The last frame might be truncated. Recover lost inline frames.
+	stk = runtime_expandFinalInlineFrame(stk)
+
+	for len(stk) > 0 {
+		addr := stk[0]
+		if l, ok := b.locs[addr]; ok {
+			// first record the location if there is any pending accumulated info.
+			if id := b.emitLocation(); id > 0 {
+				locs = append(locs, id)
+			}
+
+			// then, record the cached location.
+			locs = append(locs, l.id)
+
+			// Skip the matching pcs.
+			//
+			// Even if stk was truncated due to the stack depth
+			// limit, expandFinalInlineFrame above has already
+			// fixed the truncation, ensuring it is long enough.
+			stk = stk[len(l.pcs):]
+			continue
+		}
+
+		frames, symbolizeResult := allFrames(addr)
+		if len(frames) == 0 { // runtime.goexit.
+			if id := b.emitLocation(); id > 0 {
+				locs = append(locs, id)
+			}
+			stk = stk[1:]
+			continue
+		}
+
+		if added := b.deck.tryAdd(addr, frames, symbolizeResult); added {
+			stk = stk[1:]
+			continue
+		}
+		// add failed because this addr is not inlined with the
+		// existing PCs in the deck. Flush the deck and retry handling
+		// this pc.
+		if id := b.emitLocation(); id > 0 {
+			locs = append(locs, id)
+		}
+
+		// check cache again - previous emitLocation added a new entry
+		if l, ok := b.locs[addr]; ok {
+			locs = append(locs, l.id)
+			stk = stk[len(l.pcs):] // skip the matching pcs.
+		} else {
+			b.deck.tryAdd(addr, frames, symbolizeResult) // must succeed.
+			stk = stk[1:]
+		}
+	}
+	if id := b.emitLocation(); id > 0 { // emit remaining location.
+		locs = append(locs, id)
+	}
+	return locs
+}
+
+// pcDeck is a helper to detect a sequence of inlined functions from
+// a stack trace returned by the runtime.
+//
+// The stack traces returned by runtime's trackback functions are fully
+// expanded (at least for Go functions) and include the fake pcs representing
+// inlined functions. The profile proto expects the inlined functions to be
+// encoded in one Location message.
+// https://github.com/google/pprof/blob/5e965273ee43930341d897407202dd5e10e952cb/proto/profile.proto#L177-L184
+//
+// Runtime does not directly expose whether a frame is for an inlined function
+// and looking up debug info is not ideal, so we use a heuristic to filter
+// the fake pcs and restore the inlined and entry functions. Inlined functions
+// have the following properties:
+//   Frame's Func is nil (note: also true for non-Go functions), and
+//   Frame's Entry matches its entry function frame's Entry (note: could also be true for recursive calls and non-Go functions), and
+//   Frame's Name does not match its entry function frame's name (note: inlined functions cannot be directly recursive).
+//
+// As reading and processing the pcs in a stack trace one by one (from leaf to the root),
+// we use pcDeck to temporarily hold the observed pcs and their expanded frames
+// until we observe the entry function frame.
+type pcDeck struct {
+	pcs             []uintptr
+	frames          []runtime.Frame
+	symbolizeResult symbolizeFlag
+}
+
+func (d *pcDeck) reset() {
+	d.pcs = d.pcs[:0]
+	d.frames = d.frames[:0]
+	d.symbolizeResult = 0
+}
+
+// tryAdd tries to add the pc and Frames expanded from it (most likely one,
+// since the stack trace is already fully expanded) and the symbolizeResult
+// to the deck. If it fails the caller needs to flush the deck and retry.
+func (d *pcDeck) tryAdd(pc uintptr, frames []runtime.Frame, symbolizeResult symbolizeFlag) (success bool) {
+	if existing := len(d.pcs); existing > 0 {
+		// 'd.frames' are all expanded from one 'pc' and represent all
+		// inlined functions so we check only the last one.
+		newFrame := frames[0]
+		last := d.frames[existing-1]
+		if last.Func != nil { // the last frame can't be inlined. Flush.
+			return false
+		}
+		if last.Entry == 0 || newFrame.Entry == 0 { // Possibly not a Go function. Don't try to merge.
+			return false
+		}
+
+		if last.Entry != newFrame.Entry { // newFrame is for a different function.
+			return false
+		}
+		if last.Function == newFrame.Function { // maybe recursion.
+			return false
+		}
+	}
+	d.pcs = append(d.pcs, pc)
+	d.frames = append(d.frames, frames...)
+	d.symbolizeResult |= symbolizeResult
+	return true
+}
+
+// emitLocation emits the new location and function information recorded in the deck
+// and returns the location ID encoded in the profile protobuf.
+// It emits to b.pb, so there must be no message encoding in progress.
+// It resets the deck.
+func (b *profileBuilder) emitLocation() uint64 {
+	if len(b.deck.pcs) == 0 {
+		return 0
+	}
+	defer b.deck.reset()
+
+	addr := b.deck.pcs[0]
+	firstFrame := b.deck.frames[0]
+
+	// We can't write out functions while in the middle of the
+	// Location message, so record new functions we encounter and
+	// write them out after the Location.
+	type newFunc struct {
+		id         uint64
+		name, file string
+	}
+	newFuncs := make([]newFunc, 0, 8)
+
+	id := uint64(len(b.locs)) + 1
+	b.locs[addr] = locInfo{id: id, pcs: append([]uintptr{}, b.deck.pcs...)}
+
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLocation_ID, id)
+	b.pb.uint64Opt(tagLocation_Address, uint64(firstFrame.PC))
+	for _, frame := range b.deck.frames {
+		// Write out each line in frame expansion.
+		funcID := uint64(b.funcs[frame.Function])
+		if funcID == 0 {
+			funcID = uint64(len(b.funcs)) + 1
+			b.funcs[frame.Function] = int(funcID)
+			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+		}
+		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
+	}
+	for i := range b.mem {
+		if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
+			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+
+			m := b.mem[i]
+			m.funcs |= b.deck.symbolizeResult
+			b.mem[i] = m
+			break
+		}
+	}
+	b.pb.endMessage(tagProfile_Location, start)
+
+	// Write out functions we found during frame expansion.
+	for _, fn := range newFuncs {
+		start := b.pb.startMessage()
+		b.pb.uint64Opt(tagFunction_ID, fn.id)
+		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+		b.pb.endMessage(tagProfile_Function, start)
+	}
+
+	b.flush()
+	return id
+}
+
 // readMapping reads /proc/self/maps and writes mappings to b.pb.
 // It saves the address ranges of the mappings in b.mem for use
 // when emitting locations.
diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go
index bcb4d33..3043d53 100644
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -8,13 +8,13 @@
 	"bytes"
 	"encoding/json"
 	"fmt"
+	"internal/profile"
 	"internal/testenv"
 	"io/ioutil"
 	"os"
 	"os/exec"
 	"reflect"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"strings"
 	"testing"
 )
@@ -116,9 +116,9 @@
 
 	b := []uint64{
 		3, 0, 500, // hz = 500
-		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
-		5, 0, 40, uint64(addr2), uint64(addr2 + 2), // 40 samples in addr2
-		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 10, uint64(addr1 + 1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 40, uint64(addr2 + 1), uint64(addr2 + 2), // 40 samples in addr2
+		5, 0, 10, uint64(addr1 + 1), uint64(addr1 + 2), // 10 samples in addr1
 	}
 	p, err := translateCPUProfile(b)
 	if err != nil {
@@ -358,6 +358,17 @@
 					continue
 				}
 			}
+
+			if traceback == "Go+C" {
+				// The test code was arranged to have PCs from C and
+				// they are not symbolized.
+				// Check no Location containing those unsymbolized PCs contains multiple lines.
+				for i, loc := range prof.Location {
+					if !symbolized(loc) && len(loc.Line) > 1 {
+						t.Errorf("Location[%d] contains unsymbolized PCs and multiple lines: %v", i, loc)
+					}
+				}
+			}
 		})
 	}
 }
@@ -411,3 +422,16 @@
 		}
 	}
 }
+
+// Make sure the profiler can handle an empty stack trace.
+// See issue 37967.
+func TestEmptyStack(t *testing.T) {
+	b := []uint64{
+		3, 0, 500, // hz = 500
+		3, 0, 10, // 10 samples with an empty stack trace
+	}
+	_, err := translateCPUProfile(b)
+	if err != nil {
+		t.Fatalf("translating profile: %v", err)
+	}
+}
diff --git a/src/runtime/pprof/protomem.go b/src/runtime/pprof/protomem.go
index 1c88aae..fa75a28 100644
--- a/src/runtime/pprof/protomem.go
+++ b/src/runtime/pprof/protomem.go
@@ -27,30 +27,27 @@
 	values := []int64{0, 0, 0, 0}
 	var locs []uint64
 	for _, r := range p {
-		locs = locs[:0]
 		hideRuntime := true
 		for tries := 0; tries < 2; tries++ {
-			for _, addr := range r.Stack() {
-				// For heap profiles, all stack
-				// addresses are return PCs, which is
-				// what locForPC expects.
-				if hideRuntime {
+			stk := r.Stack()
+			// For heap profiles, all stack
+			// addresses are return PCs, which is
+			// what appendLocsForStack expects.
+			if hideRuntime {
+				for i, addr := range stk {
 					if f := runtime.FuncForPC(addr); f != nil && strings.HasPrefix(f.Name(), "runtime.") {
 						continue
 					}
 					// Found non-runtime. Show any runtime uses above it.
-					hideRuntime = false
+					stk = stk[i:]
+					break
 				}
-				l := b.locForPC(addr)
-				if l == 0 { // runtime.goexit
-					continue
-				}
-				locs = append(locs, l)
 			}
+			locs = b.appendLocsForStack(locs[:0], stk)
 			if len(locs) > 0 {
 				break
 			}
-			hideRuntime = false // try again, and show all frames
+			hideRuntime = false // try again, and show all frames next time.
 		}
 
 		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
diff --git a/src/runtime/pprof/protomem_test.go b/src/runtime/pprof/protomem_test.go
index 471b1ae..156f628 100644
--- a/src/runtime/pprof/protomem_test.go
+++ b/src/runtime/pprof/protomem_test.go
@@ -6,8 +6,8 @@
 
 import (
 	"bytes"
+	"internal/profile"
 	"runtime"
-	"runtime/pprof/internal/profile"
 	"testing"
 )
 
diff --git a/src/runtime/pprof/runtime.go b/src/runtime/pprof/runtime.go
index b71bbad..dd2545b 100644
--- a/src/runtime/pprof/runtime.go
+++ b/src/runtime/pprof/runtime.go
@@ -9,6 +9,9 @@
 	"unsafe"
 )
 
+// runtime_expandFinalInlineFrame is defined in runtime/symtab.go.
+func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr
+
 // runtime_setProfLabel is defined in runtime/proflabel.go.
 func runtime_setProfLabel(labels unsafe.Pointer)
 
diff --git a/src/runtime/pprof/testdata/mappingtest/main.go b/src/runtime/pprof/testdata/mappingtest/main.go
index 476b9e8..484b7f9 100644
--- a/src/runtime/pprof/testdata/mappingtest/main.go
+++ b/src/runtime/pprof/testdata/mappingtest/main.go
@@ -17,8 +17,7 @@
 int cpuHogCSalt1 = 0;
 int cpuHogCSalt2 = 0;
 
-void CPUHogCFunction() {
-	int foo = cpuHogCSalt1;
+void CPUHogCFunction0(int foo) {
 	int i;
 	for (i = 0; i < 100000; i++) {
 		if (foo > 0) {
@@ -30,6 +29,10 @@
 	}
 }
 
+void CPUHogCFunction() {
+	CPUHogCFunction0(cpuHogCSalt1);
+}
+
 struct CgoTracebackArg {
 	uintptr_t context;
         uintptr_t sigContext;
@@ -39,8 +42,9 @@
 
 void CollectCgoTraceback(void* parg) {
         struct CgoTracebackArg* arg = (struct CgoTracebackArg*)(parg);
-	arg->buf[0] = (uintptr_t)(CPUHogCFunction);
-	arg->buf[1] = 0;
+	arg->buf[0] = (uintptr_t)(CPUHogCFunction0);
+	arg->buf[1] = (uintptr_t)(CPUHogCFunction);
+	arg->buf[2] = 0;
 };
 */
 import "C"
@@ -81,7 +85,6 @@
 var salt2 int
 
 func cpuHogGoFunction() {
-	// Generates CPU profile samples including a Go call path.
 	for {
 		foo := salt1
 		for i := 0; i < 1e5; i++ {
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go
new file mode 100644
index 0000000..7618565
--- /dev/null
+++ b/src/runtime/preempt.go
@@ -0,0 +1,481 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Goroutine preemption
+//
+// A goroutine can be preempted at any safe-point. Currently, there
+// are a few categories of safe-points:
+//
+// 1. A blocked safe-point occurs for the duration that a goroutine is
+//    descheduled, blocked on synchronization, or in a system call.
+//
+// 2. Synchronous safe-points occur when a running goroutine checks
+//    for a preemption request.
+//
+// 3. Asynchronous safe-points occur at any instruction in user code
+//    where the goroutine can be safely paused and a conservative
+//    stack and register scan can find stack roots. The runtime can
+//    stop a goroutine at an async safe-point using a signal.
+//
+// At both blocked and synchronous safe-points, a goroutine's CPU
+// state is minimal and the garbage collector has complete information
+// about its entire stack. This makes it possible to deschedule a
+// goroutine with minimal space, and to precisely scan a goroutine's
+// stack.
+//
+// Synchronous safe-points are implemented by overloading the stack
+// bound check in function prologues. To preempt a goroutine at the
+// next synchronous safe-point, the runtime poisons the goroutine's
+// stack bound to a value that will cause the next stack bound check
+// to fail and enter the stack growth implementation, which will
+// detect that it was actually a preemption and redirect to preemption
+// handling.
+//
+// Preemption at asynchronous safe-points is implemented by suspending
+// the thread using an OS mechanism (e.g., signals) and inspecting its
+// state to determine if the goroutine was at an asynchronous
+// safe-point. Since the thread suspension itself is generally
+// asynchronous, it also checks if the running goroutine wants to be
+// preempted, since this could have changed. If all conditions are
+// satisfied, it adjusts the signal context to make it look like the
+// signaled thread just called asyncPreempt and resumes the thread.
+// asyncPreempt spills all registers and enters the scheduler.
+//
+// (An alternative would be to preempt in the signal handler itself.
+// This would let the OS save and restore the register state and the
+// runtime would only need to know how to extract potentially
+// pointer-containing registers from the signal context. However, this
+// would consume an M for every preempted G, and the scheduler itself
+// is not designed to run from a signal handler, as it tends to
+// allocate memory and start threads in the preemption path.)
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Keep in sync with cmd/compile/internal/gc/plive.go:go115ReduceLiveness.
+const go115ReduceLiveness = true
+
+const go115RestartSeq = go115ReduceLiveness && true // enable restartable sequences
+
+type suspendGState struct {
+	g *g
+
+	// dead indicates the goroutine was not suspended because it
+	// is dead. This goroutine could be reused after the dead
+	// state was observed, so the caller must not assume that it
+	// remains dead.
+	dead bool
+
+	// stopped indicates that this suspendG transitioned the G to
+	// _Gwaiting via g.preemptStop and thus is responsible for
+	// readying it when done.
+	stopped bool
+}
+
+// suspendG suspends goroutine gp at a safe-point and returns the
+// state of the suspended goroutine. The caller gets read access to
+// the goroutine until it calls resumeG.
+//
+// It is safe for multiple callers to attempt to suspend the same
+// goroutine at the same time. The goroutine may execute between
+// subsequent successful suspend operations. The current
+// implementation grants exclusive access to the goroutine, and hence
+// multiple callers will serialize. However, the intent is to grant
+// shared read access, so please don't depend on exclusive access.
+//
+// This must be called from the system stack and the user goroutine on
+// the current M (if any) must be in a preemptible state. This
+// prevents deadlocks where two goroutines attempt to suspend each
+// other and both are in non-preemptible states. There are other ways
+// to resolve this deadlock, but this seems simplest.
+//
+// TODO(austin): What if we instead required this to be called from a
+// user goroutine? Then we could deschedule the goroutine while
+// waiting instead of blocking the thread. If two goroutines tried to
+// suspend each other, one of them would win and the other wouldn't
+// complete the suspend until it was resumed. We would have to be
+// careful that they couldn't actually queue up suspend for each other
+// and then both be suspended. This would also avoid the need for a
+// kernel context switch in the synchronous case because we could just
+// directly schedule the waiter. The context switch is unavoidable in
+// the signal case.
+//
+//go:systemstack
+func suspendG(gp *g) suspendGState {
+	if mp := getg().m; mp.curg != nil && readgstatus(mp.curg) == _Grunning {
+		// Since we're on the system stack of this M, the user
+		// G is stuck at an unsafe point. If another goroutine
+		// were to try to preempt m.curg, it could deadlock.
+		throw("suspendG from non-preemptible goroutine")
+	}
+
+	// See https://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Drive the goroutine to a preemption point.
+	stopped := false
+	var asyncM *m
+	var asyncGen uint32
+	var nextPreemptM int64
+	for i := 0; ; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			if s&_Gscan != 0 {
+				// Someone else is suspending it. Wait
+				// for them to finish.
+				//
+				// TODO: It would be nicer if we could
+				// coalesce suspends.
+				break
+			}
+
+			dumpgstatus(gp)
+			throw("invalid g status")
+
+		case _Gdead:
+			// Nothing to suspend.
+			//
+			// preemptStop may need to be cleared, but
+			// doing that here could race with goroutine
+			// reuse. Instead, goexit0 clears it.
+			return suspendGState{dead: true}
+
+		case _Gcopystack:
+			// The stack is being copied. We need to wait
+			// until this is done.
+
+		case _Gpreempted:
+			// We (or someone else) suspended the G. Claim
+			// ownership of it by transitioning it to
+			// _Gwaiting.
+			if !casGFromPreempted(gp, _Gpreempted, _Gwaiting) {
+				break
+			}
+
+			// We stopped the G, so we have to ready it later.
+			stopped = true
+
+			s = _Gwaiting
+			fallthrough
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// This may race with execution or readying of gp.
+			// The scan bit keeps it from transition state.
+			if !castogscanstatus(gp, s, s|_Gscan) {
+				break
+			}
+
+			// Clear the preemption request. It's safe to
+			// reset the stack guard because we hold the
+			// _Gscan bit and thus own the stack.
+			gp.preemptStop = false
+			gp.preempt = false
+			gp.stackguard0 = gp.stack.lo + _StackGuard
+
+			// The goroutine was already at a safe-point
+			// and we've now locked that in.
+			//
+			// TODO: It would be much better if we didn't
+			// leave it in _Gscan, but instead gently
+			// prevented its scheduling until resumption.
+			// Maybe we only use this to bump a suspended
+			// count and the scheduler skips suspended
+			// goroutines? That wouldn't be enough for
+			// {_Gsyscall,_Gwaiting} -> _Grunning. Maybe
+			// for all those transitions we need to check
+			// suspended and deschedule?
+			return suspendGState{g: gp, stopped: stopped}
+
+		case _Grunning:
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if gp.preemptStop && gp.preempt && gp.stackguard0 == stackPreempt && asyncM == gp.m && atomic.Load(&asyncM.preemptGen) == asyncGen {
+				break
+			}
+
+			// Temporarily block state transitions.
+			if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				break
+			}
+
+			// Request synchronous preemption.
+			gp.preemptStop = true
+			gp.preempt = true
+			gp.stackguard0 = stackPreempt
+
+			// Prepare for asynchronous preemption.
+			asyncM2 := gp.m
+			asyncGen2 := atomic.Load(&asyncM2.preemptGen)
+			needAsync := asyncM != asyncM2 || asyncGen != asyncGen2
+			asyncM = asyncM2
+			asyncGen = asyncGen2
+
+			casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+
+			// Send asynchronous preemption. We do this
+			// after CASing the G back to _Grunning
+			// because preemptM may be synchronous and we
+			// don't want to catch the G just spinning on
+			// its status.
+			if preemptMSupported && debug.asyncpreemptoff == 0 && needAsync {
+				// Rate limit preemptM calls. This is
+				// particularly important on Windows
+				// where preemptM is actually
+				// synchronous and the spin loop here
+				// can lead to live-lock.
+				now := nanotime()
+				if now >= nextPreemptM {
+					nextPreemptM = now + yieldDelay/2
+					preemptM(asyncM)
+				}
+			}
+		}
+
+		// TODO: Don't busy wait. This loop should really only
+		// be a simple read/decide/CAS loop that only fails if
+		// there's an active race. Once the CAS succeeds, we
+		// should queue up the preemption (which will require
+		// it to be reliable in the _Grunning case, not
+		// best-effort) and then sleep until we're notified
+		// that the goroutine is suspended.
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+}
+
+// resumeG undoes the effects of suspendG, allowing the suspended
+// goroutine to continue from its current safe-point.
+func resumeG(state suspendGState) {
+	if state.dead {
+		// We didn't actually stop anything.
+		return
+	}
+
+	gp := state.g
+	switch s := readgstatus(gp); s {
+	default:
+		dumpgstatus(gp)
+		throw("unexpected g status")
+
+	case _Grunnable | _Gscan,
+		_Gwaiting | _Gscan,
+		_Gsyscall | _Gscan:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+
+	if state.stopped {
+		// We stopped it, so we need to re-schedule it.
+		ready(gp, 0, true)
+	}
+}
+
+// canPreemptM reports whether mp is in a state that is safe to preempt.
+//
+// It is nosplit because it has nosplit callers.
+//
+//go:nosplit
+func canPreemptM(mp *m) bool {
+	return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+}
+
+//go:generate go run mkpreempt.go
+
+// asyncPreempt saves all user registers and calls asyncPreempt2.
+//
+// When stack scanning encounters an asyncPreempt frame, it scans that
+// frame and its parent frame conservatively.
+//
+// asyncPreempt is implemented in assembly.
+func asyncPreempt()
+
+//go:nosplit
+func asyncPreempt2() {
+	gp := getg()
+	gp.asyncSafePoint = true
+	if gp.preemptStop {
+		mcall(preemptPark)
+	} else {
+		mcall(gopreempt_m)
+	}
+	gp.asyncSafePoint = false
+}
+
+// asyncPreemptStack is the bytes of stack space required to inject an
+// asyncPreempt call.
+var asyncPreemptStack = ^uintptr(0)
+
+func init() {
+	f := findfunc(funcPC(asyncPreempt))
+	total := funcMaxSPDelta(f)
+	f = findfunc(funcPC(asyncPreempt2))
+	total += funcMaxSPDelta(f)
+	// Add some overhead for return PCs, etc.
+	asyncPreemptStack = uintptr(total) + 8*sys.PtrSize
+	if asyncPreemptStack > _StackLimit {
+		// We need more than the nosplit limit. This isn't
+		// unsafe, but it may limit asynchronous preemption.
+		//
+		// This may be a problem if we start using more
+		// registers. In that case, we should store registers
+		// in a context object. If we pre-allocate one per P,
+		// asyncPreempt can spill just a few registers to the
+		// stack, then grab its context object and spill into
+		// it. When it enters the runtime, it would allocate a
+		// new context for the P.
+		print("runtime: asyncPreemptStack=", asyncPreemptStack, "\n")
+		throw("async stack too large")
+	}
+}
+
+// wantAsyncPreempt returns whether an asynchronous preemption is
+// queued for gp.
+func wantAsyncPreempt(gp *g) bool {
+	// Check both the G and the P.
+	return (gp.preempt || gp.m.p != 0 && gp.m.p.ptr().preempt) && readgstatus(gp)&^_Gscan == _Grunning
+}
+
+// isAsyncSafePoint reports whether gp at instruction PC is an
+// asynchronous safe point. This indicates that:
+//
+// 1. It's safe to suspend gp and conservatively scan its stack and
+// registers. There are no potentially hidden pointer values and it's
+// not in the middle of an atomic sequence like a write barrier.
+//
+// 2. gp has enough stack space to inject the asyncPreempt call.
+//
+// 3. It's generally safe to interact with the runtime, even if we're
+// in a signal handler stopped here. For example, there are no runtime
+// locks held, so acquiring a runtime lock won't self-deadlock.
+//
+// In some cases the PC is safe for asynchronous preemption but it
+// also needs to adjust the resumption PC. The new PC is returned in
+// the second result.
+func isAsyncSafePoint(gp *g, pc, sp, lr uintptr) (bool, uintptr) {
+	mp := gp.m
+
+	// Only user Gs can have safe-points. We check this first
+	// because it's extremely common that we'll catch mp in the
+	// scheduler processing this G preemption.
+	if mp.curg != gp {
+		return false, 0
+	}
+
+	// Check M state.
+	if mp.p == 0 || !canPreemptM(mp) {
+		return false, 0
+	}
+
+	// Check stack space.
+	if sp < gp.stack.lo || sp-gp.stack.lo < asyncPreemptStack {
+		return false, 0
+	}
+
+	// Check if PC is an unsafe-point.
+	f := findfunc(pc)
+	if !f.valid() {
+		// Not Go code.
+		return false, 0
+	}
+	if (GOARCH == "mips" || GOARCH == "mipsle" || GOARCH == "mips64" || GOARCH == "mips64le") && lr == pc+8 && funcspdelta(f, pc, nil) == 0 {
+		// We probably stopped at a half-executed CALL instruction,
+		// where the LR is updated but the PC has not. If we preempt
+		// here we'll see a seemingly self-recursive call, which is in
+		// fact not.
+		// This is normally ok, as we use the return address saved on
+		// stack for unwinding, not the LR value. But if this is a
+		// call to morestack, we haven't created the frame, and we'll
+		// use the LR for unwinding, which will be bad.
+		return false, 0
+	}
+	var up int32
+	var startpc uintptr
+	if !go115ReduceLiveness {
+		smi := pcdatavalue(f, _PCDATA_RegMapIndex, pc, nil)
+		if smi == -2 {
+			// Unsafe-point marked by compiler. This includes
+			// atomic sequences (e.g., write barrier) and nosplit
+			// functions (except at calls).
+			return false, 0
+		}
+	} else {
+		up, startpc = pcdatavalue2(f, _PCDATA_UnsafePoint, pc)
+		if up != _PCDATA_UnsafePointSafe {
+			// Unsafe-point marked by compiler. This includes
+			// atomic sequences (e.g., write barrier) and nosplit
+			// functions (except at calls).
+			return false, 0
+		}
+	}
+	if fd := funcdata(f, _FUNCDATA_LocalsPointerMaps); fd == nil || fd == unsafe.Pointer(&no_pointers_stackmap) {
+		// This is assembly code. Don't assume it's
+		// well-formed. We identify assembly code by
+		// checking that it has either no stack map, or
+		// no_pointers_stackmap, which is the stack map
+		// for ones marked as NO_LOCAL_POINTERS.
+		//
+		// TODO: Are there cases that are safe but don't have a
+		// locals pointer map, like empty frame functions?
+		return false, 0
+	}
+	name := funcname(f)
+	if inldata := funcdata(f, _FUNCDATA_InlTree); inldata != nil {
+		inltree := (*[1 << 20]inlinedCall)(inldata)
+		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, pc, nil)
+		if ix >= 0 {
+			name = funcnameFromNameoff(f, inltree[ix].func_)
+		}
+	}
+	if hasPrefix(name, "runtime.") ||
+		hasPrefix(name, "runtime/internal/") ||
+		hasPrefix(name, "reflect.") {
+		// For now we never async preempt the runtime or
+		// anything closely tied to the runtime. Known issues
+		// include: various points in the scheduler ("don't
+		// preempt between here and here"), much of the defer
+		// implementation (untyped info on stack), bulk write
+		// barriers (write barrier check),
+		// reflect.{makeFuncStub,methodValueCall}.
+		//
+		// TODO(austin): We should improve this, or opt things
+		// in incrementally.
+		return false, 0
+	}
+	if go115RestartSeq {
+		switch up {
+		case _PCDATA_Restart1, _PCDATA_Restart2:
+			// Restartable instruction sequence. Back off PC to
+			// the start PC.
+			if startpc == 0 || startpc > pc || pc-startpc > 20 {
+				throw("bad restart PC")
+			}
+			return true, startpc
+		case _PCDATA_RestartAtEntry:
+			// Restart from the function entry at resumption.
+			return true, f.entry
+		}
+	} else {
+		switch up {
+		case _PCDATA_Restart1, _PCDATA_Restart2, _PCDATA_RestartAtEntry:
+			// go115RestartSeq is not enabled. Treat it as unsafe point.
+			return false, 0
+		}
+	}
+	return true, pc
+}
+
+var no_pointers_stackmap uint64 // defined in assembly, for NO_LOCAL_POINTERS macro
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
new file mode 100644
index 0000000..a00ac8f
--- /dev/null
+++ b/src/runtime/preempt_386.s
@@ -0,0 +1,52 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	PUSHFL
+	ADJSP $264
+	NOP SP
+	MOVL AX, 0(SP)
+	MOVL CX, 4(SP)
+	MOVL DX, 8(SP)
+	MOVL BX, 12(SP)
+	MOVL BP, 16(SP)
+	MOVL SI, 20(SP)
+	MOVL DI, 24(SP)
+	FSAVE 28(SP)
+	FLDCW runtime·controlWord64(SB)
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE nosse
+	MOVUPS X0, 136(SP)
+	MOVUPS X1, 152(SP)
+	MOVUPS X2, 168(SP)
+	MOVUPS X3, 184(SP)
+	MOVUPS X4, 200(SP)
+	MOVUPS X5, 216(SP)
+	MOVUPS X6, 232(SP)
+	MOVUPS X7, 248(SP)
+nosse:
+	CALL ·asyncPreempt2(SB)
+	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
+	JNE nosse2
+	MOVUPS 248(SP), X7
+	MOVUPS 232(SP), X6
+	MOVUPS 216(SP), X5
+	MOVUPS 200(SP), X4
+	MOVUPS 184(SP), X3
+	MOVUPS 168(SP), X2
+	MOVUPS 152(SP), X1
+	MOVUPS 136(SP), X0
+nosse2:
+	FRSTOR 28(SP)
+	MOVL 24(SP), DI
+	MOVL 20(SP), SI
+	MOVL 16(SP), BP
+	MOVL 12(SP), BX
+	MOVL 8(SP), DX
+	MOVL 4(SP), CX
+	MOVL 0(SP), AX
+	ADJSP $-264
+	POPFL
+	RET
diff --git a/src/runtime/preempt_amd64.s b/src/runtime/preempt_amd64.s
new file mode 100644
index 0000000..4765e9f
--- /dev/null
+++ b/src/runtime/preempt_amd64.s
@@ -0,0 +1,84 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	PUSHQ BP
+	MOVQ SP, BP
+	// Save flags before clobbering them
+	PUSHFQ
+	// obj doesn't understand ADD/SUB on SP, but does understand ADJSP
+	ADJSP $368
+	// But vet doesn't know ADJSP, so suppress vet stack checking
+	NOP SP
+	#ifdef GOOS_darwin
+	CMPB internal∕cpu·X86+const_offsetX86HasAVX(SB), $0
+	JE 2(PC)
+	VZEROUPPER
+	#endif
+	MOVQ AX, 0(SP)
+	MOVQ CX, 8(SP)
+	MOVQ DX, 16(SP)
+	MOVQ BX, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ DI, 40(SP)
+	MOVQ R8, 48(SP)
+	MOVQ R9, 56(SP)
+	MOVQ R10, 64(SP)
+	MOVQ R11, 72(SP)
+	MOVQ R12, 80(SP)
+	MOVQ R13, 88(SP)
+	MOVQ R14, 96(SP)
+	MOVQ R15, 104(SP)
+	MOVUPS X0, 112(SP)
+	MOVUPS X1, 128(SP)
+	MOVUPS X2, 144(SP)
+	MOVUPS X3, 160(SP)
+	MOVUPS X4, 176(SP)
+	MOVUPS X5, 192(SP)
+	MOVUPS X6, 208(SP)
+	MOVUPS X7, 224(SP)
+	MOVUPS X8, 240(SP)
+	MOVUPS X9, 256(SP)
+	MOVUPS X10, 272(SP)
+	MOVUPS X11, 288(SP)
+	MOVUPS X12, 304(SP)
+	MOVUPS X13, 320(SP)
+	MOVUPS X14, 336(SP)
+	MOVUPS X15, 352(SP)
+	CALL ·asyncPreempt2(SB)
+	MOVUPS 352(SP), X15
+	MOVUPS 336(SP), X14
+	MOVUPS 320(SP), X13
+	MOVUPS 304(SP), X12
+	MOVUPS 288(SP), X11
+	MOVUPS 272(SP), X10
+	MOVUPS 256(SP), X9
+	MOVUPS 240(SP), X8
+	MOVUPS 224(SP), X7
+	MOVUPS 208(SP), X6
+	MOVUPS 192(SP), X5
+	MOVUPS 176(SP), X4
+	MOVUPS 160(SP), X3
+	MOVUPS 144(SP), X2
+	MOVUPS 128(SP), X1
+	MOVUPS 112(SP), X0
+	MOVQ 104(SP), R15
+	MOVQ 96(SP), R14
+	MOVQ 88(SP), R13
+	MOVQ 80(SP), R12
+	MOVQ 72(SP), R11
+	MOVQ 64(SP), R10
+	MOVQ 56(SP), R9
+	MOVQ 48(SP), R8
+	MOVQ 40(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 24(SP), BX
+	MOVQ 16(SP), DX
+	MOVQ 8(SP), CX
+	MOVQ 0(SP), AX
+	ADJSP $-368
+	POPFQ
+	POPQ BP
+	RET
diff --git a/src/runtime/preempt_arm.s b/src/runtime/preempt_arm.s
new file mode 100644
index 0000000..8f243c0
--- /dev/null
+++ b/src/runtime/preempt_arm.s
@@ -0,0 +1,83 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW.W R14, -188(R13)
+	MOVW R0, 4(R13)
+	MOVW R1, 8(R13)
+	MOVW R2, 12(R13)
+	MOVW R3, 16(R13)
+	MOVW R4, 20(R13)
+	MOVW R5, 24(R13)
+	MOVW R6, 28(R13)
+	MOVW R7, 32(R13)
+	MOVW R8, 36(R13)
+	MOVW R9, 40(R13)
+	MOVW R11, 44(R13)
+	MOVW R12, 48(R13)
+	MOVW CPSR, R0
+	MOVW R0, 52(R13)
+	MOVB ·goarm(SB), R0
+	CMP $6, R0
+	BLT nofp
+	MOVW FPCR, R0
+	MOVW R0, 56(R13)
+	MOVD F0, 60(R13)
+	MOVD F1, 68(R13)
+	MOVD F2, 76(R13)
+	MOVD F3, 84(R13)
+	MOVD F4, 92(R13)
+	MOVD F5, 100(R13)
+	MOVD F6, 108(R13)
+	MOVD F7, 116(R13)
+	MOVD F8, 124(R13)
+	MOVD F9, 132(R13)
+	MOVD F10, 140(R13)
+	MOVD F11, 148(R13)
+	MOVD F12, 156(R13)
+	MOVD F13, 164(R13)
+	MOVD F14, 172(R13)
+	MOVD F15, 180(R13)
+nofp:
+	CALL ·asyncPreempt2(SB)
+	MOVB ·goarm(SB), R0
+	CMP $6, R0
+	BLT nofp2
+	MOVD 180(R13), F15
+	MOVD 172(R13), F14
+	MOVD 164(R13), F13
+	MOVD 156(R13), F12
+	MOVD 148(R13), F11
+	MOVD 140(R13), F10
+	MOVD 132(R13), F9
+	MOVD 124(R13), F8
+	MOVD 116(R13), F7
+	MOVD 108(R13), F6
+	MOVD 100(R13), F5
+	MOVD 92(R13), F4
+	MOVD 84(R13), F3
+	MOVD 76(R13), F2
+	MOVD 68(R13), F1
+	MOVD 60(R13), F0
+	MOVW 56(R13), R0
+	MOVW R0, FPCR
+nofp2:
+	MOVW 52(R13), R0
+	MOVW R0, CPSR
+	MOVW 48(R13), R12
+	MOVW 44(R13), R11
+	MOVW 40(R13), R9
+	MOVW 36(R13), R8
+	MOVW 32(R13), R7
+	MOVW 28(R13), R6
+	MOVW 24(R13), R5
+	MOVW 20(R13), R4
+	MOVW 16(R13), R3
+	MOVW 12(R13), R2
+	MOVW 8(R13), R1
+	MOVW 4(R13), R0
+	MOVW 188(R13), R14
+	MOVW.P 192(R13), R15
+	UNDEF
diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s
new file mode 100644
index 0000000..3c27b52
--- /dev/null
+++ b/src/runtime/preempt_arm64.s
@@ -0,0 +1,147 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD R30, -496(RSP)
+	SUB $496, RSP
+	#ifdef GOOS_linux
+	MOVD R29, -8(RSP)
+	SUB $8, RSP, R29
+	#endif
+	#ifdef GOOS_darwin
+	MOVD R30, (RSP)
+	#endif
+	MOVD R0, 8(RSP)
+	MOVD R1, 16(RSP)
+	MOVD R2, 24(RSP)
+	MOVD R3, 32(RSP)
+	MOVD R4, 40(RSP)
+	MOVD R5, 48(RSP)
+	MOVD R6, 56(RSP)
+	MOVD R7, 64(RSP)
+	MOVD R8, 72(RSP)
+	MOVD R9, 80(RSP)
+	MOVD R10, 88(RSP)
+	MOVD R11, 96(RSP)
+	MOVD R12, 104(RSP)
+	MOVD R13, 112(RSP)
+	MOVD R14, 120(RSP)
+	MOVD R15, 128(RSP)
+	MOVD R16, 136(RSP)
+	MOVD R17, 144(RSP)
+	MOVD R19, 152(RSP)
+	MOVD R20, 160(RSP)
+	MOVD R21, 168(RSP)
+	MOVD R22, 176(RSP)
+	MOVD R23, 184(RSP)
+	MOVD R24, 192(RSP)
+	MOVD R25, 200(RSP)
+	MOVD R26, 208(RSP)
+	MOVD NZCV, R0
+	MOVD R0, 216(RSP)
+	MOVD FPSR, R0
+	MOVD R0, 224(RSP)
+	FMOVD F0, 232(RSP)
+	FMOVD F1, 240(RSP)
+	FMOVD F2, 248(RSP)
+	FMOVD F3, 256(RSP)
+	FMOVD F4, 264(RSP)
+	FMOVD F5, 272(RSP)
+	FMOVD F6, 280(RSP)
+	FMOVD F7, 288(RSP)
+	FMOVD F8, 296(RSP)
+	FMOVD F9, 304(RSP)
+	FMOVD F10, 312(RSP)
+	FMOVD F11, 320(RSP)
+	FMOVD F12, 328(RSP)
+	FMOVD F13, 336(RSP)
+	FMOVD F14, 344(RSP)
+	FMOVD F15, 352(RSP)
+	FMOVD F16, 360(RSP)
+	FMOVD F17, 368(RSP)
+	FMOVD F18, 376(RSP)
+	FMOVD F19, 384(RSP)
+	FMOVD F20, 392(RSP)
+	FMOVD F21, 400(RSP)
+	FMOVD F22, 408(RSP)
+	FMOVD F23, 416(RSP)
+	FMOVD F24, 424(RSP)
+	FMOVD F25, 432(RSP)
+	FMOVD F26, 440(RSP)
+	FMOVD F27, 448(RSP)
+	FMOVD F28, 456(RSP)
+	FMOVD F29, 464(RSP)
+	FMOVD F30, 472(RSP)
+	FMOVD F31, 480(RSP)
+	CALL ·asyncPreempt2(SB)
+	FMOVD 480(RSP), F31
+	FMOVD 472(RSP), F30
+	FMOVD 464(RSP), F29
+	FMOVD 456(RSP), F28
+	FMOVD 448(RSP), F27
+	FMOVD 440(RSP), F26
+	FMOVD 432(RSP), F25
+	FMOVD 424(RSP), F24
+	FMOVD 416(RSP), F23
+	FMOVD 408(RSP), F22
+	FMOVD 400(RSP), F21
+	FMOVD 392(RSP), F20
+	FMOVD 384(RSP), F19
+	FMOVD 376(RSP), F18
+	FMOVD 368(RSP), F17
+	FMOVD 360(RSP), F16
+	FMOVD 352(RSP), F15
+	FMOVD 344(RSP), F14
+	FMOVD 336(RSP), F13
+	FMOVD 328(RSP), F12
+	FMOVD 320(RSP), F11
+	FMOVD 312(RSP), F10
+	FMOVD 304(RSP), F9
+	FMOVD 296(RSP), F8
+	FMOVD 288(RSP), F7
+	FMOVD 280(RSP), F6
+	FMOVD 272(RSP), F5
+	FMOVD 264(RSP), F4
+	FMOVD 256(RSP), F3
+	FMOVD 248(RSP), F2
+	FMOVD 240(RSP), F1
+	FMOVD 232(RSP), F0
+	MOVD 224(RSP), R0
+	MOVD R0, FPSR
+	MOVD 216(RSP), R0
+	MOVD R0, NZCV
+	MOVD 208(RSP), R26
+	MOVD 200(RSP), R25
+	MOVD 192(RSP), R24
+	MOVD 184(RSP), R23
+	MOVD 176(RSP), R22
+	MOVD 168(RSP), R21
+	MOVD 160(RSP), R20
+	MOVD 152(RSP), R19
+	MOVD 144(RSP), R17
+	MOVD 136(RSP), R16
+	MOVD 128(RSP), R15
+	MOVD 120(RSP), R14
+	MOVD 112(RSP), R13
+	MOVD 104(RSP), R12
+	MOVD 96(RSP), R11
+	MOVD 88(RSP), R10
+	MOVD 80(RSP), R9
+	MOVD 72(RSP), R8
+	MOVD 64(RSP), R7
+	MOVD 56(RSP), R6
+	MOVD 48(RSP), R5
+	MOVD 40(RSP), R4
+	MOVD 32(RSP), R3
+	MOVD 24(RSP), R2
+	MOVD 16(RSP), R1
+	MOVD 8(RSP), R0
+	MOVD 496(RSP), R30
+	#ifdef GOOS_linux
+	MOVD -8(RSP), R29
+	#endif
+	MOVD (RSP), R27
+	ADD $512, RSP
+	JMP (R27)
diff --git a/src/runtime/preempt_mips64x.s b/src/runtime/preempt_mips64x.s
new file mode 100644
index 0000000..1e123e8
--- /dev/null
+++ b/src/runtime/preempt_mips64x.s
@@ -0,0 +1,145 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVV R31, -488(R29)
+	SUBV $488, R29
+	MOVV R1, 8(R29)
+	MOVV R2, 16(R29)
+	MOVV R3, 24(R29)
+	MOVV R4, 32(R29)
+	MOVV R5, 40(R29)
+	MOVV R6, 48(R29)
+	MOVV R7, 56(R29)
+	MOVV R8, 64(R29)
+	MOVV R9, 72(R29)
+	MOVV R10, 80(R29)
+	MOVV R11, 88(R29)
+	MOVV R12, 96(R29)
+	MOVV R13, 104(R29)
+	MOVV R14, 112(R29)
+	MOVV R15, 120(R29)
+	MOVV R16, 128(R29)
+	MOVV R17, 136(R29)
+	MOVV R18, 144(R29)
+	MOVV R19, 152(R29)
+	MOVV R20, 160(R29)
+	MOVV R21, 168(R29)
+	MOVV R22, 176(R29)
+	MOVV R24, 184(R29)
+	MOVV R25, 192(R29)
+	MOVV RSB, 200(R29)
+	MOVV HI, R1
+	MOVV R1, 208(R29)
+	MOVV LO, R1
+	MOVV R1, 216(R29)
+	#ifndef GOMIPS64_softfloat
+	MOVV FCR31, R1
+	MOVV R1, 224(R29)
+	MOVD F0, 232(R29)
+	MOVD F1, 240(R29)
+	MOVD F2, 248(R29)
+	MOVD F3, 256(R29)
+	MOVD F4, 264(R29)
+	MOVD F5, 272(R29)
+	MOVD F6, 280(R29)
+	MOVD F7, 288(R29)
+	MOVD F8, 296(R29)
+	MOVD F9, 304(R29)
+	MOVD F10, 312(R29)
+	MOVD F11, 320(R29)
+	MOVD F12, 328(R29)
+	MOVD F13, 336(R29)
+	MOVD F14, 344(R29)
+	MOVD F15, 352(R29)
+	MOVD F16, 360(R29)
+	MOVD F17, 368(R29)
+	MOVD F18, 376(R29)
+	MOVD F19, 384(R29)
+	MOVD F20, 392(R29)
+	MOVD F21, 400(R29)
+	MOVD F22, 408(R29)
+	MOVD F23, 416(R29)
+	MOVD F24, 424(R29)
+	MOVD F25, 432(R29)
+	MOVD F26, 440(R29)
+	MOVD F27, 448(R29)
+	MOVD F28, 456(R29)
+	MOVD F29, 464(R29)
+	MOVD F30, 472(R29)
+	MOVD F31, 480(R29)
+	#endif
+	CALL ·asyncPreempt2(SB)
+	#ifndef GOMIPS64_softfloat
+	MOVD 480(R29), F31
+	MOVD 472(R29), F30
+	MOVD 464(R29), F29
+	MOVD 456(R29), F28
+	MOVD 448(R29), F27
+	MOVD 440(R29), F26
+	MOVD 432(R29), F25
+	MOVD 424(R29), F24
+	MOVD 416(R29), F23
+	MOVD 408(R29), F22
+	MOVD 400(R29), F21
+	MOVD 392(R29), F20
+	MOVD 384(R29), F19
+	MOVD 376(R29), F18
+	MOVD 368(R29), F17
+	MOVD 360(R29), F16
+	MOVD 352(R29), F15
+	MOVD 344(R29), F14
+	MOVD 336(R29), F13
+	MOVD 328(R29), F12
+	MOVD 320(R29), F11
+	MOVD 312(R29), F10
+	MOVD 304(R29), F9
+	MOVD 296(R29), F8
+	MOVD 288(R29), F7
+	MOVD 280(R29), F6
+	MOVD 272(R29), F5
+	MOVD 264(R29), F4
+	MOVD 256(R29), F3
+	MOVD 248(R29), F2
+	MOVD 240(R29), F1
+	MOVD 232(R29), F0
+	MOVV 224(R29), R1
+	MOVV R1, FCR31
+	#endif
+	MOVV 216(R29), R1
+	MOVV R1, LO
+	MOVV 208(R29), R1
+	MOVV R1, HI
+	MOVV 200(R29), RSB
+	MOVV 192(R29), R25
+	MOVV 184(R29), R24
+	MOVV 176(R29), R22
+	MOVV 168(R29), R21
+	MOVV 160(R29), R20
+	MOVV 152(R29), R19
+	MOVV 144(R29), R18
+	MOVV 136(R29), R17
+	MOVV 128(R29), R16
+	MOVV 120(R29), R15
+	MOVV 112(R29), R14
+	MOVV 104(R29), R13
+	MOVV 96(R29), R12
+	MOVV 88(R29), R11
+	MOVV 80(R29), R10
+	MOVV 72(R29), R9
+	MOVV 64(R29), R8
+	MOVV 56(R29), R7
+	MOVV 48(R29), R6
+	MOVV 40(R29), R5
+	MOVV 32(R29), R4
+	MOVV 24(R29), R3
+	MOVV 16(R29), R2
+	MOVV 8(R29), R1
+	MOVV 488(R29), R31
+	MOVV (R29), R23
+	ADDV $496, R29
+	JMP (R23)
diff --git a/src/runtime/preempt_mipsx.s b/src/runtime/preempt_mipsx.s
new file mode 100644
index 0000000..afac33e
--- /dev/null
+++ b/src/runtime/preempt_mipsx.s
@@ -0,0 +1,145 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+// +build mips mipsle
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW R31, -244(R29)
+	SUB $244, R29
+	MOVW R1, 4(R29)
+	MOVW R2, 8(R29)
+	MOVW R3, 12(R29)
+	MOVW R4, 16(R29)
+	MOVW R5, 20(R29)
+	MOVW R6, 24(R29)
+	MOVW R7, 28(R29)
+	MOVW R8, 32(R29)
+	MOVW R9, 36(R29)
+	MOVW R10, 40(R29)
+	MOVW R11, 44(R29)
+	MOVW R12, 48(R29)
+	MOVW R13, 52(R29)
+	MOVW R14, 56(R29)
+	MOVW R15, 60(R29)
+	MOVW R16, 64(R29)
+	MOVW R17, 68(R29)
+	MOVW R18, 72(R29)
+	MOVW R19, 76(R29)
+	MOVW R20, 80(R29)
+	MOVW R21, 84(R29)
+	MOVW R22, 88(R29)
+	MOVW R24, 92(R29)
+	MOVW R25, 96(R29)
+	MOVW R28, 100(R29)
+	MOVW HI, R1
+	MOVW R1, 104(R29)
+	MOVW LO, R1
+	MOVW R1, 108(R29)
+	#ifndef GOMIPS_softfloat
+	MOVW FCR31, R1
+	MOVW R1, 112(R29)
+	MOVF F0, 116(R29)
+	MOVF F1, 120(R29)
+	MOVF F2, 124(R29)
+	MOVF F3, 128(R29)
+	MOVF F4, 132(R29)
+	MOVF F5, 136(R29)
+	MOVF F6, 140(R29)
+	MOVF F7, 144(R29)
+	MOVF F8, 148(R29)
+	MOVF F9, 152(R29)
+	MOVF F10, 156(R29)
+	MOVF F11, 160(R29)
+	MOVF F12, 164(R29)
+	MOVF F13, 168(R29)
+	MOVF F14, 172(R29)
+	MOVF F15, 176(R29)
+	MOVF F16, 180(R29)
+	MOVF F17, 184(R29)
+	MOVF F18, 188(R29)
+	MOVF F19, 192(R29)
+	MOVF F20, 196(R29)
+	MOVF F21, 200(R29)
+	MOVF F22, 204(R29)
+	MOVF F23, 208(R29)
+	MOVF F24, 212(R29)
+	MOVF F25, 216(R29)
+	MOVF F26, 220(R29)
+	MOVF F27, 224(R29)
+	MOVF F28, 228(R29)
+	MOVF F29, 232(R29)
+	MOVF F30, 236(R29)
+	MOVF F31, 240(R29)
+	#endif
+	CALL ·asyncPreempt2(SB)
+	#ifndef GOMIPS_softfloat
+	MOVF 240(R29), F31
+	MOVF 236(R29), F30
+	MOVF 232(R29), F29
+	MOVF 228(R29), F28
+	MOVF 224(R29), F27
+	MOVF 220(R29), F26
+	MOVF 216(R29), F25
+	MOVF 212(R29), F24
+	MOVF 208(R29), F23
+	MOVF 204(R29), F22
+	MOVF 200(R29), F21
+	MOVF 196(R29), F20
+	MOVF 192(R29), F19
+	MOVF 188(R29), F18
+	MOVF 184(R29), F17
+	MOVF 180(R29), F16
+	MOVF 176(R29), F15
+	MOVF 172(R29), F14
+	MOVF 168(R29), F13
+	MOVF 164(R29), F12
+	MOVF 160(R29), F11
+	MOVF 156(R29), F10
+	MOVF 152(R29), F9
+	MOVF 148(R29), F8
+	MOVF 144(R29), F7
+	MOVF 140(R29), F6
+	MOVF 136(R29), F5
+	MOVF 132(R29), F4
+	MOVF 128(R29), F3
+	MOVF 124(R29), F2
+	MOVF 120(R29), F1
+	MOVF 116(R29), F0
+	MOVW 112(R29), R1
+	MOVW R1, FCR31
+	#endif
+	MOVW 108(R29), R1
+	MOVW R1, LO
+	MOVW 104(R29), R1
+	MOVW R1, HI
+	MOVW 100(R29), R28
+	MOVW 96(R29), R25
+	MOVW 92(R29), R24
+	MOVW 88(R29), R22
+	MOVW 84(R29), R21
+	MOVW 80(R29), R20
+	MOVW 76(R29), R19
+	MOVW 72(R29), R18
+	MOVW 68(R29), R17
+	MOVW 64(R29), R16
+	MOVW 60(R29), R15
+	MOVW 56(R29), R14
+	MOVW 52(R29), R13
+	MOVW 48(R29), R12
+	MOVW 44(R29), R11
+	MOVW 40(R29), R10
+	MOVW 36(R29), R9
+	MOVW 32(R29), R8
+	MOVW 28(R29), R7
+	MOVW 24(R29), R6
+	MOVW 20(R29), R5
+	MOVW 16(R29), R4
+	MOVW 12(R29), R3
+	MOVW 8(R29), R2
+	MOVW 4(R29), R1
+	MOVW 244(R29), R31
+	MOVW (R29), R23
+	ADD $248, R29
+	JMP (R23)
diff --git a/src/runtime/preempt_nonwindows.go b/src/runtime/preempt_nonwindows.go
new file mode 100644
index 0000000..3066a15
--- /dev/null
+++ b/src/runtime/preempt_nonwindows.go
@@ -0,0 +1,13 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !windows
+
+package runtime
+
+//go:nosplit
+func osPreemptExtEnter(mp *m) {}
+
+//go:nosplit
+func osPreemptExtExit(mp *m) {}
diff --git a/src/runtime/preempt_ppc64x.s b/src/runtime/preempt_ppc64x.s
new file mode 100644
index 0000000..b2d7e30
--- /dev/null
+++ b/src/runtime/preempt_ppc64x.s
@@ -0,0 +1,147 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+// +build ppc64 ppc64le
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD R31, -488(R1)
+	MOVD LR, R31
+	MOVDU R31, -520(R1)
+	MOVD R3, 40(R1)
+	MOVD R4, 48(R1)
+	MOVD R5, 56(R1)
+	MOVD R6, 64(R1)
+	MOVD R7, 72(R1)
+	MOVD R8, 80(R1)
+	MOVD R9, 88(R1)
+	MOVD R10, 96(R1)
+	MOVD R11, 104(R1)
+	MOVD R14, 112(R1)
+	MOVD R15, 120(R1)
+	MOVD R16, 128(R1)
+	MOVD R17, 136(R1)
+	MOVD R18, 144(R1)
+	MOVD R19, 152(R1)
+	MOVD R20, 160(R1)
+	MOVD R21, 168(R1)
+	MOVD R22, 176(R1)
+	MOVD R23, 184(R1)
+	MOVD R24, 192(R1)
+	MOVD R25, 200(R1)
+	MOVD R26, 208(R1)
+	MOVD R27, 216(R1)
+	MOVD R28, 224(R1)
+	MOVD R29, 232(R1)
+	MOVW CR, R31
+	MOVW R31, 240(R1)
+	MOVD XER, R31
+	MOVD R31, 248(R1)
+	FMOVD F0, 256(R1)
+	FMOVD F1, 264(R1)
+	FMOVD F2, 272(R1)
+	FMOVD F3, 280(R1)
+	FMOVD F4, 288(R1)
+	FMOVD F5, 296(R1)
+	FMOVD F6, 304(R1)
+	FMOVD F7, 312(R1)
+	FMOVD F8, 320(R1)
+	FMOVD F9, 328(R1)
+	FMOVD F10, 336(R1)
+	FMOVD F11, 344(R1)
+	FMOVD F12, 352(R1)
+	FMOVD F13, 360(R1)
+	FMOVD F14, 368(R1)
+	FMOVD F15, 376(R1)
+	FMOVD F16, 384(R1)
+	FMOVD F17, 392(R1)
+	FMOVD F18, 400(R1)
+	FMOVD F19, 408(R1)
+	FMOVD F20, 416(R1)
+	FMOVD F21, 424(R1)
+	FMOVD F22, 432(R1)
+	FMOVD F23, 440(R1)
+	FMOVD F24, 448(R1)
+	FMOVD F25, 456(R1)
+	FMOVD F26, 464(R1)
+	FMOVD F27, 472(R1)
+	FMOVD F28, 480(R1)
+	FMOVD F29, 488(R1)
+	FMOVD F30, 496(R1)
+	FMOVD F31, 504(R1)
+	MOVFL FPSCR, F0
+	FMOVD F0, 512(R1)
+	CALL ·asyncPreempt2(SB)
+	FMOVD 512(R1), F0
+	MOVFL F0, FPSCR
+	FMOVD 504(R1), F31
+	FMOVD 496(R1), F30
+	FMOVD 488(R1), F29
+	FMOVD 480(R1), F28
+	FMOVD 472(R1), F27
+	FMOVD 464(R1), F26
+	FMOVD 456(R1), F25
+	FMOVD 448(R1), F24
+	FMOVD 440(R1), F23
+	FMOVD 432(R1), F22
+	FMOVD 424(R1), F21
+	FMOVD 416(R1), F20
+	FMOVD 408(R1), F19
+	FMOVD 400(R1), F18
+	FMOVD 392(R1), F17
+	FMOVD 384(R1), F16
+	FMOVD 376(R1), F15
+	FMOVD 368(R1), F14
+	FMOVD 360(R1), F13
+	FMOVD 352(R1), F12
+	FMOVD 344(R1), F11
+	FMOVD 336(R1), F10
+	FMOVD 328(R1), F9
+	FMOVD 320(R1), F8
+	FMOVD 312(R1), F7
+	FMOVD 304(R1), F6
+	FMOVD 296(R1), F5
+	FMOVD 288(R1), F4
+	FMOVD 280(R1), F3
+	FMOVD 272(R1), F2
+	FMOVD 264(R1), F1
+	FMOVD 256(R1), F0
+	MOVD 248(R1), R31
+	MOVD R31, XER
+	MOVW 240(R1), R31
+	MOVFL R31, $0xff
+	MOVD 232(R1), R29
+	MOVD 224(R1), R28
+	MOVD 216(R1), R27
+	MOVD 208(R1), R26
+	MOVD 200(R1), R25
+	MOVD 192(R1), R24
+	MOVD 184(R1), R23
+	MOVD 176(R1), R22
+	MOVD 168(R1), R21
+	MOVD 160(R1), R20
+	MOVD 152(R1), R19
+	MOVD 144(R1), R18
+	MOVD 136(R1), R17
+	MOVD 128(R1), R16
+	MOVD 120(R1), R15
+	MOVD 112(R1), R14
+	MOVD 104(R1), R11
+	MOVD 96(R1), R10
+	MOVD 88(R1), R9
+	MOVD 80(R1), R8
+	MOVD 72(R1), R7
+	MOVD 64(R1), R6
+	MOVD 56(R1), R5
+	MOVD 48(R1), R4
+	MOVD 40(R1), R3
+	MOVD 520(R1), R31
+	MOVD R31, LR
+	MOVD 528(R1), R2
+	MOVD 536(R1), R12
+	MOVD (R1), R31
+	MOVD R31, CTR
+	MOVD 32(R1), R31
+	ADD $552, R1
+	JMP (CTR)
diff --git a/src/runtime/preempt_riscv64.s b/src/runtime/preempt_riscv64.s
new file mode 100644
index 0000000..0338c22
--- /dev/null
+++ b/src/runtime/preempt_riscv64.s
@@ -0,0 +1,131 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	MOV X1, -480(X2)
+	ADD $-480, X2
+	MOV X3, 8(X2)
+	MOV X5, 16(X2)
+	MOV X6, 24(X2)
+	MOV X7, 32(X2)
+	MOV X8, 40(X2)
+	MOV X9, 48(X2)
+	MOV X10, 56(X2)
+	MOV X11, 64(X2)
+	MOV X12, 72(X2)
+	MOV X13, 80(X2)
+	MOV X14, 88(X2)
+	MOV X15, 96(X2)
+	MOV X16, 104(X2)
+	MOV X17, 112(X2)
+	MOV X18, 120(X2)
+	MOV X19, 128(X2)
+	MOV X20, 136(X2)
+	MOV X21, 144(X2)
+	MOV X22, 152(X2)
+	MOV X23, 160(X2)
+	MOV X24, 168(X2)
+	MOV X25, 176(X2)
+	MOV X26, 184(X2)
+	MOV X27, 192(X2)
+	MOV X28, 200(X2)
+	MOV X29, 208(X2)
+	MOV X30, 216(X2)
+	MOVD F0, 224(X2)
+	MOVD F1, 232(X2)
+	MOVD F2, 240(X2)
+	MOVD F3, 248(X2)
+	MOVD F4, 256(X2)
+	MOVD F5, 264(X2)
+	MOVD F6, 272(X2)
+	MOVD F7, 280(X2)
+	MOVD F8, 288(X2)
+	MOVD F9, 296(X2)
+	MOVD F10, 304(X2)
+	MOVD F11, 312(X2)
+	MOVD F12, 320(X2)
+	MOVD F13, 328(X2)
+	MOVD F14, 336(X2)
+	MOVD F15, 344(X2)
+	MOVD F16, 352(X2)
+	MOVD F17, 360(X2)
+	MOVD F18, 368(X2)
+	MOVD F19, 376(X2)
+	MOVD F20, 384(X2)
+	MOVD F21, 392(X2)
+	MOVD F22, 400(X2)
+	MOVD F23, 408(X2)
+	MOVD F24, 416(X2)
+	MOVD F25, 424(X2)
+	MOVD F26, 432(X2)
+	MOVD F27, 440(X2)
+	MOVD F28, 448(X2)
+	MOVD F29, 456(X2)
+	MOVD F30, 464(X2)
+	MOVD F31, 472(X2)
+	CALL ·asyncPreempt2(SB)
+	MOVD 472(X2), F31
+	MOVD 464(X2), F30
+	MOVD 456(X2), F29
+	MOVD 448(X2), F28
+	MOVD 440(X2), F27
+	MOVD 432(X2), F26
+	MOVD 424(X2), F25
+	MOVD 416(X2), F24
+	MOVD 408(X2), F23
+	MOVD 400(X2), F22
+	MOVD 392(X2), F21
+	MOVD 384(X2), F20
+	MOVD 376(X2), F19
+	MOVD 368(X2), F18
+	MOVD 360(X2), F17
+	MOVD 352(X2), F16
+	MOVD 344(X2), F15
+	MOVD 336(X2), F14
+	MOVD 328(X2), F13
+	MOVD 320(X2), F12
+	MOVD 312(X2), F11
+	MOVD 304(X2), F10
+	MOVD 296(X2), F9
+	MOVD 288(X2), F8
+	MOVD 280(X2), F7
+	MOVD 272(X2), F6
+	MOVD 264(X2), F5
+	MOVD 256(X2), F4
+	MOVD 248(X2), F3
+	MOVD 240(X2), F2
+	MOVD 232(X2), F1
+	MOVD 224(X2), F0
+	MOV 216(X2), X30
+	MOV 208(X2), X29
+	MOV 200(X2), X28
+	MOV 192(X2), X27
+	MOV 184(X2), X26
+	MOV 176(X2), X25
+	MOV 168(X2), X24
+	MOV 160(X2), X23
+	MOV 152(X2), X22
+	MOV 144(X2), X21
+	MOV 136(X2), X20
+	MOV 128(X2), X19
+	MOV 120(X2), X18
+	MOV 112(X2), X17
+	MOV 104(X2), X16
+	MOV 96(X2), X15
+	MOV 88(X2), X14
+	MOV 80(X2), X13
+	MOV 72(X2), X12
+	MOV 64(X2), X11
+	MOV 56(X2), X10
+	MOV 48(X2), X9
+	MOV 40(X2), X8
+	MOV 32(X2), X7
+	MOV 24(X2), X6
+	MOV 16(X2), X5
+	MOV 8(X2), X3
+	MOV 480(X2), X1
+	MOV (X2), X31
+	ADD $488, X2
+	JMP (X31)
diff --git a/src/runtime/preempt_s390x.s b/src/runtime/preempt_s390x.s
new file mode 100644
index 0000000..ca9e47c
--- /dev/null
+++ b/src/runtime/preempt_s390x.s
@@ -0,0 +1,51 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	IPM R10
+	MOVD R14, -248(R15)
+	ADD $-248, R15
+	MOVW R10, 8(R15)
+	STMG R0, R12, 16(R15)
+	FMOVD F0, 120(R15)
+	FMOVD F1, 128(R15)
+	FMOVD F2, 136(R15)
+	FMOVD F3, 144(R15)
+	FMOVD F4, 152(R15)
+	FMOVD F5, 160(R15)
+	FMOVD F6, 168(R15)
+	FMOVD F7, 176(R15)
+	FMOVD F8, 184(R15)
+	FMOVD F9, 192(R15)
+	FMOVD F10, 200(R15)
+	FMOVD F11, 208(R15)
+	FMOVD F12, 216(R15)
+	FMOVD F13, 224(R15)
+	FMOVD F14, 232(R15)
+	FMOVD F15, 240(R15)
+	CALL ·asyncPreempt2(SB)
+	FMOVD 240(R15), F15
+	FMOVD 232(R15), F14
+	FMOVD 224(R15), F13
+	FMOVD 216(R15), F12
+	FMOVD 208(R15), F11
+	FMOVD 200(R15), F10
+	FMOVD 192(R15), F9
+	FMOVD 184(R15), F8
+	FMOVD 176(R15), F7
+	FMOVD 168(R15), F6
+	FMOVD 160(R15), F5
+	FMOVD 152(R15), F4
+	FMOVD 144(R15), F3
+	FMOVD 136(R15), F2
+	FMOVD 128(R15), F1
+	FMOVD 120(R15), F0
+	LMG 16(R15), R0, R12
+	MOVD 248(R15), R14
+	ADD $256, R15
+	MOVWZ -248(R15), R10
+	TMLH R10, $(3<<12)
+	MOVD -256(R15), R10
+	JMP (R10)
diff --git a/src/runtime/preempt_wasm.s b/src/runtime/preempt_wasm.s
new file mode 100644
index 0000000..0cf57d3
--- /dev/null
+++ b/src/runtime/preempt_wasm.s
@@ -0,0 +1,8 @@
+// Code generated by mkpreempt.go; DO NOT EDIT.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
+	// No async preemption on wasm
+	UNDEF
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 93d329d..2399f0a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -82,6 +82,7 @@
 var (
 	m0           m
 	g0           g
+	mcache0      *mcache
 	raceprocctx0 uintptr
 )
 
@@ -244,13 +245,14 @@
 
 func forcegchelper() {
 	forcegc.g = getg()
+	lockInit(&forcegc.lock, lockRankForcegc)
 	for {
 		lock(&forcegc.lock)
 		if forcegc.idle != 0 {
 			throw("forcegc: phase error")
 		}
 		atomic.Store(&forcegc.idle, 1)
-		goparkunlock(&forcegc.lock, waitReasonForceGGIdle, traceEvGoBlock, 1)
+		goparkunlock(&forcegc.lock, waitReasonForceGCIdle, traceEvGoBlock, 1)
 		// this goroutine is explicitly resumed by sysmon
 		if debug.gctrace > 0 {
 			println("GC forced")
@@ -413,7 +415,7 @@
 // use the result as an address at which to start executing code.
 //go:nosplit
 func funcPC(f interface{}) uintptr {
-	return **(**uintptr)(add(unsafe.Pointer(&f), sys.PtrSize))
+	return *(*uintptr)(efaceOf(&f).data)
 }
 
 // called from assembly
@@ -514,6 +516,9 @@
 	// to guard execution of instructions that can not be assumed to be always supported.
 	x86HasPOPCNT = cpu.X86.HasPOPCNT
 	x86HasSSE41 = cpu.X86.HasSSE41
+	x86HasFMA = cpu.X86.HasFMA
+
+	armHasVFPv4 = cpu.ARM.HasVFPv4
 
 	arm64HasATOMICS = cpu.ARM64.HasATOMICS
 }
@@ -527,6 +532,22 @@
 //
 // The new G calls runtime·main.
 func schedinit() {
+	lockInit(&sched.lock, lockRankSched)
+	lockInit(&sched.sysmonlock, lockRankSysmon)
+	lockInit(&sched.deferlock, lockRankDefer)
+	lockInit(&sched.sudoglock, lockRankSudog)
+	lockInit(&deadlock, lockRankDeadlock)
+	lockInit(&paniclk, lockRankPanic)
+	lockInit(&allglock, lockRankAllg)
+	lockInit(&allpLock, lockRankAllp)
+	lockInit(&reflectOffs.lock, lockRankReflectOffs)
+	lockInit(&finlock, lockRankFin)
+	lockInit(&trace.bufLock, lockRankTraceBuf)
+	lockInit(&trace.stringsLock, lockRankTraceStrings)
+	lockInit(&trace.lock, lockRankTrace)
+	lockInit(&cpuprof.lock, lockRankCpuprof)
+	lockInit(&trace.stackTab.lock, lockRankTraceStackTab)
+
 	// raceinit must be the first call to race detector.
 	// In particular, it must be done before mallocinit below calls racemapshadow.
 	_g_ := getg()
@@ -540,6 +561,7 @@
 	moduledataverify()
 	stackinit()
 	mallocinit()
+	fastrandinit() // must run before mcommoninit
 	mcommoninit(_g_.m)
 	cpuinit()       // must run before alginit
 	alginit()       // maps must not be used before this call
@@ -617,8 +639,8 @@
 	sched.mnext++
 	checkmcount()
 
-	mp.fastrand[0] = 1597334677 * uint32(mp.id)
-	mp.fastrand[1] = uint32(cputicks())
+	mp.fastrand[0] = uint32(int64Hash(uint64(mp.id), fastrandseed))
+	mp.fastrand[1] = uint32(int64Hash(uint64(cputicks()), ^fastrandseed))
 	if mp.fastrand[0]|mp.fastrand[1] == 0 {
 		mp.fastrand[1] = 1
 	}
@@ -643,6 +665,13 @@
 	}
 }
 
+var fastrandseed uintptr
+
+func fastrandinit() {
+	s := (*[unsafe.Sizeof(fastrandseed)]byte)(unsafe.Pointer(&fastrandseed))[:]
+	getRandomData(s)
+}
+
 // Mark gp ready to run.
 func ready(gp *g, traceskip int, next bool) {
 	if trace.enabled {
@@ -662,9 +691,7 @@
 	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	runqput(_g_.m.p.ptr(), gp, next)
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-		wakep()
-	}
+	wakep()
 	releasem(mp)
 }
 
@@ -707,18 +734,6 @@
 	return atomic.Load(&gp.atomicstatus)
 }
 
-// Ownership of gcscanvalid:
-//
-// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
-// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
-//
-// Otherwise, a second goroutine can lock the scan state by setting _Gscan
-// in the status bit and then modify gcscanvalid, and then unlock the scan state.
-//
-// Note that the first condition implies an exception to the second:
-// if a second goroutine changes gp's status to _Grunning|_Gscan,
-// that second goroutine still does not have the right to modify gcscanvalid.
-
 // The Gscanstatuses are acting like locks and this releases them.
 // If it proves to be a performance hit we should be able to make these
 // simple atomic stores but for now we are going to throw if
@@ -735,7 +750,8 @@
 	case _Gscanrunnable,
 		_Gscanwaiting,
 		_Gscanrunning,
-		_Gscansyscall:
+		_Gscansyscall,
+		_Gscanpreempted:
 		if newval == oldval&^_Gscan {
 			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
 		}
@@ -745,6 +761,7 @@
 		dumpgstatus(gp)
 		throw("casfrom_Gscanstatus: gp->status is not in scan state")
 	}
+	releaseLockRank(lockRankGscan)
 }
 
 // This will return false if the gp is not in the expected status and the cas fails.
@@ -756,7 +773,12 @@
 		_Gwaiting,
 		_Gsyscall:
 		if newval == oldval|_Gscan {
-			return atomic.Cas(&gp.atomicstatus, oldval, newval)
+			r := atomic.Cas(&gp.atomicstatus, oldval, newval)
+			if r {
+				acquireLockRank(lockRankGscan)
+			}
+			return r
+
 		}
 	}
 	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
@@ -777,16 +799,8 @@
 		})
 	}
 
-	if oldval == _Grunning && gp.gcscanvalid {
-		// If oldvall == _Grunning, then the actual status must be
-		// _Grunning or _Grunning|_Gscan; either way,
-		// we own gp.gcscanvalid, so it's safe to read.
-		// gp.gcscanvalid must not be true when we are running.
-		systemstack(func() {
-			print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-			throw("casgstatus")
-		})
-	}
+	acquireLockRank(lockRankGscan)
+	releaseLockRank(lockRankGscan)
 
 	// See https://golang.org/cl/21503 for justification of the yield delay.
 	const yieldDelay = 5 * 1000
@@ -798,14 +812,6 @@
 		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
 			throw("casgstatus: waiting for Gwaiting but is Grunnable")
 		}
-		// Help GC if needed.
-		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-		// 	gp.preemptscan = false
-		// 	systemstack(func() {
-		// 		gcphasework(gp)
-		// 	})
-		// }
-		// But meanwhile just yield.
 		if i == 0 {
 			nextYield = nanotime() + yieldDelay
 		}
@@ -818,9 +824,6 @@
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
 }
 
 // casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
@@ -841,109 +844,27 @@
 	}
 }
 
-// scang blocks until gp's stack has been scanned.
-// It might be scanned by scang or it might be scanned by the goroutine itself.
-// Either way, the stack scan has completed when scang returns.
-func scang(gp *g, gcw *gcWork) {
-	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
-	// Nothing is racing with us now, but gcscandone might be set to true left over
-	// from an earlier round of stack scanning (we scan twice per GC).
-	// We use gcscandone to record whether the scan has been done during this round.
-
-	gp.gcscandone = false
-
-	// See https://golang.org/cl/21503 for justification of the yield delay.
-	const yieldDelay = 10 * 1000
-	var nextYield int64
-
-	// Endeavor to get gcscandone set to true,
-	// either by doing the stack scan ourselves or by coercing gp to scan itself.
-	// gp.gcscandone can transition from false to true when we're not looking
-	// (if we asked for preemption), so any time we lock the status using
-	// castogscanstatus we have to double-check that the scan is still not done.
-loop:
-	for i := 0; !gp.gcscandone; i++ {
-		switch s := readgstatus(gp); s {
-		default:
-			dumpgstatus(gp)
-			throw("stopg: invalid status")
-
-		case _Gdead:
-			// No stack.
-			gp.gcscandone = true
-			break loop
-
-		case _Gcopystack:
-		// Stack being switched. Go around again.
-
-		case _Grunnable, _Gsyscall, _Gwaiting:
-			// Claim goroutine by setting scan bit.
-			// Racing with execution or readying of gp.
-			// The scan bit keeps them from running
-			// the goroutine until we're done.
-			if castogscanstatus(gp, s, s|_Gscan) {
-				if !gp.gcscandone {
-					scanstack(gp, gcw)
-					gp.gcscandone = true
-				}
-				restartg(gp)
-				break loop
-			}
-
-		case _Gscanwaiting:
-		// newstack is doing a scan for us right now. Wait.
-
-		case _Grunning:
-			// Goroutine running. Try to preempt execution so it can scan itself.
-			// The preemption handler (in newstack) does the actual scan.
-
-			// Optimization: if there is already a pending preemption request
-			// (from the previous loop iteration), don't bother with the atomics.
-			if gp.preemptscan && gp.preempt && gp.stackguard0 == stackPreempt {
-				break
-			}
-
-			// Ask for preemption and self scan.
-			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
-				if !gp.gcscandone {
-					gp.preemptscan = true
-					gp.preempt = true
-					gp.stackguard0 = stackPreempt
-				}
-				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
-			}
-		}
-
-		if i == 0 {
-			nextYield = nanotime() + yieldDelay
-		}
-		if nanotime() < nextYield {
-			procyield(10)
-		} else {
-			osyield()
-			nextYield = nanotime() + yieldDelay/2
-		}
+// casGToPreemptScan transitions gp from _Grunning to _Gscan|_Gpreempted.
+//
+// TODO(austin): This is the only status operation that both changes
+// the status and locks the _Gscan bit. Rethink this.
+func casGToPreemptScan(gp *g, old, new uint32) {
+	if old != _Grunning || new != _Gscan|_Gpreempted {
+		throw("bad g transition")
 	}
-
-	gp.preemptscan = false // cancel scan request if no longer needed
+	acquireLockRank(lockRankGscan)
+	for !atomic.Cas(&gp.atomicstatus, _Grunning, _Gscan|_Gpreempted) {
+	}
 }
 
-// The GC requests that this routine be moved from a scanmumble state to a mumble state.
-func restartg(gp *g) {
-	s := readgstatus(gp)
-	switch s {
-	default:
-		dumpgstatus(gp)
-		throw("restartg: unexpected status")
-
-	case _Gdead:
-	// ok
-
-	case _Gscanrunnable,
-		_Gscanwaiting,
-		_Gscansyscall:
-		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+// casGFromPreempted attempts to transition gp from _Gpreempted to
+// _Gwaiting. If successful, the caller is responsible for
+// re-scheduling gp.
+func casGFromPreempted(gp *g, old, new uint32) bool {
+	if old != _Gpreempted || new != _Gwaiting {
+		throw("bad g transition")
 	}
+	return atomic.Cas(&gp.atomicstatus, _Gpreempted, _Gwaiting)
 }
 
 // stopTheWorld stops all P's from executing goroutines, interrupting
@@ -962,8 +883,23 @@
 // goroutines.
 func stopTheWorld(reason string) {
 	semacquire(&worldsema)
-	getg().m.preemptoff = reason
-	systemstack(stopTheWorldWithSema)
+	gp := getg()
+	gp.m.preemptoff = reason
+	systemstack(func() {
+		// Mark the goroutine which called stopTheWorld preemptible so its
+		// stack may be scanned.
+		// This lets a mark worker scan us while we try to stop the world
+		// since otherwise we could get in a mutual preemption deadlock.
+		// We must not modify anything on the G stack because a stack shrink
+		// may occur. A stack shrink is otherwise OK though because in order
+		// to return from this function (and to leave the system stack) we
+		// must have preempted all goroutines, including any attempting
+		// to scan our stack, in which case, any stack shrinking will
+		// have already completed by the time we exit.
+		casgstatus(gp, _Grunning, _Gwaiting)
+		stopTheWorldWithSema()
+		casgstatus(gp, _Gwaiting, _Grunning)
+	})
 }
 
 // startTheWorld undoes the effects of stopTheWorld.
@@ -975,10 +911,31 @@
 	getg().m.preemptoff = ""
 }
 
-// Holding worldsema grants an M the right to try to stop the world
-// and prevents gomaxprocs from changing concurrently.
+// stopTheWorldGC has the same effect as stopTheWorld, but blocks
+// until the GC is not running. It also blocks a GC from starting
+// until startTheWorldGC is called.
+func stopTheWorldGC(reason string) {
+	semacquire(&gcsema)
+	stopTheWorld(reason)
+}
+
+// startTheWorldGC undoes the effects of stopTheWorldGC.
+func startTheWorldGC() {
+	startTheWorld()
+	semrelease(&gcsema)
+}
+
+// Holding worldsema grants an M the right to try to stop the world.
 var worldsema uint32 = 1
 
+// Holding gcsema grants the M the right to block a GC, and blocks
+// until the current GC is done. In particular, it prevents gomaxprocs
+// from changing concurrently.
+//
+// TODO(mknyszek): Once gomaxprocs and the execution tracer can handle
+// being changed/enabled during a GC, remove this.
+var gcsema uint32 = 1
+
 // stopTheWorldWithSema is the core implementation of stopTheWorld.
 // The caller is responsible for acquiring worldsema and disabling
 // preemption first and then should stopTheWorldWithSema on the system
@@ -1080,7 +1037,7 @@
 func startTheWorldWithSema(emitTraceEvent bool) int64 {
 	mp := acquirem() // disable preemption because it can be holding p in a local var
 	if netpollinited() {
-		list := netpoll(false) // non-blocking
+		list := netpoll(0) // non-blocking
 		injectglist(&list)
 	}
 	lock(&sched.lock)
@@ -1124,9 +1081,7 @@
 	// Wakeup an additional proc in case we have excessive runnable goroutines
 	// in local queues or in the global queue. If we don't, the proc will park itself.
 	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-		wakep()
-	}
+	wakep()
 
 	releasem(mp)
 
@@ -1167,7 +1122,8 @@
 	mstart1()
 
 	// Exit this thread.
-	if GOOS == "windows" || GOOS == "solaris" || GOOS == "illumos" || GOOS == "plan9" || GOOS == "darwin" || GOOS == "aix" {
+	switch GOOS {
+	case "windows", "solaris", "illumos", "plan9", "darwin", "aix":
 		// Windows, Solaris, illumos, Darwin, AIX and Plan 9 always system-allocate
 		// the stack, but put it in _g_.stack before mstart,
 		// so the logic above hasn't set osStack yet.
@@ -1266,6 +1222,11 @@
 	// Free the gsignal stack.
 	if m.gsignal != nil {
 		stackfree(m.gsignal.stack)
+		// On some platforms, when calling into VDSO (e.g. nanotime)
+		// we store our g on the gsignal stack, if there is one.
+		// Now the stack is freed, unlink it from the m, so we
+		// won't write to it when calling VDSO code.
+		m.gsignal = nil
 	}
 
 	// Remove m from allm.
@@ -1635,8 +1596,6 @@
 	gp.syscallpc = gp.sched.pc
 	gp.syscallsp = gp.sched.sp
 	gp.stktopsp = gp.sched.sp
-	gp.gcscanvalid = true
-	gp.gcscandone = true
 	// malg returns status as _Gidle. Change to _Gdead before
 	// adding to allg where GC can see it. We use _Gdead to hide
 	// this from tracebacks and stack scans since it isn't a
@@ -1698,6 +1657,7 @@
 
 	// Return mp.curg to dead state.
 	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	mp.curg.preemptStop = false
 	atomic.Xadd(&sched.ngsys, +1)
 
 	// Block signals before unminit.
@@ -1742,8 +1702,7 @@
 	for {
 		old := atomic.Loaduintptr(&extram)
 		if old == locked {
-			yield := osyield
-			yield()
+			osyield()
 			continue
 		}
 		if old == 0 && !nilokay {
@@ -1760,8 +1719,7 @@
 		if atomic.Casuintptr(&extram, old, locked) {
 			return (*m)(unsafe.Pointer(old))
 		}
-		yield := osyield
-		yield()
+		osyield()
 		continue
 	}
 }
@@ -1862,10 +1820,16 @@
 	if GOARCH == "wasm" { // no threads on wasm yet
 		return
 	}
+
+	// Disable preemption to guarantee that the template thread will be
+	// created before a park once haveTemplateThread is set.
+	mp := acquirem()
 	if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+		releasem(mp)
 		return
 	}
 	newm(templateThread, nil)
+	releasem(mp)
 }
 
 // templateThread is a thread in a known-good state that exists solely
@@ -2036,6 +2000,9 @@
 		startm(_p_, false)
 		return
 	}
+	if when := nobarrierWakeTime(_p_); when != 0 {
+		wakeNetPoller(when)
+	}
 	pidleput(_p_)
 	unlock(&sched.lock)
 }
@@ -2043,8 +2010,11 @@
 // Tries to add one more P to execute G's.
 // Called when a G is made runnable (newproc, ready).
 func wakep() {
+	if atomic.Load(&sched.npidle) == 0 {
+		return
+	}
 	// be conservative about spinning threads
-	if !atomic.Cas(&sched.nmspinning, 0, 1) {
+	if atomic.Load(&sched.nmspinning) != 0 || !atomic.Cas(&sched.nmspinning, 0, 1) {
 		return
 	}
 	startm(nil, true)
@@ -2137,6 +2107,10 @@
 func execute(gp *g, inheritTime bool) {
 	_g_ := getg()
 
+	// Assign gp.m before entering _Grunning so running Gs have an
+	// M.
+	_g_.m.curg = gp
+	gp.m = _g_.m
 	casgstatus(gp, _Grunnable, _Grunning)
 	gp.waitsince = 0
 	gp.preempt = false
@@ -2144,8 +2118,6 @@
 	if !inheritTime {
 		_g_.m.p.ptr().schedtick++
 	}
-	_g_.m.curg = gp
-	gp.m = _g_.m
 
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
@@ -2166,7 +2138,7 @@
 }
 
 // Finds a runnable goroutine to execute.
-// Tries to steal from other P's, get g from global queue, poll network.
+// Tries to steal from other P's, get g from local or global queue, poll network.
 func findrunnable() (gp *g, inheritTime bool) {
 	_g_ := getg()
 
@@ -2183,6 +2155,9 @@
 	if _p_.runSafePointFn != 0 {
 		runSafePointFn()
 	}
+
+	now, pollUntil, _ := checkTimers(_p_, 0)
+
 	if fingwait && fingwake {
 		if gp := wakefing(); gp != nil {
 			ready(gp, 0, true)
@@ -2215,7 +2190,7 @@
 	// not set lastpoll yet), this thread will do blocking netpoll below
 	// anyway.
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Load64(&sched.lastpoll) != 0 {
-		if list := netpoll(false); !list.empty() { // non-blocking
+		if list := netpoll(0); !list.empty() { // non-blocking
 			gp := list.pop()
 			injectglist(&list)
 			casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2228,12 +2203,7 @@
 
 	// Steal work from other P's.
 	procs := uint32(gomaxprocs)
-	if atomic.Load(&sched.npidle) == procs-1 {
-		// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
-		// New work can appear from returning syscall/cgocall, network or timers.
-		// Neither of that submits to local run queues, so no point in stealing.
-		goto stop
-	}
+	ranTimer := false
 	// If number of spinning M's >= number of busy P's, block.
 	// This is necessary to prevent excessive CPU consumption
 	// when GOMAXPROCS>>1 but the program parallelism is low.
@@ -2250,11 +2220,51 @@
 				goto top
 			}
 			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
-			if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+			p2 := allp[enum.position()]
+			if _p_ == p2 {
+				continue
+			}
+			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
 				return gp, false
 			}
+
+			// Consider stealing timers from p2.
+			// This call to checkTimers is the only place where
+			// we hold a lock on a different P's timers.
+			// Lock contention can be a problem here, so
+			// initially avoid grabbing the lock if p2 is running
+			// and is not marked for preemption. If p2 is running
+			// and not being preempted we assume it will handle its
+			// own timers.
+			// If we're still looking for work after checking all
+			// the P's, then go ahead and steal from an active P.
+			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+				tnow, w, ran := checkTimers(p2, now)
+				now = tnow
+				if w != 0 && (pollUntil == 0 || w < pollUntil) {
+					pollUntil = w
+				}
+				if ran {
+					// Running the timers may have
+					// made an arbitrary number of G's
+					// ready and added them to this P's
+					// local run queue. That invalidates
+					// the assumption of runqsteal
+					// that is always has room to add
+					// stolen G's. So check now if there
+					// is a local G to run.
+					if gp, inheritTime := runqget(_p_); gp != nil {
+						return gp, inheritTime
+					}
+					ranTimer = true
+				}
+			}
 		}
 	}
+	if ranTimer {
+		// Running a timer may have made some goroutine ready.
+		goto top
+	}
 
 stop:
 
@@ -2271,11 +2281,25 @@
 		return gp, false
 	}
 
+	delta := int64(-1)
+	if pollUntil != 0 {
+		// checkTimers ensures that polluntil > now.
+		delta = pollUntil - now
+	}
+
 	// wasm only:
 	// If a callback returned and no other goroutine is awake,
-	// then pause execution until a callback was triggered.
-	if beforeIdle() {
-		// At least one goroutine got woken.
+	// then wake event handler goroutine which pauses execution
+	// until a callback was triggered.
+	gp, otherReady := beforeIdle(delta)
+	if gp != nil {
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		if trace.enabled {
+			traceGoUnpark(gp, 0)
+		}
+		return gp, false
+	}
+	if otherReady {
 		goto top
 	}
 
@@ -2362,21 +2386,35 @@
 	}
 
 	// poll network
-	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+		atomic.Store64(&sched.pollUntil, uint64(pollUntil))
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
 		if _g_.m.spinning {
 			throw("findrunnable: netpoll with spinning")
 		}
-		list := netpoll(true) // block until new work is available
+		if faketime != 0 {
+			// When using fake time, just poll.
+			delta = 0
+		}
+		list := netpoll(delta) // block until new work is available
+		atomic.Store64(&sched.pollUntil, 0)
 		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
-		if !list.empty() {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
+		if faketime != 0 && list.empty() {
+			// Using fake time and nothing is ready; stop M.
+			// When all M's stop, checkdead will call timejump.
+			stopm()
+			goto top
+		}
+		lock(&sched.lock)
+		_p_ = pidleget()
+		unlock(&sched.lock)
+		if _p_ == nil {
+			injectglist(&list)
+		} else {
+			acquirep(_p_)
+			if !list.empty() {
 				gp := list.pop()
 				injectglist(&list)
 				casgstatus(gp, _Gwaiting, _Grunnable)
@@ -2385,7 +2423,16 @@
 				}
 				return gp, false
 			}
-			injectglist(&list)
+			if wasSpinning {
+				_g_.m.spinning = true
+				atomic.Xadd(&sched.nmspinning, 1)
+			}
+			goto top
+		}
+	} else if pollUntil != 0 && netpollinited() {
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > pollUntil {
+			netpollBreak()
 		}
 	}
 	stopm()
@@ -2405,7 +2452,7 @@
 		return true
 	}
 	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
-		if list := netpoll(false); !list.empty() {
+		if list := netpoll(0); !list.empty() {
 			injectglist(&list)
 			return true
 		}
@@ -2413,6 +2460,22 @@
 	return false
 }
 
+// wakeNetPoller wakes up the thread sleeping in the network poller,
+// if there is one, and if it isn't going to wake up anyhow before
+// the when argument.
+func wakeNetPoller(when int64) {
+	if atomic.Load64(&sched.lastpoll) == 0 {
+		// In findrunnable we ensure that when polling the pollUntil
+		// field is either zero or the time to which the current
+		// poll is expected to run. This can have a spurious wakeup
+		// but should never miss a wakeup.
+		pollerPollUntil := int64(atomic.Load64(&sched.pollUntil))
+		if pollerPollUntil == 0 || pollerPollUntil > when {
+			netpollBreak()
+		}
+	}
+}
+
 func resetspinning() {
 	_g_ := getg()
 	if !_g_.m.spinning {
@@ -2426,12 +2489,16 @@
 	// M wakeup policy is deliberately somewhat conservative, so check if we
 	// need to wakeup another P here. See "Worker thread parking/unparking"
 	// comment at the top of the file for details.
-	if nmspinning == 0 && atomic.Load(&sched.npidle) > 0 {
-		wakep()
-	}
+	wakep()
 }
 
-// Injects the list of runnable G's into the scheduler and clears glist.
+// injectglist adds each runnable G on the list to some run queue,
+// and clears glist. If there is no current P, they are added to the
+// global queue, and up to npidle M's are started to run them.
+// Otherwise, for each idle P, this adds a G to the global queue
+// and starts an M. Any remaining G's are added to the current P's
+// local run queue.
+// This may temporarily acquire the scheduler lock.
 // Can run concurrently with GC.
 func injectglist(glist *gList) {
 	if glist.empty() {
@@ -2442,18 +2509,52 @@
 			traceGoUnpark(gp, 0)
 		}
 	}
-	lock(&sched.lock)
-	var n int
-	for n = 0; !glist.empty(); n++ {
-		gp := glist.pop()
+
+	// Mark all the goroutines as runnable before we put them
+	// on the run queues.
+	head := glist.head.ptr()
+	var tail *g
+	qsize := 0
+	for gp := head; gp != nil; gp = gp.schedlink.ptr() {
+		tail = gp
+		qsize++
 		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
+	}
+
+	// Turn the gList into a gQueue.
+	var q gQueue
+	q.head.set(head)
+	q.tail.set(tail)
+	*glist = gList{}
+
+	startIdle := func(n int) {
+		for ; n != 0 && sched.npidle != 0; n-- {
+			startm(nil, false)
+		}
+	}
+
+	pp := getg().m.p.ptr()
+	if pp == nil {
+		lock(&sched.lock)
+		globrunqputbatch(&q, int32(qsize))
+		unlock(&sched.lock)
+		startIdle(qsize)
+		return
+	}
+
+	lock(&sched.lock)
+	npidle := int(sched.npidle)
+	var n int
+	for n = 0; n < npidle && !q.empty(); n++ {
+		globrunqput(q.pop())
 	}
 	unlock(&sched.lock)
-	for ; n != 0 && sched.npidle != 0; n-- {
-		startm(nil, false)
+	startIdle(n)
+	qsize -= n
+
+	if !q.empty() {
+		runqputbatch(pp, &q, qsize)
 	}
-	*glist = gList{}
 }
 
 // One round of scheduler: find a runnable goroutine and execute it.
@@ -2477,14 +2578,26 @@
 	}
 
 top:
+	pp := _g_.m.p.ptr()
+	pp.preempt = false
+
 	if sched.gcwaiting != 0 {
 		gcstopm()
 		goto top
 	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
+	if pp.runSafePointFn != 0 {
 		runSafePointFn()
 	}
 
+	// Sanity check: if we are spinning, the run queue should be empty.
+	// Check this before calling checkTimers, as that might call
+	// goready to put a ready goroutine on the local run queue.
+	if _g_.m.spinning && (pp.runnext != 0 || pp.runqhead != pp.runqtail) {
+		throw("schedule: spinning with local work")
+	}
+
+	checkTimers(pp, 0)
+
 	var gp *g
 	var inheritTime bool
 
@@ -2516,9 +2629,8 @@
 	}
 	if gp == nil {
 		gp, inheritTime = runqget(_g_.m.p.ptr())
-		if gp != nil && _g_.m.spinning {
-			throw("schedule: spinning with local work")
-		}
+		// We can see gp != nil here even if the M is spinning,
+		// if checkTimers added a local goroutine via goready.
 	}
 	if gp == nil {
 		gp, inheritTime = findrunnable() // blocks until work is available
@@ -2551,9 +2663,7 @@
 	// If about to schedule a not-normal goroutine (a GCworker or tracereader),
 	// wake a P if there is one.
 	if tryWakeP {
-		if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
-			wakep()
-		}
+		wakep()
 	}
 	if gp.lockedm != 0 {
 		// Hands off own p to the locked m,
@@ -2579,6 +2689,90 @@
 	setGNoWB(&_g_.m.curg, nil)
 }
 
+// checkTimers runs any timers for the P that are ready.
+// If now is not 0 it is the current time.
+// It returns the current time or 0 if it is not known,
+// and the time when the next timer should run or 0 if there is no next timer,
+// and reports whether it ran any timers.
+// If the time when the next timer should run is not 0,
+// it is always larger than the returned time.
+// We pass now in and out to avoid extra calls of nanotime.
+//go:yeswritebarrierrec
+func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
+	// If there are no timers to adjust, and the first timer on
+	// the heap is not yet ready to run, then there is nothing to do.
+	if atomic.Load(&pp.adjustTimers) == 0 {
+		next := int64(atomic.Load64(&pp.timer0When))
+		if next == 0 {
+			return now, 0, false
+		}
+		if now == 0 {
+			now = nanotime()
+		}
+		if now < next {
+			// Next timer is not ready to run.
+			// But keep going if we would clear deleted timers.
+			// This corresponds to the condition below where
+			// we decide whether to call clearDeletedTimers.
+			if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+				return now, next, false
+			}
+		}
+	}
+
+	lock(&pp.timersLock)
+
+	adjusttimers(pp)
+
+	rnow = now
+	if len(pp.timers) > 0 {
+		if rnow == 0 {
+			rnow = nanotime()
+		}
+		for len(pp.timers) > 0 {
+			// Note that runtimer may temporarily unlock
+			// pp.timersLock.
+			if tw := runtimer(pp, rnow); tw != 0 {
+				if tw > 0 {
+					pollUntil = tw
+				}
+				break
+			}
+			ran = true
+		}
+	}
+
+	// If this is the local P, and there are a lot of deleted timers,
+	// clear them out. We only do this for the local P to reduce
+	// lock contention on timersLock.
+	if pp == getg().m.p.ptr() && int(atomic.Load(&pp.deletedTimers)) > len(pp.timers)/4 {
+		clearDeletedTimers(pp)
+	}
+
+	unlock(&pp.timersLock)
+
+	return rnow, pollUntil, ran
+}
+
+// shouldStealTimers reports whether we should try stealing the timers from p2.
+// We don't steal timers from a running P that is not marked for preemption,
+// on the assumption that it will run its own timers. This reduces
+// contention on the timers lock.
+func shouldStealTimers(p2 *p) bool {
+	if p2.status != _Prunning {
+		return true
+	}
+	mp := p2.m.ptr()
+	if mp == nil || mp.locks > 0 {
+		return false
+	}
+	gp := mp.curg
+	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
+		return false
+	}
+	return true
+}
+
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -2636,7 +2830,7 @@
 // goschedguarded is a forbidden-states-avoided version of gosched_m
 func goschedguarded_m(gp *g) {
 
-	if gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.p.ptr().status != _Prunning {
+	if !canPreemptM(gp.m) {
 		gogo(&gp.sched) // never return
 	}
 
@@ -2653,6 +2847,50 @@
 	goschedImpl(gp)
 }
 
+// preemptPark parks gp and puts it in _Gpreempted.
+//
+//go:systemstack
+func preemptPark(gp *g) {
+	if trace.enabled {
+		traceGoPark(traceEvGoBlock, 0)
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	gp.waitreason = waitReasonPreempted
+	// Transition from _Grunning to _Gscan|_Gpreempted. We can't
+	// be in _Grunning when we dropg because then we'd be running
+	// without an M, but the moment we're in _Gpreempted,
+	// something could claim this G before we've fully cleaned it
+	// up. Hence, we set the scan bit to lock down further
+	// transitions until we can dropg.
+	casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
+	dropg()
+	casfrom_Gscanstatus(gp, _Gscan|_Gpreempted, _Gpreempted)
+	schedule()
+}
+
+// goyield is like Gosched, but it:
+// - emits a GoPreempt trace event instead of a GoSched trace event
+// - puts the current G on the runq of the current P instead of the globrunq
+func goyield() {
+	checkTimeouts()
+	mcall(goyield_m)
+}
+
+func goyield_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	pp := gp.m.p.ptr()
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	runqput(pp, gp, false)
+	schedule()
+}
+
 // Finishes execution of the current goroutine.
 func goexit1() {
 	if raceenabled {
@@ -2676,6 +2914,7 @@
 	locked := gp.lockedm != 0
 	gp.lockedm = 0
 	_g_.m.lockedg = 0
+	gp.preemptStop = false
 	gp.paniconfault = false
 	gp._defer = nil // should be true already but just in case.
 	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
@@ -2694,9 +2933,6 @@
 		gp.gcAssistBytes = 0
 	}
 
-	// Note that gp's stack scan is now "valid" because it has no
-	// stack.
-	gp.gcscanvalid = true
 	dropg()
 
 	if GOARCH == "wasm" { // no threads yet on wasm
@@ -2835,7 +3071,6 @@
 
 	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
 	_g_.sysblocktraced = true
-	_g_.m.mcache = nil
 	pp := _g_.m.p.ptr()
 	pp.m = 0
 	_g_.m.oldp.set(pp)
@@ -2961,9 +3196,6 @@
 	oldp := _g_.m.oldp.ptr()
 	_g_.m.oldp = 0
 	if exitsyscallfast(oldp) {
-		if _g_.m.mcache == nil {
-			throw("lost mcache")
-		}
 		if trace.enabled {
 			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
 				systemstack(traceGoStart)
@@ -3014,10 +3246,6 @@
 	// Call the scheduler.
 	mcall(exitsyscall0)
 
-	if _g_.m.mcache == nil {
-		throw("lost mcache")
-	}
-
 	// Scheduler returned, so we're allowed to run now.
 	// Delete the syscallsp information that we left for
 	// the garbage collector during the system call.
@@ -3237,6 +3465,9 @@
 		})
 		newg.stackguard0 = newg.stack.lo + _StackGuard
 		newg.stackguard1 = ^uintptr(0)
+		// Clear the bottom word of the stack. We record g
+		// there on gsignal stack during VDSO on ARM and ARM64.
+		*(*uintptr)(unsafe.Pointer(newg.stack.lo)) = 0
 	}
 	return newg
 }
@@ -3244,23 +3475,44 @@
 // Create a new g running fn with siz bytes of arguments.
 // Put it on the queue of g's waiting to run.
 // The compiler turns a go statement into a call to this.
-// Cannot split the stack because it assumes that the arguments
-// are available sequentially after &fn; they would not be
-// copied if a stack split occurred.
+//
+// The stack layout of this call is unusual: it assumes that the
+// arguments to pass to fn are on the stack sequentially immediately
+// after &fn. Hence, they are logically part of newproc's argument
+// frame, even though they don't appear in its signature (and can't
+// because their types differ between call sites).
+//
+// This must be nosplit because this stack layout means there are
+// untyped arguments in newproc's argument frame. Stack copies won't
+// be able to adjust them and stack splits won't be able to copy them.
+//
 //go:nosplit
 func newproc(siz int32, fn *funcval) {
 	argp := add(unsafe.Pointer(&fn), sys.PtrSize)
 	gp := getg()
 	pc := getcallerpc()
 	systemstack(func() {
-		newproc1(fn, (*uint8)(argp), siz, gp, pc)
+		newg := newproc1(fn, argp, siz, gp, pc)
+
+		_p_ := getg().m.p.ptr()
+		runqput(_p_, newg, true)
+
+		if mainStarted {
+			wakep()
+		}
 	})
 }
 
-// Create a new g running fn with narg bytes of arguments starting
-// at argp. callerpc is the address of the go statement that created
-// this. The new g is put on the queue of g's waiting to run.
-func newproc1(fn *funcval, argp *uint8, narg int32, callergp *g, callerpc uintptr) {
+// Create a new g in state _Grunnable, starting at fn, with narg bytes
+// of arguments starting at argp. callerpc is the address of the go
+// statement that created this. The caller is responsible for adding
+// the new g to the scheduler.
+//
+// This must run on the system stack because it's the continuation of
+// newproc, which cannot split the stack.
+//
+//go:systemstack
+func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerpc uintptr) *g {
 	_g_ := getg()
 
 	if fn == nil {
@@ -3305,7 +3557,7 @@
 		spArg += sys.MinFrameSize
 	}
 	if narg > 0 {
-		memmove(unsafe.Pointer(spArg), unsafe.Pointer(argp), uintptr(narg))
+		memmove(unsafe.Pointer(spArg), argp, uintptr(narg))
 		// This is a stack-to-stack copy. If write barriers
 		// are enabled and the source stack is grey (the
 		// destination is always black), then perform a
@@ -3338,7 +3590,6 @@
 	if isSystemGoroutine(newg, false) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
-	newg.gcscanvalid = false
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -3357,12 +3608,9 @@
 	if trace.enabled {
 		traceGoCreate(newg, newg.startpc)
 	}
-	runqput(_p_, newg, true)
-
-	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && mainStarted {
-		wakep()
-	}
 	releasem(_g_.m)
+
+	return newg
 }
 
 // saveAncestors copies previous ancestors of the given caller g and
@@ -3909,10 +4157,12 @@
 	pp.wbBuf.reset()
 	if pp.mcache == nil {
 		if id == 0 {
-			if getg().m.mcache == nil {
+			if mcache0 == nil {
 				throw("missing mcache?")
 			}
-			pp.mcache = getg().m.mcache // bootstrap
+			// Use the bootstrap mcache0. Only one P will get
+			// mcache0: the one with ID 0.
+			pp.mcache = mcache0
 		} else {
 			pp.mcache = allocmcache()
 		}
@@ -3925,6 +4175,7 @@
 			pp.raceprocctx = raceproccreate()
 		}
 	}
+	lockInit(&pp.timersLock, lockRankTimers)
 }
 
 // destroy releases all of the resources associated with pp and
@@ -3944,6 +4195,23 @@
 		globrunqputhead(pp.runnext.ptr())
 		pp.runnext = 0
 	}
+	if len(pp.timers) > 0 {
+		plocal := getg().m.p.ptr()
+		// The world is stopped, but we acquire timersLock to
+		// protect against sysmon calling timeSleepUntil.
+		// This is the only case where we hold the timersLock of
+		// more than one P, so there are no deadlock concerns.
+		lock(&plocal.timersLock)
+		lock(&pp.timersLock)
+		moveTimers(plocal, pp.timers)
+		pp.timers = nil
+		pp.numTimers = 0
+		pp.adjustTimers = 0
+		pp.deletedTimers = 0
+		atomic.Store64(&pp.timer0When, 0)
+		unlock(&pp.timersLock)
+		unlock(&plocal.timersLock)
+	}
 	// If there's a background worker, make it runnable and put
 	// it on the global queue so it can clean itself up.
 	if gp := pp.gcBgMarkWorker.ptr(); gp != nil {
@@ -3971,11 +4239,34 @@
 		}
 		pp.deferpool[i] = pp.deferpoolbuf[i][:0]
 	}
+	systemstack(func() {
+		for i := 0; i < pp.mspancache.len; i++ {
+			// Safe to call since the world is stopped.
+			mheap_.spanalloc.free(unsafe.Pointer(pp.mspancache.buf[i]))
+		}
+		pp.mspancache.len = 0
+		pp.pcache.flush(&mheap_.pages)
+	})
 	freemcache(pp.mcache)
 	pp.mcache = nil
 	gfpurge(pp)
 	traceProcFree(pp)
 	if raceenabled {
+		if pp.timerRaceCtx != 0 {
+			// The race detector code uses a callback to fetch
+			// the proc context, so arrange for that callback
+			// to see the right thing.
+			// This hack only works because we are the only
+			// thread running.
+			mp := getg().m
+			phold := mp.p.ptr()
+			mp.p.set(pp)
+
+			racectxend(pp.timerRaceCtx)
+			pp.timerRaceCtx = 0
+
+			mp.p.set(phold)
+		}
 		raceprocdestroy(pp.raceprocctx)
 		pp.raceprocctx = 0
 	}
@@ -4052,7 +4343,6 @@
 			_g_.m.p.ptr().m = 0
 		}
 		_g_.m.p = 0
-		_g_.m.mcache = nil
 		p := allp[0]
 		p.m = 0
 		p.status = _Pidle
@@ -4062,6 +4352,9 @@
 		}
 	}
 
+	// g.m.p is now set, so we no longer need mcache0 for bootstrapping.
+	mcache0 = nil
+
 	// release resources from unused P's
 	for i := nprocs; i < old; i++ {
 		p := allp[i]
@@ -4127,7 +4420,7 @@
 func wirep(_p_ *p) {
 	_g_ := getg()
 
-	if _g_.m.p != 0 || _g_.m.mcache != nil {
+	if _g_.m.p != 0 {
 		throw("wirep: already in go")
 	}
 	if _p_.m != 0 || _p_.status != _Pidle {
@@ -4138,7 +4431,6 @@
 		print("wirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
 		throw("wirep: invalid p state")
 	}
-	_g_.m.mcache = _p_.mcache
 	_g_.m.p.set(_p_)
 	_p_.m.set(_g_.m)
 	_p_.status = _Prunning
@@ -4148,19 +4440,18 @@
 func releasep() *p {
 	_g_ := getg()
 
-	if _g_.m.p == 0 || _g_.m.mcache == nil {
+	if _g_.m.p == 0 {
 		throw("releasep: invalid arg")
 	}
 	_p_ := _g_.m.p.ptr()
-	if _p_.m.ptr() != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
-		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
+	if _p_.m.ptr() != _g_.m || _p_.status != _Prunning {
+		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", hex(_p_.m), " p->status=", _p_.status, "\n")
 		throw("releasep: invalid p state")
 	}
 	if trace.enabled {
 		traceProcStop(_g_.m.p.ptr())
 	}
 	_g_.m.p = 0
-	_g_.m.mcache = nil
 	_p_.m = 0
 	_p_.status = _Pidle
 	return _p_
@@ -4226,7 +4517,8 @@
 		}
 		s := readgstatus(gp)
 		switch s &^ _Gscan {
-		case _Gwaiting:
+		case _Gwaiting,
+			_Gpreempted:
 			grunning++
 		case _Grunnable,
 			_Grunning,
@@ -4238,30 +4530,42 @@
 	}
 	unlock(&allglock)
 	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+		unlock(&sched.lock) // unlock so that GODEBUG=scheddetail=1 doesn't hang
 		throw("no goroutines (main called runtime.Goexit) - deadlock!")
 	}
 
 	// Maybe jump time forward for playground.
-	gp := timejump()
-	if gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
-		_p_ := pidleget()
-		if _p_ == nil {
-			throw("checkdead: no p for timer")
+	if faketime != 0 {
+		when, _p_ := timeSleepUntil()
+		if _p_ != nil {
+			faketime = when
+			for pp := &sched.pidle; *pp != 0; pp = &(*pp).ptr().link {
+				if (*pp).ptr() == _p_ {
+					*pp = _p_.link
+					break
+				}
+			}
+			mp := mget()
+			if mp == nil {
+				// There should always be a free M since
+				// nothing is running.
+				throw("checkdead: no m for timer")
+			}
+			mp.nextp.set(_p_)
+			notewakeup(&mp.park)
+			return
 		}
-		mp := mget()
-		if mp == nil {
-			// There should always be a free M since
-			// nothing is running.
-			throw("checkdead: no m for timer")
+	}
+
+	// There are no goroutines running, so we can look at the P's.
+	for _, _p_ := range allp {
+		if len(_p_.timers) > 0 {
+			return
 		}
-		mp.nextp.set(_p_)
-		notewakeup(&mp.park)
-		return
 	}
 
 	getg().m.throwing = -1 // do not dump full stacks
+	unlock(&sched.lock)    // unlock so that GODEBUG=scheddetail=1 doesn't hang
 	throw("all goroutines are asleep - deadlock!")
 }
 
@@ -4294,47 +4598,60 @@
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		now := nanotime()
+		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
-				atomic.Store(&sched.sysmonwait, 1)
-				unlock(&sched.lock)
-				// Make wake-up period small enough
-				// for the sampling to be correct.
-				maxsleep := forcegcperiod / 2
-				shouldRelax := true
-				if osRelaxMinNS > 0 {
-					next := timeSleepUntil()
-					now := nanotime()
-					if next-now < osRelaxMinNS {
-						shouldRelax = false
+				if next > now {
+					atomic.Store(&sched.sysmonwait, 1)
+					unlock(&sched.lock)
+					// Make wake-up period small enough
+					// for the sampling to be correct.
+					sleep := forcegcperiod / 2
+					if next-now < sleep {
+						sleep = next - now
 					}
+					shouldRelax := sleep >= osRelaxMinNS
+					if shouldRelax {
+						osRelax(true)
+					}
+					notetsleep(&sched.sysmonnote, sleep)
+					if shouldRelax {
+						osRelax(false)
+					}
+					now = nanotime()
+					next, _ = timeSleepUntil()
+					lock(&sched.lock)
+					atomic.Store(&sched.sysmonwait, 0)
+					noteclear(&sched.sysmonnote)
 				}
-				if shouldRelax {
-					osRelax(true)
-				}
-				notetsleep(&sched.sysmonnote, maxsleep)
-				if shouldRelax {
-					osRelax(false)
-				}
-				lock(&sched.lock)
-				atomic.Store(&sched.sysmonwait, 0)
-				noteclear(&sched.sysmonnote)
 				idle = 0
 				delay = 20
 			}
 			unlock(&sched.lock)
 		}
+		lock(&sched.sysmonlock)
+		{
+			// If we spent a long time blocked on sysmonlock
+			// then we want to update now and next since it's
+			// likely stale.
+			now1 := nanotime()
+			if now1-now > 50*1000 /* 50µs */ {
+				next, _ = timeSleepUntil()
+			}
+			now = now1
+		}
+
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
 			asmcgocall(*cgo_yield, nil)
 		}
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
-		now := nanotime()
 		if netpollinited() && lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			list := netpoll(false) // non-blocking - returns list of goroutines
+			list := netpoll(0) // non-blocking - returns list of goroutines
 			if !list.empty() {
 				// Need to decrement number of idle locked M's
 				// (pretending that one more is running) before injectglist.
@@ -4348,6 +4665,16 @@
 				incidlelocked(1)
 			}
 		}
+		if next < now {
+			// There are timers that should have already run,
+			// perhaps because there is an unpreemptible P.
+			// Try to start an M to run them.
+			startm(nil, false)
+		}
+		if atomic.Load(&scavenge.sysmonWake) != 0 {
+			// Kick the scavenger awake if someone requested it.
+			wakeScavenger()
+		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
 		if retake(now) != 0 {
@@ -4368,6 +4695,7 @@
 			lasttrace = now
 			schedtrace(debug.scheddetail > 0)
 		}
+		unlock(&sched.sysmonlock)
 	}
 }
 
@@ -4496,6 +4824,13 @@
 	// Setting gp->stackguard0 to StackPreempt folds
 	// preemption into the normal stack overflow check.
 	gp.stackguard0 = stackPreempt
+
+	// Request an async preemption of this P.
+	if preemptMSupported && debug.asyncpreemptoff == 0 {
+		_p_.preempt = true
+		preemptM(mp)
+	}
+
 	return true
 }
 
@@ -4524,7 +4859,7 @@
 			if mp != nil {
 				id = mp.id
 			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, "\n")
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gFree.n, " timerslen=", len(_p_.timers), "\n")
 		} else {
 			// In non-detailed mode format lengths of per-P run queues as:
 			// [len1 len2 len3 len4]
@@ -4829,6 +5164,38 @@
 	return true
 }
 
+// runqputbatch tries to put all the G's on q on the local runnable queue.
+// If the queue is full, they are put on the global queue; in that case
+// this will temporarily acquire the scheduler lock.
+// Executed only by the owner P.
+func runqputbatch(pp *p, q *gQueue, qsize int) {
+	h := atomic.LoadAcq(&pp.runqhead)
+	t := pp.runqtail
+	n := uint32(0)
+	for !q.empty() && t-h < uint32(len(pp.runq)) {
+		gp := q.pop()
+		pp.runq[t%uint32(len(pp.runq))].set(gp)
+		t++
+		n++
+	}
+	qsize -= int(n)
+
+	if randomizeScheduler {
+		off := func(o uint32) uint32 {
+			return (pp.runqtail + o) % uint32(len(pp.runq))
+		}
+		for i := uint32(1); i < n; i++ {
+			j := fastrandn(i + 1)
+			pp.runq[off(i)], pp.runq[off(j)] = pp.runq[off(j)], pp.runq[off(i)]
+		}
+	}
+
+	atomic.StoreRel(&pp.runqtail, t)
+	if !q.empty() {
+		globrunqputbatch(q, int32(qsize))
+	}
+}
+
 // Get g from local runnable queue.
 // If inheritTime is true, gp should inherit the remaining time in the
 // current time slice. Otherwise, it should start a new time slice.
@@ -5194,6 +5561,7 @@
 }
 
 // An initTask represents the set of initializations that need to be done for a package.
+// Keep in sync with ../../test/initempty.go:initTask
 type initTask struct {
 	// TODO: pack the first 3 fields more tightly?
 	state uintptr // 0 = uninitialized, 1 = in progress, 2 = done
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 6e6272e..de4dec3 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -6,6 +6,8 @@
 
 import (
 	"fmt"
+	"internal/race"
+	"internal/testenv"
 	"math"
 	"net"
 	"runtime"
@@ -356,6 +358,17 @@
 	atomic.StoreUint32(&stop, 1)
 }
 
+func TestAsyncPreempt(t *testing.T) {
+	if !runtime.PreemptMSupported {
+		t.Skip("asynchronous preemption not supported on this platform")
+	}
+	output := runTestProg(t, "testprog", "AsyncPreempt")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
+
 func TestGCFairness(t *testing.T) {
 	output := runTestProg(t, "testprog", "GCFairness")
 	want := "OK\n"
@@ -411,6 +424,11 @@
 	if testing.Short() {
 		t.Skip("skipping in -short mode")
 	}
+	if race.Enabled {
+		// The race detector randomizes the scheduler,
+		// which causes this test to fail (#38266).
+		t.Skip("skipping in -race mode")
+	}
 
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 	done := make(chan bool)
@@ -912,6 +930,29 @@
 	}
 }
 
+func TestLockOSThreadTemplateThreadRace(t *testing.T) {
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprog")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	iterations := 100
+	if testing.Short() {
+		// Reduce run time to ~100ms, with much lower probability of
+		// catching issues.
+		iterations = 5
+	}
+	for i := 0; i < iterations; i++ {
+		want := "OK\n"
+		output := runBuiltTestProg(t, exe, "LockOSThreadTemplateThreadRace")
+		if output != want {
+			t.Fatalf("run %d: want %q, got %q", i, want, output)
+		}
+	}
+}
+
 // fakeSyscall emulates a system call.
 //go:nosplit
 func fakeSyscall(duration time.Duration) {
@@ -981,3 +1022,61 @@
 func TestGetgThreadSwitch(t *testing.T) {
 	runtime.RunGetgThreadSwitchTest()
 }
+
+// TestNetpollBreak tests that netpollBreak can break a netpoll.
+// This test is not particularly safe since the call to netpoll
+// will pick up any stray files that are ready, but it should work
+// OK as long it is not run in parallel.
+func TestNetpollBreak(t *testing.T) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		t.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	// Make sure that netpoll is initialized.
+	runtime.NetpollGenericInit()
+
+	start := time.Now()
+	c := make(chan bool, 2)
+	go func() {
+		c <- true
+		runtime.Netpoll(10 * time.Second.Nanoseconds())
+		c <- true
+	}()
+	<-c
+	// Loop because the break might get eaten by the scheduler.
+	// Break twice to break both the netpoll we started and the
+	// scheduler netpoll.
+loop:
+	for {
+		runtime.Usleep(100)
+		runtime.NetpollBreak()
+		runtime.NetpollBreak()
+		select {
+		case <-c:
+			break loop
+		default:
+		}
+	}
+	if dur := time.Since(start); dur > 5*time.Second {
+		t.Errorf("netpollBreak did not interrupt netpoll: slept for: %v", dur)
+	}
+}
+
+// TestBigGOMAXPROCS tests that setting GOMAXPROCS to a large value
+// doesn't cause a crash at startup. See issue 38474.
+func TestBigGOMAXPROCS(t *testing.T) {
+	t.Parallel()
+	output := runTestProg(t, "testprog", "NonexistentTest", "GOMAXPROCS=1024")
+	// Ignore error conditions on small machines.
+	for _, errstr := range []string{
+		"failed to create new OS thread",
+		"cannot allocate memory",
+	} {
+		if strings.Contains(output, errstr) {
+			t.Skipf("failed to create 1024 threads")
+		}
+	}
+	if !strings.Contains(output, "unknown function: NonexistentTest") {
+		t.Errorf("output:\n%s\nwanted:\nunknown function: NonexistentTest", output)
+	}
+}
diff --git a/src/runtime/race.go b/src/runtime/race.go
index c41e1ba..53910f9 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go
@@ -155,15 +155,51 @@
 	}
 }
 
+// raceSymbolizeCode reads ctx.pc and populates the rest of *ctx with
+// information about the code at that pc.
+//
+// The race detector has already subtracted 1 from pcs, so they point to the last
+// byte of call instructions (including calls to runtime.racewrite and friends).
+//
+// If the incoming pc is part of an inlined function, *ctx is populated
+// with information about the inlined function, and on return ctx.pc is set
+// to a pc in the logically containing function. (The race detector should call this
+// function again with that pc.)
+//
+// If the incoming pc is not part of an inlined function, the return pc is unchanged.
 func raceSymbolizeCode(ctx *symbolizeCodeContext) {
-	f := findfunc(ctx.pc)._Func()
+	pc := ctx.pc
+	fi := findfunc(pc)
+	f := fi._Func()
 	if f != nil {
-		file, line := f.FileLine(ctx.pc)
+		file, line := f.FileLine(pc)
 		if line != 0 {
-			ctx.fn = cfuncname(f.funcInfo())
+			if inldata := funcdata(fi, _FUNCDATA_InlTree); inldata != nil {
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				for {
+					ix := pcdatavalue(fi, _PCDATA_InlTreeIndex, pc, nil)
+					if ix >= 0 {
+						if inltree[ix].funcID == funcID_wrapper {
+							// ignore wrappers
+							// Back up to an instruction in the "caller".
+							pc = f.Entry() + uintptr(inltree[ix].parentPc)
+							continue
+						}
+						ctx.pc = f.Entry() + uintptr(inltree[ix].parentPc) // "caller" pc
+						ctx.fn = cfuncnameFromNameoff(fi, inltree[ix].func_)
+						ctx.line = uintptr(line)
+						ctx.file = &bytes(file)[0] // assume NUL-terminated
+						ctx.off = pc - f.Entry()
+						ctx.res = 1
+						return
+					}
+					break
+				}
+			}
+			ctx.fn = cfuncname(fi)
 			ctx.line = uintptr(line)
 			ctx.file = &bytes(file)[0] // assume NUL-terminated
-			ctx.off = ctx.pc - f.Entry()
+			ctx.off = pc - f.Entry()
 			ctx.res = 1
 			return
 		}
@@ -349,7 +385,7 @@
 	if end < firstmoduledata.ebss {
 		end = firstmoduledata.ebss
 	}
-	size := round(end-start, _PageSize)
+	size := alignUp(end-start, _PageSize)
 	racecall(&__tsan_map_shadow, start, size, 0, 0)
 	racedatastart = start
 	racedataend = start + size
@@ -367,6 +403,9 @@
 	// already held it's assumed that the first caller exits the program
 	// so other calls can hang forever without an issue.
 	lock(&raceFiniLock)
+	// We're entering external code that may call ExitProcess on
+	// Windows.
+	osPreemptExtEnter(getg().m)
 	racecall(&__tsan_fini, 0, 0, 0, 0)
 }
 
@@ -424,6 +463,11 @@
 }
 
 //go:nosplit
+func racectxend(racectx uintptr) {
+	racecall(&__tsan_go_end, racectx, 0, 0, 0)
+}
+
+//go:nosplit
 func racewriterangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
 	_g_ := getg()
 	if _g_ != _g_.m.curg {
@@ -471,6 +515,14 @@
 }
 
 //go:nosplit
+func raceacquirectx(racectx uintptr, addr unsafe.Pointer) {
+	if !isvalidaddr(addr) {
+		return
+	}
+	racecall(&__tsan_acquire, racectx, uintptr(addr), 0, 0)
+}
+
+//go:nosplit
 func racerelease(addr unsafe.Pointer) {
 	racereleaseg(getg(), addr)
 }
diff --git a/src/runtime/race/README b/src/runtime/race/README
index be53b4c..65378c8 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -1,13 +1,13 @@
 runtime/race package contains the data race detector runtime library.
 It is based on ThreadSanitizer race detector, that is currently a part of
-the LLVM project (http://llvm.org/git/compiler-rt.git).
+the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).
 
 To update the .syso files use golang.org/x/build/cmd/racebuild.
 
-race_darwin_amd64.syso built with LLVM fe2c72c59aa7f4afa45e3f65a5d16a374b6cce26 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_freebsd_amd64.syso built with LLVM fe2c72c59aa7f4afa45e3f65a5d16a374b6cce26 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_linux_amd64.syso built with LLVM fe2c72c59aa7f4afa45e3f65a5d16a374b6cce26 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_linux_ppc64le.syso built with LLVM fe2c72c59aa7f4afa45e3f65a5d16a374b6cce26 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_netbsd_amd64.syso built with LLVM fe2c72c59aa7f4afa45e3f65a5d16a374b6cce26 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_windows_amd64.syso built with LLVM ae08a22cc215448aa3ad5a6fb099f6df77e9fa01 and Go 323c85862a7afbde66a3bba0776bf4ba6cd7c030.
-race_linux_arm64.syso built with LLVM 3aa2b775d08f903f804246af10b80a439c16b436 and Go ef2c48659880c7e8a989e6721a21f018790f7793.
+race_darwin_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_freebsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_linux_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_linux_ppc64le.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_netbsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_windows_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
+race_linux_arm64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index 019ad58..d3e7762 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -24,7 +24,7 @@
 		t.Fatal(err)
 	}
 	defer os.RemoveAll(pkgdir)
-	out, err := exec.Command(testenv.GoToolPath(t), "install", "-race", "-pkgdir="+pkgdir, "-gcflags=all=-l", "testing").CombinedOutput()
+	out, err := exec.Command(testenv.GoToolPath(t), "install", "-race", "-pkgdir="+pkgdir, "testing").CombinedOutput()
 	if err != nil {
 		t.Fatalf("go install -race: %v\n%s", err, out)
 	}
@@ -56,8 +56,8 @@
 		if err := f.Close(); err != nil {
 			t.Fatalf("failed to close file: %v", err)
 		}
-		// Pass -l to the compiler to test stack traces.
-		cmd := exec.Command(testenv.GoToolPath(t), test.run, "-race", "-pkgdir="+pkgdir, "-gcflags=all=-l", src)
+
+		cmd := exec.Command(testenv.GoToolPath(t), test.run, "-race", "-pkgdir="+pkgdir, src)
 		// GODEBUG spoils program output, GOMAXPROCS makes it flaky.
 		for _, env := range os.Environ() {
 			if strings.HasPrefix(env, "GODEBUG=") ||
@@ -218,6 +218,52 @@
   main\.main\.func1\(\)
       .*/main.go:7`},
 
+	// Test for https://golang.org/issue/33309
+	{"midstack_inlining_traceback", "run", "linux", "atexit_sleep_ms=0", `
+package main
+
+var x int
+
+func main() {
+	c := make(chan int)
+	go f(c)
+	x = 1
+	<-c
+}
+
+func f(c chan int) {
+	g(c)
+}
+
+func g(c chan int) {
+	h(c)
+}
+
+func h(c chan int) {
+	c <- x
+}
+`, `==================
+WARNING: DATA RACE
+Read at 0x[0-9,a-f]+ by goroutine [0-9]:
+  main\.h\(\)
+      .+/main\.go:22 \+0x[0-9,a-f]+
+  main\.g\(\)
+      .+/main\.go:18 \+0x[0-9,a-f]+
+  main\.f\(\)
+      .+/main\.go:14 \+0x[0-9,a-f]+
+
+Previous write at 0x[0-9,a-f]+ by main goroutine:
+  main\.main\(\)
+      .+/main\.go:9 \+0x[0-9,a-f]+
+
+Goroutine [0-9] \(running\) created at:
+  main\.main\(\)
+      .+/main\.go:8 \+0x[0-9,a-f]+
+==================
+Found 1 data race\(s\)
+exit status 66
+`},
+
 	// Test for https://golang.org/issue/17190
 	{"external_cgo_thread", "run", "linux", "atexit_sleep_ms=0", `
 package main
diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index d298e80..c894de5 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go
@@ -7,7 +7,7 @@
 package race
 
 // This file merely ensures that we link in runtime/cgo in race build,
-// this is turn ensures that runtime uses pthread_create to create threads.
+// this in turn ensures that runtime uses pthread_create to create threads.
 // The prebuilt race runtime lives in race_GOOS_GOARCH.syso.
 // Calls to the runtime are done directly from src/runtime/race.go.
 
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
index 0e4017b..d03a593 100644
--- a/src/runtime/race/race_darwin_amd64.syso
+++ b/src/runtime/race/race_darwin_amd64.syso
Binary files differ
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
index fcae118..573591c 100644
--- a/src/runtime/race/race_freebsd_amd64.syso
+++ b/src/runtime/race/race_freebsd_amd64.syso
Binary files differ
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
index c18e2a0..255b2e5 100644
--- a/src/runtime/race/race_linux_amd64.syso
+++ b/src/runtime/race/race_linux_amd64.syso
Binary files differ
diff --git a/src/runtime/race/race_linux_arm64.syso b/src/runtime/race/race_linux_arm64.syso
index 65bc1ec..f15c599 100644
--- a/src/runtime/race/race_linux_arm64.syso
+++ b/src/runtime/race/race_linux_arm64.syso
Binary files differ
diff --git a/src/runtime/race/race_linux_ppc64le.syso b/src/runtime/race/race_linux_ppc64le.syso
index a3609db..2bf5029 100644
--- a/src/runtime/race/race_linux_ppc64le.syso
+++ b/src/runtime/race/race_linux_ppc64le.syso
Binary files differ
diff --git a/src/runtime/race/race_netbsd_amd64.syso b/src/runtime/race/race_netbsd_amd64.syso
index 3937a61..54e276b 100644
--- a/src/runtime/race/race_netbsd_amd64.syso
+++ b/src/runtime/race/race_netbsd_amd64.syso
Binary files differ
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
index 1f1dd17..abaf426 100644
--- a/src/runtime/race/race_windows_amd64.syso
+++ b/src/runtime/race/race_windows_amd64.syso
Binary files differ
diff --git a/src/runtime/race/testdata/chan_test.go b/src/runtime/race/testdata/chan_test.go
index 60e55ed..3e57b82 100644
--- a/src/runtime/race/testdata/chan_test.go
+++ b/src/runtime/race/testdata/chan_test.go
@@ -737,3 +737,29 @@
 	case <-make(chan int):
 	}
 }
+
+// Test that close synchronizes with a read from the empty closed channel.
+// See https://golang.org/issue/36714.
+func TestNoRaceCloseHappensBeforeRead(t *testing.T) {
+	for i := 0; i < 100; i++ {
+		var loc int
+		var write = make(chan struct{})
+		var read = make(chan struct{})
+
+		go func() {
+			select {
+			case <-write:
+				_ = loc
+			default:
+			}
+			close(read)
+		}()
+
+		go func() {
+			loc = 1
+			close(write)
+		}()
+
+		<-read
+	}
+}
diff --git a/src/runtime/race/testdata/select_test.go b/src/runtime/race/testdata/select_test.go
index 3827867..9a43f9b 100644
--- a/src/runtime/race/testdata/select_test.go
+++ b/src/runtime/race/testdata/select_test.go
@@ -20,7 +20,7 @@
 		x = 1
 		// At least two channels are needed because
 		// otherwise the compiler optimizes select out.
-		// See comment in runtime/select.go:^func selectgoImpl.
+		// See comment in runtime/select.go:^func selectgo.
 		select {
 		case c <- true:
 		case c1 <- true:
diff --git a/src/runtime/race/testdata/slice_test.go b/src/runtime/race/testdata/slice_test.go
index 1ec5243..9009a9a 100644
--- a/src/runtime/race/testdata/slice_test.go
+++ b/src/runtime/race/testdata/slice_test.go
@@ -5,6 +5,7 @@
 package race_test
 
 import (
+	"sync"
 	"testing"
 )
 
@@ -590,3 +591,18 @@
 	_ = x[:1:i]
 	<-done
 }
+
+var saved string
+
+func TestRaceSlice4(t *testing.T) {
+	// See issue 36794.
+	data := []byte("hello there")
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		_ = string(data)
+		wg.Done()
+	}()
+	copy(data, data[2:])
+	wg.Wait()
+}
diff --git a/src/runtime/race/timer_test.go b/src/runtime/race/timer_test.go
new file mode 100644
index 0000000..a6c34a8
--- /dev/null
+++ b/src/runtime/race/timer_test.go
@@ -0,0 +1,33 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build race
+
+package race_test
+
+import (
+	"sync"
+	"testing"
+	"time"
+)
+
+func TestTimers(t *testing.T) {
+	const goroutines = 8
+	var wg sync.WaitGroup
+	wg.Add(goroutines)
+	var mu sync.Mutex
+	for i := 0; i < goroutines; i++ {
+		go func() {
+			defer wg.Done()
+			ticker := time.NewTicker(1)
+			defer ticker.Stop()
+			for c := 0; c < 1000; c++ {
+				<-ticker.C
+				mu.Lock()
+				mu.Unlock()
+			}
+		}()
+	}
+	wg.Wait()
+}
diff --git a/src/runtime/race0.go b/src/runtime/race0.go
index f1d3706..6f26afa 100644
--- a/src/runtime/race0.go
+++ b/src/runtime/race0.go
@@ -29,6 +29,7 @@
 func racewriterangepc(addr unsafe.Pointer, sz, callerpc, pc uintptr)        { throw("race") }
 func raceacquire(addr unsafe.Pointer)                                       { throw("race") }
 func raceacquireg(gp *g, addr unsafe.Pointer)                               { throw("race") }
+func raceacquirectx(racectx uintptr, addr unsafe.Pointer)                   { throw("race") }
 func racerelease(addr unsafe.Pointer)                                       { throw("race") }
 func racereleaseg(gp *g, addr unsafe.Pointer)                               { throw("race") }
 func racereleasemerge(addr unsafe.Pointer)                                  { throw("race") }
@@ -38,3 +39,4 @@
 func racefree(p unsafe.Pointer, sz uintptr)                                 { throw("race") }
 func racegostart(pc uintptr) uintptr                                        { throw("race"); return 0 }
 func racegoend()                                                            { throw("race") }
+func racectxend(racectx uintptr)                                            { throw("race") }
diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
index 4ed9533..758d543 100644
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s
@@ -416,9 +416,11 @@
 	// Set g = g0.
 	get_tls(R12)
 	MOVQ	g(R12), R13
-	MOVQ	g_m(R13), R13
-	MOVQ	m_g0(R13), R14
-	MOVQ	R14, g(R12)	// g = m->g0
+	MOVQ	g_m(R13), R14
+	MOVQ	m_g0(R14), R15
+	CMPQ	R13, R15
+	JEQ	noswitch	// branch if already on g0
+	MOVQ	R15, g(R12)	// g = m->g0
 	PUSHQ	RARG1	// func arg
 	PUSHQ	RARG0	// func arg
 	CALL	runtime·racecallback(SB)
@@ -430,6 +432,7 @@
 	MOVQ	g_m(R13), R13
 	MOVQ	m_curg(R13), R14
 	MOVQ	R14, g(R12)	// g = m->curg
+ret:
 	// Restore callee-saved registers.
 	POPQ	R15
 	POPQ	R14
@@ -440,3 +443,12 @@
 	POPQ	BP
 	POPQ	BX
 	RET
+
+noswitch:
+	// already on g0
+	PUSHQ	RARG1	// func arg
+	PUSHQ	RARG0	// func arg
+	CALL	runtime·racecallback(SB)
+	POPQ	R12
+	POPQ	R12
+	JMP	ret
diff --git a/src/runtime/race_arm64.s b/src/runtime/race_arm64.s
index 48c719a..9b909ac 100644
--- a/src/runtime/race_arm64.s
+++ b/src/runtime/race_arm64.s
@@ -421,8 +421,7 @@
 	// First, code below assumes that we are on curg, while raceGetProcCmd
 	// can be executed on g0. Second, it is called frequently, so will
 	// benefit from this fast path.
-	CMP	$0, R0
-	BNE	rest
+	CBNZ	R0, rest
 	MOVD	g, R13
 	load_g
 	MOVD	g_m(g), R0
@@ -434,13 +433,13 @@
 rest:
 	// Save callee-saved registers (Go code won't respect that).
 	// 8(RSP) and 16(RSP) are for args passed through racecallback
-	SUB	$96, RSP
+	SUB	$112, RSP
 	MOVD	LR, 0(RSP)
 	STP	(R19, R20), 24(RSP)
 	STP	(R21, R22), 40(RSP)
 	STP	(R23, R24), 56(RSP)
 	STP	(R25, R26), 72(RSP)
-	MOVD	R27, 88(RSP)
+	STP	(R27,   g), 88(RSP)
 	// Set g = g0.
 	// load_g will clobber R0, Save R0
 	MOVD	R0, R13
@@ -448,7 +447,10 @@
 	// restore R0
 	MOVD	R13, R0
 	MOVD	g_m(g), R13
-	MOVD	m_g0(R13), g
+	MOVD	m_g0(R13), R14
+	CMP	R14, g
+	BEQ	noswitch	// branch if already on g0
+	MOVD	R14, g
 
 	MOVD	R0, 8(RSP)	// func arg
 	MOVD	R1, 16(RSP)	// func arg
@@ -457,15 +459,23 @@
 	// All registers are smashed after Go code, reload.
 	MOVD	g_m(g), R13
 	MOVD	m_curg(R13), g	// g = m->curg
+ret:
 	// Restore callee-saved registers.
 	MOVD	0(RSP), LR
 	LDP	24(RSP), (R19, R20)
 	LDP	40(RSP), (R21, R22)
 	LDP	56(RSP), (R23, R24)
 	LDP	72(RSP), (R25, R26)
-	MOVD	88(RSP), R27
-	ADD	$96, RSP
+	LDP	88(RSP), (R27,   g)
+	ADD	$112, RSP
 	JMP	(LR)
 
+noswitch:
+	// already on g0
+	MOVD	R0, 8(RSP)	// func arg
+	MOVD	R1, 16(RSP)	// func arg
+	BL	runtime·racecallback(SB)
+	JMP	ret
+
 // tls_g, g value for each thread in TLS
 GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
diff --git a/src/runtime/race_ppc64le.s b/src/runtime/race_ppc64le.s
index 79b8ba2..7421d53 100644
--- a/src/runtime/race_ppc64le.s
+++ b/src/runtime/race_ppc64le.s
@@ -8,6 +8,7 @@
 #include "go_tls.h"
 #include "funcdata.h"
 #include "textflag.h"
+#include "asm_ppc64x.h"
 
 // The following functions allow calling the clang-compiled race runtime directly
 // from Go code without going all the way through cgo.
@@ -101,7 +102,7 @@
 	MOVD    $__tsan_read_range(SB), R8
 	BR	racecalladdr<>(SB)
 
-TEXT    runtime·RaceReadRange(SB), NOSPLIT, $0-24
+TEXT    runtime·RaceReadRange(SB), NOSPLIT, $0-16
 	BR	runtime·racereadrange(SB)
 
 // func runtime·RaceWriteRange(addr, size uintptr)
@@ -467,9 +468,9 @@
 	MOVD	R10, 16(R1)
 	MOVW	CR, R10
 	MOVW	R10, 8(R1)
-	MOVDU   R1, -336(R1) // Allocate frame needed for register save area
+	MOVDU   R1, -336(R1) // Allocate frame needed for outargs and register save area
 
-	MOVD    R14, 40(R1)
+	MOVD    R14, 328(R1)
 	MOVD    R15, 48(R1)
 	MOVD    R16, 56(R1)
 	MOVD    R17, 64(R1)
@@ -506,21 +507,30 @@
 	FMOVD   F30, 312(R1)
 	FMOVD   F31, 320(R1)
 
+	MOVD	R3, FIXED_FRAME+0(R1)
+	MOVD	R4, FIXED_FRAME+8(R1)
+
 	MOVD    runtime·tls_g(SB), R10
 	MOVD    0(R13)(R10*1), g
 
 	MOVD	g_m(g), R7
-	MOVD	m_g0(R7), g // set g = m-> g0
-	MOVD	R3, cmd+0(FP) // can't use R1 here ?? use input args and assumer caller expects those?
-	MOVD	R4, ctx+8(FP) // can't use R1 here ??
+	MOVD	m_g0(R7), R8
+	CMP	g, R8
+	BEQ	noswitch
+
+	MOVD	R8, g // set g = m-> g0
+
 	BL	runtime·racecallback(SB)
+
 	// All registers are clobbered after Go code, reload.
 	MOVD    runtime·tls_g(SB), R10
 	MOVD    0(R13)(R10*1), g
 
 	MOVD	g_m(g), R7
 	MOVD	m_curg(R7), g // restore g = m->curg
-	MOVD    40(R1), R14
+
+ret:
+	MOVD    328(R1), R14
 	MOVD    48(R1), R15
 	MOVD    56(R1), R16
 	MOVD    64(R1), R17
@@ -564,5 +574,9 @@
 	MOVD    R10, LR
 	RET
 
+noswitch:
+	BL      runtime·racecallback(SB)
+	JMP     ret
+
 // tls_g, g value for each thread in TLS
 GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8
diff --git a/src/runtime/rt0_darwin_386.s b/src/runtime/rt0_darwin_386.s
deleted file mode 100644
index a8d3a79..0000000
--- a/src/runtime/rt0_darwin_386.s
+++ /dev/null
@@ -1,17 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-TEXT _rt0_386_darwin(SB),NOSPLIT,$0
-	JMP	_rt0_386(SB)
-
-TEXT _rt0_386_darwin_lib(SB),NOSPLIT,$0
-	JMP	_rt0_386_lib(SB)
-
-TEXT main(SB),NOSPLIT,$0
-	// Remove the return address from the stack.
-	// rt0_go doesn't expect it to be there.
-	ADDL	$4, SP
-	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_darwin_arm.s b/src/runtime/rt0_darwin_arm.s
deleted file mode 100644
index 71fbe5f..0000000
--- a/src/runtime/rt0_darwin_arm.s
+++ /dev/null
@@ -1,11 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-TEXT _rt0_arm_darwin(SB),7,$0
-	B	_rt0_asm(SB)
-
-TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$0
-	B	_rt0_arm_lib(SB)
diff --git a/src/runtime/rt0_freebsd_arm64.s b/src/runtime/rt0_freebsd_arm64.s
new file mode 100644
index 0000000..3a348c3
--- /dev/null
+++ b/src/runtime/rt0_freebsd_arm64.s
@@ -0,0 +1,106 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// On FreeBSD argc/argv are passed in R0, not RSP
+TEXT _rt0_arm64_freebsd(SB),NOSPLIT|NOFRAME,$0
+	ADD	$8, R0, R1	// argv
+	MOVD	0(R0), R0	// argc
+	BL	main(SB)
+
+// When building with -buildmode=c-shared, this symbol is called when the shared
+// library is loaded.
+TEXT _rt0_arm64_freebsd_lib(SB),NOSPLIT,$184
+	// Preserve callee-save registers.
+	MOVD R19, 24(RSP)
+	MOVD R20, 32(RSP)
+	MOVD R21, 40(RSP)
+	MOVD R22, 48(RSP)
+	MOVD R23, 56(RSP)
+	MOVD R24, 64(RSP)
+	MOVD R25, 72(RSP)
+	MOVD R26, 80(RSP)
+	MOVD R27, 88(RSP)
+	FMOVD F8, 96(RSP)
+	FMOVD F9, 104(RSP)
+	FMOVD F10, 112(RSP)
+	FMOVD F11, 120(RSP)
+	FMOVD F12, 128(RSP)
+	FMOVD F13, 136(RSP)
+	FMOVD F14, 144(RSP)
+	FMOVD F15, 152(RSP)
+	MOVD g, 160(RSP)
+
+	// Initialize g as null in case of using g later e.g. sigaction in cgo_sigaction.go
+	MOVD	ZR, g
+
+	MOVD	R0, _rt0_arm64_freebsd_lib_argc<>(SB)
+	MOVD	R1, _rt0_arm64_freebsd_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOVD	$runtime·libpreinit(SB), R4
+	BL	(R4)
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVD	_cgo_sys_thread_create(SB), R4
+	CMP	$0, R4
+	BEQ	nocgo
+	MOVD	$_rt0_arm64_freebsd_lib_go(SB), R0
+	MOVD	$0, R1
+	SUB	$16, RSP	// reserve 16 bytes for sp-8 where fp may be saved.
+	BL	(R4)
+	ADD	$16, RSP
+	B	restore
+
+nocgo:
+	MOVD	$0x800000, R0                     // stacksize = 8192KB
+	MOVD	$_rt0_arm64_freebsd_lib_go(SB), R1
+	MOVD	R0, 8(RSP)
+	MOVD	R1, 16(RSP)
+	MOVD	$runtime·newosproc0(SB),R4
+	BL	(R4)
+
+restore:
+	// Restore callee-save registers.
+	MOVD 24(RSP), R19
+	MOVD 32(RSP), R20
+	MOVD 40(RSP), R21
+	MOVD 48(RSP), R22
+	MOVD 56(RSP), R23
+	MOVD 64(RSP), R24
+	MOVD 72(RSP), R25
+	MOVD 80(RSP), R26
+	MOVD 88(RSP), R27
+	FMOVD 96(RSP), F8
+	FMOVD 104(RSP), F9
+	FMOVD 112(RSP), F10
+	FMOVD 120(RSP), F11
+	FMOVD 128(RSP), F12
+	FMOVD 136(RSP), F13
+	FMOVD 144(RSP), F14
+	FMOVD 152(RSP), F15
+	MOVD 160(RSP), g
+	RET
+
+TEXT _rt0_arm64_freebsd_lib_go(SB),NOSPLIT,$0
+	MOVD	_rt0_arm64_freebsd_lib_argc<>(SB), R0
+	MOVD	_rt0_arm64_freebsd_lib_argv<>(SB), R1
+	MOVD	$runtime·rt0_go(SB),R4
+	B       (R4)
+
+DATA _rt0_arm64_freebsd_lib_argc<>(SB)/8, $0
+GLOBL _rt0_arm64_freebsd_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_arm64_freebsd_lib_argv<>(SB)/8, $0
+GLOBL _rt0_arm64_freebsd_lib_argv<>(SB),NOPTR, $8
+
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$runtime·rt0_go(SB), R2
+	BL	(R2)
+exit:
+	MOVD	$0, R0
+	MOVD	$1, R8	// SYS_exit
+	SVC
+	B	exit
diff --git a/src/runtime/rt0_js_wasm.s b/src/runtime/rt0_js_wasm.s
index b22c46e..714582a 100644
--- a/src/runtime/rt0_js_wasm.s
+++ b/src/runtime/rt0_js_wasm.s
@@ -19,7 +19,7 @@
 // R0: argc (i32)
 // R1: argv (i32)
 TEXT wasm_export_run(SB),NOSPLIT,$0
-	MOVD $runtime·wasmStack+m0Stack__size(SB), SP
+	MOVD $runtime·wasmStack+(m0Stack__size-16)(SB), SP
 
 	Get SP
 	Get R0 // argc
diff --git a/src/runtime/rt0_linux_arm64.s b/src/runtime/rt0_linux_arm64.s
index a6bc99d..f48a8d6 100644
--- a/src/runtime/rt0_linux_arm64.s
+++ b/src/runtime/rt0_linux_arm64.s
@@ -44,8 +44,7 @@
 
 	// Create a new thread to do the runtime initialization and return.
 	MOVD	_cgo_sys_thread_create(SB), R4
-	CMP	$0, R4
-	BEQ	nocgo
+	CBZ	R4, nocgo
 	MOVD	$_rt0_arm64_linux_lib_go(SB), R0
 	MOVD	$0, R1
 	SUB	$16, RSP		// reserve 16 bytes for sp-8 where fp may be saved.
diff --git a/src/runtime/rt0_linux_riscv64.s b/src/runtime/rt0_linux_riscv64.s
new file mode 100644
index 0000000..f31f7f7
--- /dev/null
+++ b/src/runtime/rt0_linux_riscv64.s
@@ -0,0 +1,14 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_riscv64_linux(SB),NOSPLIT|NOFRAME,$0
+	MOV	0(X2), A0	// argc
+	ADD	$8, X2, A1	// argv
+	JMP	main(SB)
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	MOV	$runtime·rt0_go(SB), T0
+	JALR	ZERO, T0
diff --git a/src/runtime/rt0_nacl_386.s b/src/runtime/rt0_nacl_386.s
deleted file mode 100644
index 4c99002..0000000
--- a/src/runtime/rt0_nacl_386.s
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// NaCl entry has:
-//	0(FP) - arg block == SP+8
-//	4(FP) - cleanup function pointer, always 0
-//	8(FP) - envc
-//	12(FP) - argc
-//	16(FP) - argv, then 0, then envv, then 0, then auxv
-TEXT _rt0_386_nacl(SB),NOSPLIT,$8
-	MOVL	argc+12(FP), AX
-	LEAL	argv+16(FP), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	JMP	runtime·rt0_go(SB)
-
-TEXT main(SB),NOSPLIT,$0
-	// Remove the return address from the stack.
-	// rt0_go doesn't expect it to be there.
-	ADDL	$4, SP
-	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_nacl_amd64p32.s b/src/runtime/rt0_nacl_amd64p32.s
deleted file mode 100644
index 38583c5..0000000
--- a/src/runtime/rt0_nacl_amd64p32.s
+++ /dev/null
@@ -1,30 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// NaCl entry on 32-bit x86 has DI pointing at the arg block, which contains:
-//
-//	0(DI) - cleanup function pointer, always 0
-//	4(DI) - envc
-//	8(DI) - argc
-//	12(DI) - argv, then 0, then envv, then 0, then auxv
-// NaCl entry here is almost the same, except that there
-// is no saved caller PC, so 0(FP) is -8(FP) and so on.
-TEXT _rt0_amd64p32_nacl(SB),NOSPLIT,$16
-	MOVL	DI, 0(SP)
-	CALL	runtime·nacl_sysinfo(SB)
-	MOVL	0(SP), DI
-	MOVL	8(DI), AX
-	LEAL	12(DI), BX
-	MOVL	AX, 0(SP)
-	MOVL	BX, 4(SP)
-	CALL	main(SB)
-	INT	$3
-
-TEXT main(SB),NOSPLIT,$0
-	// Uncomment for fake time like on Go Playground.
-	//MOVQ	$1257894000000000000, AX
-	//MOVQ	AX, runtime·faketime(SB)
-	JMP	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_nacl_arm.s b/src/runtime/rt0_nacl_arm.s
deleted file mode 100644
index a52c0d8..0000000
--- a/src/runtime/rt0_nacl_arm.s
+++ /dev/null
@@ -1,16 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-// NaCl entry has:
-//	0(FP) - 0
-//	4(FP) - cleanup function pointer, always 0
-//	8(FP) - envc
-//	12(FP) - argc
-//	16(FP) - argv, then 0, then envv, then 0, then auxv
-TEXT _rt0_arm_nacl(SB),NOSPLIT|NOFRAME,$0
-	MOVW	8(R13), R0
-	MOVW	$12(R13), R1
-	B	runtime·rt0_go(SB)
diff --git a/src/runtime/rt0_openbsd_arm64.s b/src/runtime/rt0_openbsd_arm64.s
index ab8ea97..12408f2 100644
--- a/src/runtime/rt0_openbsd_arm64.s
+++ b/src/runtime/rt0_openbsd_arm64.s
@@ -4,6 +4,12 @@
 
 #include "textflag.h"
 
+// See comment in runtime/sys_openbsd_arm64.s re this construction.
+#define	INVOKE_SYSCALL	\
+	SVC;		\
+	NOOP;		\
+	NOOP
+
 TEXT _rt0_arm64_openbsd(SB),NOSPLIT|NOFRAME,$0
 	MOVD	0(RSP), R0	// argc
 	ADD	$8, RSP, R1	// argv
@@ -101,5 +107,5 @@
 exit:
 	MOVD	$0, R0
 	MOVD	$1, R8		// sys_exit
-	SVC
+	INVOKE_SYSCALL
 	B	exit
diff --git a/src/runtime/runtime-gdb.py b/src/runtime/runtime-gdb.py
index 6139f99..8d96dfb 100644
--- a/src/runtime/runtime-gdb.py
+++ b/src/runtime/runtime-gdb.py
@@ -18,6 +18,7 @@
 from __future__ import print_function
 import re
 import sys
+import gdb
 
 print("Loading Go Runtime support.", file=sys.stderr)
 #http://python3porting.com/differences.html
@@ -98,11 +99,11 @@
 #  Pretty Printers
 #
 
-
+# The patterns for matching types are permissive because gdb 8.2 switched to matching on (we think) typedef names instead of C syntax names.
 class StringTypePrinter:
 	"Pretty print Go strings."
 
-	pattern = re.compile(r'^struct string( \*)?$')
+	pattern = re.compile(r'^(struct string( \*)?|string)$')
 
 	def __init__(self, val):
 		self.val = val
@@ -118,7 +119,7 @@
 class SliceTypePrinter:
 	"Pretty print slices."
 
-	pattern = re.compile(r'^struct \[\]')
+	pattern = re.compile(r'^(struct \[\]|\[\])')
 
 	def __init__(self, val):
 		self.val = val
@@ -127,7 +128,10 @@
 		return 'array'
 
 	def to_string(self):
-		return str(self.val.type)[6:]  # skip 'struct '
+		t = str(self.val.type)
+		if (t.startswith("struct ")):
+			return t[len("struct "):]
+		return t
 
 	def children(self):
 		sval = SliceValue(self.val)
@@ -195,7 +199,7 @@
 	to inspect their contents with this pretty printer.
 	"""
 
-	pattern = re.compile(r'^struct hchan<.*>$')
+	pattern = re.compile(r'^chan ')
 
 	def __init__(self, val):
 		self.val = val
@@ -209,7 +213,7 @@
 	def children(self):
 		# see chan.c chanbuf(). et is the type stolen from hchan<T>::recvq->first->elem
 		et = [x.type for x in self.val['recvq']['first'].type.target().fields() if x.name == 'elem'][0]
-		ptr = (self.val.address + 1).cast(et.pointer())
+		ptr = (self.val.address["buf"]).cast(et)
 		for i in range(self.val["qcount"]):
 			j = (self.val["recvx"] + i) % self.val["dataqsiz"]
 			yield ('[{0}]'.format(i), (ptr + j).dereference())
@@ -229,8 +233,6 @@
 	return matcher
 
 goobjfile.pretty_printers.extend([makematcher(var) for var in vars().values() if hasattr(var, 'pattern')])
-
-
 #
 #  Utilities
 #
diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index de1bac6..e52bd1c 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go
@@ -19,6 +19,12 @@
 	"testing"
 )
 
+// NOTE: In some configurations, GDB will segfault when sent a SIGWINCH signal.
+// Some runtime tests send SIGWINCH to the entire process group, so those tests
+// must never run in parallel with GDB tests.
+//
+// See issue 39021 and https://sourceware.org/bugzilla/show_bug.cgi?id=26056.
+
 func checkGdbEnvironment(t *testing.T) {
 	testenv.MustHaveGoBuild(t)
 	switch runtime.GOOS {
@@ -37,6 +43,12 @@
 		}
 	case "freebsd":
 		t.Skip("skipping gdb tests on FreeBSD; see https://golang.org/issue/29508")
+	case "aix":
+		if testing.Short() {
+			t.Skip("skipping gdb tests on AIX; see https://golang.org/issue/35710")
+		}
+	case "plan9":
+		t.Skip("there is no gdb on Plan 9")
 	}
 	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
 		t.Skip("gdb test can fail with GOROOT_FINAL pending")
@@ -66,8 +78,8 @@
 }
 
 func checkGdbPython(t *testing.T) {
-	if runtime.GOOS == "solaris" && testenv.Builder() != "solaris-amd64-smartosbuildlet" {
-		t.Skip("skipping gdb python tests on solaris; see golang.org/issue/20821")
+	if runtime.GOOS == "solaris" || runtime.GOOS == "illumos" {
+		t.Skip("skipping gdb python tests on illumos and solaris; see golang.org/issue/20821")
 	}
 
 	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
@@ -103,16 +115,26 @@
 var gslice []string
 func main() {
 	mapvar := make(map[string]string, 13)
+	slicemap := make(map[string][]string,11)
+    chanint := make(chan int, 10)
+    chanstr := make(chan string, 10)
+    chanint <- 99
+	chanint <- 11
+    chanstr <- "spongepants"
+    chanstr <- "squarebob"
 	mapvar["abc"] = "def"
 	mapvar["ghi"] = "jkl"
+	slicemap["a"] = []string{"b","c","d"}
+    slicemap["e"] = []string{"f","g","h"}
 	strvar := "abc"
 	ptrvar := &strvar
 	slicevar := make([]string, 0, 16)
 	slicevar = append(slicevar, mapvar["abc"])
 	fmt.Println("hi")
 	runtime.KeepAlive(ptrvar)
-	_ = ptrvar
+	_ = ptrvar // set breakpoint here
 	gslice = slicevar
+	fmt.Printf("%v, %v, %v\n", slicemap, <-chanint, <-chanstr)
 	runtime.KeepAlive(mapvar)
 }  // END_OF_PROGRAM
 `
@@ -163,6 +185,16 @@
 
 	src := buf.Bytes()
 
+	// Locate breakpoint line
+	var bp int
+	lines := bytes.Split(src, []byte("\n"))
+	for i, line := range lines {
+		if bytes.Contains(line, []byte("breakpoint")) {
+			bp = i
+			break
+		}
+	}
+
 	err = ioutil.WriteFile(filepath.Join(dir, "main.go"), src, 0644)
 	if err != nil {
 		t.Fatalf("failed to create file: %v", err)
@@ -197,7 +229,7 @@
 	}
 	args = append(args,
 		"-ex", "set python print-stack full",
-		"-ex", "br main.go:15",
+		"-ex", fmt.Sprintf("br main.go:%d", bp),
 		"-ex", "run",
 		"-ex", "echo BEGIN info goroutines\n",
 		"-ex", "info goroutines",
@@ -205,18 +237,24 @@
 		"-ex", "echo BEGIN print mapvar\n",
 		"-ex", "print mapvar",
 		"-ex", "echo END\n",
+		"-ex", "echo BEGIN print slicemap\n",
+		"-ex", "print slicemap",
+		"-ex", "echo END\n",
 		"-ex", "echo BEGIN print strvar\n",
 		"-ex", "print strvar",
 		"-ex", "echo END\n",
+		"-ex", "echo BEGIN print chanint\n",
+		"-ex", "print chanint",
+		"-ex", "echo END\n",
+		"-ex", "echo BEGIN print chanstr\n",
+		"-ex", "print chanstr",
+		"-ex", "echo END\n",
 		"-ex", "echo BEGIN info locals\n",
 		"-ex", "info locals",
 		"-ex", "echo END\n",
 		"-ex", "echo BEGIN goroutine 1 bt\n",
 		"-ex", "goroutine 1 bt",
 		"-ex", "echo END\n",
-		"-ex", "echo BEGIN goroutine 2 bt\n",
-		"-ex", "goroutine 2 bt",
-		"-ex", "echo END\n",
 		"-ex", "echo BEGIN goroutine all bt\n",
 		"-ex", "goroutine all bt",
 		"-ex", "echo END\n",
@@ -228,8 +266,11 @@
 		"-ex", "echo END\n",
 		filepath.Join(dir, "a.exe"),
 	)
-	got, _ := exec.Command("gdb", args...).CombinedOutput()
-	t.Logf("gdb output: %s\n", got)
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
 
 	firstLine := bytes.SplitN(got, []byte("\n"), 2)[0]
 	if string(firstLine) != "Loading Go Runtime support." {
@@ -268,6 +309,23 @@
 		t.Fatalf("print mapvar failed: %s", bl)
 	}
 
+	// 2 orders, and possible differences in spacing.
+	sliceMapSfx1 := `map[string][]string = {["e"] = []string = {"f", "g", "h"}, ["a"] = []string = {"b", "c", "d"}}`
+	sliceMapSfx2 := `map[string][]string = {["a"] = []string = {"b", "c", "d"}, ["e"] = []string = {"f", "g", "h"}}`
+	if bl := strings.ReplaceAll(blocks["print slicemap"], "  ", " "); !strings.HasSuffix(bl, sliceMapSfx1) && !strings.HasSuffix(bl, sliceMapSfx2) {
+		t.Fatalf("print slicemap failed: %s", bl)
+	}
+
+	chanIntSfx := `chan int = {99, 11}`
+	if bl := strings.ReplaceAll(blocks["print chanint"], "  ", " "); !strings.HasSuffix(bl, chanIntSfx) {
+		t.Fatalf("print chanint failed: %s", bl)
+	}
+
+	chanStrSfx := `chan string = {"spongepants", "squarebob"}`
+	if bl := strings.ReplaceAll(blocks["print chanstr"], "  ", " "); !strings.HasSuffix(bl, chanStrSfx) {
+		t.Fatalf("print chanstr failed: %s", bl)
+	}
+
 	strVarRe := regexp.MustCompile(`^\$[0-9]+ = (0x[0-9a-f]+\s+)?"abc"$`)
 	if bl := blocks["print strvar"]; !strVarRe.MatchString(bl) {
 		t.Fatalf("print strvar failed: %s", bl)
@@ -293,7 +351,6 @@
 
 	// Check that the backtraces are well formed.
 	checkCleanBacktrace(t, blocks["goroutine 1 bt"])
-	checkCleanBacktrace(t, blocks["goroutine 2 bt"])
 	checkCleanBacktrace(t, blocks["goroutine 1 bt at the end"])
 
 	btGoroutine1Re := regexp.MustCompile(`(?m)^#0\s+(0x[0-9a-f]+\s+in\s+)?main\.main.+at`)
@@ -301,12 +358,7 @@
 		t.Fatalf("goroutine 1 bt failed: %s", bl)
 	}
 
-	btGoroutine2Re := regexp.MustCompile(`(?m)^#0\s+(0x[0-9a-f]+\s+in\s+)?runtime.+at`)
-	if bl := blocks["goroutine 2 bt"]; !btGoroutine2Re.MatchString(bl) {
-		t.Fatalf("goroutine 2 bt failed: %s", bl)
-	}
-
-	if bl := blocks["goroutine all bt"]; !btGoroutine1Re.MatchString(bl) || !btGoroutine2Re.MatchString(bl) {
+	if bl := blocks["goroutine all bt"]; !btGoroutine1Re.MatchString(bl) {
 		t.Fatalf("goroutine all bt failed: %s", bl)
 	}
 
@@ -381,7 +433,11 @@
 		"-ex", "continue",
 		filepath.Join(dir, "a.exe"),
 	}
-	got, _ := exec.Command("gdb", args...).CombinedOutput()
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
 
 	// Check that the backtrace matches the source code.
 	bt := []string{
@@ -396,8 +452,7 @@
 		s := fmt.Sprintf("#%v.*main\\.%v", i, name)
 		re := regexp.MustCompile(s)
 		if found := re.Find(got) != nil; !found {
-			t.Errorf("could not find '%v' in backtrace", s)
-			t.Fatalf("gdb output:\n%v", string(got))
+			t.Fatalf("could not find '%v' in backtrace", s)
 		}
 	}
 }
@@ -456,7 +511,11 @@
 		"-ex", "info types astruct",
 		filepath.Join(dir, "a.exe"),
 	}
-	got, _ := exec.Command("gdb", args...).CombinedOutput()
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
 
 	sgot := string(got)
 
@@ -470,8 +529,7 @@
 	}
 	for _, name := range types {
 		if !strings.Contains(sgot, name) {
-			t.Errorf("could not find %s in 'info typrs astruct' output", name)
-			t.Fatalf("gdb output:\n%v", sgot)
+			t.Fatalf("could not find %s in 'info typrs astruct' output", name)
 		}
 	}
 }
@@ -525,12 +583,14 @@
 		"-ex", "print 'runtime._PageSize'",
 		filepath.Join(dir, "a.exe"),
 	}
-	got, _ := exec.Command("gdb", args...).CombinedOutput()
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
 
 	sgot := strings.ReplaceAll(string(got), "\r\n", "\n")
 
-	t.Logf("output %q", sgot)
-
 	if !strings.Contains(sgot, "\n$1 = 42\n$2 = 18446744073709551615\n$3 = -1\n$4 = 1 '\\001'\n$5 = 8192") {
 		t.Fatalf("output mismatch")
 	}
@@ -585,7 +645,11 @@
 		"-ex", "backtrace",
 		filepath.Join(dir, "a.exe"),
 	}
-	got, _ := exec.Command("gdb", args...).CombinedOutput()
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
 
 	// Check that the backtrace matches the source code.
 	bt := []string{
@@ -596,8 +660,91 @@
 		s := fmt.Sprintf("(#.* .* in )?main\\.%v", name)
 		re := regexp.MustCompile(s)
 		if found := re.Find(got) != nil; !found {
-			t.Errorf("could not find '%v' in backtrace", s)
-			t.Fatalf("gdb output:\n%v", string(got))
+			t.Fatalf("could not find '%v' in backtrace", s)
+		}
+	}
+}
+
+const InfCallstackSource = `
+package main
+import "C"
+import "time"
+
+func loop() {
+        for i := 0; i < 1000; i++ {
+                time.Sleep(time.Millisecond*5)
+        }
+}
+
+func main() {
+        go loop()
+        time.Sleep(time.Second * 1)
+}
+`
+
+// TestGdbInfCallstack tests that gdb can unwind the callstack of cgo programs
+// on arm64 platforms without endless frames of function 'crossfunc1'.
+// https://golang.org/issue/37238
+func TestGdbInfCallstack(t *testing.T) {
+	checkGdbEnvironment(t)
+
+	testenv.MustHaveCGO(t)
+	if runtime.GOARCH != "arm64" {
+		t.Skip("skipping infinite callstack test on non-arm64 arches")
+	}
+
+	t.Parallel()
+	checkGdbVersion(t)
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// Build the source code.
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(InfCallstackSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", "a.exe", "main.go")
+	cmd.Dir = dir
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	// Execute gdb commands.
+	// 'setg_gcc' is the first point where we can reproduce the issue with just one 'run' command.
+	args := []string{"-nx", "-batch",
+		"-iex", "add-auto-load-safe-path " + filepath.Join(runtime.GOROOT(), "src", "runtime"),
+		"-ex", "set startup-with-shell off",
+		"-ex", "break setg_gcc",
+		"-ex", "run",
+		"-ex", "backtrace 3",
+		"-ex", "disable 1",
+		"-ex", "continue",
+		filepath.Join(dir, "a.exe"),
+	}
+	got, err := exec.Command("gdb", args...).CombinedOutput()
+	t.Logf("gdb output:\n%s", got)
+	if err != nil {
+		t.Fatalf("gdb exited with error: %v", err)
+	}
+
+	// Check that the backtrace matches
+	// We check the 3 inner most frames only as they are present certainly, according to gcc_<OS>_arm64.c
+	bt := []string{
+		`setg_gcc`,
+		`crosscall1`,
+		`threadentry`,
+	}
+	for i, name := range bt {
+		s := fmt.Sprintf("#%v.*%v", i, name)
+		re := regexp.MustCompile(s)
+		if found := re.Find(got) != nil; !found {
+			t.Fatalf("could not find '%v' in backtrace", s)
 		}
 	}
 }
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index ad29818..c65a534 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -312,9 +312,11 @@
 	madvdontneed       int32 // for Linux; issue 28466
 	sbrk               int32
 	scavenge           int32
+	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
+	asyncpreemptoff    int32
 }
 
 var dbgvars = []dbgVar{
@@ -331,9 +333,11 @@
 	{"madvdontneed", &debug.madvdontneed},
 	{"sbrk", &debug.sbrk},
 	{"scavenge", &debug.scavenge},
+	{"scavtrace", &debug.scavtrace},
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
+	{"asyncpreemptoff", &debug.asyncpreemptoff},
 }
 
 func parsedebugvars() {
@@ -455,11 +459,6 @@
 	}
 }
 
-//go:nosplit
-func gomcache() *mcache {
-	return getg().m.mcache
-}
-
 //go:linkname reflect_typelinks reflect.typelinks
 func reflect_typelinks() ([]unsafe.Pointer, [][]int32) {
 	modules := activeModules()
@@ -484,7 +483,7 @@
 	return unsafe.Pointer((*_type)(rtype).typeOff(typeOff(off)))
 }
 
-// reflect_resolveTextOff resolves an function pointer offset from a base type.
+// reflect_resolveTextOff resolves a function pointer offset from a base type.
 //go:linkname reflect_resolveTextOff reflect.resolveTextOff
 func reflect_resolveTextOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
 	return (*_type)(rtype).textOff(textOff(off))
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 16c02cd..cffdb0b 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -40,7 +40,7 @@
 
 	// _Grunning means this goroutine may execute user code. The
 	// stack is owned by this goroutine. It is not on a run queue.
-	// It is assigned an M and a P.
+	// It is assigned an M and a P (g.m and g.m.p are valid).
 	_Grunning // 2
 
 	// _Gsyscall means this goroutine is executing a system call.
@@ -78,6 +78,13 @@
 	// stack is owned by the goroutine that put it in _Gcopystack.
 	_Gcopystack // 8
 
+	// _Gpreempted means this goroutine stopped itself for a
+	// suspendG preemption. It is like _Gwaiting, but nothing is
+	// yet responsible for ready()ing it. Some suspendG must CAS
+	// the status to _Gwaiting to take responsibility for
+	// ready()ing this G.
+	_Gpreempted // 9
+
 	// _Gscan combined with one of the above states other than
 	// _Grunning indicates that GC is scanning the stack. The
 	// goroutine is not executing user code and the stack is owned
@@ -89,11 +96,12 @@
 	//
 	// atomicstatus&~Gscan gives the state the goroutine will
 	// return to when the scan completes.
-	_Gscan         = 0x1000
-	_Gscanrunnable = _Gscan + _Grunnable // 0x1001
-	_Gscanrunning  = _Gscan + _Grunning  // 0x1002
-	_Gscansyscall  = _Gscan + _Gsyscall  // 0x1003
-	_Gscanwaiting  = _Gscan + _Gwaiting  // 0x1004
+	_Gscan          = 0x1000
+	_Gscanrunnable  = _Gscan + _Grunnable  // 0x1001
+	_Gscanrunning   = _Gscan + _Grunning   // 0x1002
+	_Gscansyscall   = _Gscan + _Gsyscall   // 0x1003
+	_Gscanwaiting   = _Gscan + _Gwaiting   // 0x1004
+	_Gscanpreempted = _Gscan + _Gpreempted // 0x1009
 )
 
 const (
@@ -150,7 +158,10 @@
 // as fast as spin locks (just a few user-level instructions),
 // but on the contention path they sleep in the kernel.
 // A zeroed Mutex is unlocked (no need to initialize each lock).
+// Initialization is helpful for static lock ranking, but not required.
 type mutex struct {
+	// Empty struct if lock ranking is disabled, otherwise includes the lock rank
+	lockRankStruct
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
 	// Used to be a union, but unions break precise GC.
@@ -338,12 +349,9 @@
 
 	g *g
 
-	// isSelect indicates g is participating in a select, so
-	// g.selectDone must be CAS'd to win the wake-up race.
-	isSelect bool
-	next     *sudog
-	prev     *sudog
-	elem     unsafe.Pointer // data element (may point to stack)
+	next *sudog
+	prev *sudog
+	elem unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
 	// For channels, waitlink is only accessed by g.
@@ -353,10 +361,15 @@
 	acquiretime int64
 	releasetime int64
 	ticket      uint32
-	parent      *sudog // semaRoot binary tree
-	waitlink    *sudog // g.waiting list or semaRoot
-	waittail    *sudog // semaRoot
-	c           *hchan // channel
+
+	// isSelect indicates g is participating in a select, so
+	// g.selectDone must be CAS'd to win the wake-up race.
+	isSelect bool
+
+	parent   *sudog // semaRoot binary tree
+	waitlink *sudog // g.waiting list or semaRoot
+	waittail *sudog // semaRoot
+	c        *hchan // channel
 }
 
 type libcall struct {
@@ -384,6 +397,12 @@
 	hi uintptr
 }
 
+// heldLockInfo gives info on a held lock and the rank of that lock
+type heldLockInfo struct {
+	lockAddr uintptr
+	rank     lockRank
+}
+
 type g struct {
 	// Stack parameters.
 	// stack describes the actual stack memory: [stack.lo, stack.hi).
@@ -396,31 +415,44 @@
 	stackguard0 uintptr // offset known to liblink
 	stackguard1 uintptr // offset known to liblink
 
-	_panic         *_panic // innermost panic - offset known to liblink
-	_defer         *_defer // innermost defer
-	m              *m      // current m; offset known to arm liblink
-	sched          gobuf
-	syscallsp      uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
-	syscallpc      uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
-	stktopsp       uintptr        // expected sp at top of stack, to check in traceback
-	param          unsafe.Pointer // passed parameter on wakeup
-	atomicstatus   uint32
-	stackLock      uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
-	goid           int64
-	schedlink      guintptr
-	waitsince      int64      // approx time when the g become blocked
-	waitreason     waitReason // if status==Gwaiting
-	preempt        bool       // preemption signal, duplicates stackguard0 = stackpreempt
-	paniconfault   bool       // panic (instead of crash) on unexpected fault address
-	preemptscan    bool       // preempted g does scan for gc
-	gcscandone     bool       // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool       // false at start of gc cycle, true if G has not run since last scan; TODO: remove?
-	throwsplit     bool       // must not split stack
-	raceignore     int8       // ignore race detection events
-	sysblocktraced bool       // StartTrace has emitted EvGoInSyscall about this goroutine
-	sysexitticks   int64      // cputicks when syscall has returned (for tracing)
-	traceseq       uint64     // trace event sequencer
-	tracelastp     puintptr   // last P emitted an event for this goroutine
+	_panic       *_panic // innermost panic - offset known to liblink
+	_defer       *_defer // innermost defer
+	m            *m      // current m; offset known to arm liblink
+	sched        gobuf
+	syscallsp    uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
+	syscallpc    uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
+	stktopsp     uintptr        // expected sp at top of stack, to check in traceback
+	param        unsafe.Pointer // passed parameter on wakeup
+	atomicstatus uint32
+	stackLock    uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
+	goid         int64
+	schedlink    guintptr
+	waitsince    int64      // approx time when the g become blocked
+	waitreason   waitReason // if status==Gwaiting
+
+	preempt       bool // preemption signal, duplicates stackguard0 = stackpreempt
+	preemptStop   bool // transition to _Gpreempted on preemption; otherwise, just deschedule
+	preemptShrink bool // shrink stack at synchronous safe point
+
+	// asyncSafePoint is set if g is stopped at an asynchronous
+	// safe point. This means there are frames on the stack
+	// without precise pointer information.
+	asyncSafePoint bool
+
+	paniconfault bool // panic (instead of crash) on unexpected fault address
+	gcscandone   bool // g has scanned stack; protected by _Gscan bit in status
+	throwsplit   bool // must not split stack
+	// activeStackChans indicates that there are unlocked channels
+	// pointing into this goroutine's stack. If true, stack
+	// copying needs to acquire channel locks to protect these
+	// areas of the stack.
+	activeStackChans bool
+
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
 	lockedm        muintptr
 	sig            uint32
 	writebuf       []byte
@@ -489,7 +521,6 @@
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
-	mcache        *mcache
 	lockedg       guintptr
 	createstack   [32]uintptr // stack that created this thread.
 	lockedExt     uint32      // tracking for external LockOSThread
@@ -501,8 +532,7 @@
 	waittraceskip int
 	startingtrace bool
 	syscalltick   uint32
-	thread        uintptr // thread handle
-	freelink      *m      // on sched.freem
+	freelink      *m // on sched.freem
 
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
@@ -515,9 +545,22 @@
 	vdsoSP uintptr // SP for traceback while in VDSO call (0 if not in call)
 	vdsoPC uintptr // PC for traceback while in VDSO call
 
+	// preemptGen counts the number of completed preemption
+	// signals. This is used to detect when a preemption is
+	// requested, but fails. Accessed atomically.
+	preemptGen uint32
+
+	// Whether this is a pending preemption signal on this M.
+	// Accessed atomically.
+	signalPending uint32
+
 	dlogPerM
 
 	mOS
+
+	// Up to 10 locks held by this m, maintained by the lock ranking code.
+	locksHeldLen int
+	locksHeld    [10]heldLockInfo
 }
 
 type p struct {
@@ -529,6 +572,7 @@
 	sysmontick  sysmontick // last tick observed by sysmon
 	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
+	pcache      pageCache
 	raceprocctx uintptr
 
 	deferpool    [5][]*_defer // pool of available defer structs of different sizes (see panic.go)
@@ -562,6 +606,17 @@
 	sudogcache []*sudog
 	sudogbuf   [128]*sudog
 
+	// Cache of mspan objects from the heap.
+	mspancache struct {
+		// We need an explicit length here because this field is used
+		// in allocation codepaths where write barriers are not allowed,
+		// and eliminating the write barrier/keeping it eliminated from
+		// slice updates is tricky, moreso than just managing the length
+		// ourselves.
+		len int
+		buf [128]*mspan
+	}
+
 	tracebuf traceBufPtr
 
 	// traceSweep indicates the sweep events should be traced.
@@ -576,6 +631,11 @@
 
 	_ uint32 // Alignment for atomic fields below
 
+	// The when field of the first entry on the timer heap.
+	// This is updated using atomic functions.
+	// This is 0 if the timer heap is empty.
+	timer0When uint64
+
 	// Per-P GC state
 	gcAssistTime         int64    // Nanoseconds in assistAlloc
 	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
@@ -598,13 +658,44 @@
 
 	runSafePointFn uint32 // if 1, run sched.safePointFn at next safe point
 
+	// Lock for timers. We normally access the timers while running
+	// on this P, but the scheduler can also do it from a different P.
+	timersLock mutex
+
+	// Actions to take at some time. This is used to implement the
+	// standard library's time package.
+	// Must hold timersLock to access.
+	timers []*timer
+
+	// Number of timers in P's heap.
+	// Modified using atomic instructions.
+	numTimers uint32
+
+	// Number of timerModifiedEarlier timers on P's heap.
+	// This should only be modified while holding timersLock,
+	// or while the timer status is in a transient state
+	// such as timerModifying.
+	adjustTimers uint32
+
+	// Number of timerDeleted timers in P's heap.
+	// Modified using atomic instructions.
+	deletedTimers uint32
+
+	// Race context used while executing timer functions.
+	timerRaceCtx uintptr
+
+	// preempt is set to indicate that this P should be enter the
+	// scheduler ASAP (regardless of what G is running on it).
+	preempt bool
+
 	pad cpu.CacheLinePad
 }
 
 type schedt struct {
 	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
-	goidgen  uint64
-	lastpoll uint64
+	goidgen   uint64
+	lastpoll  uint64 // time of last network poll, 0 if currently polling
+	pollUntil uint64 // time to which current poll is sleeping
 
 	lock mutex
 
@@ -677,6 +768,12 @@
 
 	procresizetime int64 // nanotime() of last change to gomaxprocs
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
+
+	// sysmonlock protects sysmon's actions on the runtime.
+	//
+	// Acquire and hold this mutex to block sysmon from interacting
+	// with the rest of the runtime.
+	sysmonlock mutex
 }
 
 // Values for the flags field of a sigTabT.
@@ -701,7 +798,7 @@
 	nameoff int32   // function name
 
 	args        int32  // in/out args size
-	deferreturn uint32 // offset of a deferreturn block from entry, if any.
+	deferreturn uint32 // offset of start of a deferreturn call instruction from entry, if any.
 
 	pcsp      int32
 	pcfile    int32
@@ -726,7 +823,7 @@
 // layout of Itab known to compilers
 // allocated in non-garbage-collected memory
 // Needs to be in sync with
-// ../cmd/compile/internal/gc/reflect.go:/^func.dumptypestructs.
+// ../cmd/compile/internal/gc/reflect.go:/^func.dumptabs.
 type itab struct {
 	inter *interfacetype
 	_type *_type
@@ -774,7 +871,7 @@
 }
 
 // A _defer holds an entry on the list of deferred calls.
-// If you add a field here, add code to clear it in freedefer.
+// If you add a field here, add code to clear it in freedefer and deferProcStack
 // This struct must match the code in cmd/compile/internal/gc/reflect.go:deferstruct
 // and cmd/compile/internal/gc/ssa.go:(*state).call.
 // Some defers will be allocated on the stack and some on the heap.
@@ -785,11 +882,27 @@
 	siz     int32 // includes both arguments and results
 	started bool
 	heap    bool
-	sp      uintptr // sp at time of defer
-	pc      uintptr
-	fn      *funcval
-	_panic  *_panic // panic that is running defer
-	link    *_defer
+	// openDefer indicates that this _defer is for a frame with open-coded
+	// defers. We have only one defer record for the entire frame (which may
+	// currently have 0, 1, or more defers active).
+	openDefer bool
+	sp        uintptr  // sp at time of defer
+	pc        uintptr  // pc at time of defer
+	fn        *funcval // can be nil for open-coded defers
+	_panic    *_panic  // panic that is running defer
+	link      *_defer
+
+	// If openDefer is true, the fields below record values about the stack
+	// frame and associated function that has the open-coded defer(s). sp
+	// above will be the sp for the frame, and pc will be address of the
+	// deferreturn call in the function.
+	fd   unsafe.Pointer // funcdata for the function associated with the frame
+	varp uintptr        // value of varp for the stack frame
+	// framepc is the current pc associated with the stack frame. Together,
+	// with sp above (which is the sp associated with the stack frame),
+	// framepc/sp can be used as pc/sp pair to continue a stack trace via
+	// gentraceback().
+	framepc uintptr
 }
 
 // A _panic holds information about an active panic.
@@ -807,8 +920,11 @@
 	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
 	arg       interface{}    // argument to panic
 	link      *_panic        // link to earlier panic
+	pc        uintptr        // where to return to in runtime if this panic is bypassed
+	sp        unsafe.Pointer // where to return to in runtime if this panic is bypassed
 	recovered bool           // whether this panic is over
 	aborted   bool           // the panic was aborted
+	goexit    bool
 }
 
 // stack traces
@@ -863,7 +979,7 @@
 	waitReasonChanReceive                             // "chan receive"
 	waitReasonChanSend                                // "chan send"
 	waitReasonFinalizerWait                           // "finalizer wait"
-	waitReasonForceGGIdle                             // "force gc (idle)"
+	waitReasonForceGCIdle                             // "force gc (idle)"
 	waitReasonSemacquire                              // "semacquire"
 	waitReasonSleep                                   // "sleep"
 	waitReasonSyncCondWait                            // "sync.Cond.Wait"
@@ -871,6 +987,8 @@
 	waitReasonTraceReaderBlocked                      // "trace reader (blocked)"
 	waitReasonWaitForGCCycle                          // "wait for GC cycle"
 	waitReasonGCWorkerIdle                            // "GC worker (idle)"
+	waitReasonPreempted                               // "preempted"
+	waitReasonDebugCall                               // "debug call"
 )
 
 var waitReasonStrings = [...]string{
@@ -891,7 +1009,7 @@
 	waitReasonChanReceive:           "chan receive",
 	waitReasonChanSend:              "chan send",
 	waitReasonFinalizerWait:         "finalizer wait",
-	waitReasonForceGGIdle:           "force gc (idle)",
+	waitReasonForceGCIdle:           "force gc (idle)",
 	waitReasonSemacquire:            "semacquire",
 	waitReasonSleep:                 "sleep",
 	waitReasonSyncCondWait:          "sync.Cond.Wait",
@@ -899,6 +1017,8 @@
 	waitReasonTraceReaderBlocked:    "trace reader (blocked)",
 	waitReasonWaitForGCCycle:        "wait for GC cycle",
 	waitReasonGCWorkerIdle:          "GC worker (idle)",
+	waitReasonPreempted:             "preempted",
+	waitReasonDebugCall:             "debug call",
 }
 
 func (w waitReason) String() string {
@@ -922,7 +1042,7 @@
 	// Information about what cpu features are available.
 	// Packages outside the runtime should not use these
 	// as they are not an external api.
-	// Set on startup in asm_{386,amd64,amd64p32}.s
+	// Set on startup in asm_{386,amd64}.s
 	processorVersionInfo uint32
 	isIntel              bool
 	lfenceBeforeRdtsc    bool
diff --git a/src/runtime/runtime_linux_test.go b/src/runtime/runtime_linux_test.go
index 17d6fbd..cd59368 100644
--- a/src/runtime/runtime_linux_test.go
+++ b/src/runtime/runtime_linux_test.go
@@ -41,11 +41,11 @@
 	}
 }
 
-// Test that error values are negative. Use address 1 (a misaligned
-// pointer) to get -EINVAL.
+// Test that error values are negative.
+// Use a misaligned pointer to get -EINVAL.
 func TestMincoreErrorSign(t *testing.T) {
 	var dst byte
-	v := Mincore(unsafe.Pointer(uintptr(1)), 1, &dst)
+	v := Mincore(Add(unsafe.Pointer(new(int32)), 1), 1, &dst)
 
 	const EINVAL = 0x16
 	if v != -EINVAL {
@@ -54,7 +54,7 @@
 }
 
 func TestEpollctlErrorSign(t *testing.T) {
-	v := Epollctl(-1, 1, -1, unsafe.Pointer(&struct{}{}))
+	v := Epollctl(-1, 1, -1, unsafe.Pointer(&EpollEvent{}))
 
 	const EBADF = 0x09
 	if v != -EBADF {
diff --git a/src/runtime/runtime_mmap_test.go b/src/runtime/runtime_mmap_test.go
index 6741e1d..bb0b747 100644
--- a/src/runtime/runtime_mmap_test.go
+++ b/src/runtime/runtime_mmap_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime_test
 
diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index 5ea9cbd..e5d2d97 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go
@@ -122,6 +122,21 @@
 	}
 }
 
+func BenchmarkPanicRecover(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		defer3()
+	}
+}
+
+func defer3() {
+	defer func(x, y, z int) {
+		if recover() == nil {
+			panic("failed recover")
+		}
+	}(1, 2, 3)
+	panic("hi")
+}
+
 // golang.org/issue/7063
 func TestStopCPUProfilingWithProfilerOff(t *testing.T) {
 	SetCPUProfileRate(0)
@@ -177,10 +192,11 @@
 	}
 }
 
+// testSetPanicOnFault tests one potentially faulting address.
+// It deliberately constructs and uses an invalid pointer,
+// so mark it as nocheckptr.
+//go:nocheckptr
 func testSetPanicOnFault(t *testing.T, addr uintptr, nfault *int) {
-	if GOOS == "nacl" {
-		t.Skip("nacl doesn't seem to fault on high addresses")
-	}
 	if GOOS == "js" {
 		t.Skip("js does not support catching faults")
 	}
@@ -278,32 +294,6 @@
 	}
 }
 
-func TestBadOpen(t *testing.T) {
-	if GOOS == "windows" || GOOS == "nacl" || GOOS == "js" {
-		t.Skip("skipping OS that doesn't have open/read/write/close")
-	}
-	// make sure we get the correct error code if open fails. Same for
-	// read/write/close on the resulting -1 fd. See issue 10052.
-	nonfile := []byte("/notreallyafile")
-	fd := Open(&nonfile[0], 0, 0)
-	if fd != -1 {
-		t.Errorf("open(\"%s\")=%d, want -1", string(nonfile), fd)
-	}
-	var buf [32]byte
-	r := Read(-1, unsafe.Pointer(&buf[0]), int32(len(buf)))
-	if r != -1 {
-		t.Errorf("read()=%d, want -1", r)
-	}
-	w := Write(^uintptr(0), unsafe.Pointer(&buf[0]), int32(len(buf)))
-	if w != -1 {
-		t.Errorf("write()=%d, want -1", w)
-	}
-	c := Close(-1)
-	if c != -1 {
-		t.Errorf("close()=%d, want -1", c)
-	}
-}
-
 func TestAppendGrowth(t *testing.T) {
 	var x []int64
 	check := func(want int) {
diff --git a/src/runtime/rwmutex.go b/src/runtime/rwmutex.go
index a6da4c9..7713c3f 100644
--- a/src/runtime/rwmutex.go
+++ b/src/runtime/rwmutex.go
@@ -39,7 +39,7 @@
 	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
 		// A writer is pending. Park on the reader queue.
 		systemstack(func() {
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			if rw.readerPass > 0 {
 				// Writer finished.
 				rw.readerPass -= 1
@@ -67,7 +67,7 @@
 		// A writer is pending.
 		if atomic.Xadd(&rw.readerWait, -1) == 0 {
 			// The last reader unblocks the writer.
-			lock(&rw.rLock)
+			lockWithRank(&rw.rLock, lockRankRwmutexR)
 			w := rw.writer.ptr()
 			if w != nil {
 				notewakeup(&w.park)
@@ -81,12 +81,12 @@
 // lock locks rw for writing.
 func (rw *rwmutex) lock() {
 	// Resolve competition with other writers and stick to our P.
-	lock(&rw.wLock)
+	lockWithRank(&rw.wLock, lockRankRwmutexW)
 	m := getg().m
 	// Announce that there is a pending writer.
 	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
 	// Wait for any active readers to complete.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
 		// Wait for reader to wake us up.
 		systemstack(func() {
@@ -108,7 +108,7 @@
 		throw("unlock of unlocked rwmutex")
 	}
 	// Unblock blocked readers.
-	lock(&rw.rLock)
+	lockWithRank(&rw.rLock, lockRankRwmutexR)
 	for rw.readers.ptr() != nil {
 		reader := rw.readers.ptr()
 		rw.readers = reader.schedlink
diff --git a/src/runtime/select.go b/src/runtime/select.go
index 85be1bc..a069e3e 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go
@@ -14,7 +14,7 @@
 
 // scase.kind values.
 // Known to compiler.
-// Changes here must also be made in src/cmd/compile/internal/gc/select.go's walkselect.
+// Changes here must also be made in src/cmd/compile/internal/gc/select.go's walkselectcases.
 const (
 	caseNil = iota
 	caseRecv
@@ -75,6 +75,9 @@
 }
 
 func selparkcommit(gp *g, _ unsafe.Pointer) bool {
+	// There are unlocked sudogs that point into gp's stack. Stack
+	// copying must lock the channels of those sudogs.
+	gp.activeStackChans = true
 	// This must not access gp's stack (see gopark). In
 	// particular, it must not access the *hselect. That's okay,
 	// because by the time this is called, gp.waiting has all
@@ -105,8 +108,9 @@
 // selectgo implements the select statement.
 //
 // cas0 points to an array of type [ncases]scase, and order0 points to
-// an array of type [2*ncases]uint16. Both reside on the goroutine's
-// stack (regardless of any escaping in selectgo).
+// an array of type [2*ncases]uint16 where ncases must be <= 65536.
+// Both reside on the goroutine's stack (regardless of any escaping in
+// selectgo).
 //
 // selectgo returns the index of the chosen scase, which matches the
 // ordinal position of its respective select{recv,send,default} call.
@@ -117,6 +121,8 @@
 		print("select: cas0=", cas0, "\n")
 	}
 
+	// NOTE: In order to maintain a lean stack size, the number of scases
+	// is capped at 65536.
 	cas1 := (*[1 << 16]scase)(unsafe.Pointer(cas0))
 	order1 := (*[1 << 17]uint16)(unsafe.Pointer(order0))
 
@@ -311,6 +317,7 @@
 	// wait for someone to wake us up
 	gp.param = nil
 	gopark(selparkcommit, nil, waitReasonSelect, traceEvGoBlockSelect, 1)
+	gp.activeStackChans = false
 
 	sellock(scases, lockorder)
 
@@ -493,8 +500,6 @@
 }
 
 func (c *hchan) sortkey() uintptr {
-	// TODO(khr): if we have a moving garbage collector, we'll need to
-	// change this function.
 	return uintptr(unsafe.Pointer(c))
 }
 
diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 30c8959..f94c1aa 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go
@@ -129,7 +129,7 @@
 		s.acquiretime = t0
 	}
 	for {
-		lock(&root.lock)
+		lockWithRank(&root.lock, lockRankRoot)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
 		atomic.Xadd(&root.nwait, 1)
 		// Check cansemacquire to avoid missed wakeup.
@@ -168,7 +168,7 @@
 	}
 
 	// Harder case: search for a waiter and wake it.
-	lock(&root.lock)
+	lockWithRank(&root.lock, lockRankRoot)
 	if atomic.Load(&root.nwait) == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
@@ -180,7 +180,7 @@
 		atomic.Xadd(&root.nwait, -1)
 	}
 	unlock(&root.lock)
-	if s != nil { // May be slow, so unlock first
+	if s != nil { // May be slow or even yield, so unlock first
 		acquiretime := s.acquiretime
 		if acquiretime != 0 {
 			mutexevent(t0-acquiretime, 3+skipframes)
@@ -192,6 +192,25 @@
 			s.ticket = 1
 		}
 		readyWithTime(s, 5+skipframes)
+		if s.ticket == 1 && getg().m.locks == 0 {
+			// Direct G handoff
+			// readyWithTime has added the waiter G as runnext in the
+			// current P; we now call the scheduler so that we start running
+			// the waiter G immediately.
+			// Note that waiter inherits our time slice: this is desirable
+			// to avoid having a highly contended semaphore hog the P
+			// indefinitely. goyield is like Gosched, but it emits a
+			// "preempted" trace event instead and, more importantly, puts
+			// the current G on the local runq instead of the global one.
+			// We only do this in the starving regime (handoff=true), as in
+			// the non-starving case it is possible for a different waiter
+			// to acquire the semaphore while we are yielding/scheduling,
+			// and this would be wasteful. We wait instead to enter starving
+			// regime, and then we start to do direct handoffs of ticket and
+			// P.
+			// See issue 33747 for discussion.
+			goyield()
+		}
 	}
 }
 
@@ -373,19 +392,11 @@
 func (root *semaRoot) rotateLeft(x *sudog) {
 	// p -> (x a (y b c))
 	p := x.parent
-	a, y := x.prev, x.next
-	b, c := y.prev, y.next
+	y := x.next
+	b := y.prev
 
 	y.prev = x
 	x.parent = y
-	y.next = c
-	if c != nil {
-		c.parent = y
-	}
-	x.prev = a
-	if a != nil {
-		a.parent = x
-	}
 	x.next = b
 	if b != nil {
 		b.parent = x
@@ -409,23 +420,15 @@
 func (root *semaRoot) rotateRight(y *sudog) {
 	// p -> (y (x a b) c)
 	p := y.parent
-	x, c := y.prev, y.next
-	a, b := x.prev, x.next
+	x := y.prev
+	b := x.next
 
-	x.prev = a
-	if a != nil {
-		a.parent = x
-	}
 	x.next = y
 	y.parent = x
 	y.prev = b
 	if b != nil {
 		b.parent = y
 	}
-	y.next = c
-	if c != nil {
-		c.parent = y
-	}
 
 	x.parent = p
 	if p == nil {
@@ -483,7 +486,7 @@
 // notifyListAdd was called, it returns immediately. Otherwise, it blocks.
 //go:linkname notifyListWait sync.runtime_notifyListWait
 func notifyListWait(l *notifyList, t uint32) {
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Return right away if this ticket has already been notified.
 	if less(t, l.notify) {
@@ -525,7 +528,7 @@
 
 	// Pull the list out into a local variable, waiters will be readied
 	// outside the lock.
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 	s := l.head
 	l.head = nil
 	l.tail = nil
@@ -555,7 +558,7 @@
 		return
 	}
 
-	lock(&l.lock)
+	lockWithRank(&l.lock, lockRankNotifyList)
 
 	// Re-check under the lock if we need to do anything.
 	t := l.notify
diff --git a/src/runtime/sema_test.go b/src/runtime/sema_test.go
new file mode 100644
index 0000000..cf3de0a
--- /dev/null
+++ b/src/runtime/sema_test.go
@@ -0,0 +1,103 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"sync"
+	"sync/atomic"
+	"testing"
+)
+
+// TestSemaHandoff checks that when semrelease+handoff is
+// requested, the G that releases the semaphore yields its
+// P directly to the first waiter in line.
+// See issue 33747 for discussion.
+func TestSemaHandoff(t *testing.T) {
+	const iter = 10000
+	ok := 0
+	for i := 0; i < iter; i++ {
+		if testSemaHandoff() {
+			ok++
+		}
+	}
+	// As long as two thirds of handoffs are direct, we
+	// consider the test successful. The scheduler is
+	// nondeterministic, so this test checks that we get the
+	// desired outcome in a significant majority of cases.
+	// The actual ratio of direct handoffs is much higher
+	// (>90%) but we use a lower threshold to minimize the
+	// chances that unrelated changes in the runtime will
+	// cause the test to fail or become flaky.
+	if ok < iter*2/3 {
+		t.Fatal("direct handoff < 2/3:", ok, iter)
+	}
+}
+
+func TestSemaHandoff1(t *testing.T) {
+	if GOMAXPROCS(-1) <= 1 {
+		t.Skip("GOMAXPROCS <= 1")
+	}
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	GOMAXPROCS(1)
+	TestSemaHandoff(t)
+}
+
+func TestSemaHandoff2(t *testing.T) {
+	if GOMAXPROCS(-1) <= 2 {
+		t.Skip("GOMAXPROCS <= 2")
+	}
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	GOMAXPROCS(2)
+	TestSemaHandoff(t)
+}
+
+func testSemaHandoff() bool {
+	var sema, res uint32
+	done := make(chan struct{})
+
+	// We're testing that the current goroutine is able to yield its time slice
+	// to another goroutine. Stop the current goroutine from migrating to
+	// another CPU where it can win the race (and appear to have not yielded) by
+	// keeping the CPUs slightly busy.
+	var wg sync.WaitGroup
+	for i := 0; i < GOMAXPROCS(-1); i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			for {
+				select {
+				case <-done:
+					return
+				default:
+				}
+				Gosched()
+			}
+		}()
+	}
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		Semacquire(&sema)
+		atomic.CompareAndSwapUint32(&res, 0, 1)
+
+		Semrelease1(&sema, true, 0)
+		close(done)
+	}()
+	for SemNwait(&sema) == 0 {
+		Gosched() // wait for goroutine to block in Semacquire
+	}
+
+	// The crux of the test: we release the semaphore with handoff
+	// and immediately perform a CAS both here and in the waiter; we
+	// want the CAS in the waiter to execute first.
+	Semrelease1(&sema, true, 0)
+	atomic.CompareAndSwapUint32(&res, 0, 2)
+
+	wg.Wait() // wait for goroutines to finish to avoid data races
+
+	return res == 1 // did the waiter run first?
+}
diff --git a/src/runtime/semasleep_test.go b/src/runtime/semasleep_test.go
index f5b4a50..9b371b0 100644
--- a/src/runtime/semasleep_test.go
+++ b/src/runtime/semasleep_test.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//+build !nacl,!plan9,!windows,!js
+// +build !plan9,!windows,!js
 
 package runtime_test
 
diff --git a/src/runtime/signal_386.go b/src/runtime/signal_386.go
index 143deb9..065aff4 100644
--- a/src/runtime/signal_386.go
+++ b/src/runtime/signal_386.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd
+// +build dragonfly freebsd linux netbsd openbsd
 
 package runtime
 
@@ -37,34 +37,22 @@
 
 // preparePanic sets up the stack to look like a call to sigpanic.
 func (c *sigctxt) preparePanic(sig uint32, gp *g) {
-	if GOOS == "darwin" {
-		// Work around Leopard bug that doesn't set FPE_INTDIV.
-		// Look at instruction to see if it is a divide.
-		// Not necessary in Snow Leopard (si_code will be != 0).
-		if sig == _SIGFPE && gp.sigcode0 == 0 {
-			pc := (*[4]byte)(unsafe.Pointer(gp.sigpc))
-			i := 0
-			if pc[i] == 0x66 { // 16-bit instruction prefix
-				i++
-			}
-			if pc[i] == 0xF6 || pc[i] == 0xF7 {
-				gp.sigcode0 = _FPE_INTDIV
-			}
-		}
-	}
-
 	pc := uintptr(c.eip())
 	sp := uintptr(c.esp())
 
 	if shouldPushSigpanic(gp, pc, *(*uintptr)(unsafe.Pointer(sp))) {
-		// Make it look like the faulting PC called sigpanic.
-		if sys.RegSize > sys.PtrSize {
-			sp -= sys.PtrSize
-			*(*uintptr)(unsafe.Pointer(sp)) = 0
-		}
-		sp -= sys.PtrSize
-		*(*uintptr)(unsafe.Pointer(sp)) = pc
-		c.set_esp(uint32(sp))
+		c.pushCall(funcPC(sigpanic), pc)
+	} else {
+		// Not safe to push the call. Just clobber the frame.
+		c.set_eip(uint32(funcPC(sigpanic)))
 	}
-	c.set_eip(uint32(funcPC(sigpanic)))
+}
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Make it look like we called target at resumePC.
+	sp := uintptr(c.esp())
+	sp -= sys.PtrSize
+	*(*uintptr)(unsafe.Pointer(sp)) = resumePC
+	c.set_esp(uint32(sp))
+	c.set_eip(uint32(targetPC))
 }
diff --git a/src/runtime/signal_amd64x.go b/src/runtime/signal_amd64.go
similarity index 81%
rename from src/runtime/signal_amd64x.go
rename to src/runtime/signal_amd64.go
index 9d59e26..6ab1f75 100644
--- a/src/runtime/signal_amd64x.go
+++ b/src/runtime/signal_amd64.go
@@ -2,8 +2,8 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 amd64p32
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+// +build amd64
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
 
 package runtime
 
@@ -66,14 +66,18 @@
 	sp := uintptr(c.rsp())
 
 	if shouldPushSigpanic(gp, pc, *(*uintptr)(unsafe.Pointer(sp))) {
-		// Make it look the like faulting PC called sigpanic.
-		if sys.RegSize > sys.PtrSize {
-			sp -= sys.PtrSize
-			*(*uintptr)(unsafe.Pointer(sp)) = 0
-		}
-		sp -= sys.PtrSize
-		*(*uintptr)(unsafe.Pointer(sp)) = pc
-		c.set_rsp(uint64(sp))
+		c.pushCall(funcPC(sigpanic), pc)
+	} else {
+		// Not safe to push the call. Just clobber the frame.
+		c.set_rip(uint64(funcPC(sigpanic)))
 	}
-	c.set_rip(uint64(funcPC(sigpanic)))
+}
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Make it look like we called target at resumePC.
+	sp := uintptr(c.rsp())
+	sp -= sys.PtrSize
+	*(*uintptr)(unsafe.Pointer(sp)) = resumePC
+	c.set_rsp(uint64(sp))
+	c.set_rip(uint64(targetPC))
 }
diff --git a/src/runtime/signal_arm.go b/src/runtime/signal_arm.go
index bb597c5..156d9d3 100644
--- a/src/runtime/signal_arm.go
+++ b/src/runtime/signal_arm.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin dragonfly freebsd linux nacl netbsd openbsd
+// +build dragonfly freebsd linux netbsd openbsd
 
 package runtime
 
@@ -62,3 +62,17 @@
 	c.set_r10(uint32(uintptr(unsafe.Pointer(gp))))
 	c.set_pc(uint32(funcPC(sigpanic)))
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra slot is known to gentraceback.
+	sp := c.sp() - 4
+	c.set_sp(sp)
+	*(*uint32)(unsafe.Pointer(uintptr(sp))) = c.lr()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_lr(uint32(resumePC))
+	c.set_pc(uint32(targetPC))
+}
diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go
index 7a3b1cc..3c20139 100644
--- a/src/runtime/signal_arm64.go
+++ b/src/runtime/signal_arm64.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build darwin linux netbsd openbsd
+// +build darwin freebsd linux netbsd openbsd
 
 package runtime
 
@@ -78,3 +78,17 @@
 	c.set_r28(uint64(uintptr(unsafe.Pointer(gp))))
 	c.set_pc(uint64(funcPC(sigpanic)))
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra space is known to gentraceback.
+	sp := c.sp() - 16 // SP needs 16-byte alignment
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_lr(uint64(resumePC))
+	c.set_pc(uint64(targetPC))
+}
diff --git a/src/runtime/signal_darwin_386.go b/src/runtime/signal_darwin_386.go
deleted file mode 100644
index 3dc5334..0000000
--- a/src/runtime/signal_darwin_386.go
+++ /dev/null
@@ -1,64 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-type sigctxt struct {
-	info *siginfo
-	ctxt unsafe.Pointer
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) regs() *regs32 { return &(*ucontext)(c.ctxt).uc_mcontext.ss }
-
-func (c *sigctxt) eax() uint32 { return c.regs().eax }
-func (c *sigctxt) ebx() uint32 { return c.regs().ebx }
-func (c *sigctxt) ecx() uint32 { return c.regs().ecx }
-func (c *sigctxt) edx() uint32 { return c.regs().edx }
-func (c *sigctxt) edi() uint32 { return c.regs().edi }
-func (c *sigctxt) esi() uint32 { return c.regs().esi }
-func (c *sigctxt) ebp() uint32 { return c.regs().ebp }
-func (c *sigctxt) esp() uint32 { return c.regs().esp }
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) eip() uint32 { return c.regs().eip }
-
-func (c *sigctxt) eflags() uint32  { return c.regs().eflags }
-func (c *sigctxt) cs() uint32      { return c.regs().cs }
-func (c *sigctxt) fs() uint32      { return c.regs().fs }
-func (c *sigctxt) gs() uint32      { return c.regs().gs }
-func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
-func (c *sigctxt) sigaddr() uint32 { return c.info.si_addr }
-
-func (c *sigctxt) set_eip(x uint32)     { c.regs().eip = x }
-func (c *sigctxt) set_esp(x uint32)     { c.regs().esp = x }
-func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
-func (c *sigctxt) set_sigaddr(x uint32) { c.info.si_addr = x }
-
-//go:nosplit
-func (c *sigctxt) fixsigcode(sig uint32) {
-	switch sig {
-	case _SIGTRAP:
-		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
-		// leaving no way to distinguish a breakpoint-induced SIGTRAP
-		// from an asynchronous signal SIGTRAP.
-		// They all look breakpoint-induced by default.
-		// Try looking at the code to see if it's a breakpoint.
-		// The assumption is that we're very unlikely to get an
-		// asynchronous SIGTRAP at just the moment that the
-		// PC started to point at unmapped memory.
-		pc := uintptr(c.eip())
-		// OS X will leave the pc just after the INT 3 instruction.
-		// INT 3 is usually 1 byte, but there is a 2-byte form.
-		code := (*[2]byte)(unsafe.Pointer(pc - 2))
-		if code[1] != 0xCC && (code[0] != 0xCD || code[1] != 3) {
-			// SIGTRAP on something other than INT 3.
-			c.set_sigcode(_SI_USER)
-		}
-	}
-}
diff --git a/src/runtime/signal_darwin_arm.go b/src/runtime/signal_darwin_arm.go
deleted file mode 100644
index 9098b10..0000000
--- a/src/runtime/signal_darwin_arm.go
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-type sigctxt struct {
-	info *siginfo
-	ctxt unsafe.Pointer
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) regs() *regs32 { return &(*ucontext)(c.ctxt).uc_mcontext.ss }
-
-func (c *sigctxt) r0() uint32  { return c.regs().r[0] }
-func (c *sigctxt) r1() uint32  { return c.regs().r[1] }
-func (c *sigctxt) r2() uint32  { return c.regs().r[2] }
-func (c *sigctxt) r3() uint32  { return c.regs().r[3] }
-func (c *sigctxt) r4() uint32  { return c.regs().r[4] }
-func (c *sigctxt) r5() uint32  { return c.regs().r[5] }
-func (c *sigctxt) r6() uint32  { return c.regs().r[6] }
-func (c *sigctxt) r7() uint32  { return c.regs().r[7] }
-func (c *sigctxt) r8() uint32  { return c.regs().r[8] }
-func (c *sigctxt) r9() uint32  { return c.regs().r[9] }
-func (c *sigctxt) r10() uint32 { return c.regs().r[10] }
-func (c *sigctxt) fp() uint32  { return c.regs().r[11] }
-func (c *sigctxt) ip() uint32  { return c.regs().r[12] }
-func (c *sigctxt) sp() uint32  { return c.regs().sp }
-func (c *sigctxt) lr() uint32  { return c.regs().lr }
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) pc() uint32 { return c.regs().pc }
-
-func (c *sigctxt) cpsr() uint32    { return c.regs().cpsr }
-func (c *sigctxt) fault() uintptr  { return uintptr(c.info.si_addr) }
-func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
-func (c *sigctxt) trap() uint32    { return 0 }
-func (c *sigctxt) error() uint32   { return 0 }
-func (c *sigctxt) oldmask() uint32 { return 0 }
-
-func (c *sigctxt) set_pc(x uint32)  { c.regs().pc = x }
-func (c *sigctxt) set_sp(x uint32)  { c.regs().sp = x }
-func (c *sigctxt) set_lr(x uint32)  { c.regs().lr = x }
-func (c *sigctxt) set_r10(x uint32) { c.regs().r[10] = x }
-
-func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
-func (c *sigctxt) set_sigaddr(x uint32) { c.info.si_addr = x }
-
-//go:nosplit
-func (c *sigctxt) fixsigcode(sig uint32) {
-	switch sig {
-	case _SIGTRAP:
-		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
-		// leaving no way to distinguish a breakpoint-induced SIGTRAP
-		// from an asynchronous signal SIGTRAP.
-		// They all look breakpoint-induced by default.
-		// Try looking at the code to see if it's a breakpoint.
-		// The assumption is that we're very unlikely to get an
-		// asynchronous SIGTRAP at just the moment that the
-		// PC started to point at unmapped memory.
-		pc := uintptr(c.pc())
-		// OS X will leave the pc just after the instruction.
-		code := (*uint32)(unsafe.Pointer(pc - 4))
-		if *code != 0xe7f001f0 {
-			// SIGTRAP on something other than breakpoint.
-			c.set_sigcode(_SI_USER)
-		}
-	}
-}
diff --git a/src/runtime/signal_freebsd_arm64.go b/src/runtime/signal_freebsd_arm64.go
new file mode 100644
index 0000000..159e965
--- /dev/null
+++ b/src/runtime/signal_freebsd_arm64.go
@@ -0,0 +1,66 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *mcontext { return &(*ucontext)(c.ctxt).uc_mcontext }
+
+func (c *sigctxt) r0() uint64  { return c.regs().mc_gpregs.gp_x[0] }
+func (c *sigctxt) r1() uint64  { return c.regs().mc_gpregs.gp_x[1] }
+func (c *sigctxt) r2() uint64  { return c.regs().mc_gpregs.gp_x[2] }
+func (c *sigctxt) r3() uint64  { return c.regs().mc_gpregs.gp_x[3] }
+func (c *sigctxt) r4() uint64  { return c.regs().mc_gpregs.gp_x[4] }
+func (c *sigctxt) r5() uint64  { return c.regs().mc_gpregs.gp_x[5] }
+func (c *sigctxt) r6() uint64  { return c.regs().mc_gpregs.gp_x[6] }
+func (c *sigctxt) r7() uint64  { return c.regs().mc_gpregs.gp_x[7] }
+func (c *sigctxt) r8() uint64  { return c.regs().mc_gpregs.gp_x[8] }
+func (c *sigctxt) r9() uint64  { return c.regs().mc_gpregs.gp_x[9] }
+func (c *sigctxt) r10() uint64 { return c.regs().mc_gpregs.gp_x[10] }
+func (c *sigctxt) r11() uint64 { return c.regs().mc_gpregs.gp_x[11] }
+func (c *sigctxt) r12() uint64 { return c.regs().mc_gpregs.gp_x[12] }
+func (c *sigctxt) r13() uint64 { return c.regs().mc_gpregs.gp_x[13] }
+func (c *sigctxt) r14() uint64 { return c.regs().mc_gpregs.gp_x[14] }
+func (c *sigctxt) r15() uint64 { return c.regs().mc_gpregs.gp_x[15] }
+func (c *sigctxt) r16() uint64 { return c.regs().mc_gpregs.gp_x[16] }
+func (c *sigctxt) r17() uint64 { return c.regs().mc_gpregs.gp_x[17] }
+func (c *sigctxt) r18() uint64 { return c.regs().mc_gpregs.gp_x[18] }
+func (c *sigctxt) r19() uint64 { return c.regs().mc_gpregs.gp_x[19] }
+func (c *sigctxt) r20() uint64 { return c.regs().mc_gpregs.gp_x[20] }
+func (c *sigctxt) r21() uint64 { return c.regs().mc_gpregs.gp_x[21] }
+func (c *sigctxt) r22() uint64 { return c.regs().mc_gpregs.gp_x[22] }
+func (c *sigctxt) r23() uint64 { return c.regs().mc_gpregs.gp_x[23] }
+func (c *sigctxt) r24() uint64 { return c.regs().mc_gpregs.gp_x[24] }
+func (c *sigctxt) r25() uint64 { return c.regs().mc_gpregs.gp_x[25] }
+func (c *sigctxt) r26() uint64 { return c.regs().mc_gpregs.gp_x[26] }
+func (c *sigctxt) r27() uint64 { return c.regs().mc_gpregs.gp_x[27] }
+func (c *sigctxt) r28() uint64 { return c.regs().mc_gpregs.gp_x[28] }
+func (c *sigctxt) r29() uint64 { return c.regs().mc_gpregs.gp_x[29] }
+func (c *sigctxt) lr() uint64  { return c.regs().mc_gpregs.gp_lr }
+func (c *sigctxt) sp() uint64  { return c.regs().mc_gpregs.gp_sp }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) pc() uint64 { return c.regs().mc_gpregs.gp_elr }
+
+func (c *sigctxt) fault() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_pc(x uint64)  { c.regs().mc_gpregs.gp_elr = x }
+func (c *sigctxt) set_sp(x uint64)  { c.regs().mc_gpregs.gp_sp = x }
+func (c *sigctxt) set_lr(x uint64)  { c.regs().mc_gpregs.gp_lr = x }
+func (c *sigctxt) set_r28(x uint64) { c.regs().mc_gpregs.gp_x[28] = x }
+
+func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) { c.info.si_addr = x }
diff --git a/src/runtime/signal_linux_riscv64.go b/src/runtime/signal_linux_riscv64.go
new file mode 100644
index 0000000..9f68e5c
--- /dev/null
+++ b/src/runtime/signal_linux_riscv64.go
@@ -0,0 +1,68 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) regs() *sigcontext { return &(*ucontext)(c.ctxt).uc_mcontext }
+
+func (c *sigctxt) ra() uint64  { return c.regs().sc_regs.ra }
+func (c *sigctxt) sp() uint64  { return c.regs().sc_regs.sp }
+func (c *sigctxt) gp() uint64  { return c.regs().sc_regs.gp }
+func (c *sigctxt) tp() uint64  { return c.regs().sc_regs.tp }
+func (c *sigctxt) t0() uint64  { return c.regs().sc_regs.t0 }
+func (c *sigctxt) t1() uint64  { return c.regs().sc_regs.t1 }
+func (c *sigctxt) t2() uint64  { return c.regs().sc_regs.t2 }
+func (c *sigctxt) s0() uint64  { return c.regs().sc_regs.s0 }
+func (c *sigctxt) s1() uint64  { return c.regs().sc_regs.s1 }
+func (c *sigctxt) a0() uint64  { return c.regs().sc_regs.a0 }
+func (c *sigctxt) a1() uint64  { return c.regs().sc_regs.a1 }
+func (c *sigctxt) a2() uint64  { return c.regs().sc_regs.a2 }
+func (c *sigctxt) a3() uint64  { return c.regs().sc_regs.a3 }
+func (c *sigctxt) a4() uint64  { return c.regs().sc_regs.a4 }
+func (c *sigctxt) a5() uint64  { return c.regs().sc_regs.a5 }
+func (c *sigctxt) a6() uint64  { return c.regs().sc_regs.a6 }
+func (c *sigctxt) a7() uint64  { return c.regs().sc_regs.a7 }
+func (c *sigctxt) s2() uint64  { return c.regs().sc_regs.s2 }
+func (c *sigctxt) s3() uint64  { return c.regs().sc_regs.s3 }
+func (c *sigctxt) s4() uint64  { return c.regs().sc_regs.s4 }
+func (c *sigctxt) s5() uint64  { return c.regs().sc_regs.s5 }
+func (c *sigctxt) s6() uint64  { return c.regs().sc_regs.s6 }
+func (c *sigctxt) s7() uint64  { return c.regs().sc_regs.s7 }
+func (c *sigctxt) s8() uint64  { return c.regs().sc_regs.s8 }
+func (c *sigctxt) s9() uint64  { return c.regs().sc_regs.s9 }
+func (c *sigctxt) s10() uint64 { return c.regs().sc_regs.s10 }
+func (c *sigctxt) s11() uint64 { return c.regs().sc_regs.s11 }
+func (c *sigctxt) t3() uint64  { return c.regs().sc_regs.t3 }
+func (c *sigctxt) t4() uint64  { return c.regs().sc_regs.t4 }
+func (c *sigctxt) t5() uint64  { return c.regs().sc_regs.t5 }
+func (c *sigctxt) t6() uint64  { return c.regs().sc_regs.t6 }
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) pc() uint64 { return c.regs().sc_regs.pc }
+
+func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_pc(x uint64) { c.regs().sc_regs.pc = x }
+func (c *sigctxt) set_ra(x uint64) { c.regs().sc_regs.ra = x }
+func (c *sigctxt) set_sp(x uint64) { c.regs().sc_regs.sp = x }
+func (c *sigctxt) set_gp(x uint64) { c.regs().sc_regs.gp = x }
+
+func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
+}
diff --git a/src/runtime/signal_linux_s390x.go b/src/runtime/signal_linux_s390x.go
index 6892f63..12d5c31 100644
--- a/src/runtime/signal_linux_s390x.go
+++ b/src/runtime/signal_linux_s390x.go
@@ -109,3 +109,17 @@
 	c.set_r13(uint64(uintptr(unsafe.Pointer(gp))))
 	c.set_pc(uint64(funcPC(sigpanic)))
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra slot is known to gentraceback.
+	sp := c.sp() - 8
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_link(uint64(resumePC))
+	c.set_pc(uint64(targetPC))
+}
diff --git a/src/runtime/signal_mips64x.go b/src/runtime/signal_mips64x.go
index 1b96842..040c959 100644
--- a/src/runtime/signal_mips64x.go
+++ b/src/runtime/signal_mips64x.go
@@ -84,3 +84,17 @@
 	c.set_r30(uint64(uintptr(unsafe.Pointer(gp))))
 	c.set_pc(sigpanicPC)
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra slot is known to gentraceback.
+	sp := c.sp() - 8
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_link(uint64(resumePC))
+	c.set_pc(uint64(targetPC))
+}
diff --git a/src/runtime/signal_mipsx.go b/src/runtime/signal_mipsx.go
index e223c28..8c29f59 100644
--- a/src/runtime/signal_mipsx.go
+++ b/src/runtime/signal_mipsx.go
@@ -79,3 +79,17 @@
 	c.set_r30(uint32(uintptr(unsafe.Pointer(gp))))
 	c.set_pc(uint32(funcPC(sigpanic)))
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra slot is known to gentraceback.
+	sp := c.sp() - 4
+	c.set_sp(sp)
+	*(*uint32)(unsafe.Pointer(uintptr(sp))) = c.link()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_link(uint32(resumePC))
+	c.set_pc(uint32(targetPC))
+}
diff --git a/src/runtime/signal_nacl.go b/src/runtime/signal_nacl.go
deleted file mode 100644
index ad321d8..0000000
--- a/src/runtime/signal_nacl.go
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type sigTabT struct {
-	flags int32
-	name  string
-}
-
-var sigtable = [...]sigTabT{
-	/* 0 */ {0, "SIGNONE: no trap"},
-	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
-	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
-	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
-	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
-	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
-	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
-	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
-	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
-	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
-	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
-	/* 16 */ {_SigNotify + _SigIgn, "SIGURG: urgent condition on socket"},
-	/* 17 */ {0, "SIGSTOP: stop"},
-	/* 18 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTSTP: keyboard stop"},
-	/* 19 */ {_SigNotify + _SigDefault + _SigIgn, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify + _SigIgn, "SIGCHLD: child status has changed"},
-	/* 21 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTIN: background read from tty"},
-	/* 22 */ {_SigNotify + _SigDefault + _SigIgn, "SIGTTOU: background write to tty"},
-	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
-	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
-	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
-	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
-	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
-	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
-	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
-	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
-}
diff --git a/src/runtime/signal_nacl_386.go b/src/runtime/signal_nacl_386.go
deleted file mode 100644
index 1a30a89..0000000
--- a/src/runtime/signal_nacl_386.go
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-type sigctxt struct {
-	info *siginfo
-	ctxt unsafe.Pointer
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) regs() *excregs386 { return &(*exccontext)(c.ctxt).regs }
-
-func (c *sigctxt) eax() uint32 { return c.regs().eax }
-func (c *sigctxt) ebx() uint32 { return c.regs().ebx }
-func (c *sigctxt) ecx() uint32 { return c.regs().ecx }
-func (c *sigctxt) edx() uint32 { return c.regs().edx }
-func (c *sigctxt) edi() uint32 { return c.regs().edi }
-func (c *sigctxt) esi() uint32 { return c.regs().esi }
-func (c *sigctxt) ebp() uint32 { return c.regs().ebp }
-func (c *sigctxt) esp() uint32 { return c.regs().esp }
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) eip() uint32 { return c.regs().eip }
-
-func (c *sigctxt) eflags() uint32  { return c.regs().eflags }
-func (c *sigctxt) cs() uint32      { return ^uint32(0) }
-func (c *sigctxt) fs() uint32      { return ^uint32(0) }
-func (c *sigctxt) gs() uint32      { return ^uint32(0) }
-func (c *sigctxt) sigcode() uint32 { return ^uint32(0) }
-func (c *sigctxt) sigaddr() uint32 { return 0 }
-
-func (c *sigctxt) set_eip(x uint32)     { c.regs().eip = x }
-func (c *sigctxt) set_esp(x uint32)     { c.regs().esp = x }
-func (c *sigctxt) set_sigcode(x uint32) {}
-func (c *sigctxt) set_sigaddr(x uint32) {}
diff --git a/src/runtime/signal_nacl_amd64p32.go b/src/runtime/signal_nacl_amd64p32.go
deleted file mode 100644
index 81bbdc5..0000000
--- a/src/runtime/signal_nacl_amd64p32.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-func nacl_sysinfo(di uint32) // cross-assembly-file call; declared for vet
-
-type sigctxt struct {
-	info *siginfo
-	ctxt unsafe.Pointer
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) regs() *excregsamd64 {
-	return &(*exccontext)(c.ctxt).regs
-}
-
-func (c *sigctxt) rax() uint64 { return c.regs().rax }
-func (c *sigctxt) rbx() uint64 { return c.regs().rbx }
-func (c *sigctxt) rcx() uint64 { return c.regs().rcx }
-func (c *sigctxt) rdx() uint64 { return c.regs().rdx }
-func (c *sigctxt) rdi() uint64 { return c.regs().rdi }
-func (c *sigctxt) rsi() uint64 { return c.regs().rsi }
-func (c *sigctxt) rbp() uint64 { return c.regs().rbp }
-func (c *sigctxt) rsp() uint64 { return c.regs().rsp }
-func (c *sigctxt) r8() uint64  { return c.regs().r8 }
-func (c *sigctxt) r9() uint64  { return c.regs().r9 }
-func (c *sigctxt) r10() uint64 { return c.regs().r10 }
-func (c *sigctxt) r11() uint64 { return c.regs().r11 }
-func (c *sigctxt) r12() uint64 { return c.regs().r12 }
-func (c *sigctxt) r13() uint64 { return c.regs().r13 }
-func (c *sigctxt) r14() uint64 { return c.regs().r14 }
-func (c *sigctxt) r15() uint64 { return c.regs().r15 }
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) rip() uint64 { return c.regs().rip }
-
-func (c *sigctxt) rflags() uint64  { return uint64(c.regs().rflags) }
-func (c *sigctxt) cs() uint64      { return ^uint64(0) }
-func (c *sigctxt) fs() uint64      { return ^uint64(0) }
-func (c *sigctxt) gs() uint64      { return ^uint64(0) }
-func (c *sigctxt) sigcode() uint64 { return ^uint64(0) }
-func (c *sigctxt) sigaddr() uint64 { return 0 }
-
-func (c *sigctxt) set_rip(x uint64)     { c.regs().rip = x }
-func (c *sigctxt) set_rsp(x uint64)     { c.regs().rsp = x }
-func (c *sigctxt) set_sigcode(x uint64) {}
-func (c *sigctxt) set_sigaddr(x uint64) {}
diff --git a/src/runtime/signal_nacl_arm.go b/src/runtime/signal_nacl_arm.go
deleted file mode 100644
index b831232..0000000
--- a/src/runtime/signal_nacl_arm.go
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-type sigctxt struct {
-	info *siginfo
-	ctxt unsafe.Pointer
-}
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) regs() *excregsarm { return &(*exccontext)(c.ctxt).regs }
-
-func (c *sigctxt) r0() uint32  { return c.regs().r0 }
-func (c *sigctxt) r1() uint32  { return c.regs().r1 }
-func (c *sigctxt) r2() uint32  { return c.regs().r2 }
-func (c *sigctxt) r3() uint32  { return c.regs().r3 }
-func (c *sigctxt) r4() uint32  { return c.regs().r4 }
-func (c *sigctxt) r5() uint32  { return c.regs().r5 }
-func (c *sigctxt) r6() uint32  { return c.regs().r6 }
-func (c *sigctxt) r7() uint32  { return c.regs().r7 }
-func (c *sigctxt) r8() uint32  { return c.regs().r8 }
-func (c *sigctxt) r9() uint32  { return c.regs().r9 }
-func (c *sigctxt) r10() uint32 { return c.regs().r10 }
-func (c *sigctxt) fp() uint32  { return c.regs().r11 }
-func (c *sigctxt) ip() uint32  { return c.regs().r12 }
-func (c *sigctxt) sp() uint32  { return c.regs().sp }
-func (c *sigctxt) lr() uint32  { return c.regs().lr }
-
-//go:nosplit
-//go:nowritebarrierrec
-func (c *sigctxt) pc() uint32 { return c.regs().pc }
-
-func (c *sigctxt) cpsr() uint32    { return c.regs().cpsr }
-func (c *sigctxt) fault() uintptr  { return ^uintptr(0) }
-func (c *sigctxt) trap() uint32    { return ^uint32(0) }
-func (c *sigctxt) error() uint32   { return ^uint32(0) }
-func (c *sigctxt) oldmask() uint32 { return ^uint32(0) }
-
-func (c *sigctxt) sigcode() uint32 { return 0 }
-func (c *sigctxt) sigaddr() uint32 { return 0 }
-
-func (c *sigctxt) set_pc(x uint32)  { c.regs().pc = x }
-func (c *sigctxt) set_sp(x uint32)  { c.regs().sp = x }
-func (c *sigctxt) set_lr(x uint32)  { c.regs().lr = x }
-func (c *sigctxt) set_r10(x uint32) { c.regs().r10 = x }
-
-func (c *sigctxt) set_sigcode(x uint32) {}
-func (c *sigctxt) set_sigaddr(x uint32) {}
diff --git a/src/runtime/signal_ppc64x.go b/src/runtime/signal_ppc64x.go
index cac1a23..5de93a3 100644
--- a/src/runtime/signal_ppc64x.go
+++ b/src/runtime/signal_ppc64x.go
@@ -85,3 +85,27 @@
 	c.set_r12(uint64(funcPC(sigpanic)))
 	c.set_pc(uint64(funcPC(sigpanic)))
 }
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra space is known to gentraceback.
+	sp := c.sp() - sys.MinFrameSize
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+	// In PIC mode, we'll set up (i.e. clobber) R2 on function
+	// entry. Save it ahead of time.
+	// In PIC mode it requires R12 points to the function entry,
+	// so we'll set it up when pushing the call. Save it ahead
+	// of time as well.
+	// 8(SP) and 16(SP) are unused space in the reserved
+	// MinFrameSize (32) bytes.
+	*(*uint64)(unsafe.Pointer(uintptr(sp) + 8)) = c.r2()
+	*(*uint64)(unsafe.Pointer(uintptr(sp) + 16)) = c.r12()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_link(uint64(resumePC))
+	c.set_r12(uint64(targetPC))
+	c.set_pc(uint64(targetPC))
+}
diff --git a/src/runtime/signal_riscv64.go b/src/runtime/signal_riscv64.go
new file mode 100644
index 0000000..93363a4
--- /dev/null
+++ b/src/runtime/signal_riscv64.go
@@ -0,0 +1,93 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,riscv64
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+func dumpregs(c *sigctxt) {
+	print("ra  ", hex(c.ra()), "\t")
+	print("sp  ", hex(c.sp()), "\n")
+	print("gp  ", hex(c.gp()), "\t")
+	print("tp  ", hex(c.tp()), "\n")
+	print("t0  ", hex(c.t0()), "\t")
+	print("t1  ", hex(c.t1()), "\n")
+	print("t2  ", hex(c.t2()), "\t")
+	print("s0  ", hex(c.s0()), "\n")
+	print("s1  ", hex(c.s1()), "\t")
+	print("a0  ", hex(c.a0()), "\n")
+	print("a1  ", hex(c.a1()), "\t")
+	print("a2  ", hex(c.a2()), "\n")
+	print("a3  ", hex(c.a3()), "\t")
+	print("a4  ", hex(c.a4()), "\n")
+	print("a5  ", hex(c.a5()), "\t")
+	print("a6  ", hex(c.a6()), "\n")
+	print("a7  ", hex(c.a7()), "\t")
+	print("s2  ", hex(c.s2()), "\n")
+	print("s3  ", hex(c.s3()), "\t")
+	print("s4  ", hex(c.s4()), "\n")
+	print("s5  ", hex(c.s5()), "\t")
+	print("s6  ", hex(c.s6()), "\n")
+	print("s7  ", hex(c.s7()), "\t")
+	print("s8  ", hex(c.s8()), "\n")
+	print("s9  ", hex(c.s9()), "\t")
+	print("s10 ", hex(c.s10()), "\n")
+	print("s11 ", hex(c.s11()), "\t")
+	print("t3  ", hex(c.t3()), "\n")
+	print("t4  ", hex(c.t4()), "\t")
+	print("t5  ", hex(c.t5()), "\n")
+	print("t6  ", hex(c.t6()), "\t")
+	print("pc  ", hex(c.pc()), "\n")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func (c *sigctxt) sigpc() uintptr { return uintptr(c.pc()) }
+
+func (c *sigctxt) sigsp() uintptr { return uintptr(c.sp()) }
+func (c *sigctxt) siglr() uintptr { return uintptr(c.ra()) }
+func (c *sigctxt) fault() uintptr { return uintptr(c.sigaddr()) }
+
+// preparePanic sets up the stack to look like a call to sigpanic.
+func (c *sigctxt) preparePanic(sig uint32, gp *g) {
+	// We arrange RA, and pc to pretend the panicking
+	// function calls sigpanic directly.
+	// Always save RA to stack so that panics in leaf
+	// functions are correctly handled. This smashes
+	// the stack frame but we're not going back there
+	// anyway.
+	sp := c.sp() - sys.PtrSize
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.ra()
+
+	pc := gp.sigpc
+
+	if shouldPushSigpanic(gp, pc, uintptr(c.ra())) {
+		// Make it look the like faulting PC called sigpanic.
+		c.set_ra(uint64(pc))
+	}
+
+	// In case we are panicking from external C code
+	c.set_gp(uint64(uintptr(unsafe.Pointer(gp))))
+	c.set_pc(uint64(funcPC(sigpanic)))
+}
+
+func (c *sigctxt) pushCall(targetPC, resumePC uintptr) {
+	// Push the LR to stack, as we'll clobber it in order to
+	// push the call. The function being pushed is responsible
+	// for restoring the LR and setting the SP back.
+	// This extra slot is known to gentraceback.
+	sp := c.sp() - sys.PtrSize
+	c.set_sp(sp)
+	*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.ra()
+	// Set up PC and LR to pretend the function being signaled
+	// calls targetPC at resumePC.
+	c.set_ra(uint64(resumePC))
+	c.set_pc(uint64(targetPC))
+}
diff --git a/src/runtime/signal_sighandler.go b/src/runtime/signal_sighandler.go
deleted file mode 100644
index bec4653..0000000
--- a/src/runtime/signal_sighandler.go
+++ /dev/null
@@ -1,154 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build aix darwin dragonfly freebsd linux nacl netbsd openbsd solaris
-
-package runtime
-
-import (
-	"unsafe"
-)
-
-// crashing is the number of m's we have waited for when implementing
-// GOTRACEBACK=crash when a signal is received.
-var crashing int32
-
-// testSigtrap is used by the runtime tests. If non-nil, it is called
-// on SIGTRAP. If it returns true, the normal behavior on SIGTRAP is
-// suppressed.
-var testSigtrap func(info *siginfo, ctxt *sigctxt, gp *g) bool
-
-// sighandler is invoked when a signal occurs. The global g will be
-// set to a gsignal goroutine and we will be running on the alternate
-// signal stack. The parameter g will be the value of the global g
-// when the signal occurred. The sig, info, and ctxt parameters are
-// from the system signal handler: they are the parameters passed when
-// the SA is passed to the sigaction system call.
-//
-// The garbage collector may have stopped the world, so write barriers
-// are not allowed.
-//
-//go:nowritebarrierrec
-func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
-	_g_ := getg()
-	c := &sigctxt{info, ctxt}
-
-	if sig == _SIGPROF {
-		sigprof(c.sigpc(), c.sigsp(), c.siglr(), gp, _g_.m)
-		return
-	}
-
-	if sig == _SIGTRAP && testSigtrap != nil && testSigtrap(info, (*sigctxt)(noescape(unsafe.Pointer(c))), gp) {
-		return
-	}
-
-	flags := int32(_SigThrow)
-	if sig < uint32(len(sigtable)) {
-		flags = sigtable[sig].flags
-	}
-	if flags&_SigPanic != 0 && gp.throwsplit {
-		// We can't safely sigpanic because it may grow the
-		// stack. Abort in the signal handler instead.
-		flags = (flags &^ _SigPanic) | _SigThrow
-	}
-	if isAbortPC(c.sigpc()) {
-		// On many architectures, the abort function just
-		// causes a memory fault. Don't turn that into a panic.
-		flags = _SigThrow
-	}
-	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
-		// The signal is going to cause a panic.
-		// Arrange the stack so that it looks like the point
-		// where the signal occurred made a call to the
-		// function sigpanic. Then set the PC to sigpanic.
-
-		// Have to pass arguments out of band since
-		// augmenting the stack frame would break
-		// the unwinding code.
-		gp.sig = sig
-		gp.sigcode0 = uintptr(c.sigcode())
-		gp.sigcode1 = uintptr(c.fault())
-		gp.sigpc = c.sigpc()
-
-		c.preparePanic(sig, gp)
-		return
-	}
-
-	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
-		if sigsend(sig) {
-			return
-		}
-	}
-
-	if c.sigcode() == _SI_USER && signal_ignored(sig) {
-		return
-	}
-
-	if flags&_SigKill != 0 {
-		dieFromSignal(sig)
-	}
-
-	if flags&_SigThrow == 0 {
-		return
-	}
-
-	_g_.m.throwing = 1
-	_g_.m.caughtsig.set(gp)
-
-	if crashing == 0 {
-		startpanic_m()
-	}
-
-	if sig < uint32(len(sigtable)) {
-		print(sigtable[sig].name, "\n")
-	} else {
-		print("Signal ", sig, "\n")
-	}
-
-	print("PC=", hex(c.sigpc()), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
-	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
-		print("signal arrived during cgo execution\n")
-		gp = _g_.m.lockedg.ptr()
-	}
-	print("\n")
-
-	level, _, docrash := gotraceback()
-	if level > 0 {
-		goroutineheader(gp)
-		tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
-		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
-			// tracebackothers on original m skipped this one; trace it now.
-			goroutineheader(_g_.m.curg)
-			traceback(^uintptr(0), ^uintptr(0), 0, _g_.m.curg)
-		} else if crashing == 0 {
-			tracebackothers(gp)
-			print("\n")
-		}
-		dumpregs(c)
-	}
-
-	if docrash {
-		crashing++
-		if crashing < mcount()-int32(extraMCount) {
-			// There are other m's that need to dump their stacks.
-			// Relay SIGQUIT to the next m by sending it to the current process.
-			// All m's that have already received SIGQUIT have signal masks blocking
-			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
-			// When the last m receives the SIGQUIT, it will fall through to the call to
-			// crash below. Just in case the relaying gets botched, each m involved in
-			// the relay sleeps for 5 seconds and then does the crash/exit itself.
-			// In expected operation, the last m has received the SIGQUIT and run
-			// crash/exit and the process is gone, all long before any of the
-			// 5-second sleeps have finished.
-			print("\n-----\n\n")
-			raiseproc(_SIGQUIT)
-			usleep(5 * 1000 * 1000)
-		}
-		crash()
-	}
-
-	printDebugLog()
-
-	exit(2)
-}
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index ad51dc1..5aedbf7 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -38,6 +38,38 @@
 	_SIG_IGN uintptr = 1
 )
 
+// sigPreempt is the signal used for non-cooperative preemption.
+//
+// There's no good way to choose this signal, but there are some
+// heuristics:
+//
+// 1. It should be a signal that's passed-through by debuggers by
+// default. On Linux, this is SIGALRM, SIGURG, SIGCHLD, SIGIO,
+// SIGVTALRM, SIGPROF, and SIGWINCH, plus some glibc-internal signals.
+//
+// 2. It shouldn't be used internally by libc in mixed Go/C binaries
+// because libc may assume it's the only thing that can handle these
+// signals. For example SIGCANCEL or SIGSETXID.
+//
+// 3. It should be a signal that can happen spuriously without
+// consequences. For example, SIGALRM is a bad choice because the
+// signal handler can't tell if it was caused by the real process
+// alarm or not (arguably this means the signal is broken, but I
+// digress). SIGUSR1 and SIGUSR2 are also bad because those are often
+// used in meaningful ways by applications.
+//
+// 4. We need to deal with platforms without real-time signals (like
+// macOS), so those are out.
+//
+// We use SIGURG because it meets all of these criteria, is extremely
+// unlikely to be used by an application for its "real" meaning (both
+// because out-of-band data is basically unused and because SIGURG
+// doesn't report which socket has the condition, making it pretty
+// useless), and even if it is, the application has to be ready for
+// spurious SIGURG. SIGIO wouldn't be a bad choice either, but is more
+// likely to be used for real.
+const sigPreempt = _SIGURG
+
 // Stores the signal handlers registered before Go installed its own.
 // These signal handlers will be invoked in cases where Go doesn't want to
 // handle a particular signal (e.g., signal occurred on a non-Go thread).
@@ -242,10 +274,26 @@
 		}
 	} else {
 		// If the Go signal handler should be disabled by default,
-		// disable it if it is enabled.
+		// switch back to the signal handler that was installed
+		// when we enabled profiling. We don't try to handle the case
+		// of a program that changes the SIGPROF handler while Go
+		// profiling is enabled.
+		//
+		// If no signal handler was installed before, then start
+		// ignoring SIGPROF signals. We do this, rather than change
+		// to SIG_DFL, because there may be a pending SIGPROF
+		// signal that has not yet been delivered to some other thread.
+		// If we change to SIG_DFL here, the program will crash
+		// when that SIGPROF is delivered. We assume that programs
+		// that use profiling don't want to crash on a stray SIGPROF.
+		// See issue 19320.
 		if !sigInstallGoHandler(_SIGPROF) {
 			if atomic.Cas(&handlingSig[_SIGPROF], 1, 0) {
-				setsig(_SIGPROF, atomic.Loaduintptr(&fwdSig[_SIGPROF]))
+				h := atomic.Loaduintptr(&fwdSig[_SIGPROF])
+				if h == _SIG_DFL {
+					h = _SIG_IGN
+				}
+				setsig(_SIGPROF, h)
 			}
 		}
 	}
@@ -274,6 +322,78 @@
 	dieFromSignal(_SIGPIPE)
 }
 
+// doSigPreempt handles a preemption signal on gp.
+func doSigPreempt(gp *g, ctxt *sigctxt) {
+	// Check if this G wants to be preempted and is safe to
+	// preempt.
+	if wantAsyncPreempt(gp) {
+		if ok, newpc := isAsyncSafePoint(gp, ctxt.sigpc(), ctxt.sigsp(), ctxt.siglr()); ok {
+			// Adjust the PC and inject a call to asyncPreempt.
+			ctxt.pushCall(funcPC(asyncPreempt), newpc)
+		}
+	}
+
+	// Acknowledge the preemption.
+	atomic.Xadd(&gp.m.preemptGen, 1)
+	atomic.Store(&gp.m.signalPending, 0)
+}
+
+const preemptMSupported = true
+
+// preemptM sends a preemption request to mp. This request may be
+// handled asynchronously and may be coalesced with other requests to
+// the M. When the request is received, if the running G or P are
+// marked for preemption and the goroutine is at an asynchronous
+// safe-point, it will preempt the goroutine. It always atomically
+// increments mp.preemptGen after handling a preemption request.
+func preemptM(mp *m) {
+	if GOOS == "darwin" && GOARCH == "arm64" && !iscgo {
+		// On darwin, we use libc calls, and cgo is required on ARM64
+		// so we have TLS set up to save/restore G during C calls. If cgo is
+		// absent, we cannot save/restore G in TLS, and if a signal is
+		// received during C execution we cannot get the G. Therefore don't
+		// send signals.
+		// This can only happen in the go_bootstrap program (otherwise cgo is
+		// required).
+		return
+	}
+	if atomic.Cas(&mp.signalPending, 0, 1) {
+		// If multiple threads are preempting the same M, it may send many
+		// signals to the same M such that it hardly make progress, causing
+		// live-lock problem. Apparently this could happen on darwin. See
+		// issue #37741.
+		// Only send a signal if there isn't already one pending.
+		signalM(mp, sigPreempt)
+	}
+}
+
+// sigFetchG fetches the value of G safely when running in a signal handler.
+// On some architectures, the g value may be clobbered when running in a VDSO.
+// See issue #32912.
+//
+//go:nosplit
+func sigFetchG(c *sigctxt) *g {
+	switch GOARCH {
+	case "arm", "arm64":
+		if !iscgo && inVDSOPage(c.sigpc()) {
+			// When using cgo, we save the g on TLS and load it from there
+			// in sigtramp. Just use that.
+			// Otherwise, before making a VDSO call we save the g to the
+			// bottom of the signal stack. Fetch from there.
+			// TODO: in efence mode, stack is sysAlloc'd, so this wouldn't
+			// work.
+			sp := getcallersp()
+			s := spanOf(sp)
+			if s != nil && s.state.get() == mSpanManual && s.base() < sp && sp < s.limit {
+				gp := *(**g)(unsafe.Pointer(s.base()))
+				return gp
+			}
+			return nil
+		}
+	}
+	return getg()
+}
+
 // sigtrampgo is called from the signal handler function, sigtramp,
 // written in assembly code.
 // This is called by the signal handler, and the world may be stopped.
@@ -289,56 +409,34 @@
 	if sigfwdgo(sig, info, ctx) {
 		return
 	}
-	g := getg()
+	c := &sigctxt{info, ctx}
+	g := sigFetchG(c)
+	setg(g)
 	if g == nil {
-		c := &sigctxt{info, ctx}
 		if sig == _SIGPROF {
 			sigprofNonGoPC(c.sigpc())
 			return
 		}
+		if sig == sigPreempt && preemptMSupported && debug.asyncpreemptoff == 0 {
+			// This is probably a signal from preemptM sent
+			// while executing Go code but received while
+			// executing non-Go code.
+			// We got past sigfwdgo, so we know that there is
+			// no non-Go signal handler for sigPreempt.
+			// The default behavior for sigPreempt is to ignore
+			// the signal, so badsignal will be a no-op anyway.
+			return
+		}
 		c.fixsigcode(sig)
 		badsignal(uintptr(sig), c)
 		return
 	}
 
 	// If some non-Go code called sigaltstack, adjust.
-	setStack := false
 	var gsignalStack gsignalStack
-	sp := uintptr(unsafe.Pointer(&sig))
-	if sp < g.m.gsignal.stack.lo || sp >= g.m.gsignal.stack.hi {
-		if sp >= g.m.g0.stack.lo && sp < g.m.g0.stack.hi {
-			// The signal was delivered on the g0 stack.
-			// This can happen when linked with C code
-			// using the thread sanitizer, which collects
-			// signals then delivers them itself by calling
-			// the signal handler directly when C code,
-			// including C code called via cgo, calls a
-			// TSAN-intercepted function such as malloc.
-			st := stackt{ss_size: g.m.g0.stack.hi - g.m.g0.stack.lo}
-			setSignalstackSP(&st, g.m.g0.stack.lo)
-			setGsignalStack(&st, &gsignalStack)
-			g.m.gsignal.stktopsp = getcallersp()
-			setStack = true
-		} else {
-			var st stackt
-			sigaltstack(nil, &st)
-			if st.ss_flags&_SS_DISABLE != 0 {
-				setg(nil)
-				needm(0)
-				noSignalStack(sig)
-				dropm()
-			}
-			stsp := uintptr(unsafe.Pointer(st.ss_sp))
-			if sp < stsp || sp >= stsp+st.ss_size {
-				setg(nil)
-				needm(0)
-				sigNotOnStack(sig)
-				dropm()
-			}
-			setGsignalStack(&st, &gsignalStack)
-			g.m.gsignal.stktopsp = getcallersp()
-			setStack = true
-		}
+	setStack := adjustSignalStack(sig, g.m, &gsignalStack)
+	if setStack {
+		g.m.gsignal.stktopsp = getcallersp()
 	}
 
 	setg(g.m.gsignal)
@@ -347,7 +445,6 @@
 		signalDuringFork(sig)
 	}
 
-	c := &sigctxt{info, ctx}
 	c.fixsigcode(sig)
 	sighandler(sig, info, ctx, g)
 	setg(g)
@@ -356,6 +453,235 @@
 	}
 }
 
+// adjustSignalStack adjusts the current stack guard based on the
+// stack pointer that is actually in use while handling a signal.
+// We do this in case some non-Go code called sigaltstack.
+// This reports whether the stack was adjusted, and if so stores the old
+// signal stack in *gsigstack.
+//go:nosplit
+func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp >= mp.gsignal.stack.lo && sp < mp.gsignal.stack.hi {
+		return false
+	}
+
+	if sp >= mp.g0.stack.lo && sp < mp.g0.stack.hi {
+		// The signal was delivered on the g0 stack.
+		// This can happen when linked with C code
+		// using the thread sanitizer, which collects
+		// signals then delivers them itself by calling
+		// the signal handler directly when C code,
+		// including C code called via cgo, calls a
+		// TSAN-intercepted function such as malloc.
+		st := stackt{ss_size: mp.g0.stack.hi - mp.g0.stack.lo}
+		setSignalstackSP(&st, mp.g0.stack.lo)
+		setGsignalStack(&st, gsigStack)
+		return true
+	}
+
+	var st stackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		setg(nil)
+		needm(0)
+		noSignalStack(sig)
+		dropm()
+	}
+	stsp := uintptr(unsafe.Pointer(st.ss_sp))
+	if sp < stsp || sp >= stsp+st.ss_size {
+		setg(nil)
+		needm(0)
+		sigNotOnStack(sig)
+		dropm()
+	}
+	setGsignalStack(&st, gsigStack)
+	return true
+}
+
+// crashing is the number of m's we have waited for when implementing
+// GOTRACEBACK=crash when a signal is received.
+var crashing int32
+
+// testSigtrap and testSigusr1 are used by the runtime tests. If
+// non-nil, it is called on SIGTRAP/SIGUSR1. If it returns true, the
+// normal behavior on this signal is suppressed.
+var testSigtrap func(info *siginfo, ctxt *sigctxt, gp *g) bool
+var testSigusr1 func(gp *g) bool
+
+// sighandler is invoked when a signal occurs. The global g will be
+// set to a gsignal goroutine and we will be running on the alternate
+// signal stack. The parameter g will be the value of the global g
+// when the signal occurred. The sig, info, and ctxt parameters are
+// from the system signal handler: they are the parameters passed when
+// the SA is passed to the sigaction system call.
+//
+// The garbage collector may have stopped the world, so write barriers
+// are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	if sig == _SIGPROF {
+		sigprof(c.sigpc(), c.sigsp(), c.siglr(), gp, _g_.m)
+		return
+	}
+
+	if sig == _SIGTRAP && testSigtrap != nil && testSigtrap(info, (*sigctxt)(noescape(unsafe.Pointer(c))), gp) {
+		return
+	}
+
+	if sig == _SIGUSR1 && testSigusr1 != nil && testSigusr1(gp) {
+		return
+	}
+
+	if sig == sigPreempt {
+		// Might be a preemption signal.
+		doSigPreempt(gp, c)
+		// Even if this was definitely a preemption signal, it
+		// may have been coalesced with another signal, so we
+		// still let it through to the application.
+	}
+
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 && gp.throwsplit {
+		// We can't safely sigpanic because it may grow the
+		// stack. Abort in the signal handler instead.
+		flags = _SigThrow
+	}
+	if isAbortPC(c.sigpc()) {
+		// On many architectures, the abort function just
+		// causes a memory fault. Don't turn that into a panic.
+		flags = _SigThrow
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// The signal is going to cause a panic.
+		// Arrange the stack so that it looks like the point
+		// where the signal occurred made a call to the
+		// function sigpanic. Then set the PC to sigpanic.
+
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = uintptr(c.fault())
+		gp.sigpc = c.sigpc()
+
+		c.preparePanic(sig, gp)
+		return
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(sig)
+	}
+
+	// _SigThrow means that we should exit now.
+	// If we get here with _SigPanic, it means that the signal
+	// was sent to us by a program (c.sigcode() == _SI_USER);
+	// in that case, if we didn't handle it in sigsend, we exit now.
+	if flags&(_SigThrow|_SigPanic) == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic_m()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(c.sigpc()), " m=", _g_.m.id, " sigcode=", c.sigcode(), "\n")
+	if _g_.m.lockedg != 0 && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+		print("signal arrived during cgo execution\n")
+		gp = _g_.m.lockedg.ptr()
+	}
+	if sig == _SIGILL {
+		// It would be nice to know how long the instruction is.
+		// Unfortunately, that's complicated to do in general (mostly for x86
+		// and s930x, but other archs have non-standard instruction lengths also).
+		// Opt to print 16 bytes, which covers most instructions.
+		const maxN = 16
+		n := uintptr(maxN)
+		// We have to be careful, though. If we're near the end of
+		// a page and the following page isn't mapped, we could
+		// segfault. So make sure we don't straddle a page (even though
+		// that could lead to printing an incomplete instruction).
+		// We're assuming here we can read at least the page containing the PC.
+		// I suppose it is possible that the page is mapped executable but not readable?
+		pc := c.sigpc()
+		if n > physPageSize-pc%physPageSize {
+			n = physPageSize - pc%physPageSize
+		}
+		print("instruction bytes:")
+		b := (*[maxN]byte)(unsafe.Pointer(pc))
+		for i := uintptr(0); i < n; i++ {
+			print(" ", hex(b[i]))
+		}
+		println()
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		tracebacktrap(c.sigpc(), c.sigsp(), c.siglr(), gp)
+		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+			// tracebackothers on original m skipped this one; trace it now.
+			goroutineheader(_g_.m.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, _g_.m.curg)
+		} else if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(c)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < mcount()-int32(extraMCount) {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	printDebugLog()
+
+	exit(2)
+}
+
 // sigpanic turns a synchronous signal into a run-time panic.
 // If the signal handler sees a synchronous panic, it arranges the
 // stack to look like the function where the signal occurred called
@@ -588,11 +914,22 @@
 	throw("signal received during fork")
 }
 
+var badginsignalMsg = "fatal: bad g in signal handler\n"
+
 // This runs on a foreign stack, without an m or a g. No stack split.
 //go:nosplit
 //go:norace
 //go:nowritebarrierrec
 func badsignal(sig uintptr, c *sigctxt) {
+	if !iscgo && !cgoHasExtraM {
+		// There is no extra M. needm will not be able to grab
+		// an M. Instead of hanging, just crash.
+		// Cannot call split-stack function as there is no G.
+		s := stringStructOf(&badginsignalMsg)
+		write(2, s.str, int32(s.len))
+		exit(2)
+		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
+	}
 	needm(0)
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
@@ -636,6 +973,13 @@
 		return true
 	}
 
+	// This function and its caller sigtrampgo assumes SIGPIPE is delivered on the
+	// originating thread. This property does not hold on macOS (golang.org/issue/33384),
+	// so we have no choice but to ignore SIGPIPE.
+	if GOOS == "darwin" && sig == _SIGPIPE {
+		return true
+	}
+
 	// If there is no handler to forward to, no need to forward.
 	if fwdFn == _SIG_DFL {
 		return false
@@ -650,9 +994,10 @@
 		return false
 	}
 	// Determine if the signal occurred inside Go code. We test that:
-	//   (1) we were in a goroutine (i.e., m.curg != nil), and
-	//   (2) we weren't in CGO.
-	g := getg()
+	//   (1) we weren't in VDSO page,
+	//   (2) we were in a goroutine (i.e., m.curg != nil), and
+	//   (3) we weren't in CGO.
+	g := sigFetchG(c)
 	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
 		return false
 	}
@@ -724,13 +1069,15 @@
 // stack to the gsignal stack. If the alternate signal stack is set
 // for the thread (the case when a non-Go thread sets the alternate
 // signal stack and then calls a Go function) then set the gsignal
-// stack to the alternate signal stack. Record which choice was made
-// in newSigstack, so that it can be undone in unminit.
+// stack to the alternate signal stack. We also set the alternate
+// signal stack to the gsignal stack if cgo is not used (regardless
+// of whether it is already set). Record which choice was made in
+// newSigstack, so that it can be undone in unminit.
 func minitSignalStack() {
 	_g_ := getg()
 	var st stackt
 	sigaltstack(nil, &st)
-	if st.ss_flags&_SS_DISABLE != 0 {
+	if st.ss_flags&_SS_DISABLE != 0 || !iscgo {
 		signalstack(&_g_.m.gsignal.stack)
 		_g_.m.newSigstack = true
 	} else {
@@ -845,7 +1192,7 @@
 	sigaltstack(&st, nil)
 }
 
-// setsigsegv is used on darwin/arm{,64} to fake a segmentation fault.
+// setsigsegv is used on darwin/arm64 to fake a segmentation fault.
 //
 // This is exported via linkname to assembly in runtime/cgo.
 //
diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index 3fc1ec5..d123276 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go
@@ -129,7 +129,14 @@
 	// make the trace look like a call to runtime·sigpanic instead.
 	// (Otherwise the trace will end at runtime·sigpanic and we
 	// won't get to see who faulted.)
-	if r.ip() != 0 {
+	// Also don't push a sigpanic frame if the faulting PC
+	// is the entry of asyncPreempt. In this case, we suspended
+	// the thread right between the fault and the exception handler
+	// starting to run, and we have pushed an asyncPreempt call.
+	// The exception is not from asyncPreempt, so not to push a
+	// sigpanic call to make it look like that. Instead, just
+	// overwrite the PC. (See issue #35773)
+	if r.ip() != 0 && r.ip() != funcPC(asyncPreempt) {
 		sp := unsafe.Pointer(r.sp())
 		sp = add(sp, ^(unsafe.Sizeof(uintptr(0)) - 1)) // sp--
 		r.set_sp(uintptr(sp))
@@ -171,6 +178,12 @@
 //
 //go:nosplit
 func lastcontinuehandler(info *exceptionrecord, r *context, gp *g) int32 {
+	if islibrary || isarchive {
+		// Go DLL/archive has been loaded in a non-go program.
+		// If the exception does not originate from go, the go runtime
+		// should not take responsibility of crashing the process.
+		return _EXCEPTION_CONTINUE_SEARCH
+	}
 	if testingWER {
 		return _EXCEPTION_CONTINUE_SEARCH
 	}
diff --git a/src/runtime/signal_windows_test.go b/src/runtime/signal_windows_test.go
new file mode 100644
index 0000000..f998571
--- /dev/null
+++ b/src/runtime/signal_windows_test.go
@@ -0,0 +1,152 @@
+// +build windows
+
+package runtime_test
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"internal/testenv"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"syscall"
+	"testing"
+)
+
+func TestVectoredHandlerDontCrashOnLibrary(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skip("this test can only run on windows/amd64")
+	}
+	testenv.MustHaveGoBuild(t)
+	testenv.MustHaveExecPath(t, "gcc")
+	testprog.Lock()
+	defer testprog.Unlock()
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// build go dll
+	dll := filepath.Join(dir, "testwinlib.dll")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlib/main.go")
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build go library: %s\n%s", err, out)
+	}
+
+	// build c program
+	exe := filepath.Join(dir, "test.exe")
+	cmd = exec.Command("gcc", "-L"+dir, "-I"+dir, "-ltestwinlib", "-o", exe, "testdata/testwinlib/main.c")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build c exe: %s\n%s", err, out)
+	}
+
+	// run test program
+	cmd = exec.Command(exe)
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failure while running executable: %s\n%s", err, out)
+	}
+	expectedOutput := "exceptionCount: 1\ncontinueCount: 1\n"
+	// cleaning output
+	cleanedOut := strings.ReplaceAll(string(out), "\r\n", "\n")
+	if cleanedOut != expectedOutput {
+		t.Errorf("expected output %q, got %q", expectedOutput, cleanedOut)
+	}
+}
+
+func sendCtrlBreak(pid int) error {
+	kernel32, err := syscall.LoadDLL("kernel32.dll")
+	if err != nil {
+		return fmt.Errorf("LoadDLL: %v\n", err)
+	}
+	generateEvent, err := kernel32.FindProc("GenerateConsoleCtrlEvent")
+	if err != nil {
+		return fmt.Errorf("FindProc: %v\n", err)
+	}
+	result, _, err := generateEvent.Call(syscall.CTRL_BREAK_EVENT, uintptr(pid))
+	if result == 0 {
+		return fmt.Errorf("GenerateConsoleCtrlEvent: %v\n", err)
+	}
+	return nil
+}
+
+// TestLibraryCtrlHandler tests that Go DLL allows calling program to handle console control events.
+// See https://golang.org/issues/35965.
+func TestLibraryCtrlHandler(t *testing.T) {
+	if *flagQuick {
+		t.Skip("-quick")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skip("this test can only run on windows/amd64")
+	}
+	testenv.MustHaveGoBuild(t)
+	testenv.MustHaveExecPath(t, "gcc")
+	testprog.Lock()
+	defer testprog.Unlock()
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// build go dll
+	dll := filepath.Join(dir, "dummy.dll")
+	cmd := exec.Command(testenv.GoToolPath(t), "build", "-o", dll, "--buildmode", "c-shared", "testdata/testwinlibsignal/dummy.go")
+	out, err := testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build go library: %s\n%s", err, out)
+	}
+
+	// build c program
+	exe := filepath.Join(dir, "test.exe")
+	cmd = exec.Command("gcc", "-o", exe, "testdata/testwinlibsignal/main.c")
+	out, err = testenv.CleanCmdEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build c exe: %s\n%s", err, out)
+	}
+
+	// run test program
+	cmd = exec.Command(exe)
+	var stderr bytes.Buffer
+	cmd.Stderr = &stderr
+	outPipe, err := cmd.StdoutPipe()
+	if err != nil {
+		t.Fatalf("Failed to create stdout pipe: %v", err)
+	}
+	outReader := bufio.NewReader(outPipe)
+
+	cmd.SysProcAttr = &syscall.SysProcAttr{
+		CreationFlags: syscall.CREATE_NEW_PROCESS_GROUP,
+	}
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("Start failed: %v", err)
+	}
+
+	errCh := make(chan error, 1)
+	go func() {
+		if line, err := outReader.ReadString('\n'); err != nil {
+			errCh <- fmt.Errorf("could not read stdout: %v", err)
+		} else if strings.TrimSpace(line) != "ready" {
+			errCh <- fmt.Errorf("unexpected message: %v", line)
+		} else {
+			errCh <- sendCtrlBreak(cmd.Process.Pid)
+		}
+	}()
+
+	if err := <-errCh; err != nil {
+		t.Fatal(err)
+	}
+	if err := cmd.Wait(); err != nil {
+		t.Fatalf("Program exited with error: %v\n%s", err, &stderr)
+	}
+}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index b2ebb2b..3bf07cb 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -192,16 +192,13 @@
 //go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {
 	if !sig.inuse {
-		// The first call to signal_enable is for us
-		// to use for initialization. It does not pass
-		// signal information in m.
+		// This is the first call to signal_enable. Initialize.
 		sig.inuse = true // enable reception of signals; cannot disable
 		if GOOS == "darwin" {
 			sigNoteSetup(&sig.note)
-			return
+		} else {
+			noteclear(&sig.note)
 		}
-		noteclear(&sig.note)
-		return
 	}
 
 	if s >= uint32(len(sig.wanted)*32) {
diff --git a/src/runtime/sigqueue_plan9.go b/src/runtime/sigqueue_plan9.go
index 934742a..d5fe8f8 100644
--- a/src/runtime/sigqueue_plan9.go
+++ b/src/runtime/sigqueue_plan9.go
@@ -134,12 +134,9 @@
 //go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {
 	if !sig.inuse {
-		// The first call to signal_enable is for us
-		// to use for initialization. It does not pass
-		// signal information in m.
+		// This is the first call to signal_enable. Initialize.
 		sig.inuse = true // enable reception of signals; cannot disable
 		noteclear(&sig.note)
-		return
 	}
 }
 
diff --git a/src/runtime/sizeof_test.go b/src/runtime/sizeof_test.go
index 830055e..736e848 100644
--- a/src/runtime/sizeof_test.go
+++ b/src/runtime/sizeof_test.go
@@ -2,8 +2,6 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !nacl
-
 package runtime_test
 
 import (
@@ -23,7 +21,8 @@
 		_32bit uintptr     // size on 32bit platforms
 		_64bit uintptr     // size on 64bit platforms
 	}{
-		{runtime.G{}, 216, 376}, // g, but exported for testing
+		{runtime.G{}, 216, 376},   // g, but exported for testing
+		{runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
 	}
 
 	for _, tt := range tests {
diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 79cfc69..0418ace 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go
@@ -16,7 +16,7 @@
 	cap   int
 }
 
-// An notInHeapSlice is a slice backed by go:notinheap memory.
+// A notInHeapSlice is a slice backed by go:notinheap memory.
 type notInHeapSlice struct {
 	array *notInHeap
 	len   int
@@ -31,6 +31,55 @@
 	panic(errorString("makeslice: cap out of range"))
 }
 
+// makeslicecopy allocates a slice of "tolen" elements of type "et",
+// then copies "fromlen" elements of type "et" into that new allocation from "from".
+func makeslicecopy(et *_type, tolen int, fromlen int, from unsafe.Pointer) unsafe.Pointer {
+	var tomem, copymem uintptr
+	if uintptr(tolen) > uintptr(fromlen) {
+		var overflow bool
+		tomem, overflow = math.MulUintptr(et.size, uintptr(tolen))
+		if overflow || tomem > maxAlloc || tolen < 0 {
+			panicmakeslicelen()
+		}
+		copymem = et.size * uintptr(fromlen)
+	} else {
+		// fromlen is a known good length providing and equal or greater than tolen,
+		// thereby making tolen a good slice length too as from and to slices have the
+		// same element width.
+		tomem = et.size * uintptr(tolen)
+		copymem = tomem
+	}
+
+	var to unsafe.Pointer
+	if et.ptrdata == 0 {
+		to = mallocgc(tomem, nil, false)
+		if copymem < tomem {
+			memclrNoHeapPointers(add(to, copymem), tomem-copymem)
+		}
+	} else {
+		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
+		to = mallocgc(tomem, et, true)
+		if copymem > 0 && writeBarrier.enabled {
+			// Only shade the pointers in old.array since we know the destination slice to
+			// only contains nil pointers because it has been cleared during alloc.
+			bulkBarrierPreWriteSrcOnly(uintptr(to), uintptr(from), copymem)
+		}
+	}
+
+	if raceenabled {
+		callerpc := getcallerpc()
+		pc := funcPC(makeslicecopy)
+		racereadrangepc(from, copymem, callerpc, pc)
+	}
+	if msanenabled {
+		msanread(from, copymem)
+	}
+
+	memmove(to, from, copymem)
+
+	return to
+}
+
 func makeslice(et *_type, len, cap int) unsafe.Pointer {
 	mem, overflow := math.MulUintptr(et.size, uintptr(cap))
 	if overflow || mem > maxAlloc || len < 0 || len > cap {
@@ -182,7 +231,7 @@
 		if lenmem > 0 && writeBarrier.enabled {
 			// Only shade the pointers in old.array since we know the destination slice p
 			// only contains nil pointers because it has been cleared during alloc.
-			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem)
+			bulkBarrierPreWriteSrcOnly(uintptr(p), uintptr(old.array), lenmem-et.size+et.ptrdata)
 		}
 	}
 	memmove(p, old.array, lenmem)
@@ -194,14 +243,14 @@
 	return x&(x-1) == 0
 }
 
-func slicecopy(to, fm slice, width uintptr) int {
-	if fm.len == 0 || to.len == 0 {
+func slicecopy(toPtr unsafe.Pointer, toLen int, fmPtr unsafe.Pointer, fmLen int, width uintptr) int {
+	if fmLen == 0 || toLen == 0 {
 		return 0
 	}
 
-	n := fm.len
-	if to.len < n {
-		n = to.len
+	n := fmLen
+	if toLen < n {
+		n = toLen
 	}
 
 	if width == 0 {
@@ -211,43 +260,43 @@
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicecopy)
-		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
-		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
+		racereadrangepc(fmPtr, uintptr(n*int(width)), callerpc, pc)
+		racewriterangepc(toPtr, uintptr(n*int(width)), callerpc, pc)
 	}
 	if msanenabled {
-		msanwrite(to.array, uintptr(n*int(width)))
-		msanread(fm.array, uintptr(n*int(width)))
+		msanread(fmPtr, uintptr(n*int(width)))
+		msanwrite(toPtr, uintptr(n*int(width)))
 	}
 
 	size := uintptr(n) * width
 	if size == 1 { // common case worth about 2x to do here
 		// TODO: is this still worth it with new memmove impl?
-		*(*byte)(to.array) = *(*byte)(fm.array) // known to be a byte pointer
+		*(*byte)(toPtr) = *(*byte)(fmPtr) // known to be a byte pointer
 	} else {
-		memmove(to.array, fm.array, size)
+		memmove(toPtr, fmPtr, size)
 	}
 	return n
 }
 
-func slicestringcopy(to []byte, fm string) int {
-	if len(fm) == 0 || len(to) == 0 {
+func slicestringcopy(toPtr *byte, toLen int, fm string) int {
+	if len(fm) == 0 || toLen == 0 {
 		return 0
 	}
 
 	n := len(fm)
-	if len(to) < n {
-		n = len(to)
+	if toLen < n {
+		n = toLen
 	}
 
 	if raceenabled {
 		callerpc := getcallerpc()
 		pc := funcPC(slicestringcopy)
-		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
+		racewriterangepc(unsafe.Pointer(toPtr), uintptr(n), callerpc, pc)
 	}
 	if msanenabled {
-		msanwrite(unsafe.Pointer(&to[0]), uintptr(n))
+		msanwrite(unsafe.Pointer(toPtr), uintptr(n))
 	}
 
-	memmove(unsafe.Pointer(&to[0]), stringStructOf(&fm).str, uintptr(n))
+	memmove(unsafe.Pointer(toPtr), stringStructOf(&fm).str, uintptr(n))
 	return n
 }
diff --git a/src/runtime/slice_test.go b/src/runtime/slice_test.go
index 0463fc7..e963a43 100644
--- a/src/runtime/slice_test.go
+++ b/src/runtime/slice_test.go
@@ -10,6 +10,84 @@
 
 const N = 20
 
+func BenchmarkMakeSliceCopy(b *testing.B) {
+	const length = 32
+	var bytes = make([]byte, 8*length)
+	var ints = make([]int, length)
+	var ptrs = make([]*byte, length)
+	b.Run("mallocmove", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = make([]byte, len(bytes))
+				copy(x, bytes)
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = make([]int, len(ints))
+				copy(x, ints)
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = make([]*byte, len(ptrs))
+				copy(x, ptrs)
+			}
+
+		})
+	})
+	b.Run("makecopy", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = make([]byte, 8*length)
+				copy(x, bytes)
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = make([]int, length)
+				copy(x, ints)
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = make([]*byte, length)
+				copy(x, ptrs)
+			}
+
+		})
+	})
+	b.Run("nilappend", func(b *testing.B) {
+		b.Run("Byte", func(b *testing.B) {
+			var x []byte
+			for i := 0; i < b.N; i++ {
+				x = append([]byte(nil), bytes...)
+				_ = x
+			}
+		})
+		b.Run("Int", func(b *testing.B) {
+			var x []int
+			for i := 0; i < b.N; i++ {
+				x = append([]int(nil), ints...)
+				_ = x
+			}
+		})
+		b.Run("Ptr", func(b *testing.B) {
+			var x []*byte
+			for i := 0; i < b.N; i++ {
+				x = append([]*byte(nil), ptrs...)
+				_ = x
+			}
+		})
+	})
+}
+
 type (
 	struct24 struct{ a, b, c int64 }
 	struct32 struct{ a, b, c, d int64 }
diff --git a/src/runtime/softfloat64.go b/src/runtime/softfloat64.go
index 8fde0fe..13bee6c 100644
--- a/src/runtime/softfloat64.go
+++ b/src/runtime/softfloat64.go
@@ -13,7 +13,7 @@
 	expbits64  uint = 11
 	bias64          = -1<<(expbits64-1) + 1
 
-	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1
+	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
 	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
 	neg64 uint64 = 1 << (expbits64 + mantbits64)
 
@@ -21,7 +21,7 @@
 	expbits32  uint = 8
 	bias32          = -1<<(expbits32-1) + 1
 
-	nan32 uint32 = (1<<expbits32-1)<<mantbits32 + 1
+	nan32 uint32 = (1<<expbits32-1)<<mantbits32 + 1<<(mantbits32-1) // quiet NaN, 0 payload
 	inf32 uint32 = (1<<expbits32 - 1) << mantbits32
 	neg32 uint32 = 1 << (expbits32 + mantbits32)
 )
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 7ae3eee..52e5417 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"internal/cpu"
 	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
@@ -90,7 +91,7 @@
 
 	// The stack guard is a pointer this many bytes above the
 	// bottom of the stack.
-	_StackGuard = 880*sys.StackGuardMultiplier + _StackSystem
+	_StackGuard = 928*sys.StackGuardMultiplier + _StackSystem
 
 	// After a stack split check the SP is allowed to be this
 	// many bytes below the stack guard. This saves an instruction
@@ -137,9 +138,16 @@
 // Stacks are assigned an order according to size.
 //     order = log_2(size/FixedStack)
 // There is a free list for each order.
-// TODO: one lock per order?
-var stackpool [_NumStackOrders]mSpanList
-var stackpoolmu mutex
+var stackpool [_NumStackOrders]struct {
+	item stackpoolItem
+	_    [cpu.CacheLinePadSize - unsafe.Sizeof(stackpoolItem{})%cpu.CacheLinePadSize]byte
+}
+
+//go:notinheap
+type stackpoolItem struct {
+	mu   mutex
+	span mSpanList
+}
 
 // Global pool of large stack spans.
 var stackLarge struct {
@@ -152,10 +160,12 @@
 		throw("cache size must be a multiple of page size")
 	}
 	for i := range stackpool {
-		stackpool[i].init()
+		stackpool[i].item.span.init()
+		lockInit(&stackpool[i].item.mu, lockRankStackpool)
 	}
 	for i := range stackLarge.free {
 		stackLarge.free[i].init()
+		lockInit(&stackLarge.lock, lockRankStackLarge)
 	}
 }
 
@@ -170,10 +180,11 @@
 }
 
 // Allocates a stack from the free pool. Must be called with
-// stackpoolmu held.
+// stackpool[order].item.mu held.
 func stackpoolalloc(order uint8) gclinkptr {
-	list := &stackpool[order]
+	list := &stackpool[order].item.span
 	s := list.first
+	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
 		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
@@ -208,15 +219,15 @@
 	return x
 }
 
-// Adds stack x to the free pool. Must be called with stackpoolmu held.
+// Adds stack x to the free pool. Must be called with stackpool[order].item.mu held.
 func stackpoolfree(x gclinkptr, order uint8) {
 	s := spanOfUnchecked(uintptr(x))
-	if s.state != mSpanManual {
+	if s.state.get() != mSpanManual {
 		throw("freeing stack not in a stack span")
 	}
 	if s.manualFreeList.ptr() == nil {
 		// s will now have a free stack
-		stackpool[order].insert(s)
+		stackpool[order].item.span.insert(s)
 	}
 	x.ptr().next = s.manualFreeList
 	s.manualFreeList = x
@@ -237,7 +248,7 @@
 		//    pointer into a free span.
 		//
 		// By not freeing, we prevent step #4 until GC is done.
-		stackpool[order].remove(s)
+		stackpool[order].item.span.remove(s)
 		s.manualFreeList = 0
 		osStackFree(s)
 		mheap_.freeManual(s, &memstats.stacks_inuse)
@@ -257,14 +268,14 @@
 	// Grab half of the allowed capacity (to prevent thrashing).
 	var list gclinkptr
 	var size uintptr
-	lock(&stackpoolmu)
+	lock(&stackpool[order].item.mu)
 	for size < _StackCacheSize/2 {
 		x := stackpoolalloc(order)
 		x.ptr().next = list
 		list = x
 		size += _FixedStack << order
 	}
-	unlock(&stackpoolmu)
+	unlock(&stackpool[order].item.mu)
 	c.stackcache[order].list = list
 	c.stackcache[order].size = size
 }
@@ -276,14 +287,14 @@
 	}
 	x := c.stackcache[order].list
 	size := c.stackcache[order].size
-	lock(&stackpoolmu)
+	lock(&stackpool[order].item.mu)
 	for size > _StackCacheSize/2 {
 		y := x.ptr().next
 		stackpoolfree(x, order)
 		x = y
 		size -= _FixedStack << order
 	}
-	unlock(&stackpoolmu)
+	unlock(&stackpool[order].item.mu)
 	c.stackcache[order].list = x
 	c.stackcache[order].size = size
 }
@@ -293,8 +304,8 @@
 	if stackDebug >= 1 {
 		print("stackcache clear\n")
 	}
-	lock(&stackpoolmu)
 	for order := uint8(0); order < _NumStackOrders; order++ {
+		lock(&stackpool[order].item.mu)
 		x := c.stackcache[order].list
 		for x.ptr() != nil {
 			y := x.ptr().next
@@ -303,8 +314,8 @@
 		}
 		c.stackcache[order].list = 0
 		c.stackcache[order].size = 0
+		unlock(&stackpool[order].item.mu)
 	}
-	unlock(&stackpoolmu)
 }
 
 // stackalloc allocates an n byte stack.
@@ -329,7 +340,7 @@
 	}
 
 	if debug.efence != 0 || stackFromSystem != 0 {
-		n = uint32(round(uintptr(n), physPageSize))
+		n = uint32(alignUp(uintptr(n), physPageSize))
 		v := sysAlloc(uintptr(n), &memstats.stacks_sys)
 		if v == nil {
 			throw("out of memory (stackalloc)")
@@ -349,16 +360,16 @@
 			n2 >>= 1
 		}
 		var x gclinkptr
-		c := thisg.m.mcache
-		if stackNoCache != 0 || c == nil || thisg.m.preemptoff != "" {
-			// c == nil can happen in the guts of exitsyscall or
-			// procresize. Just get a stack from the global pool.
+		if stackNoCache != 0 || thisg.m.p == 0 || thisg.m.preemptoff != "" {
+			// thisg.m.p == 0 can happen in the guts of exitsyscall
+			// or procresize. Just get a stack from the global pool.
 			// Also don't touch stackcache during gc
 			// as it's flushed concurrently.
-			lock(&stackpoolmu)
+			lock(&stackpool[order].item.mu)
 			x = stackpoolalloc(order)
-			unlock(&stackpoolmu)
+			unlock(&stackpool[order].item.mu)
 		} else {
+			c := thisg.m.p.ptr().mcache
 			x = c.stackcache[order].list
 			if x.ptr() == nil {
 				stackcacherefill(c, order)
@@ -381,6 +392,8 @@
 		}
 		unlock(&stackLarge.lock)
 
+		lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
+
 		if s == nil {
 			// Allocate a new stack from the heap.
 			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
@@ -444,12 +457,12 @@
 			n2 >>= 1
 		}
 		x := gclinkptr(v)
-		c := gp.m.mcache
-		if stackNoCache != 0 || c == nil || gp.m.preemptoff != "" {
-			lock(&stackpoolmu)
+		if stackNoCache != 0 || gp.m.p == 0 || gp.m.preemptoff != "" {
+			lock(&stackpool[order].item.mu)
 			stackpoolfree(x, order)
-			unlock(&stackpoolmu)
+			unlock(&stackpool[order].item.mu)
 		} else {
+			c := gp.m.p.ptr().mcache
 			if c.stackcache[order].size >= _StackCacheSize {
 				stackcacherelease(c, order)
 			}
@@ -459,7 +472,7 @@
 		}
 	} else {
 		s := spanOfUnchecked(uintptr(v))
-		if s.state != mSpanManual {
+		if s.state.get() != mSpanManual {
 			println(hex(s.base()), v)
 			throw("bad span state")
 		}
@@ -619,7 +632,7 @@
 		print("    adjusting ", funcname(f), " frame=[", hex(frame.sp), ",", hex(frame.fp), "] pc=", hex(frame.pc), " continpc=", hex(frame.continpc), "\n")
 	}
 	if f.funcID == funcID_systemstack_switch {
-		// A special routine at the bottom of stack of a goroutine that does an systemstack call.
+		// A special routine at the bottom of stack of a goroutine that does a systemstack call.
 		// We will allow it to be copied even though we don't
 		// have full GC info for it (because it is written in asm).
 		return true
@@ -728,6 +741,8 @@
 		adjustpointer(adjinfo, unsafe.Pointer(&d.sp))
 		adjustpointer(adjinfo, unsafe.Pointer(&d._panic))
 		adjustpointer(adjinfo, unsafe.Pointer(&d.link))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.varp))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.fd))
 	}
 
 	// Adjust defer argument blocks the same way we adjust active stack frames.
@@ -776,14 +791,19 @@
 	}
 
 	// Lock channels to prevent concurrent send/receive.
-	// It's important that we *only* do this for async
-	// copystack; otherwise, gp may be in the middle of
-	// putting itself on wait queues and this would
-	// self-deadlock.
 	var lastc *hchan
 	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
 		if sg.c != lastc {
-			lock(&sg.c.lock)
+			// There is a ranking cycle here between gscan bit and
+			// hchan locks. Normally, we only allow acquiring hchan
+			// locks and then getting a gscan bit. In this case, we
+			// already have the gscan bit. We allow acquiring hchan
+			// locks here as a special case, since a deadlock can't
+			// happen because the G involved must already be
+			// suspended. So, we get a special hchan lock rank here
+			// that is lower than gscan, but doesn't allow acquiring
+			// any other locks other than hchan.
+			lockWithRank(&sg.c.lock, lockRankHchanLeaf)
 		}
 		lastc = sg.c
 	}
@@ -816,12 +836,7 @@
 
 // Copies gp's stack to a new stack of a different size.
 // Caller must have changed gp status to Gcopystack.
-//
-// If sync is true, this is a self-triggered stack growth and, in
-// particular, no other G may be writing to gp's stack (e.g., via a
-// channel operation). If sync is false, copystack protects against
-// concurrent channel operations.
-func copystack(gp *g, newsize uintptr, sync bool) {
+func copystack(gp *g, newsize uintptr) {
 	if gp.syscallsp != 0 {
 		throw("stack growth not allowed in system call")
 	}
@@ -847,15 +862,16 @@
 
 	// Adjust sudogs, synchronizing with channel ops if necessary.
 	ncopy := used
-	if sync {
+	if !gp.activeStackChans {
 		adjustsudogs(gp, &adjinfo)
 	} else {
-		// sudogs can point in to the stack. During concurrent
-		// shrinking, these areas may be written to. Find the
-		// highest such pointer so we can handle everything
-		// there and below carefully. (This shouldn't be far
-		// from the bottom of the stack, so there's little
-		// cost in handling everything below it carefully.)
+		// sudogs may be pointing in to the stack and gp has
+		// released channel locks, so other goroutines could
+		// be writing to gp's stack. Find the highest such
+		// pointer so we can handle everything there and below
+		// carefully. (This shouldn't be far from the bottom
+		// of the stack, so there's little cost in handling
+		// everything below it carefully.)
 		adjinfo.sghi = findsghi(gp, old)
 
 		// Synchronize with channel ops and copy the part of
@@ -906,7 +922,7 @@
 // Stack growth is multiplicative, for constant amortized cost.
 //
 // g->atomicstatus will be Grunning or Gscanrunning upon entry.
-// If the GC is trying to stop this g then it will set preemptscan to true.
+// If the scheduler is trying to stop this g, then it will set preemptStop.
 //
 // This must be nowritebarrierrec because it can be called as part of
 // stack growth from other nowritebarrierrec functions, but the
@@ -973,7 +989,7 @@
 	// it needs a lock held by the goroutine), that small preemption turns
 	// into a real deadlock.
 	if preempt {
-		if thisg.m.locks != 0 || thisg.m.mallocing != 0 || thisg.m.preemptoff != "" || thisg.m.p.ptr().status != _Prunning {
+		if !canPreemptM(thisg.m) {
 			// Let the goroutine keep running for now.
 			// gp->preempt is set, so it will be preempted next time.
 			gp.stackguard0 = gp.stack.lo + _StackGuard
@@ -1007,42 +1023,39 @@
 		if thisg.m.p == 0 && thisg.m.locks == 0 {
 			throw("runtime: g is running but p is not")
 		}
-		// Synchronize with scang.
-		casgstatus(gp, _Grunning, _Gwaiting)
-		if gp.preemptscan {
-			for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
-				// Likely to be racing with the GC as
-				// it sees a _Gwaiting and does the
-				// stack scan. If so, gcworkdone will
-				// be set and gcphasework will simply
-				// return.
-			}
-			if !gp.gcscandone {
-				// gcw is safe because we're on the
-				// system stack.
-				gcw := &gp.m.p.ptr().gcw
-				scanstack(gp, gcw)
-				gp.gcscandone = true
-			}
-			gp.preemptscan = false
-			gp.preempt = false
-			casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
-			// This clears gcscanvalid.
-			casgstatus(gp, _Gwaiting, _Grunning)
-			gp.stackguard0 = gp.stack.lo + _StackGuard
-			gogo(&gp.sched) // never return
+
+		if gp.preemptShrink {
+			// We're at a synchronous safe point now, so
+			// do the pending stack shrink.
+			gp.preemptShrink = false
+			shrinkstack(gp)
+		}
+
+		if gp.preemptStop {
+			preemptPark(gp) // never returns
 		}
 
 		// Act like goroutine called runtime.Gosched.
-		casgstatus(gp, _Gwaiting, _Grunning)
 		gopreempt_m(gp) // never return
 	}
 
 	// Allocate a bigger segment and move the stack.
 	oldsize := gp.stack.hi - gp.stack.lo
 	newsize := oldsize * 2
+
+	// Make sure we grow at least as much as needed to fit the new frame.
+	// (This is just an optimization - the caller of morestack will
+	// recheck the bounds on return.)
+	if f := findfunc(gp.sched.pc); f.valid() {
+		max := uintptr(funcMaxSPDelta(f))
+		for newsize-oldsize < max+_StackGuard {
+			newsize *= 2
+		}
+	}
+
 	if newsize > maxstacksize {
 		print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
+		print("runtime: sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
 		throw("stack overflow")
 	}
 
@@ -1052,7 +1065,7 @@
 
 	// The concurrent GC will not scan the stack while we are doing the copy since
 	// the gp is in a Gcopystack status.
-	copystack(gp, newsize, true)
+	copystack(gp, newsize)
 	if stackDebug >= 1 {
 		print("stack grow done\n")
 	}
@@ -1077,16 +1090,46 @@
 	gostartcall(gobuf, fn, unsafe.Pointer(fv))
 }
 
+// isShrinkStackSafe returns whether it's safe to attempt to shrink
+// gp's stack. Shrinking the stack is only safe when we have precise
+// pointer maps for all frames on the stack.
+func isShrinkStackSafe(gp *g) bool {
+	// We can't copy the stack if we're in a syscall.
+	// The syscall might have pointers into the stack and
+	// often we don't have precise pointer maps for the innermost
+	// frames.
+	//
+	// We also can't copy the stack if we're at an asynchronous
+	// safe-point because we don't have precise pointer maps for
+	// all frames.
+	return gp.syscallsp == 0 && !gp.asyncSafePoint
+}
+
 // Maybe shrink the stack being used by gp.
-// Called at garbage collection time.
-// gp must be stopped, but the world need not be.
+//
+// gp must be stopped and we must own its stack. It may be in
+// _Grunning, but only if this is our own user G.
 func shrinkstack(gp *g) {
-	gstatus := readgstatus(gp)
 	if gp.stack.lo == 0 {
 		throw("missing stack in shrinkstack")
 	}
-	if gstatus&_Gscan == 0 {
-		throw("bad status in shrinkstack")
+	if s := readgstatus(gp); s&_Gscan == 0 {
+		// We don't own the stack via _Gscan. We could still
+		// own it if this is our own user G and we're on the
+		// system stack.
+		if !(gp == getg().m.curg && getg() != getg().m.curg && s == _Grunning) {
+			// We don't own the stack.
+			throw("bad status in shrinkstack")
+		}
+	}
+	if !isShrinkStackSafe(gp) {
+		throw("shrinkstack at bad time")
+	}
+	// Check for self-shrinks while in a libcall. These may have
+	// pointers into the stack disguised as uintptrs, but these
+	// code paths should all be nosplit.
+	if gp == getg().m.curg && gp.m.libcallsp != 0 {
+		throw("shrinking stack in libcall")
 	}
 
 	if debug.gcshrinkstackoff > 0 {
@@ -1116,29 +1159,20 @@
 		return
 	}
 
-	// We can't copy the stack if we're in a syscall.
-	// The syscall might have pointers into the stack.
-	if gp.syscallsp != 0 {
-		return
-	}
-	if sys.GoosWindows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
-		return
-	}
-
 	if stackDebug > 0 {
 		print("shrinking stack ", oldsize, "->", newsize, "\n")
 	}
 
-	copystack(gp, newsize, false)
+	copystack(gp, newsize)
 }
 
 // freeStackSpans frees unused stack spans at the end of GC.
 func freeStackSpans() {
-	lock(&stackpoolmu)
 
 	// Scan stack pools for empty stack spans.
 	for order := range stackpool {
-		list := &stackpool[order]
+		lock(&stackpool[order].item.mu)
+		list := &stackpool[order].item.span
 		for s := list.first; s != nil; {
 			next := s.next
 			if s.allocCount == 0 {
@@ -1149,10 +1183,9 @@
 			}
 			s = next
 		}
+		unlock(&stackpool[order].item.mu)
 	}
 
-	unlock(&stackpoolmu)
-
 	// Free large stack spans.
 	lock(&stackLarge.lock)
 	for i := range stackLarge.free {
@@ -1203,29 +1236,8 @@
 		minsize = sys.MinFrameSize
 	}
 	if size > minsize {
-		var stkmap *stackmap
 		stackid := pcdata
-		if f.funcID != funcID_debugCallV1 {
-			stkmap = (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
-		} else {
-			// debugCallV1's stack map is the register map
-			// at its call site.
-			callerPC := frame.lr
-			caller := findfunc(callerPC)
-			if !caller.valid() {
-				println("runtime: debugCallV1 called by unknown caller", hex(callerPC))
-				throw("bad debugCallV1")
-			}
-			stackid = int32(-1)
-			if callerPC != caller.entry {
-				callerPC--
-				stackid = pcdatavalue(caller, _PCDATA_RegMapIndex, callerPC, cache)
-			}
-			if stackid == -1 {
-				stackid = 0 // in prologue
-			}
-			stkmap = (*stackmap)(funcdata(caller, _FUNCDATA_RegPointerMaps))
-		}
+		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
 		if stkmap == nil || stkmap.n <= 0 {
 			print("runtime: frame ", funcname(f), " untyped locals ", hex(frame.varp-size), "+", hex(size), "\n")
 			throw("missing stackmap")
diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index 143d3a9..adfc653 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -599,9 +599,6 @@
 	return pc[:Callers(0, pc)]
 }
 
-// The noinline prevents this function from being inlined
-// into a wrapper. TODO: remove this when issue 28640 is fixed.
-//go:noinline
 func (s structWithMethod) stack() string {
 	buf := make([]byte, 4<<10)
 	return string(buf[:Stack(buf, false)])
diff --git a/src/runtime/string.go b/src/runtime/string.go
index d198f73..0515b56 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go
@@ -6,6 +6,7 @@
 
 import (
 	"internal/bytealg"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -70,40 +71,47 @@
 	return concatstrings(buf, a[:])
 }
 
+// slicebytetostring converts a byte slice to a string.
+// It is inserted by the compiler into generated code.
+// ptr is a pointer to the first element of the slice;
+// n is the length of the slice.
 // Buf is a fixed-size buffer for the result,
 // it is not nil if the result does not escape.
-func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
-	l := len(b)
-	if l == 0 {
+func slicebytetostring(buf *tmpBuf, ptr *byte, n int) (str string) {
+	if n == 0 {
 		// Turns out to be a relatively common case.
 		// Consider that you want to parse out data between parens in "foo()bar",
 		// you find the indices and convert the subslice to string.
 		return ""
 	}
 	if raceenabled {
-		racereadrangepc(unsafe.Pointer(&b[0]),
-			uintptr(l),
+		racereadrangepc(unsafe.Pointer(ptr),
+			uintptr(n),
 			getcallerpc(),
 			funcPC(slicebytetostring))
 	}
 	if msanenabled {
-		msanread(unsafe.Pointer(&b[0]), uintptr(l))
+		msanread(unsafe.Pointer(ptr), uintptr(n))
 	}
-	if l == 1 {
-		stringStructOf(&str).str = unsafe.Pointer(&staticbytes[b[0]])
+	if n == 1 {
+		p := unsafe.Pointer(&staticuint64s[*ptr])
+		if sys.BigEndian {
+			p = add(p, 7)
+		}
+		stringStructOf(&str).str = p
 		stringStructOf(&str).len = 1
 		return
 	}
 
 	var p unsafe.Pointer
-	if buf != nil && len(b) <= len(buf) {
+	if buf != nil && n <= len(buf) {
 		p = unsafe.Pointer(buf)
 	} else {
-		p = mallocgc(uintptr(len(b)), nil, false)
+		p = mallocgc(uintptr(n), nil, false)
 	}
 	stringStructOf(&str).str = p
-	stringStructOf(&str).len = len(b)
-	memmove(p, (*(*slice)(unsafe.Pointer(&b))).array, uintptr(len(b)))
+	stringStructOf(&str).len = n
+	memmove(p, unsafe.Pointer(ptr), uintptr(n))
 	return
 }
 
@@ -118,7 +126,7 @@
 func rawstringtmp(buf *tmpBuf, l int) (s string, b []byte) {
 	if buf != nil && l <= len(buf) {
 		b = buf[:l]
-		s = slicebytetostringtmp(b)
+		s = slicebytetostringtmp(&b[0], len(b))
 	} else {
 		s, b = rawstring(l)
 	}
@@ -139,17 +147,19 @@
 //   where k is []byte, T1 to Tn is a nesting of struct and array literals.
 // - Used for "<"+string(b)+">" concatenation where b is []byte.
 // - Used for string(b)=="foo" comparison where b is []byte.
-func slicebytetostringtmp(b []byte) string {
-	if raceenabled && len(b) > 0 {
-		racereadrangepc(unsafe.Pointer(&b[0]),
-			uintptr(len(b)),
+func slicebytetostringtmp(ptr *byte, n int) (str string) {
+	if raceenabled && n > 0 {
+		racereadrangepc(unsafe.Pointer(ptr),
+			uintptr(n),
 			getcallerpc(),
 			funcPC(slicebytetostringtmp))
 	}
-	if msanenabled && len(b) > 0 {
-		msanread(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	if msanenabled && n > 0 {
+		msanread(unsafe.Pointer(ptr), uintptr(n))
 	}
-	return *(*string)(unsafe.Pointer(&b))
+	stringStructOf(&str).str = unsafe.Pointer(ptr)
+	stringStructOf(&str).len = n
+	return
 }
 
 func stringtoslicebyte(buf *tmpBuf, s string) []byte {
@@ -231,16 +241,10 @@
 }
 
 func intstring(buf *[4]byte, v int64) (s string) {
-	if v >= 0 && v < runeSelf {
-		stringStructOf(&s).str = unsafe.Pointer(&staticbytes[v])
-		stringStructOf(&s).len = 1
-		return
-	}
-
 	var b []byte
 	if buf != nil {
 		b = buf[:]
-		s = slicebytetostringtmp(b)
+		s = slicebytetostringtmp(&b[0], len(b))
 	} else {
 		s, b = rawstring(4)
 	}
@@ -495,3 +499,37 @@
 	b[n2] = 0 // for luck
 	return s[:n2]
 }
+
+// parseRelease parses a dot-separated version number. It follows the
+// semver syntax, but allows the minor and patch versions to be
+// elided.
+func parseRelease(rel string) (major, minor, patch int, ok bool) {
+	// Strip anything after a dash or plus.
+	for i := 0; i < len(rel); i++ {
+		if rel[i] == '-' || rel[i] == '+' {
+			rel = rel[:i]
+			break
+		}
+	}
+
+	next := func() (int, bool) {
+		for i := 0; i < len(rel); i++ {
+			if rel[i] == '.' {
+				ver, ok := atoi(rel[:i])
+				rel = rel[i+1:]
+				return ver, ok
+			}
+		}
+		ver, ok := atoi(rel)
+		rel = ""
+		return ver, ok
+	}
+	if major, ok = next(); !ok || rel == "" {
+		return
+	}
+	if minor, ok = next(); !ok || rel == "" {
+		return
+	}
+	patch, ok = next()
+	return
+}
diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index a1716fa..b9ac667 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go
@@ -282,7 +282,7 @@
 func TestIntString(t *testing.T) {
 	// Non-escaping result of intstring.
 	s := ""
-	for i := 0; i < 4; i++ {
+	for i := rune(0); i < 4; i++ {
 		s += string(i+'0') + string(i+'0'+1)
 	}
 	if want := "01122334"; s != want {
@@ -291,7 +291,7 @@
 
 	// Escaping result of intstring.
 	var a [4]string
-	for i := 0; i < 4; i++ {
+	for i := rune(0); i < 4; i++ {
 		a[i] = string(i + '0')
 	}
 	s = a[0] + a[1] + a[2] + a[3]
@@ -454,3 +454,34 @@
 		}
 	}
 }
+
+type parseReleaseTest struct {
+	in                  string
+	major, minor, patch int
+}
+
+var parseReleaseTests = []parseReleaseTest{
+	{"", -1, -1, -1},
+	{"x", -1, -1, -1},
+	{"5", 5, 0, 0},
+	{"5.12", 5, 12, 0},
+	{"5.12-x", 5, 12, 0},
+	{"5.12.1", 5, 12, 1},
+	{"5.12.1-x", 5, 12, 1},
+	{"5.12.1.0", 5, 12, 1},
+	{"5.20496382327982653440", -1, -1, -1},
+}
+
+func TestParseRelease(t *testing.T) {
+	for _, test := range parseReleaseTests {
+		major, minor, patch, ok := runtime.ParseRelease(test.in)
+		if !ok {
+			major, minor, patch = -1, -1, -1
+		}
+		if test.major != major || test.minor != minor || test.patch != patch {
+			t.Errorf("parseRelease(%q) = (%v, %v, %v) want (%v, %v, %v)",
+				test.in, major, minor, patch,
+				test.major, test.minor, test.patch)
+		}
+	}
+}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 26aaf22..2c6f027 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -83,7 +83,17 @@
 }
 
 // memmove copies n bytes from "from" to "to".
-// in memmove_*.s
+//
+// memmove ensures that any pointer in "from" is written to "to" with
+// an indivisible write, so that racy reads cannot observe a
+// half-written pointer. This is necessary to prevent the garbage
+// collector from observing invalid pointers, and differs from memmove
+// in unmanaged languages. However, memmove is only required to do
+// this if "from" and "to" may contain pointers, which can only be the
+// case if "from", "to", and "n" are all be word-aligned.
+//
+// Implementations are in memmove_*.s.
+//
 //go:noescape
 func memmove(to, from unsafe.Pointer, n uintptr)
 
@@ -290,11 +300,23 @@
 
 func systemstack_switch()
 
-// round n up to a multiple of a.  a must be a power of 2.
-func round(n, a uintptr) uintptr {
+// alignUp rounds n up to a multiple of a. a must be a power of 2.
+func alignUp(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
 }
 
+// alignDown rounds n down to a multiple of a. a must be a power of 2.
+func alignDown(n, a uintptr) uintptr {
+	return n &^ (a - 1)
+}
+
+// divRoundUp returns ceil(n / a).
+func divRoundUp(n, a uintptr) uintptr {
+	// a is generally a power of two. This will get inlined and
+	// the compiler will optimize the division.
+	return (n + a - 1) / a
+}
+
 // checkASM reports whether assembly runtime checks have passed.
 func checkASM() bool
 
diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 57134f7..4a1a5cc 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go
@@ -5,7 +5,6 @@
 // +build !plan9
 // +build !solaris
 // +build !windows
-// +build !nacl
 // +build !js
 // +build !darwin
 // +build !aix
@@ -14,14 +13,19 @@
 
 import "unsafe"
 
+// read calls the read system call.
+// It returns a non-negative number of bytes written or a negative errno value.
 func read(fd int32, p unsafe.Pointer, n int32) int32
+
 func closefd(fd int32) int32
 
 func exit(code int32)
 func usleep(usec uint32)
 
+// write calls the write system call.
+// It returns a non-negative number of bytes written or a negative errno value.
 //go:noescape
-func write(fd uintptr, p unsafe.Pointer, n int32) int32
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32
 
 //go:noescape
 func open(name *byte, mode, perm int32) int32
diff --git a/src/runtime/stubs3.go b/src/runtime/stubs3.go
index a9ff689..95eecc7 100644
--- a/src/runtime/stubs3.go
+++ b/src/runtime/stubs3.go
@@ -4,12 +4,10 @@
 
 // +build !plan9
 // +build !solaris
-// +build !windows
-// +build !nacl
 // +build !freebsd
 // +build !darwin
 // +build !aix
 
 package runtime
 
-func nanotime() int64
+func nanotime1() int64
diff --git a/src/runtime/stubs32.go b/src/runtime/stubs32.go
index 149560f..a7f52f6 100644
--- a/src/runtime/stubs32.go
+++ b/src/runtime/stubs32.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm amd64p32 mips mipsle
+// +build 386 arm mips mipsle
 
 package runtime
 
diff --git a/src/runtime/stubs_amd64.go b/src/runtime/stubs_amd64.go
new file mode 100644
index 0000000..8c14bc2
--- /dev/null
+++ b/src/runtime/stubs_amd64.go
@@ -0,0 +1,37 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Called from compiled code; declared for vet; do NOT call from Go.
+func gcWriteBarrierCX()
+func gcWriteBarrierDX()
+func gcWriteBarrierBX()
+func gcWriteBarrierBP()
+func gcWriteBarrierSI()
+func gcWriteBarrierR8()
+func gcWriteBarrierR9()
+
+// stackcheck checks that SP is in range [g->stack.lo, g->stack.hi).
+func stackcheck()
+
+// Called from assembly only; declared for go vet.
+func settls() // argument in DI
+
+// Retpolines, used by -spectre=ret flag in cmd/asm, cmd/compile.
+func retpolineAX()
+func retpolineCX()
+func retpolineDX()
+func retpolineBX()
+func retpolineBP()
+func retpolineSI()
+func retpolineDI()
+func retpolineR8()
+func retpolineR9()
+func retpolineR10()
+func retpolineR11()
+func retpolineR12()
+func retpolineR13()
+func retpolineR14()
+func retpolineR15()
diff --git a/src/runtime/stubs_amd64x.go b/src/runtime/stubs_amd64x.go
deleted file mode 100644
index e7a1be8..0000000
--- a/src/runtime/stubs_amd64x.go
+++ /dev/null
@@ -1,13 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64 amd64p32
-
-package runtime
-
-// stackcheck checks that SP is in range [g->stack.lo, g->stack.hi).
-func stackcheck()
-
-// Called from assembly only; declared for go vet.
-func settls() // argument in DI
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index c2f32e0..1e86662 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -148,6 +148,62 @@
 	return
 }
 
+// runtime_expandFinalInlineFrame expands the final pc in stk to include all
+// "callers" if pc is inline.
+//
+//go:linkname runtime_expandFinalInlineFrame runtime/pprof.runtime_expandFinalInlineFrame
+func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr {
+	if len(stk) == 0 {
+		return stk
+	}
+	pc := stk[len(stk)-1]
+	tracepc := pc - 1
+
+	f := findfunc(tracepc)
+	if !f.valid() {
+		// Not a Go function.
+		return stk
+	}
+
+	inldata := funcdata(f, _FUNCDATA_InlTree)
+	if inldata == nil {
+		// Nothing inline in f.
+		return stk
+	}
+
+	// Treat the previous func as normal. We haven't actually checked, but
+	// since this pc was included in the stack, we know it shouldn't be
+	// elided.
+	lastFuncID := funcID_normal
+
+	// Remove pc from stk; we'll re-add it below.
+	stk = stk[:len(stk)-1]
+
+	// See inline expansion in gentraceback.
+	var cache pcvalueCache
+	inltree := (*[1 << 20]inlinedCall)(inldata)
+	for {
+		ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, &cache)
+		if ix < 0 {
+			break
+		}
+		if inltree[ix].funcID == funcID_wrapper && elideWrapperCalling(lastFuncID) {
+			// ignore wrappers
+		} else {
+			stk = append(stk, pc)
+		}
+		lastFuncID = inltree[ix].funcID
+		// Back up to an instruction in the "caller".
+		tracepc = f.entry + uintptr(inltree[ix].parentPc)
+		pc = tracepc + 1
+	}
+
+	// N.B. we want to keep the last parentPC which is not inline.
+	stk = append(stk, pc)
+
+	return stk
+}
+
 // expandCgoFrames expands frame information for pc, known to be
 // a non-Go function, using the cgoSymbolizer hook. expandCgoFrames
 // returns nil if pc could not be expanded.
@@ -212,19 +268,39 @@
 //
 // See funcdata.h and ../cmd/internal/objabi/funcdata.go.
 const (
-	_PCDATA_RegMapIndex   = 0
+	_PCDATA_RegMapIndex   = 0 // if !go115ReduceLiveness
+	_PCDATA_UnsafePoint   = 0 // if go115ReduceLiveness
 	_PCDATA_StackMapIndex = 1
 	_PCDATA_InlTreeIndex  = 2
 
-	_FUNCDATA_ArgsPointerMaps   = 0
-	_FUNCDATA_LocalsPointerMaps = 1
-	_FUNCDATA_RegPointerMaps    = 2
-	_FUNCDATA_StackObjects      = 3
-	_FUNCDATA_InlTree           = 4
+	_FUNCDATA_ArgsPointerMaps    = 0
+	_FUNCDATA_LocalsPointerMaps  = 1
+	_FUNCDATA_RegPointerMaps     = 2 // if !go115ReduceLiveness
+	_FUNCDATA_StackObjects       = 3
+	_FUNCDATA_InlTree            = 4
+	_FUNCDATA_OpenCodedDeferInfo = 5
 
 	_ArgsSizeUnknown = -0x80000000
 )
 
+const (
+	// PCDATA_UnsafePoint values.
+	_PCDATA_UnsafePointSafe   = -1 // Safe for async preemption
+	_PCDATA_UnsafePointUnsafe = -2 // Unsafe for async preemption
+
+	// _PCDATA_Restart1(2) apply on a sequence of instructions, within
+	// which if an async preemption happens, we should back off the PC
+	// to the start of the sequence when resume.
+	// We need two so we can distinguish the start/end of the sequence
+	// in case that two sequences are next to each other.
+	_PCDATA_Restart1 = -3
+	_PCDATA_Restart2 = -4
+
+	// Like _PCDATA_RestartAtEntry, but back to function entry if async
+	// preempted.
+	_PCDATA_RestartAtEntry = -5
+)
+
 // A FuncID identifies particular functions that need to be treated
 // specially by the runtime.
 // Note that in some situations involving plugins, there may be multiple
@@ -253,6 +329,8 @@
 	funcID_debugCallV1
 	funcID_gopanic
 	funcID_panicwrap
+	funcID_handleAsyncEvent
+	funcID_asyncPreempt
 	funcID_wrapper // any autogenerated code (hash/eq algorithms, method wrappers, etc.)
 )
 
@@ -485,8 +563,8 @@
 // given program counter address, or else nil.
 //
 // If pc represents multiple functions because of inlining, it returns
-// the a *Func describing the innermost function, but with an entry
-// of the outermost function.
+// the *Func describing the innermost function, but with an entry of
+// the outermost function.
 func FuncForPC(pc uintptr) *Func {
 	f := findfunc(pc)
 	if !f.valid() {
@@ -611,7 +689,15 @@
 			idx++
 		}
 	}
-	return funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff])), datap}
+	funcoff := datap.ftab[idx].funcoff
+	if funcoff == ^uintptr(0) {
+		// With multiple text sections, there may be functions inserted by the external
+		// linker that are not known by Go. This means there may be holes in the PC
+		// range covered by the func table. The invalid funcoff value indicates a hole.
+		// See also cmd/link/internal/ld/pcln.go:pclntab
+		return funcInfo{}
+	}
+	return funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[funcoff])), datap}
 }
 
 type pcvalueCache struct {
@@ -634,9 +720,11 @@
 	return (targetpc / sys.PtrSize) % uintptr(len(pcvalueCache{}.entries))
 }
 
-func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
+// Returns the PCData value, and the PC where this value starts.
+// TODO: the start PC is returned only when cache is nil.
+func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) (int32, uintptr) {
 	if off == 0 {
-		return -1
+		return -1, 0
 	}
 
 	// Check the cache. This speeds up walks of deep stacks, which
@@ -655,7 +743,7 @@
 			// fail in the first clause.
 			ent := &cache.entries[x][i]
 			if ent.off == off && ent.targetpc == targetpc {
-				return ent.val
+				return ent.val, 0
 			}
 		}
 	}
@@ -665,11 +753,12 @@
 			print("runtime: no module data for ", hex(f.entry), "\n")
 			throw("no module data")
 		}
-		return -1
+		return -1, 0
 	}
 	datap := f.datap
 	p := datap.pclntable[off:]
 	pc := f.entry
+	prevpc := pc
 	val := int32(-1)
 	for {
 		var ok bool
@@ -696,14 +785,15 @@
 				}
 			}
 
-			return val
+			return val, prevpc
 		}
+		prevpc = pc
 	}
 
 	// If there was a table, it should have covered all program counters.
 	// If not, something is wrong.
 	if panicking != 0 || !strict {
-		return -1
+		return -1, 0
 	}
 
 	print("runtime: invalid pc-encoded table f=", funcname(f), " pc=", hex(pc), " targetpc=", hex(targetpc), " tab=", p, "\n")
@@ -721,7 +811,7 @@
 	}
 
 	throw("invalid runtime symbol table")
-	return -1
+	return -1, 0
 }
 
 func cfuncname(f funcInfo) *byte {
@@ -735,13 +825,15 @@
 	return gostringnocopy(cfuncname(f))
 }
 
-func funcnameFromNameoff(f funcInfo, nameoff int32) string {
-	datap := f.datap
+func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
 	if !f.valid() {
-		return ""
+		return nil
 	}
-	cstr := &datap.pclntable[nameoff]
-	return gostringnocopy(cstr)
+	return &f.datap.pclntable[nameoff]
+}
+
+func funcnameFromNameoff(f funcInfo, nameoff int32) string {
+	return gostringnocopy(cfuncnameFromNameoff(f, nameoff))
 }
 
 func funcfile(f funcInfo, fileno int32) string {
@@ -757,9 +849,9 @@
 	if !f.valid() {
 		return "?", 0
 	}
-	fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
-	line = pcvalue(f, f.pcln, targetpc, nil, strict)
-	if fileno == -1 || line == -1 || fileno >= len(datap.filetab) {
+	fileno, _ := pcvalue(f, f.pcfile, targetpc, nil, strict)
+	line, _ = pcvalue(f, f.pcln, targetpc, nil, strict)
+	if fileno == -1 || line == -1 || int(fileno) >= len(datap.filetab) {
 		// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
 		return "?", 0
 	}
@@ -772,13 +864,32 @@
 }
 
 func funcspdelta(f funcInfo, targetpc uintptr, cache *pcvalueCache) int32 {
-	x := pcvalue(f, f.pcsp, targetpc, cache, true)
+	x, _ := pcvalue(f, f.pcsp, targetpc, cache, true)
 	if x&(sys.PtrSize-1) != 0 {
 		print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
 	}
 	return x
 }
 
+// funcMaxSPDelta returns the maximum spdelta at any point in f.
+func funcMaxSPDelta(f funcInfo) int32 {
+	datap := f.datap
+	p := datap.pclntable[f.pcsp:]
+	pc := f.entry
+	val := int32(-1)
+	max := int32(0)
+	for {
+		var ok bool
+		p, ok = step(p, &pc, &val, pc == f.entry)
+		if !ok {
+			return max
+		}
+		if val > max {
+			max = val
+		}
+	}
+}
+
 func pcdatastart(f funcInfo, table int32) int32 {
 	return *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
 }
@@ -787,14 +898,25 @@
 	if table < 0 || table >= f.npcdata {
 		return -1
 	}
-	return pcvalue(f, pcdatastart(f, table), targetpc, cache, true)
+	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, true)
+	return r
 }
 
 func pcdatavalue1(f funcInfo, table int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
 	if table < 0 || table >= f.npcdata {
 		return -1
 	}
-	return pcvalue(f, pcdatastart(f, table), targetpc, cache, strict)
+	r, _ := pcvalue(f, pcdatastart(f, table), targetpc, cache, strict)
+	return r
+}
+
+// Like pcdatavalue, but also return the start PC of this PCData value.
+// It doesn't take a cache.
+func pcdatavalue2(f funcInfo, table int32, targetpc uintptr) (int32, uintptr) {
+	if table < 0 || table >= f.npcdata {
+		return -1, 0
+	}
+	return pcvalue(f, pcdatastart(f, table), targetpc, nil, true)
 }
 
 func funcdata(f funcInfo, i uint8) unsafe.Pointer {
diff --git a/src/runtime/sys_aix_ppc64.s b/src/runtime/sys_aix_ppc64.s
index 75f4178..a56d043 100644
--- a/src/runtime/sys_aix_ppc64.s
+++ b/src/runtime/sys_aix_ppc64.s
@@ -258,8 +258,8 @@
 	CSYSCALL()
 	RET
 
-// Runs on OS stack, called from runtime·write.
-TEXT runtime·write1(SB),NOSPLIT,$0-28
+// Runs on OS stack, called from runtime·write1.
+TEXT runtime·write2(SB),NOSPLIT,$0-28
 	MOVD	fd+0(FP), R3
 	MOVD	p+8(FP), R4
 	MOVW	n+16(FP), R5
diff --git a/src/runtime/sys_darwin.go b/src/runtime/sys_darwin.go
index 376f76d..28c500a 100644
--- a/src/runtime/sys_darwin.go
+++ b/src/runtime/sys_darwin.go
@@ -60,18 +60,29 @@
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_syscall(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
+	entersyscall()
 	libcCall(unsafe.Pointer(funcPC(syscall)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
 }
 func syscall()
 
+//go:linkname syscall_syscallX syscall.syscallX
+//go:nosplit
+//go:cgo_unsafe_args
+func syscall_syscallX(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
+	entersyscallblock()
+	libcCall(unsafe.Pointer(funcPC(syscallX)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscallX()
+
 //go:linkname syscall_syscall6 syscall.syscall6
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_syscall6(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
+	entersyscall()
 	libcCall(unsafe.Pointer(funcPC(syscall6)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
@@ -82,7 +93,7 @@
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_syscall6X(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
+	entersyscall()
 	libcCall(unsafe.Pointer(funcPC(syscall6X)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
@@ -93,7 +104,7 @@
 //go:nosplit
 //go:cgo_unsafe_args
 func syscall_syscallPtr(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
+	entersyscall()
 	libcCall(unsafe.Pointer(funcPC(syscallPtr)), unsafe.Pointer(&fn))
 	exitsyscall()
 	return
@@ -116,6 +127,19 @@
 	return
 }
 
+// syscallNoErr is used in crypto/x509 to call into Security.framework and CF.
+
+//go:linkname crypto_x509_syscall crypto/x509/internal/macOS.syscall
+//go:nosplit
+//go:cgo_unsafe_args
+func crypto_x509_syscall(fn, a1, a2, a3, a4, a5, a6 uintptr) (r1 uintptr) {
+	entersyscall()
+	libcCall(unsafe.Pointer(funcPC(syscallNoErr)), unsafe.Pointer(&fn))
+	exitsyscall()
+	return
+}
+func syscallNoErr()
+
 // The *_trampoline functions convert from the Go calling convention to the C calling convention
 // and then call the underlying libc function.  They are defined in sys_darwin_$ARCH.s.
 
@@ -162,6 +186,18 @@
 }
 func pthread_self_trampoline()
 
+//go:nosplit
+//go:cgo_unsafe_args
+func pthread_kill(t pthread, sig uint32) {
+	libcCall(unsafe.Pointer(funcPC(pthread_kill_trampoline)), unsafe.Pointer(&t))
+	return
+}
+func pthread_kill_trampoline()
+
+// mmap is used to do low-level memory allocation via mmap. Don't allow stack
+// splits, since this function (used by sysAlloc) is called in a lot of low-level
+// parts of the runtime and callers often assume it won't acquire any locks.
+// go:nosplit
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (unsafe.Pointer, int) {
 	args := struct {
 		addr            unsafe.Pointer
@@ -230,7 +266,7 @@
 
 //go:nosplit
 //go:cgo_unsafe_args
-func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+func write1(fd uintptr, p unsafe.Pointer, n int32) int32 {
 	return libcCall(unsafe.Pointer(funcPC(write_trampoline)), unsafe.Pointer(&fd))
 }
 func write_trampoline()
@@ -244,7 +280,7 @@
 
 //go:nosplit
 //go:cgo_unsafe_args
-func nanotime() int64 {
+func nanotime1() int64 {
 	var r struct {
 		t            int64  // raw timer
 		numer, denom uint32 // conversion factors. nanoseconds = t * numer / denom.
@@ -266,7 +302,7 @@
 
 //go:nosplit
 //go:cgo_unsafe_args
-func walltime() (int64, int32) {
+func walltime1() (int64, int32) {
 	var t timeval
 	libcCall(unsafe.Pointer(funcPC(walltime_trampoline)), unsafe.Pointer(&t))
 	return int64(t.tv_sec), 1000 * t.tv_usec
@@ -415,6 +451,8 @@
 //go:cgo_import_dynamic libc_pthread_attr_getstacksize pthread_attr_getstacksize "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_attr_setdetachstate pthread_attr_setdetachstate "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_create pthread_create "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_self pthread_self "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic libc_pthread_kill pthread_kill "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_exit exit "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_raise raise "/usr/lib/libSystem.B.dylib"
 
@@ -452,6 +490,8 @@
 //go:cgo_import_dynamic libc_pthread_cond_timedwait_relative_np pthread_cond_timedwait_relative_np "/usr/lib/libSystem.B.dylib"
 //go:cgo_import_dynamic libc_pthread_cond_signal pthread_cond_signal "/usr/lib/libSystem.B.dylib"
 
-// Magic incantation to get libSystem actually dynamically linked.
+// Magic incantation to get libSystem and friends actually dynamically linked.
 // TODO: Why does the code require this?  See cmd/link/internal/ld/go.go
 //go:cgo_import_dynamic _ _ "/usr/lib/libSystem.B.dylib"
+//go:cgo_import_dynamic _ _ "/System/Library/Frameworks/Security.framework/Versions/A/Security"
+//go:cgo_import_dynamic _ _ "/System/Library/Frameworks/CoreFoundation.framework/Versions/A/CoreFoundation"
diff --git a/src/runtime/sys_darwin_32.go b/src/runtime/sys_darwin_32.go
deleted file mode 100644
index f126be8..0000000
--- a/src/runtime/sys_darwin_32.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build darwin
-// +build 386 arm
-
-package runtime
-
-import "unsafe"
-
-//go:linkname syscall_syscall9 syscall.syscall9
-//go:nosplit
-//go:cgo_unsafe_args
-func syscall_syscall9(fn, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
-	libcCall(unsafe.Pointer(funcPC(syscall9)), unsafe.Pointer(&fn))
-	exitsyscall()
-	return
-}
-func syscall9()
diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
deleted file mode 100644
index e653c54..0000000
--- a/src/runtime/sys_darwin_386.s
+++ /dev/null
@@ -1,914 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// System calls and other sys.stuff for 386, Darwin
-// System calls are implemented in libSystem, this file contains
-// trampolines that convert from Go to C calling convention.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "textflag.h"
-
-// Exit the entire program (like C exit)
-TEXT runtime·exit_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP   	// allocate space for callee args (must be 8 mod 16)
-	MOVL	16(SP), CX	// arg ptr
-	MOVL	0(CX), AX	// arg 1 exit status
-	MOVL	AX, 0(SP)
-	CALL	libc_exit(SB)
-	MOVL	$0xf1, 0xf1  // crash
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·open_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 name
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 mode
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 perm
-	MOVL	AX, 8(SP)
-	CALL	libc_open(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·close_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX		// arg 1 fd
-	MOVL	AX, 0(SP)
-	CALL	libc_close(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·read_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 fd
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 buf
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 count
-	MOVL	AX, 8(SP)
-	CALL	libc_read(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·write_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 fd
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 buf
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 count
-	MOVL	AX, 8(SP)
-	CALL	libc_write(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pipe_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), AX		// arg 1 pipefd
-	MOVL	AX, 0(SP)
-	CALL	libc_pipe(SB)
-	TESTL	AX, AX
-	JEQ	3(PC)
-	CALL	libc_error(SB)		// return negative errno value
-	NEGL	AX
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 addr
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 len
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 prot
-	MOVL	AX, 8(SP)
-	MOVL	12(CX), AX		// arg 4 flags
-	MOVL	AX, 12(SP)
-	MOVL	16(CX), AX		// arg 5 fid
-	MOVL	AX, 16(SP)
-	MOVL	20(CX), AX		// arg 6 offset
-	MOVL	AX, 20(SP)
-	CALL	libc_mmap(SB)
-	XORL	DX, DX
-	CMPL	AX, $-1
-	JNE	ok
-	CALL	libc_error(SB)
-	MOVL	(AX), DX		// errno
-	XORL	AX, AX
-ok:
-	MOVL	32(SP), CX
-	MOVL	AX, 24(CX)		// result pointer
-	MOVL	DX, 28(CX)		// errno
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·madvise_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 addr
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 len
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 advice
-	MOVL	AX, 8(SP)
-	CALL	libc_madvise(SB)
-	// ignore failure - maybe pages are locked
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·munmap_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX		// arg 1 addr
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 len
-	MOVL	AX, 4(SP)
-	CALL	libc_munmap(SB)
-	TESTL	AX, AX
-	JEQ	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 mode
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 new
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 old
-	MOVL	AX, 8(SP)
-	CALL	libc_setitimer(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), AX
-	MOVL	AX, 0(SP)	// *timeval
-	MOVL	$0, 4(SP)	// no timezone needed
-	CALL	libc_gettimeofday(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
-
-TEXT runtime·nanotime_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8+(machTimebaseInfo__size+15)/16*16, SP
-	CALL	libc_mach_absolute_time(SB)
-	MOVL	16+(machTimebaseInfo__size+15)/16*16(SP), CX
-	MOVL	AX, 0(CX)
-	MOVL	DX, 4(CX)
-	MOVL	timebase<>+machTimebaseInfo_denom(SB), DI // atomic read
-	MOVL	timebase<>+machTimebaseInfo_numer(SB), SI
-	TESTL	DI, DI
-	JNE	initialized
-
-	LEAL	4(SP), AX
-	MOVL	AX, 0(SP)
-	CALL	libc_mach_timebase_info(SB)
-	MOVL	4+machTimebaseInfo_numer(SP), SI
-	MOVL	4+machTimebaseInfo_denom(SP), DI
-
-	MOVL	SI, timebase<>+machTimebaseInfo_numer(SB)
-	MOVL	DI, AX
-	XCHGL	AX, timebase<>+machTimebaseInfo_denom(SB) // atomic write
-	MOVL	16+(machTimebaseInfo__size+15)/16*16(SP), CX
-
-initialized:
-	MOVL	SI, 8(CX)
-	MOVL	DI, 12(CX)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·sigaction_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 sig
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 new
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 old
-	MOVL	AX, 8(SP)
-	CALL	libc_sigaction(SB)
-	TESTL	AX, AX
-	JEQ	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 how
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 new
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 old
-	MOVL	AX, 8(SP)
-	CALL	libc_pthread_sigmask(SB)
-	TESTL	AX, AX
-	JEQ	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX		// arg 1 new
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 old
-	MOVL	AX, 4(SP)
-	CALL	libc_sigaltstack(SB)
-	TESTL	AX, AX
-	JEQ	2(PC)
-	MOVL	$0xf1, 0xf1  // crash
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·raiseproc_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	CALL	libc_getpid(SB)
-	MOVL	AX, 0(SP)	// arg 1 pid
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX
-	MOVL	AX, 4(SP)	// arg 2 signal
-	CALL	libc_kill(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
-	MOVL	fn+0(FP), AX
-	MOVL	sig+4(FP), BX
-	MOVL	info+8(FP), CX
-	MOVL	ctx+12(FP), DX
-	MOVL	SP, SI
-	SUBL	$32, SP
-	ANDL	$~15, SP	// align stack: handler might be a C function
-	MOVL	BX, 0(SP)
-	MOVL	CX, 4(SP)
-	MOVL	DX, 8(SP)
-	MOVL	SI, 12(SP)	// save SI: handler might be a Go function
-	CALL	AX
-	MOVL	12(SP), AX
-	MOVL	AX, SP
-	RET
-
-// Sigtramp's job is to call the actual signal handler.
-// It is called with the C calling convention, and calls out
-// to sigtrampgo with the Go calling convention.
-TEXT runtime·sigtramp(SB),NOSPLIT,$0
-	SUBL	$28, SP
-
-	// Save callee-save registers.
-	MOVL	BP, 12(SP)
-	MOVL	BX, 16(SP)
-	MOVL	SI, 20(SP)
-	MOVL	DI, 24(SP)
-
-	MOVL	32(SP), AX
-	MOVL	AX, 0(SP)	// arg 1 signal number
-	MOVL	36(SP), AX
-	MOVL	AX, 4(SP)	// arg 2 siginfo
-	MOVL	40(SP), AX
-	MOVL	AX, 8(SP)	// arg 3 ctxt
-	CALL	runtime·sigtrampgo(SB)
-
-	// Restore callee-save registers.
-	MOVL	12(SP), BP
-	MOVL	16(SP), BX
-	MOVL	20(SP), SI
-	MOVL	24(SP), DI
-
-	ADDL	$28, SP
-	RET
-
-TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
-	JMP	runtime·sigtramp(SB)
-
-TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 usec
-	MOVL	AX, 0(SP)
-	CALL	libc_usleep(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// func setldt(entry int, address int, limit int)
-TEXT runtime·setldt(SB),NOSPLIT,$32
-	// Nothing to do on Darwin, pthread already set thread-local storage up.
-	RET
-
-TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 mib
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 miblen
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 out
-	MOVL	AX, 8(SP)
-	MOVL	12(CX), AX		// arg 4 size
-	MOVL	AX, 12(SP)
-	MOVL	16(CX), AX		// arg 5 dst
-	MOVL	AX, 16(SP)
-	MOVL	20(CX), AX		// arg 6 ndst
-	MOVL	AX, 20(SP)
-	CALL	libc_sysctl(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	CALL	libc_kqueue(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 kq
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 ch
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 nch
-	MOVL	AX, 8(SP)
-	MOVL	12(CX), AX		// arg 4 ev
-	MOVL	AX, 12(SP)
-	MOVL	16(CX), AX		// arg 5 nev
-	MOVL	AX, 16(SP)
-	MOVL	20(CX), AX		// arg 6 ts
-	MOVL	AX, 20(SP)
-	CALL	libc_kevent(SB)
-	CMPL	AX, $-1
-	JNE	ok
-	CALL	libc_error(SB)
-	MOVL	(AX), AX		// errno
-	NEGL	AX			// caller wants it as a negative error code
-ok:
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX		// arg 1 fd
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX		// arg 2 cmd
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX		// arg 3 arg
-	MOVL	AX, 8(SP)
-	CALL	libc_fcntl(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// mstart_stub is the first function executed on a new thread started by pthread_create.
-// It just does some low-level setup and then calls mstart.
-// Note: called with the C calling convention.
-TEXT runtime·mstart_stub(SB),NOSPLIT,$0
-	// The value at SP+4 points to the m.
-	// We are already on m's g0 stack.
-
-	// Save callee-save registers.
-	SUBL	$16, SP
-	MOVL	BP, 0(SP)
-	MOVL	BX, 4(SP)
-	MOVL	SI, 8(SP)
-	MOVL	DI, 12(SP)
-
-	MOVL	SP, AX       // hide argument read from vet (vet thinks this function is using the Go calling convention)
-	MOVL	20(AX), DI   // m
-	MOVL	m_g0(DI), DX // g
-
-	// Initialize TLS entry.
-	// See cmd/link/internal/ld/sym.go:computeTLSOffset.
-	MOVL	DX, 0x18(GS)
-
-	// Someday the convention will be D is always cleared.
-	CLD
-
-	CALL	runtime·mstart(SB)
-
-	// Restore callee-save registers.
-	MOVL	0(SP), BP
-	MOVL	4(SP), BX
-	MOVL	8(SP), SI
-	MOVL	12(SP), DI
-
-	// Go is all done with this OS thread.
-	// Tell pthread everything is ok (we never join with this thread, so
-	// the value here doesn't really matter).
-	XORL	AX, AX
-
-	ADDL	$16, SP
-	RET
-
-TEXT runtime·pthread_attr_init_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 attr
-	MOVL	AX, 0(SP)
-	CALL	libc_pthread_attr_init(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_attr_getstacksize_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 attr
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 size
-	MOVL	AX, 4(SP)
-	CALL	libc_pthread_attr_getstacksize(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_attr_setdetachstate_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 attr
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 state
-	MOVL	AX, 4(SP)
-	CALL	libc_pthread_attr_setdetachstate(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_create_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	LEAL	16(SP), AX	// arg "0" &threadid (which we throw away)
-	MOVL	AX, 0(SP)
-	MOVL	0(CX), AX	// arg 1 attr
-	MOVL	AX, 4(SP)
-	MOVL	4(CX), AX	// arg 2 start
-	MOVL	AX, 8(SP)
-	MOVL	8(CX), AX	// arg 3 arg
-	MOVL	AX, 12(SP)
-	CALL	libc_pthread_create(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·raise_trampoline(SB),NOSPLIT,$0
-	PUSHL   BP
-	MOVL    SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL    0(CX), AX	// arg 1 sig
-	MOVL	AX, 0(SP)
-	CALL    libc_raise(SB)
-	MOVL    BP, SP
-	POPL    BP
-	RET
-
-TEXT runtime·pthread_mutex_init_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 mutex
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 attr
-	MOVL	AX, 4(SP)
-	CALL	libc_pthread_mutex_init(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_mutex_lock_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 mutex
-	MOVL	AX, 0(SP)
-	CALL	libc_pthread_mutex_lock(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_mutex_unlock_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 mutex
-	MOVL	AX, 0(SP)
-	CALL	libc_pthread_mutex_unlock(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_cond_init_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 cond
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 attr
-	MOVL	AX, 4(SP)
-	CALL	libc_pthread_cond_init(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_cond_wait_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 cond
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 mutex
-	MOVL	AX, 4(SP)
-	CALL	libc_pthread_cond_wait(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_cond_timedwait_relative_np_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	0(CX), AX	// arg 1 cond
-	MOVL	AX, 0(SP)
-	MOVL	4(CX), AX	// arg 2 mutex
-	MOVL	AX, 4(SP)
-	MOVL	8(CX), AX	// arg 3 timeout
-	MOVL	AX, 8(SP)
-	CALL	libc_pthread_cond_timedwait_relative_np(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-TEXT runtime·pthread_cond_signal_trampoline(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$8, SP
-	MOVL	16(SP), CX
-	MOVL	0(CX), AX	// arg 1 cond
-	MOVL	AX, 0(SP)
-	CALL	libc_pthread_cond_signal(SB)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// syscall calls a function in libc on behalf of the syscall package.
-// syscall takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	(0*4)(CX), AX // fn
-	MOVL	(1*4)(CX), DX // a1
-	MOVL	DX, 0(SP)
-	MOVL	(2*4)(CX), DX // a2
-	MOVL	DX, 4(SP)
-	MOVL	(3*4)(CX), DX // a3
-	MOVL	DX, 8(SP)
-
-	CALL	AX
-
-	MOVL	32(SP), CX
-	MOVL	AX, (4*4)(CX) // r1
-	MOVL	DX, (5*4)(CX) // r2
-
-	// Standard libc functions return -1 on error
-	// and set errno.
-	CMPL	AX, $-1
-	JNE	ok
-
-	// Get error code from libc.
-	CALL	libc_error(SB)
-	MOVL	(AX), AX
-	MOVL	32(SP), CX
-	MOVL	AX, (6*4)(CX) // err
-
-ok:
-	XORL	AX, AX        // no error (it's ignored anyway)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// syscallPtr is like syscall except the libc function reports an
-// error by returning NULL and setting errno.
-TEXT runtime·syscallPtr(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	(0*4)(CX), AX // fn
-	MOVL	(1*4)(CX), DX // a1
-	MOVL	DX, 0(SP)
-	MOVL	(2*4)(CX), DX // a2
-	MOVL	DX, 4(SP)
-	MOVL	(3*4)(CX), DX // a3
-	MOVL	DX, 8(SP)
-
-	CALL	AX
-
-	MOVL	32(SP), CX
-	MOVL	AX, (4*4)(CX) // r1
-	MOVL	DX, (5*4)(CX) // r2
-
-	// syscallPtr libc functions return NULL on error
-	// and set errno.
-	TESTL	AX, AX
-	JNE	ok
-
-	// Get error code from libc.
-	CALL	libc_error(SB)
-	MOVL	(AX), AX
-	MOVL	32(SP), CX
-	MOVL	AX, (6*4)(CX) // err
-
-ok:
-	XORL	AX, AX        // no error (it's ignored anyway)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// syscall6 calls a function in libc on behalf of the syscall package.
-// syscall6 takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall6 must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall6(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	(0*4)(CX), AX // fn
-	MOVL	(1*4)(CX), DX // a1
-	MOVL	DX, 0(SP)
-	MOVL	(2*4)(CX), DX // a2
-	MOVL	DX, 4(SP)
-	MOVL	(3*4)(CX), DX // a3
-	MOVL	DX, 8(SP)
-	MOVL	(4*4)(CX), DX // a4
-	MOVL	DX, 12(SP)
-	MOVL	(5*4)(CX), DX // a5
-	MOVL	DX, 16(SP)
-	MOVL	(6*4)(CX), DX // a6
-	MOVL	DX, 20(SP)
-
-	CALL	AX
-
-	MOVL	32(SP), CX
-	MOVL	AX, (7*4)(CX) // r1
-	MOVL	DX, (8*4)(CX) // r2
-
-	// Standard libc functions return -1 on error
-	// and set errno.
-	CMPL	AX, $-1
-	JNE	ok
-
-	// Get error code from libc.
-	CALL	libc_error(SB)
-	MOVL	(AX), AX
-	MOVL	32(SP), CX
-	MOVL	AX, (9*4)(CX) // err
-
-ok:
-	XORL	AX, AX        // no error (it's ignored anyway)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// syscall6X calls a function in libc on behalf of the syscall package.
-// syscall6X takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall6X must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall6X(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$24, SP
-	MOVL	32(SP), CX
-	MOVL	(0*4)(CX), AX // fn
-	MOVL	(1*4)(CX), DX // a1
-	MOVL	DX, 0(SP)
-	MOVL	(2*4)(CX), DX // a2
-	MOVL	DX, 4(SP)
-	MOVL	(3*4)(CX), DX // a3
-	MOVL	DX, 8(SP)
-	MOVL	(4*4)(CX), DX // a4
-	MOVL	DX, 12(SP)
-	MOVL	(5*4)(CX), DX // a5
-	MOVL	DX, 16(SP)
-	MOVL	(6*4)(CX), DX // a6
-	MOVL	DX, 20(SP)
-
-	CALL	AX
-
-	MOVL	32(SP), CX
-	MOVL	AX, (7*4)(CX) // r1
-	MOVL	DX, (8*4)(CX) // r2
-
-	// Standard libc functions return -1 on error
-	// and set errno.
-	CMPL	AX, $-1
-	JNE	ok
-	CMPL	DX, $-1
-	JNE	ok
-
-	// Get error code from libc.
-	CALL	libc_error(SB)
-	MOVL	(AX), AX
-	MOVL	32(SP), CX
-	MOVL	AX, (9*4)(CX) // err
-
-ok:
-	XORL	AX, AX        // no error (it's ignored anyway)
-	MOVL	BP, SP
-	POPL	BP
-	RET
-
-// syscall9 calls a function in libc on behalf of the syscall package.
-// syscall9 takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	a7    uintptr
-//	a8    uintptr
-//	a9    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall9 must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall9(SB),NOSPLIT,$0
-	PUSHL	BP
-	MOVL	SP, BP
-	SUBL	$40, SP
-	MOVL	48(SP), CX
-	MOVL	(0*4)(CX), AX // fn
-	MOVL	(1*4)(CX), DX // a1
-	MOVL	DX, 0(SP)
-	MOVL	(2*4)(CX), DX // a2
-	MOVL	DX, 4(SP)
-	MOVL	(3*4)(CX), DX // a3
-	MOVL	DX, 8(SP)
-	MOVL	(4*4)(CX), DX // a4
-	MOVL	DX, 12(SP)
-	MOVL	(5*4)(CX), DX // a5
-	MOVL	DX, 16(SP)
-	MOVL	(6*4)(CX), DX // a6
-	MOVL	DX, 20(SP)
-	MOVL	(7*4)(CX), DX // a7
-	MOVL	DX, 24(SP)
-	MOVL	(8*4)(CX), DX // a8
-	MOVL	DX, 28(SP)
-	MOVL	(9*4)(CX), DX // a9
-	MOVL	DX, 32(SP)
-
-	CALL	AX
-
-	MOVL	48(SP), CX
-	MOVL	AX, (10*4)(CX) // r1
-	MOVL	DX, (11*4)(CX) // r2
-
-	// Standard libc functions return -1 on error
-	// and set errno.
-	CMPL	AX, $-1
-	JNE	ok
-
-	// Get error code from libc.
-	CALL	libc_error(SB)
-	MOVL	(AX), AX
-	MOVL	48(SP), CX
-	MOVL	AX, (12*4)(CX) // err
-
-ok:
-	XORL	AX, AX        // no error (it's ignored anyway)
-	MOVL	BP, SP
-	POPL	BP
-	RET
diff --git a/src/runtime/sys_darwin_64.go b/src/runtime/sys_darwin_64.go
deleted file mode 100644
index 07b0bb5..0000000
--- a/src/runtime/sys_darwin_64.go
+++ /dev/null
@@ -1,21 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build darwin
-// +build amd64 arm64
-
-package runtime
-
-import "unsafe"
-
-//go:linkname syscall_syscallX syscall.syscallX
-//go:nosplit
-//go:cgo_unsafe_args
-func syscall_syscallX(fn, a1, a2, a3 uintptr) (r1, r2, err uintptr) {
-	entersyscallblock()
-	libcCall(unsafe.Pointer(funcPC(syscallX)), unsafe.Pointer(&fn))
-	exitsyscall()
-	return
-}
-func syscallX()
diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 87c8db8..825852d 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s
@@ -46,6 +46,12 @@
 	MOVL	16(DI), DX		// arg 3 count
 	MOVL	0(DI), DI		// arg 1 fd
 	CALL	libc_read(SB)
+	TESTL	AX, AX
+	JGE	noerr
+	CALL	libc_error(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno value
+noerr:
 	POPQ	BP
 	RET
 
@@ -56,6 +62,12 @@
 	MOVL	16(DI), DX		// arg 3 count
 	MOVQ	0(DI), DI		// arg 1 fd
 	CALL	libc_write(SB)
+	TESTL	AX, AX
+	JGE	noerr
+	CALL	libc_error(SB)
+	MOVL	(AX), AX
+	NEGL	AX			// caller expects negative errno value
+noerr:
 	POPQ	BP
 	RET
 
@@ -554,6 +566,24 @@
 	POPQ	BP
 	RET
 
+TEXT runtime·pthread_self_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	DI, BX		// BX is caller-save
+	CALL	libc_pthread_self(SB)
+	MOVQ	AX, 0(BX)	// return value
+	POPQ	BP
+	RET
+
+TEXT runtime·pthread_kill_trampoline(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	MOVQ	8(DI), SI	// arg 2 sig
+	MOVQ	0(DI), DI	// arg 1 thread
+	CALL	libc_pthread_kill(SB)
+	POPQ	BP
+	RET
+
 // syscall calls a function in libc on behalf of the syscall package.
 // syscall takes a pointer to a struct like:
 // struct {
@@ -795,3 +825,29 @@
 	MOVQ	BP, SP
 	POPQ	BP
 	RET
+
+// syscallNoErr is like syscall6 but does not check for errors, and
+// only returns one value, for use with standard C ABI library functions.
+TEXT runtime·syscallNoErr(SB),NOSPLIT,$0
+	PUSHQ	BP
+	MOVQ	SP, BP
+	SUBQ	$16, SP
+	MOVQ	(0*8)(DI), R11// fn
+	MOVQ	(2*8)(DI), SI // a2
+	MOVQ	(3*8)(DI), DX // a3
+	MOVQ	(4*8)(DI), CX // a4
+	MOVQ	(5*8)(DI), R8 // a5
+	MOVQ	(6*8)(DI), R9 // a6
+	MOVQ	DI, (SP)
+	MOVQ	(1*8)(DI), DI // a1
+	XORL	AX, AX	      // vararg: say "no float args"
+
+	CALL	R11
+
+	MOVQ	(SP), DI
+	MOVQ	AX, (7*8)(DI) // r1
+
+	XORL	AX, AX        // no error (it's ignored anyway)
+	MOVQ	BP, SP
+	POPQ	BP
+	RET
diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
deleted file mode 100644
index 996f802..0000000
--- a/src/runtime/sys_darwin_arm.s
+++ /dev/null
@@ -1,589 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// System calls and other sys.stuff for ARM, Darwin
-// System calls are implemented in libSystem, this file contains
-// trampolines that convert from Go to C calling convention.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "textflag.h"
-
-TEXT notok<>(SB),NOSPLIT,$0
-	MOVW	$0, R8
-	MOVW	R8, (R8)
-	B		0(PC)
-
-TEXT runtime·open_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 mode
-	MOVW	8(R0), R2	// arg 3 perm
-	MOVW	0(R0), R0	// arg 1 name
-	BL	libc_open(SB)
-	RET
-
-TEXT runtime·close_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 fd
-	BL	libc_close(SB)
-	RET
-
-TEXT runtime·write_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 buf
-	MOVW	8(R0), R2	// arg 3 count
-	MOVW	0(R0), R0	// arg 1 fd
-	BL	libc_write(SB)
-	RET
-
-TEXT runtime·read_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 buf
-	MOVW	8(R0), R2	// arg 3 count
-	MOVW	0(R0), R0	// arg 1 fd
-	BL	libc_read(SB)
-	RET
-
-TEXT runtime·pipe_trampoline(SB),NOSPLIT,$0
-	BL	libc_pipe(SB)	// pointer already in R0
-	CMP	$0, R0
-	BEQ	3(PC)
-	BL	libc_error(SB)	// return negative errno value
-	RSB	$0, R0, R0
-	RET
-
-TEXT runtime·exit_trampoline(SB),NOSPLIT|NOFRAME,$0
-	MOVW	0(R0), R0	// arg 0 code
-	BL libc_exit(SB)
-	MOVW	$1234, R0
-	MOVW	$1002, R1
-	MOVW	R0, (R1)	// fail hard
-
-TEXT runtime·raiseproc_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R8	// signal
-	BL	libc_getpid(SB)
-	// arg 1 pid already in R0 from getpid
-	MOVW	R8, R1	// arg 2 signal
-	BL	libc_kill(SB)
-	RET
-
-TEXT runtime·mmap_trampoline(SB),NOSPLIT,$0
-	MOVW	R0, R8
-	MOVW	0(R8), R0	// arg 1 addr
-	MOVW	4(R8), R1	// arg 2 len
-	MOVW	8(R8), R2	// arg 3 prot
-	MOVW	12(R8), R3	// arg 4 flags
-	MOVW	16(R8), R4	// arg 5 fid
-	MOVW	20(R8), R5	// arg 6 offset
-	MOVW	$0, R6	// off_t is uint64_t
-	// Only R0-R3 are used for arguments, the rest
-	// go on the stack.
-	MOVM.DB.W [R4-R6], (R13)
-	BL	libc_mmap(SB)
-	ADD $12, R13
-	MOVW	$0, R1
-	MOVW	$-1, R2
-	CMP	R0, R2
-	BNE ok
-	BL	libc_error(SB)
-	MOVW	(R0), R1
-	MOVW	$0, R0
-ok:
-	MOVW	R0, 24(R8)	// ret 1 addr
-	MOVW	R1, 28(R8)	// ret 2 err
-	RET
-
-TEXT runtime·munmap_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 len
-	MOVW	0(R0), R0	// arg 1 addr
-	BL libc_munmap(SB)
-	MOVW	$-1, R2
-	CMP	R0, R2
-	BL.EQ	notok<>(SB)
-	RET
-
-TEXT runtime·madvise_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 len
-	MOVW	8(R0), R2	// arg 3 advice
-	MOVW	0(R0), R0	// arg 1 addr
-	BL	libc_madvise(SB)
-	MOVW	$-1, R2
-	CMP	R0, R2
-	BL.EQ	notok<>(SB)
-	RET
-
-TEXT runtime·setitimer_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 new
-	MOVW	8(R0), R2	// arg 3 old
-	MOVW	0(R0), R0	// arg 1 which
-	BL	libc_setitimer(SB)
-	RET
-
-TEXT runtime·walltime_trampoline(SB),NOSPLIT,$0
-	// R0 already has *timeval
-	MOVW	$0, R1 // no timezone needed
-	BL	libc_gettimeofday(SB)
-	RET
-
-GLOBL timebase<>(SB),NOPTR,$(machTimebaseInfo__size)
-
-TEXT runtime·nanotime_trampoline(SB),NOSPLIT,$0
-	MOVW	R0, R8
-	BL	libc_mach_absolute_time(SB)
-	MOVW	R0, 0(R8)
-	MOVW	R1, 4(R8)
-	MOVW	timebase<>+machTimebaseInfo_numer(SB), R6
-	MOVW	$timebase<>+machTimebaseInfo_denom(SB), R5
-	MOVW	(R5), R7
-	DMB	MB_ISH	// memory barrier for atomic read
-	CMP	$0, R7
-	BNE	initialized
-
-	SUB	$(machTimebaseInfo__size+7)/8*8, R13
-	MOVW	R13, R0
-	BL	libc_mach_timebase_info(SB)
-	MOVW	machTimebaseInfo_numer(R13), R6
-	MOVW	machTimebaseInfo_denom(R13), R7
-	ADD	$(machTimebaseInfo__size+7)/8*8, R13
-
-	MOVW	R6, timebase<>+machTimebaseInfo_numer(SB)
-	MOVW	$timebase<>+machTimebaseInfo_denom(SB), R5
-	DMB	MB_ISH	// memory barrier for atomic write
-	MOVW	R7, (R5)
-	DMB	MB_ISH
-
-initialized:
-	MOVW	R6, 8(R8)
-	MOVW	R7, 12(R8)
-	RET
-
-TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
-	MOVW	sig+4(FP), R0
-	MOVW	info+8(FP), R1
-	MOVW	ctx+12(FP), R2
-	MOVW	fn+0(FP), R11
-	MOVW	R13, R4
-	SUB	$24, R13
-	BIC	$0x7, R13 // alignment for ELF ABI
-	BL	(R11)
-	MOVW	R4, R13
-	RET
-
-TEXT runtime·sigtramp(SB),NOSPLIT,$0
-	// Reserve space for callee-save registers and arguments.
-	SUB	$40, R13
-
-	MOVW	R4, 16(R13)
-	MOVW	R5, 20(R13)
-	MOVW	R6, 24(R13)
-	MOVW	R7, 28(R13)
-	MOVW	R8, 32(R13)
-	MOVW	R11, 36(R13)
-
-	// Save arguments.
-	MOVW	R0, 4(R13)	// sig
-	MOVW	R1, 8(R13)	// info
-	MOVW	R2, 12(R13)	// ctx
-
-	// this might be called in external code context,
-	// where g is not set.
-	MOVB	runtime·iscgo(SB), R0
-	CMP 	$0, R0
-	BL.NE	runtime·load_g(SB)
-
-	MOVW	R13, R6
-	CMP	$0, g
-	BEQ nog
-
-	// iOS always use the main stack to run the signal handler.
-	// We need to switch to gsignal ourselves.
-	MOVW	g_m(g), R11
-	MOVW	m_gsignal(R11), R5
-	MOVW	(g_stack+stack_hi)(R5), R6
-
-nog:
-	// Restore arguments.
-	MOVW	4(R13), R0
-	MOVW	8(R13), R1
-	MOVW	12(R13), R2
-
-	// Reserve space for args and the stack pointer on the
-	// gsignal stack.
-	SUB $24, R6
-	// Save stack pointer.
-	MOVW	R13, R4
-	MOVW	R4, 16(R6)
-	// Switch to gsignal stack.
-	MOVW	R6, R13
-
-	// Call sigtrampgo
-	MOVW	R0, 4(R13)
-	MOVW	R1, 8(R13)
-	MOVW	R2, 12(R13)
-	BL	runtime·sigtrampgo(SB)
-
-	// Switch to old stack.
-	MOVW	16(R13), R5
-	MOVW	R5, R13
-
-	// Restore callee-save registers.
-	MOVW	16(R13), R4
-	MOVW	20(R13), R5
-	MOVW	24(R13), R6
-	MOVW	28(R13), R7
-	MOVW	32(R13), R8
-	MOVW	36(R13), R11
-
-	ADD	$40, R13
-
-	RET
-
-TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
-	JMP	runtime·sigtramp(SB)
-
-TEXT runtime·sigprocmask_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 new
-	MOVW	8(R0), R2	// arg 3 old
-	MOVW	0(R0), R0	// arg 1 how
-	BL	libc_pthread_sigmask(SB)
-	CMP	$0, R0
-	BL.NE	notok<>(SB)
-	RET
-
-TEXT runtime·sigaction_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 new
-	MOVW	8(R0), R2	// arg 3 old
-	MOVW	0(R0), R0	// arg 1 how
-	BL	libc_sigaction(SB)
-	RET
-
-TEXT runtime·usleep_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 usec
-	BL libc_usleep(SB)
-	RET
-
-TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
-	B	runtime·armPublicationBarrier(SB)
-
-TEXT runtime·sysctl_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 miblen
-	MOVW	8(R0), R2	// arg 3 out
-	MOVW	12(R0), R3	// arg 4 size
-	MOVW	16(R0), R4	// arg 5 dst
-	MOVW	20(R0), R5	// arg 6 ndst
-	MOVW	0(R0), R0	// arg 1 mib
-	// Only R0-R3 are used for arguments, the rest
-	// go on the stack.
-	MOVM.DB.W [R4-R5], (R13)
-	BL	libc_sysctl(SB)
-	ADD $(2*4), R13
-	RET
-
-TEXT runtime·kqueue_trampoline(SB),NOSPLIT,$0
-	BL	libc_kqueue(SB)
-	RET
-
-// int32 runtime·kevent(int kq, Kevent *changelist, int nchanges, Kevent *eventlist, int events, Timespec *timeout)
-TEXT runtime·kevent_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 keventss
-	MOVW	8(R0), R2	// arg 3 nch
-	MOVW	12(R0), R3	// arg 4 ev
-	MOVW	16(R0), R4	// arg 5 nev
-	MOVW	20(R0), R5	// arg 6 ts
-	MOVW	0(R0), R0	// arg 1 kq
-	// Only R0-R3 are used for arguments, the rest
-	// go on the stack.
-	MOVM.DB.W [R4-R5], (R13)
-	BL	libc_kevent(SB)
-	ADD	$(2*4), R13
-	MOVW	$-1, R2
-	CMP	R0, R2
-	BNE	ok
-	BL	libc_error(SB)
-	MOVW	(R0), R0	// errno
-	RSB	$0, R0, R0	// caller wants it as a negative error code
-ok:
-	RET
-
-TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 cmd
-	MOVW	8(R0), R2	// arg 3 arg
-	MOVW	0(R0), R0	// arg 1 fd
-	BL	libc_fcntl(SB)
-	RET
-
-// sigaltstack is not supported on iOS, so our sigtramp has
-// to do the stack switch ourselves.
-TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
-	MOVW	$43, R0
-	BL	libc_exit(SB)
-	RET
-
-// Thread related functions
-// Note: On darwin/arm, the runtime always use runtime/cgo to
-// create threads, so all thread related functions will just exit with a
-// unique status.
-
-TEXT runtime·mstart_stub(SB),NOSPLIT,$0
-	MOVW	$44, R0
-	BL	libc_exit(SB)
-	RET
-
-TEXT runtime·pthread_attr_init_trampoline(SB),NOSPLIT,$0
-	MOVW	$45, R0
-	BL	libc_exit(SB)
-	RET
-
-TEXT runtime·pthread_attr_getstacksize_trampoline(SB),NOSPLIT,$0
-	MOVW	$46, R0
-	BL	libc_exit(SB)
-	RET
-
-TEXT runtime·pthread_attr_setdetachstate_trampoline(SB),NOSPLIT,$0
-	MOVW	$47, R0
-	BL	libc_exit(SB)
-	RET
-
-TEXT runtime·pthread_create_trampoline(SB),NOSPLIT,$0
-	MOVW	$48, R0
-	BL	libc_exit(SB)
-	RET
-
-TEXT runtime·raise_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 sig
-	BL	libc_raise(SB)
-	RET
-
-TEXT runtime·pthread_mutex_init_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 attr
-	MOVW	0(R0), R0	// arg 1 mutex
-	BL	libc_pthread_mutex_init(SB)
-	RET
-
-TEXT runtime·pthread_mutex_lock_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 mutex
-	BL	libc_pthread_mutex_lock(SB)
-	RET
-
-TEXT runtime·pthread_mutex_unlock_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 mutex
-	BL	libc_pthread_mutex_unlock(SB)
-	RET
-
-TEXT runtime·pthread_cond_init_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 attr
-	MOVW	0(R0), R0	// arg 1 cond
-	BL	libc_pthread_cond_init(SB)
-	RET
-
-TEXT runtime·pthread_cond_wait_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 mutex
-	MOVW	0(R0), R0	// arg 1 cond
-	BL	libc_pthread_cond_wait(SB)
-	RET
-
-TEXT runtime·pthread_cond_timedwait_relative_np_trampoline(SB),NOSPLIT,$0
-	MOVW	4(R0), R1	// arg 2 mutex
-	MOVW	8(R0), R2	// arg 3 timeout
-	MOVW	0(R0), R0	// arg 1 cond
-	BL	libc_pthread_cond_timedwait_relative_np(SB)
-	RET
-
-TEXT runtime·pthread_cond_signal_trampoline(SB),NOSPLIT,$0
-	MOVW	0(R0), R0	// arg 1 cond
-	BL	libc_pthread_cond_signal(SB)
-	RET
-
-// syscall calls a function in libc on behalf of the syscall package.
-// syscall takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall(SB),NOSPLIT,$0
-	MOVW.W	R0, -4(R13)	// push structure pointer
-	MOVW	0(R0), R12	// fn
-	MOVW	8(R0), R1	// a2
-	MOVW	12(R0), R2	// a3
-	MOVW	4(R0), R0	// a1
-	BL	(R12)
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 16(R2)	// save r1
-	MOVW	R1, 20(R2)	// save r2
-	MOVW	$-1, R3
-	CMP	R0, R3
-	BNE	ok
-	MOVW.W	R2, -4(R13)	// push structure pointer
-	BL	libc_error(SB)
-	MOVW	(R0), R0
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 24(R2)	// save err
-ok:
-	RET
-
-// syscallPtr is like syscall except the libc function reports an
-// error by returning NULL and setting errno.
-TEXT runtime·syscallPtr(SB),NOSPLIT,$0
-	MOVW.W	R0, -4(R13)	// push structure pointer
-	MOVW	0(R0), R12	// fn
-	MOVW	8(R0), R1	// a2
-	MOVW	12(R0), R2	// a3
-	MOVW	4(R0), R0	// a1
-	BL	(R12)
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 16(R2)	// save r1
-	MOVW	R1, 20(R2)	// save r2
-	MOVW	$0, R3
-	CMP	R0, R3
-	BNE	ok
-	MOVW.W	R2, -4(R13)	// push structure pointer
-	BL	libc_error(SB)
-	MOVW	(R0), R0
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 24(R2)	// save err
-ok:
-	RET
-
-// syscall6 calls a function in libc on behalf of the syscall package.
-// syscall6 takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall6 must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall6(SB),NOSPLIT,$0
-	MOVW.W	R0, -4(R13)	// push structure pointer
-	MOVW	0(R0), R12	// fn
-	MOVW	24(R0), R1	// a6
-	MOVW.W	R1, -4(R13)
-	MOVW	20(R0), R1	// a5
-	MOVW.W	R1, -4(R13)
-	MOVW	8(R0), R1	// a2
-	MOVW	12(R0), R2	// a3
-	MOVW	16(R0), R3	// a4
-	MOVW	4(R0), R0	// a1
-	BL	(R12)
-	ADD	$8, R13
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 28(R2)	// save r1
-	MOVW	R1, 32(R2)	// save r2
-	MOVW	$-1, R3
-	CMP	R0, R3
-	BNE	ok
-	MOVW.W	R2, -4(R13)	// push structure pointer
-	BL	libc_error(SB)
-	MOVW	(R0), R0
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 36(R2)	// save err
-ok:
-	RET
-
-// syscall6X calls a function in libc on behalf of the syscall package.
-// syscall6X takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall6X must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall6X(SB),NOSPLIT,$0
-	MOVW.W	R0, -4(R13)	// push structure pointer
-	MOVW	0(R0), R12	// fn
-	MOVW	24(R0), R1	// a6
-	MOVW.W	R1, -4(R13)
-	MOVW	20(R0), R1	// a5
-	MOVW.W	R1, -4(R13)
-	MOVW	8(R0), R1	// a2
-	MOVW	12(R0), R2	// a3
-	MOVW	16(R0), R3	// a4
-	MOVW	4(R0), R0	// a1
-	BL	(R12)
-	ADD	$8, R13
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 28(R2)	// save r1
-	MOVW	R1, 32(R2)	// save r2
-	MOVW	$-1, R3
-	CMP	R0, R3
-	BNE	ok
-	CMP	R1, R3
-	BNE	ok
-	MOVW.W	R2, -4(R13)	// push structure pointer
-	BL	libc_error(SB)
-	MOVW	(R0), R0
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 36(R2)	// save err
-ok:
-	RET
-
-// syscall9 calls a function in libc on behalf of the syscall package.
-// syscall9 takes a pointer to a struct like:
-// struct {
-//	fn    uintptr
-//	a1    uintptr
-//	a2    uintptr
-//	a3    uintptr
-//	a4    uintptr
-//	a5    uintptr
-//	a6    uintptr
-//	a7    uintptr
-//	a8    uintptr
-//	a9    uintptr
-//	r1    uintptr
-//	r2    uintptr
-//	err   uintptr
-// }
-// syscall9 must be called on the g0 stack with the
-// C calling convention (use libcCall).
-TEXT runtime·syscall9(SB),NOSPLIT,$0
-	MOVW.W	R0, -4(R13)	// push structure pointer
-	MOVW	0(R0), R12	// fn
-	MOVW	36(R0), R1	// a9
-	MOVW.W	R1, -4(R13)
-	MOVW	32(R0), R1	// a8
-	MOVW.W	R1, -4(R13)
-	MOVW	28(R0), R1	// a7
-	MOVW.W	R1, -4(R13)
-	MOVW	24(R0), R1	// a6
-	MOVW.W	R1, -4(R13)
-	MOVW	20(R0), R1	// a5
-	MOVW.W	R1, -4(R13)
-	MOVW	8(R0), R1	// a2
-	MOVW	12(R0), R2	// a3
-	MOVW	16(R0), R3	// a4
-	MOVW	4(R0), R0	// a1
-	BL	(R12)
-	ADD	$20, R13
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 40(R2)	// save r1
-	MOVW	R1, 44(R2)	// save r2
-	MOVW	$-1, R3
-	CMP	R0, R3
-	BNE	ok
-	MOVW.W	R2, -4(R13)	// push structure pointer
-	BL	libc_error(SB)
-	MOVW	(R0), R0
-	MOVW.P	4(R13), R2	// pop structure pointer
-	MOVW	R0, 48(R2)	// save err
-ok:
-	RET
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index ac3ca74..585d4f2 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -35,6 +35,13 @@
 	MOVW	16(R0), R2	// arg 3 count
 	MOVW	0(R0), R0	// arg 1 fd
 	BL	libc_write(SB)
+	MOVD	$-1, R1
+	CMP	R0, R1
+	BNE	noerr
+	BL	libc_error(SB)
+	MOVW	(R0), R0
+	NEG	R0, R0		// caller expects negative errno value
+noerr:
 	RET
 
 TEXT runtime·read_trampoline(SB),NOSPLIT,$0
@@ -42,6 +49,13 @@
 	MOVW	16(R0), R2	// arg 3 count
 	MOVW	0(R0), R0	// arg 1 fd
 	BL	libc_read(SB)
+	MOVD	$-1, R1
+	CMP	R0, R1
+	BNE	noerr
+	BL	libc_error(SB)
+	MOVW	(R0), R0
+	NEG	R0, R0		// caller expects negative errno value
+noerr:
 	RET
 
 TEXT runtime·pipe_trampoline(SB),NOSPLIT,$0
@@ -457,6 +471,18 @@
 	BL	libc_pthread_cond_signal(SB)
 	RET
 
+TEXT runtime·pthread_self_trampoline(SB),NOSPLIT,$0
+	MOVD	R0, R19		// R19 is callee-save
+	BL	libc_pthread_self(SB)
+	MOVD	R0, 0(R19)	// return value
+	RET
+
+TEXT runtime·pthread_kill_trampoline(SB),NOSPLIT,$0
+	MOVD	8(R0), R1	// arg 2 sig
+	MOVD	0(R0), R0	// arg 1 thread
+	BL	libc_pthread_kill(SB)
+	RET
+
 // syscall calls a function in libc on behalf of the syscall package.
 // syscall takes a pointer to a struct like:
 // struct {
diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index b771850..580633a 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s
@@ -104,27 +104,46 @@
 	MOVL	$3, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVL	$42, AX
+	SYSCALL
+	JCC	pipeok
+	MOVL	$-1,r+0(FP)
+	MOVL	$-1,w+4(FP)
+	MOVL	AX, errno+8(FP)
+	RET
+pipeok:
+	MOVL	AX, r+0(FP)
+	MOVL	DX, w+4(FP)
+	MOVL	$0, errno+8(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVQ	fd+0(FP), DI		// arg 1 fd
 	MOVQ	p+8(FP), SI		// arg 2 buf
 	MOVL	n+16(FP), DX		// arg 3 count
 	MOVL	$4, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
+TEXT runtime·lwp_gettid(SB),NOSPLIT,$0-4
 	MOVL	$496, AX	// lwp_gettid
 	SYSCALL
-	MOVQ	$-1, DI		// arg 1 - pid
-	MOVQ	AX, SI		// arg 2 - tid
-	MOVL	sig+0(FP), DX	// arg 3 - signum
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·lwp_kill(SB),NOSPLIT,$0-16
+	MOVL	pid+0(FP), DI	// arg 1 - pid
+	MOVL	tid+4(FP), SI	// arg 2 - tid
+	MOVQ	sig+8(FP), DX	// arg 3 - signum
 	MOVL	$497, AX	// lwp_kill
 	SYSCALL
 	RET
@@ -146,8 +165,8 @@
 	SYSCALL
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVL	$232, AX // clock_gettime
 	MOVQ	$0, DI  	// CLOCK_REALTIME
 	LEAQ	8(SP), SI
@@ -160,7 +179,7 @@
 	MOVL	DX, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB), NOSPLIT, $32
+TEXT runtime·nanotime1(SB), NOSPLIT, $32
 	MOVL	$232, AX
 	MOVQ	$4, DI  	// CLOCK_MONOTONIC
 	LEAQ	8(SP), SI
@@ -371,3 +390,18 @@
 	MOVL	$92, AX		// fcntl
 	SYSCALL
 	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$4, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
index 35f357a..c346e71 100644
--- a/src/runtime/sys_freebsd_386.s
+++ b/src/runtime/sys_freebsd_386.s
@@ -93,29 +93,54 @@
 	MOVL	$3, AX
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-4
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$8-12
+	MOVL	$42, AX
+	INT	$0x80
+	JAE	ok
+	MOVL	$0, r+0(FP)
+	MOVL	$0, w+4(FP)
+	MOVL	AX, errno+8(FP)
+	RET
+ok:
+	MOVL	AX, r+0(FP)
+	MOVL	DX, w+4(FP)
+	MOVL	$0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$12-16
+	MOVL	$542, AX
+	LEAL	r+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	flags+0(FP), BX
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	AX, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-4
 	MOVL	$4, AX
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
-	// thr_self(&8(SP))
-	LEAL	8(SP), AX
+TEXT runtime·thr_self(SB),NOSPLIT,$8-4
+	// thr_self(&0(FP))
+	LEAL	ret+0(FP), AX
 	MOVL	AX, 4(SP)
 	MOVL	$432, AX
 	INT	$0x80
-	// thr_kill(self, SIGPIPE)
-	MOVL	8(SP), AX
-	MOVL	AX, 4(SP)
-	MOVL	sig+0(FP), AX
-	MOVL	AX, 8(SP)
+	RET
+
+TEXT runtime·thr_kill(SB),NOSPLIT,$-4
+	// thr_kill(tid, sig)
 	MOVL	$433, AX
 	INT	$0x80
 	RET
@@ -412,6 +437,23 @@
 	NEGL	AX
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$16-4
+	MOVL	$92, AX // fcntl
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$3, 8(SP) // F_GETFL
+	MOVL	$0, 12(SP)
+	INT	$0x80
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$4, 8(SP) // F_SETFL
+	ORL	$4, AX // O_NONBLOCK
+	MOVL	AX, 12(SP)
+	MOVL	$92, AX // fcntl
+	INT	$0x80
+	RET
+
 // func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
 TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-28
 	MOVL	$487, AX
diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 55959b3..010b2ec 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s
@@ -93,29 +93,56 @@
 	MOVL	$3, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVL	$42, AX
+	SYSCALL
+	JCC	ok
+	MOVL	$0, r+0(FP)
+	MOVL	$0, w+4(FP)
+	MOVL	AX, errno+8(FP)
+	RET
+ok:
+	MOVL	AX, r+0(FP)
+	MOVL	DX, w+4(FP)
+	MOVL	$0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	LEAQ	r+8(FP), DI
+	MOVL	flags+0(FP), SI
+	MOVL	$542, AX
+	SYSCALL
+	MOVL	AX, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVQ	fd+0(FP), DI		// arg 1 fd
 	MOVQ	p+8(FP), SI		// arg 2 buf
 	MOVL	n+16(FP), DX		// arg 3 count
 	MOVL	$4, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
-	// thr_self(&8(SP))
-	LEAQ	8(SP), DI	// arg 1 &8(SP)
+TEXT runtime·thr_self(SB),NOSPLIT,$0-8
+	// thr_self(&0(FP))
+	LEAQ	ret+0(FP), DI	// arg 1
 	MOVL	$432, AX
 	SYSCALL
-	// thr_kill(self, SIGPIPE)
-	MOVQ	8(SP), DI	// arg 1 id
-	MOVL	sig+0(FP), SI	// arg 2
+	RET
+
+TEXT runtime·thr_kill(SB),NOSPLIT,$0-16
+	// thr_kill(tid, sig)
+	MOVQ	tid+0(FP), DI	// arg 1 id
+	MOVQ	sig+8(FP), SI	// arg 2 sig
 	MOVL	$433, AX
 	SYSCALL
 	RET
@@ -447,6 +474,21 @@
 	SYSCALL
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$4, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	RET
+
 // func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
 TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-44
 	MOVQ	level+0(FP), DI
diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index f347b9f..1e12f9c 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s
@@ -20,6 +20,7 @@
 #define SYS_close (SYS_BASE + 6)
 #define SYS_getpid (SYS_BASE + 20)
 #define SYS_kill (SYS_BASE + 37)
+#define SYS_pipe (SYS_BASE + 42)
 #define SYS_sigaltstack (SYS_BASE + 53)
 #define SYS_munmap (SYS_BASE + 73)
 #define SYS_madvise (SYS_BASE + 75)
@@ -40,6 +41,7 @@
 #define SYS_thr_new (SYS_BASE + 455)
 #define SYS_mmap (SYS_BASE + 477)
 #define SYS_cpuset_getaffinity (SYS_BASE + 487)
+#define SYS_pipe2 (SYS_BASE + 542)
 
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
 	MOVW addr+0(FP), R0
@@ -115,17 +117,43 @@
 	MOVW n+8(FP), R2	// arg 3 count
 	MOVW $SYS_read, R7
 	SWI $0
-	MOVW.CS	$-1, R0
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVW	$SYS_pipe, R7
+	SWI	$0
+	BCC	ok
+	MOVW	$0, R1
+	MOVW	R1, r+0(FP)
+	MOVW	R1, w+4(FP)
+	MOVW	R0, errno+8(FP)
+	RET
+ok:
+	MOVW	R0, r+0(FP)
+	MOVW	R1, w+4(FP)
+	MOVW	$0, R1
+	MOVW	R1, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVW	$r+4(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVW	$SYS_pipe2, R7
+	SWI	$0
+	MOVW	R0, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0
 	MOVW fd+0(FP), R0	// arg 1 fd
 	MOVW p+4(FP), R1	// arg 2 buf
 	MOVW n+8(FP), R2	// arg 3 count
 	MOVW $SYS_write, R7
 	SWI $0
-	MOVW.CS	$-1, R0
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
@@ -137,14 +165,17 @@
 	MOVW	R0, ret+4(FP)
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$8
-	// thr_self(&4(R13))
-	MOVW $4(R13), R0 // arg 1 &4(R13)
+TEXT runtime·thr_self(SB),NOSPLIT,$0-4
+	// thr_self(&0(FP))
+	MOVW $ret+0(FP), R0 // arg 1
 	MOVW $SYS_thr_self, R7
 	SWI $0
-	// thr_kill(self, SIGPIPE)
-	MOVW 4(R13), R0	// arg 1 id
-	MOVW sig+0(FP), R1	// arg 2 - signal
+	RET
+
+TEXT runtime·thr_kill(SB),NOSPLIT,$0-8
+	// thr_kill(tid, sig)
+	MOVW tid+0(FP), R0	// arg 1 id
+	MOVW sig+4(FP), R1	// arg 2 signal
 	MOVW $SYS_thr_kill, R7
 	SWI $0
 	RET
@@ -215,7 +246,11 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$12
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Reserve space for callee-save registers and arguments.
+	MOVM.DB.W [R4-R11], (R13)
+	SUB	$16, R13
+
 	// this might be called in external code context,
 	// where g is not set.
 	// first save R0, because runtime·load_g will clobber it
@@ -227,6 +262,11 @@
 	MOVW	R1, 8(R13)
 	MOVW	R2, 12(R13)
 	BL	runtime·sigtrampgo(SB)
+
+	// Restore callee-save registers.
+	ADD	$16, R13
+	MOVM.IA.W (R13), [R4-R11]
+
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$16
@@ -371,6 +411,20 @@
 	SWI $0
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$3, R1	// F_GETFL
+	MOVW	$0, R2
+	MOVW	$SYS_fcntl, R7
+	SWI	$0
+	ORR	$0x4, R0, R2	// O_NONBLOCK
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$4, R1	// F_SETFL
+	MOVW	$SYS_fcntl, R7
+	SWI	$0
+	RET
+
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	B	runtime·armPublicationBarrier(SB)
diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
new file mode 100644
index 0000000..2330f2f
--- /dev/null
+++ b/src/runtime/sys_freebsd_arm64.s
@@ -0,0 +1,538 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for arm64, FreeBSD
+// /usr/src/sys/kern/syscalls.master for syscall numbers.
+//
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define CLOCK_REALTIME		0
+#define CLOCK_MONOTONIC		4
+#define FD_CLOEXEC		1
+#define F_SETFD			2
+#define F_GETFL			3
+#define F_SETFL			4
+#define O_NONBLOCK		4
+
+#define SYS_exit		1
+#define SYS_read		3
+#define SYS_write		4
+#define SYS_open		5
+#define SYS_close		6
+#define SYS_getpid		20
+#define SYS_kill		37
+#define SYS_sigaltstack		53
+#define SYS_munmap		73
+#define SYS_madvise		75
+#define SYS_setitimer		83
+#define SYS_fcntl		92
+#define SYS___sysctl		202
+#define SYS_nanosleep		240
+#define SYS_clock_gettime	232
+#define SYS_sched_yield		331
+#define SYS_sigprocmask		340
+#define SYS_kqueue		362
+#define SYS_kevent		363
+#define SYS_sigaction		416
+#define SYS_thr_exit		431
+#define SYS_thr_self		432
+#define SYS_thr_kill		433
+#define SYS__umtx_op		454
+#define SYS_thr_new		455
+#define SYS_mmap		477
+#define SYS_cpuset_getaffinity	487
+#define SYS_pipe2 		542
+
+TEXT emptyfunc<>(SB),0,$0-0
+	RET
+
+// func sys_umtx_op(addr *uint32, mode int32, val uint32, uaddr1 uintptr, ut *umtx_time) int32
+TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
+	MOVD	addr+0(FP), R0
+	MOVW	mode+8(FP), R1
+	MOVW	val+12(FP), R2
+	MOVD	uaddr1+16(FP), R3
+	MOVD	ut+24(FP), R4
+	MOVD	$SYS__umtx_op, R8
+	SVC
+	MOVW	R0, ret+32(FP)
+	RET
+
+// func thr_new(param *thrparam, size int32) int32
+TEXT runtime·thr_new(SB),NOSPLIT,$0
+	MOVD	param+0(FP), R0
+	MOVW	size+8(FP), R1
+	MOVD	$SYS_thr_new, R8
+	SVC
+	MOVW	R0, ret+16(FP)
+	RET
+
+// func thr_start()
+TEXT runtime·thr_start(SB),NOSPLIT,$0
+	// set up g
+	MOVD	m_g0(R0), g
+	MOVD	R0, g_m(g)
+	BL	emptyfunc<>(SB)	 // fault if stack check is wrong
+	BL	runtime·mstart(SB)
+
+	MOVD	$2, R8	// crash (not reached)
+	MOVD	R8, (R8)
+	RET
+
+// func exit(code int32)
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), R0
+	MOVD	$SYS_exit, R8
+	SVC
+	MOVD	$0, R0
+	MOVD	R0, (R0)
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	wait+0(FP), R0
+	// We're done using the stack.
+	MOVW	$0, R1
+	STLRW	R1, (R0)
+	MOVW	$0, R0
+	MOVD	$SYS_thr_exit, R8
+	SVC
+	JMP	0(PC)
+
+// func open(name *byte, mode, perm int32) int32
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	name+0(FP), R0
+	MOVW	mode+8(FP), R1
+	MOVW	perm+12(FP), R2
+	MOVD	$SYS_open, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+16(FP)
+	RET
+
+// func closefd(fd int32) int32
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	fd+0(FP), R0
+	MOVD	$SYS_close, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+8(FP)
+	RET
+
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVD	$r+0(FP), R0
+	MOVW	$0, R1
+	MOVD	$SYS_pipe2, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0
+ok:
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	$r+8(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVD	$SYS_pipe2, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0
+ok:
+	MOVW	R0, errno+16(FP)
+	RET
+
+// func write1(fd uintptr, p unsafe.Pointer, n int32) int32
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
+	MOVD	fd+0(FP), R0
+	MOVD	p+8(FP), R1
+	MOVW	n+16(FP), R2
+	MOVD	$SYS_write, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0		// caller expects negative errno
+ok:
+	MOVW	R0, ret+24(FP)
+	RET
+
+// func read(fd int32, p unsafe.Pointer, n int32) int32
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	fd+0(FP), R0
+	MOVD	p+8(FP), R1
+	MOVW	n+16(FP), R2
+	MOVD	$SYS_read, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0		// caller expects negative errno
+ok:
+	MOVW	R0, ret+24(FP)
+	RET
+
+// func usleep(usec uint32)
+TEXT runtime·usleep(SB),NOSPLIT,$24-4
+	MOVWU	usec+0(FP), R3
+	MOVD	R3, R5
+	MOVW	$1000000, R4
+	UDIV	R4, R3
+	MOVD	R3, 8(RSP)
+	MUL	R3, R4
+	SUB	R4, R5
+	MOVW	$1000, R4
+	MUL	R4, R5
+	MOVD	R5, 16(RSP)
+
+	// nanosleep(&ts, 0)
+	ADD	$8, RSP, R0
+	MOVD	$0, R1
+	MOVD	$SYS_nanosleep, R8
+	SVC
+	RET
+
+// func thr_self() thread
+TEXT runtime·thr_self(SB),NOSPLIT,$8-8
+	MOVD	$ptr-8(SP), R0	// arg 1 &8(SP)
+	MOVD	$SYS_thr_self, R8
+	SVC
+	MOVD	ptr-8(SP), R0
+	MOVD	R0, ret+0(FP)
+	RET
+
+// func thr_kill(t thread, sig int)
+TEXT runtime·thr_kill(SB),NOSPLIT,$0-16
+	MOVD	tid+0(FP), R0	// arg 1 pid
+	MOVD	sig+8(FP), R1	// arg 2 sig
+	MOVD	$SYS_thr_kill, R8
+	SVC
+	RET
+
+// func raiseproc(sig uint32)
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$SYS_getpid, R8
+	SVC
+	MOVW	sig+0(FP), R1
+	MOVD	$SYS_kill, R8
+	SVC
+	RET
+
+// func setitimer(mode int32, new, old *itimerval)
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	mode+0(FP), R0
+	MOVD	new+8(FP), R1
+	MOVD	old+16(FP), R2
+	MOVD	$SYS_setitimer, R8
+	SVC
+	RET
+
+// func fallback_walltime() (sec int64, nsec int32)
+TEXT runtime·fallback_walltime(SB),NOSPLIT,$24-12
+	MOVW	$CLOCK_REALTIME, R0
+	MOVD	$8(RSP), R1
+	MOVD	$SYS_clock_gettime, R8
+	SVC
+	MOVD	8(RSP), R0	// sec
+	MOVW	16(RSP), R1	// nsec
+	MOVD	R0, sec+0(FP)
+	MOVW	R1, nsec+8(FP)
+	RET
+
+// func fallback_nanotime() int64
+TEXT runtime·fallback_nanotime(SB),NOSPLIT,$24-8
+	MOVD	$CLOCK_MONOTONIC, R0
+	MOVD	$8(RSP), R1
+	MOVD	$SYS_clock_gettime, R8
+	SVC
+	MOVD	8(RSP), R0	// sec
+	MOVW	16(RSP), R2	// nsec
+
+	// sec is in R0, nsec in R2
+	// return nsec in R2
+	MOVD	$1000000000, R3
+	MUL	R3, R0
+	ADD	R2, R0
+
+	MOVD	R0, ret+0(FP)
+	RET
+
+// func asmSigaction(sig uintptr, new, old *sigactiont) int32
+TEXT runtime·asmSigaction(SB),NOSPLIT|NOFRAME,$0
+	MOVD	sig+0(FP), R0		// arg 1 sig
+	MOVD	new+8(FP), R1		// arg 2 act
+	MOVD	old+16(FP), R2		// arg 3 oact
+	MOVD	$SYS_sigaction, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+24(FP)
+	RET
+
+// func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R0
+	MOVD	info+16(FP), R1
+	MOVD	ctx+24(FP), R2
+	MOVD	fn+0(FP), R11
+	BL	(R11)
+	RET
+
+// func sigtramp()
+TEXT runtime·sigtramp(SB),NOSPLIT,$192
+	// Save callee-save registers in the case of signal forwarding.
+	// Please refer to https://golang.org/issue/31827 .
+	MOVD	R19, 8*4(RSP)
+	MOVD	R20, 8*5(RSP)
+	MOVD	R21, 8*6(RSP)
+	MOVD	R22, 8*7(RSP)
+	MOVD	R23, 8*8(RSP)
+	MOVD	R24, 8*9(RSP)
+	MOVD	R25, 8*10(RSP)
+	MOVD	R26, 8*11(RSP)
+	MOVD	R27, 8*12(RSP)
+	MOVD	g, 8*13(RSP)
+	MOVD	R29, 8*14(RSP)
+	FMOVD	F8, 8*15(RSP)
+	FMOVD	F9, 8*16(RSP)
+	FMOVD	F10, 8*17(RSP)
+	FMOVD	F11, 8*18(RSP)
+	FMOVD	F12, 8*19(RSP)
+	FMOVD	F13, 8*20(RSP)
+	FMOVD	F14, 8*21(RSP)
+	FMOVD	F15, 8*22(RSP)
+
+	// this might be called in external code context,
+	// where g is not set.
+	// first save R0, because runtime·load_g will clobber it
+	MOVW	R0, 8(RSP)
+	MOVBU	runtime·iscgo(SB), R0
+	CMP	$0, R0
+	BEQ	2(PC)
+	BL	runtime·load_g(SB)
+
+	MOVD	R1, 16(RSP)
+	MOVD	R2, 24(RSP)
+	MOVD	$runtime·sigtrampgo(SB), R0
+	BL	(R0)
+
+	// Restore callee-save registers.
+	MOVD	8*4(RSP), R19
+	MOVD	8*5(RSP), R20
+	MOVD	8*6(RSP), R21
+	MOVD	8*7(RSP), R22
+	MOVD	8*8(RSP), R23
+	MOVD	8*9(RSP), R24
+	MOVD	8*10(RSP), R25
+	MOVD	8*11(RSP), R26
+	MOVD	8*12(RSP), R27
+	MOVD	8*13(RSP), g
+	MOVD	8*14(RSP), R29
+	FMOVD	8*15(RSP), F8
+	FMOVD	8*16(RSP), F9
+	FMOVD	8*17(RSP), F10
+	FMOVD	8*18(RSP), F11
+	FMOVD	8*19(RSP), F12
+	FMOVD	8*20(RSP), F13
+	FMOVD	8*21(RSP), F14
+	FMOVD	8*22(RSP), F15
+
+	RET
+
+// func mmap(addr uintptr, n uintptr, prot int, flags int, fd int, off int64) (ret uintptr, err error)
+TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R0
+	MOVD	n+8(FP), R1
+	MOVW	prot+16(FP), R2
+	MOVW	flags+20(FP), R3
+	MOVW	fd+24(FP), R4
+	MOVW	off+28(FP), R5
+	MOVD	$SYS_mmap, R8
+	SVC
+	BCS	fail
+	MOVD	R0, p+32(FP)
+	MOVD	$0, err+40(FP)
+	RET
+fail:
+	MOVD	$0, p+32(FP)
+	MOVD	R0, err+40(FP)
+	RET
+
+// func munmap(addr uintptr, n uintptr) (err error)
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R0
+	MOVD	n+8(FP), R1
+	MOVD	$SYS_munmap, R8
+	SVC
+	BCS	fail
+	RET
+fail:
+	MOVD	$0, R0
+	MOVD	R0, (R0)	// crash
+
+// func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R0
+	MOVD	n+8(FP), R1
+	MOVW	flags+16(FP), R2
+	MOVD	$SYS_madvise, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+24(FP)
+	RET
+
+// func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
+TEXT runtime·sysctl(SB),NOSPLIT,$0
+	MOVD	mib+0(FP), R0
+	MOVD	miblen+8(FP), R1
+	MOVD	out+16(FP), R2
+	MOVD	size+24(FP), R3
+	MOVD	dst+32(FP), R4
+	MOVD	ndst+40(FP), R5
+	MOVD	$SYS___sysctl, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0
+ok:
+	MOVW	R0, ret+48(FP)
+	RET
+
+// func sigaltstack(new, old *stackt)
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+	MOVD	new+0(FP), R0
+	MOVD	old+8(FP), R1
+	MOVD	$SYS_sigaltstack, R8
+	SVC
+	BCS	fail
+	RET
+fail:
+	MOVD	$0, R0
+	MOVD	R0, (R0)	// crash
+
+// func osyield()
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$SYS_sched_yield, R8
+	SVC
+	RET
+
+// func sigprocmask(how int32, new, old *sigset)
+TEXT runtime·sigprocmask(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	how+0(FP), R0
+	MOVD	new+8(FP), R1
+	MOVD	old+16(FP), R2
+	MOVD	$SYS_sigprocmask, R8
+	SVC
+	BCS	fail
+	RET
+fail:
+	MOVD	$0, R0
+	MOVD	R0, (R0)	// crash
+
+// func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+TEXT runtime·cpuset_getaffinity(SB),NOSPLIT|NOFRAME,$0-44
+	MOVD	level+0(FP), R0
+	MOVD	which+8(FP), R1
+	MOVD	id+16(FP), R2
+	MOVD	size+24(FP), R3
+	MOVD	mask+32(FP), R4
+	MOVD	$SYS_cpuset_getaffinity, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+40(FP)
+	RET
+
+// func kqueue() int32
+TEXT runtime·kqueue(SB),NOSPLIT|NOFRAME,$0
+	MOVD $SYS_kqueue, R8
+	SVC
+	BCC	ok
+	MOVW	$-1, R0
+ok:
+	MOVW	R0, ret+0(FP)
+	RET
+
+// func kevent(kq int, ch unsafe.Pointer, nch int, ev unsafe.Pointer, nev int, ts *Timespec) (n int, err error)
+TEXT runtime·kevent(SB),NOSPLIT,$0
+	MOVW	kq+0(FP), R0
+	MOVD	ch+8(FP), R1
+	MOVW	nch+16(FP), R2
+	MOVD	ev+24(FP), R3
+	MOVW	nev+32(FP), R4
+	MOVD	ts+40(FP), R5
+	MOVD	$SYS_kevent, R8
+	SVC
+	BCC	ok
+	NEG	R0, R0
+ok:
+	MOVW	R0, ret+48(FP)
+	RET
+
+// func closeonexec(fd int32)
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
+	MOVW	fd+0(FP), R0
+	MOVD	$F_SETFD, R1
+	MOVD	$FD_CLOEXEC, R2
+	MOVD	$SYS_fcntl, R8
+	SVC
+	RET
+
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW	fd+0(FP), R0
+	MOVD	$F_GETFL, R1
+	MOVD	$0, R2
+	MOVD	$SYS_fcntl, R8
+	SVC
+	ORR	$O_NONBLOCK, R0, R2
+	MOVW	fd+0(FP), R0
+	MOVW	$F_SETFL, R1
+	MOVW	$SYS_fcntl, R7
+	SVC
+	RET
+
+// func getCntxct(physical bool) uint32
+TEXT runtime·getCntxct(SB),NOSPLIT,$0
+	MOVB	physical+0(FP), R0
+	CMP	$0, R0
+	BEQ	3(PC)
+
+	// get CNTPCT (Physical Count Register) into R0
+	MRS	CNTPCT_EL0, R0 // SIGILL
+	B	2(PC)
+
+	// get CNTVCT (Virtual Count Register) into R0
+	MRS	CNTVCT_EL0, R0
+
+	MOVW	R0, ret+8(FP)
+	RET
+
+// func getisar0() uint64
+TEXT runtime·getisar0(SB),NOSPLIT,$0
+	// get Instruction Set Attributes 0 into R0
+	MRS	ID_AA64ISAR0_EL1, R0
+	MOVD	R0, ret+0(FP)
+	RET
+
+// func getisar1() uint64
+TEXT runtime·getisar1(SB),NOSPLIT,$0
+	// get Instruction Set Attributes 1 into R0
+	MRS	ID_AA64ISAR1_EL1, R0
+	MOVD	R0, ret+0(FP)
+	RET
+
+// func getpfr0() uint64
+TEXT runtime·getpfr0(SB),NOSPLIT,$0
+	// get Processor Feature Register 0 into R0
+	MRS	ID_AA64PFR0_EL1, R0
+	MOVD	R0, ret+0(FP)
+	RET
diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 72c43bd..1b28098 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s
@@ -32,12 +32,15 @@
 #define SYS_getpid		20
 #define SYS_access		33
 #define SYS_kill		37
+#define SYS_pipe		42
 #define SYS_brk 		45
 #define SYS_fcntl		55
 #define SYS_munmap		91
 #define SYS_socketcall		102
 #define SYS_setittimer		104
 #define SYS_clone		120
+#define SYS_uname		122
+#define SYS_mlock		150
 #define SYS_sched_yield 	158
 #define SYS_nanosleep		162
 #define SYS_rt_sigreturn	173
@@ -58,6 +61,7 @@
 #define SYS_clock_gettime	265
 #define SYS_tgkill		270
 #define SYS_epoll_create1	329
+#define SYS_pipe2		331
 
 TEXT runtime·exit(SB),NOSPLIT,$0
 	MOVL	$SYS_exit_group, AX
@@ -107,15 +111,12 @@
 	MOVL	AX, ret+4(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$0
+TEXT runtime·write1(SB),NOSPLIT,$0
 	MOVL	$SYS_write, AX
 	MOVL	fd+0(FP), BX
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
 	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	2(PC)
-	MOVL	$-1, AX
 	MOVL	AX, ret+12(FP)
 	RET
 
@@ -125,12 +126,26 @@
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
 	INVOKE_SYSCALL
-	CMPL	AX, $0xfffff001
-	JLS	2(PC)
-	MOVL	$-1, AX
 	MOVL	AX, ret+12(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVL	$SYS_pipe, AX
+	LEAL	r+0(FP), BX
+	INVOKE_SYSCALL
+	MOVL	AX, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVL	$SYS_pipe2, AX
+	LEAL	r+4(FP), BX
+	MOVL	flags+0(FP), CX
+	INVOKE_SYSCALL
+	MOVL	AX, errno+12(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$8
 	MOVL	$0, DX
 	MOVL	usec+0(FP), AX
@@ -175,6 +190,20 @@
 	INVOKE_SYSCALL
 	RET
 
+TEXT ·getpid(SB),NOSPLIT,$0-4
+	MOVL	$SYS_getpid, AX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0
+	MOVL	$SYS_tgkill, AX
+	MOVL	tgid+0(FP), BX
+	MOVL	tid+4(FP), CX
+	MOVL	sig+8(FP), DX
+	INVOKE_SYSCALL
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT,$0-12
 	MOVL	$SYS_setittimer, AX
 	MOVL	mode+0(FP), BX
@@ -192,8 +221,8 @@
 	MOVL	AX, ret+12(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $0-12
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $0-12
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 
@@ -204,9 +233,9 @@
 	MOVL	g_m(AX), SI // SI unchanged by C code.
 
 	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	MOVL	0(SP), DX
-	MOVL	DX, m_vdsoPC(SI)
-	LEAL	sec+0(SP), DX
+	LEAL	sec+0(FP), DX
+	MOVL	-4(DX), CX
+	MOVL	CX, m_vdsoPC(SI)
 	MOVL	DX, m_vdsoSP(SI)
 
 	CMPL	AX, m_curg(SI)	// Only switch if on curg.
@@ -257,7 +286,7 @@
 
 // int64 nanotime(void) so really
 // void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $0-8
+TEXT runtime·nanotime1(SB), NOSPLIT, $0-8
 	// Switch to g0 stack. See comment above in runtime·walltime.
 
 	MOVL	SP, BP	// Save old SP; BP unchanged by C code.
@@ -267,9 +296,9 @@
 	MOVL	g_m(AX), SI // SI unchanged by C code.
 
 	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	MOVL	0(SP), DX
-	MOVL	DX, m_vdsoPC(SI)
-	LEAL	ret+0(SP), DX
+	LEAL	ret+0(FP), DX
+	MOVL	-4(DX), CX
+	MOVL	CX, m_vdsoPC(SI)
 	MOVL	DX, m_vdsoSP(SI)
 
 	CMPL	AX, m_curg(SI)	// Only switch if on curg.
@@ -695,6 +724,21 @@
 	INVOKE_SYSCALL
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL	$SYS_fcntl, AX
+	MOVL	fd+0(FP), BX // fd
+	MOVL	$3, CX // F_GETFL
+	MOVL	$0, DX
+	INVOKE_SYSCALL
+	MOVL	fd+0(FP), BX // fd
+	MOVL	$4, CX // F_SETFL
+	MOVL	$0x800, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$SYS_fcntl, AX
+	INVOKE_SYSCALL
+	RET
+
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0
 	MOVL	$SYS_access, AX
@@ -734,3 +778,20 @@
 	INVOKE_SYSCALL
 	MOVL	AX, ret+0(FP)
 	RET
+
+// func uname(utsname *new_utsname) int
+TEXT ·uname(SB),NOSPLIT,$0-8
+	MOVL    $SYS_uname, AX
+	MOVL    utsname+0(FP), BX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+4(FP)
+	RET
+
+// func mlock(addr, len uintptr) int
+TEXT ·mlock(SB),NOSPLIT,$0-12
+	MOVL    $SYS_mlock, AX
+	MOVL    addr+0(FP), BX
+	MOVL    len+4(FP), CX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 5c300f5..58d3bc5 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s
@@ -21,6 +21,7 @@
 #define SYS_rt_sigaction	13
 #define SYS_rt_sigprocmask	14
 #define SYS_rt_sigreturn	15
+#define SYS_pipe		22
 #define SYS_sched_yield 	24
 #define SYS_mincore		27
 #define SYS_madvise		28
@@ -32,8 +33,10 @@
 #define SYS_clone		56
 #define SYS_exit		60
 #define SYS_kill		62
+#define SYS_uname		63
 #define SYS_fcntl		72
 #define SYS_sigaltstack 	131
+#define SYS_mlock		149
 #define SYS_arch_prctl		158
 #define SYS_gettid		186
 #define SYS_futex		202
@@ -46,6 +49,7 @@
 #define SYS_faccessat		269
 #define SYS_epoll_pwait		281
 #define SYS_epoll_create1	291
+#define SYS_pipe2		293
 
 TEXT runtime·exit(SB),NOSPLIT,$0-4
 	MOVL	code+0(FP), DI
@@ -89,15 +93,12 @@
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$0-28
+TEXT runtime·write1(SB),NOSPLIT,$0-28
 	MOVQ	fd+0(FP), DI
 	MOVQ	p+8(FP), SI
 	MOVL	n+16(FP), DX
 	MOVL	$SYS_write, AX
 	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	2(PC)
-	MOVL	$-1, AX
 	MOVL	AX, ret+24(FP)
 	RET
 
@@ -107,12 +108,26 @@
 	MOVL	n+16(FP), DX
 	MOVL	$SYS_read, AX
 	SYSCALL
-	CMPQ	AX, $0xfffffffffffff001
-	JLS	2(PC)
-	MOVL	$-1, AX
 	MOVL	AX, ret+24(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	LEAQ	r+0(FP), DI
+	MOVL	$SYS_pipe, AX
+	SYSCALL
+	MOVL	AX, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	LEAQ	r+8(FP), DI
+	MOVL	flags+0(FP), SI
+	MOVL	$SYS_pipe2, AX
+	SYSCALL
+	MOVL	AX, errno+16(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$16
 	MOVL	$0, DX
 	MOVL	usec+0(FP), AX
@@ -158,6 +173,20 @@
 	SYSCALL
 	RET
 
+TEXT ·getpid(SB),NOSPLIT,$0-8
+	MOVL	$SYS_getpid, AX
+	SYSCALL
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0
+	MOVQ	tgid+0(FP), DI
+	MOVQ	tid+8(FP), SI
+	MOVQ	sig+16(FP), DX
+	MOVL	$SYS_tgkill, AX
+	SYSCALL
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT,$0-24
 	MOVL	mode+0(FP), DI
 	MOVQ	new+8(FP), SI
@@ -175,8 +204,9 @@
 	MOVL	AX, ret+24(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$0-12
+// func walltime1() (sec int64, nsec int32)
+// non-zero frame-size means bp is saved and restored
+TEXT runtime·walltime1(SB),NOSPLIT,$8-12
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 	// In particular, a kernel configured with CONFIG_OPTIMIZE_INLINING=n
@@ -191,9 +221,9 @@
 	MOVQ	g_m(AX), BX // BX unchanged by C code.
 
 	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	MOVQ	0(SP), DX
-	MOVQ	DX, m_vdsoPC(BX)
-	LEAQ	sec+0(SP), DX
+	LEAQ	sec+0(FP), DX
+	MOVQ	-8(DX), CX
+	MOVQ	CX, m_vdsoPC(BX)
 	MOVQ	DX, m_vdsoSP(BX)
 
 	CMPQ	AX, m_curg(BX)	// Only switch if on curg.
@@ -233,7 +263,9 @@
 	MOVL	DX, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+// func nanotime1() int64
+// non-zero frame-size means bp is saved and restored
+TEXT runtime·nanotime1(SB),NOSPLIT,$8-8
 	// Switch to g0 stack. See comment above in runtime·walltime.
 
 	MOVQ	SP, BP	// Save old SP; BP unchanged by C code.
@@ -243,9 +275,9 @@
 	MOVQ	g_m(AX), BX // BX unchanged by C code.
 
 	// Set vdsoPC and vdsoSP for SIGPROF traceback.
-	MOVQ	0(SP), DX
-	MOVQ	DX, m_vdsoPC(BX)
-	LEAQ	ret+0(SP), DX
+	LEAQ	ret+0(FP), DX
+	MOVQ	-8(DX), CX
+	MOVQ	CX, m_vdsoPC(BX)
 	MOVQ	DX, m_vdsoSP(BX)
 
 	CMPQ	AX, m_curg(BX)	// Only switch if on curg.
@@ -682,6 +714,20 @@
 	SYSCALL
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$SYS_fcntl, AX
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$0x800, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$SYS_fcntl, AX
+	SYSCALL
+	RET
 
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0
@@ -723,3 +769,20 @@
 	SYSCALL
 	MOVQ	AX, ret+0(FP)
 	RET
+
+// func uname(utsname *new_utsname) int
+TEXT ·uname(SB),NOSPLIT,$0-16
+	MOVQ    utsname+0(FP), DI
+	MOVL    $SYS_uname, AX
+	SYSCALL
+	MOVQ	AX, ret+8(FP)
+	RET
+
+// func mlock(addr, len uintptr) int
+TEXT ·mlock(SB),NOSPLIT,$0-24
+	MOVQ    addr+0(FP), DI
+	MOVQ    len+8(FP), SI
+	MOVL    $SYS_mlock, AX
+	SYSCALL
+	MOVQ	AX, ret+16(FP)
+	RET
diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 9c73984..e103da5 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s
@@ -23,6 +23,7 @@
 #define SYS_close (SYS_BASE + 6)
 #define SYS_getpid (SYS_BASE + 20)
 #define SYS_kill (SYS_BASE + 37)
+#define SYS_pipe (SYS_BASE + 42)
 #define SYS_clone (SYS_BASE + 120)
 #define SYS_rt_sigreturn (SYS_BASE + 173)
 #define SYS_rt_sigaction (SYS_BASE + 174)
@@ -45,6 +46,7 @@
 #define SYS_epoll_ctl (SYS_BASE + 251)
 #define SYS_epoll_wait (SYS_BASE + 252)
 #define SYS_epoll_create1 (SYS_BASE + 357)
+#define SYS_pipe2 (SYS_BASE + 359)
 #define SYS_fcntl (SYS_BASE + 55)
 #define SYS_access (SYS_BASE + 33)
 #define SYS_connect (SYS_BASE + 283)
@@ -75,15 +77,12 @@
 	MOVW	R0, ret+4(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$0
+TEXT runtime·write1(SB),NOSPLIT,$0
 	MOVW	fd+0(FP), R0
 	MOVW	p+4(FP), R1
 	MOVW	n+8(FP), R2
 	MOVW	$SYS_write, R7
 	SWI	$0
-	MOVW	$0xfffff001, R1
-	CMP	R1, R0
-	MOVW.HI	$-1, R0
 	MOVW	R0, ret+12(FP)
 	RET
 
@@ -93,12 +92,26 @@
 	MOVW	n+8(FP), R2
 	MOVW	$SYS_read, R7
 	SWI	$0
-	MOVW	$0xfffff001, R1
-	CMP	R1, R0
-	MOVW.HI	$-1, R0
 	MOVW	R0, ret+12(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVW	$r+0(FP), R0
+	MOVW	$SYS_pipe, R7
+	SWI	$0
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVW	$r+4(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVW	$SYS_pipe2, R7
+	SWI	$0
+	MOVW	R0, errno+12(FP)
+	RET
+
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0
 	MOVW	code+0(FP), R0
 	MOVW	$SYS_exit_group, R7
@@ -159,6 +172,20 @@
 	SWI	$0
 	RET
 
+TEXT ·getpid(SB),NOSPLIT,$0-4
+	MOVW	$SYS_getpid, R7
+	SWI	$0
+	MOVW	R0, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0-12
+	MOVW	tgid+0(FP), R0
+	MOVW	tid+4(FP), R1
+	MOVW	sig+8(FP), R2
+	MOVW	$SYS_tgkill, R7
+	SWI	$0
+	RET
+
 TEXT runtime·mmap(SB),NOSPLIT,$0
 	MOVW	addr+0(FP), R0
 	MOVW	n+4(FP), R1
@@ -215,7 +242,7 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·walltime(SB),NOSPLIT,$0-12
+TEXT runtime·walltime1(SB),NOSPLIT,$0-12
 	// We don't know how much stack space the VDSO code will need,
 	// so switch to g0.
 
@@ -242,11 +269,38 @@
 
 	MOVW	$CLOCK_REALTIME, R0
 	MOVW	$8(R13), R1	// timespec
-	MOVW	runtime·vdsoClockgettimeSym(SB), R11
-	CMP	$0, R11
+	MOVW	runtime·vdsoClockgettimeSym(SB), R2
+	CMP	$0, R2
 	B.EQ	fallback
 
-	BL	(R11)
+	// Store g on gsignal's stack, so if we receive a signal
+	// during VDSO code we can find the g.
+	// If we don't have a signal stack, we won't receive signal,
+	// so don't bother saving g.
+	// When using cgo, we already saved g on TLS, also don't save
+	// g here.
+	// Also don't save g if we are already on the signal stack.
+	// We won't get a nested signal.
+	MOVB	runtime·iscgo(SB), R6
+	CMP	$0, R6
+	BNE	nosaveg
+	MOVW	m_gsignal(R5), R6          // g.m.gsignal
+	CMP	$0, R6
+	BEQ	nosaveg
+	CMP	g, R6
+	BEQ	nosaveg
+	MOVW	(g_stack+stack_lo)(R6), R6 // g.m.gsignal.stack.lo
+	MOVW	g, (R6)
+
+	BL	(R2)
+
+	MOVW	$0, R1
+	MOVW	R1, (R6) // clear g slot, R6 is unchanged by C code
+
+	JMP	finish
+
+nosaveg:
+	BL	(R2)
 	JMP	finish
 
 fallback:
@@ -266,8 +320,8 @@
 	MOVW	R2, nsec+8(FP)
 	RET
 
-// int64 nanotime(void)
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+// int64 nanotime1(void)
+TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	// Switch to g0 stack. See comment above in runtime·walltime.
 
 	// Save old SP. Use R13 instead of SP to avoid linker rewriting the offsets.
@@ -293,11 +347,38 @@
 
 	MOVW	$CLOCK_MONOTONIC, R0
 	MOVW	$8(R13), R1	// timespec
-	MOVW	runtime·vdsoClockgettimeSym(SB), R11
-	CMP	$0, R11
+	MOVW	runtime·vdsoClockgettimeSym(SB), R2
+	CMP	$0, R2
 	B.EQ	fallback
 
-	BL	(R11)
+	// Store g on gsignal's stack, so if we receive a signal
+	// during VDSO code we can find the g.
+	// If we don't have a signal stack, we won't receive signal,
+	// so don't bother saving g.
+	// When using cgo, we already saved g on TLS, also don't save
+	// g here.
+	// Also don't save g if we are already on the signal stack.
+	// We won't get a nested signal.
+	MOVB	runtime·iscgo(SB), R6
+	CMP	$0, R6
+	BNE	nosaveg
+	MOVW	m_gsignal(R5), R6          // g.m.gsignal
+	CMP	$0, R6
+	BEQ	nosaveg
+	CMP	g, R6
+	BEQ	nosaveg
+	MOVW	(g_stack+stack_lo)(R6), R6 // g.m.gsignal.stack.lo
+	MOVW	g, (R6)
+
+	BL	(R2)
+
+	MOVW	$0, R1
+	MOVW	R1, (R6) // clear g slot, R6 is unchanged by C code
+
+	JMP	finish
+
+nosaveg:
+	BL	(R2)
 	JMP	finish
 
 fallback:
@@ -434,7 +515,11 @@
 	MOVW	R4, R13
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$12
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Reserve space for callee-save registers and arguments.
+	MOVM.DB.W [R4-R11], (R13)
+	SUB	$16, R13
+
 	// this might be called in external code context,
 	// where g is not set.
 	// first save R0, because runtime·load_g will clobber it
@@ -447,6 +532,11 @@
 	MOVW	R2, 12(R13)
 	MOVW  	$runtime·sigtrampgo(SB), R11
 	BL	(R11)
+
+	// Restore callee-save registers.
+	ADD	$16, R13
+	MOVM.IA.W (R13), [R4-R11]
+
 	RET
 
 TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
@@ -567,6 +657,20 @@
 	SWI	$0
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$3, R1	// F_GETFL
+	MOVW	$0, R2
+	MOVW	$SYS_fcntl, R7
+	SWI	$0
+	ORR	$0x800, R0, R2	// O_NONBLOCK
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$4, R1	// F_SETFL
+	MOVW	$SYS_fcntl, R7
+	SWI	$0
+	RET
+
 // b __kuser_get_tls @ 0xffff0fe0
 TEXT runtime·read_tls_fallback(SB),NOSPLIT|NOFRAME,$0
 	MOVW	$0xffff0fe0, R0
diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index 2835b6c..b23e3b9 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s
@@ -20,6 +20,7 @@
 #define SYS_write		64
 #define SYS_openat		56
 #define SYS_close		57
+#define SYS_pipe2		59
 #define SYS_fcntl		25
 #define SYS_nanosleep		101
 #define SYS_mmap		222
@@ -91,16 +92,12 @@
 	MOVW	R0, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
 	MOVD	fd+0(FP), R0
 	MOVD	p+8(FP), R1
 	MOVW	n+16(FP), R2
 	MOVD	$SYS_write, R8
 	SVC
-	CMN	$4095, R0
-	BCC	done
-	MOVW	$-1, R0
-done:
 	MOVW	R0, ret+24(FP)
 	RET
 
@@ -110,13 +107,27 @@
 	MOVW	n+16(FP), R2
 	MOVD	$SYS_read, R8
 	SVC
-	CMN	$4095, R0
-	BCC	done
-	MOVW	$-1, R0
-done:
 	MOVW	R0, ret+24(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVD	$r+0(FP), R0
+	MOVW	$0, R1
+	MOVW	$SYS_pipe2, R8
+	SVC
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	$r+8(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVW	$SYS_pipe2, R8
+	SVC
+	MOVW	R0, errno+16(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$24-4
 	MOVWU	usec+0(FP), R3
 	MOVD	R3, R5
@@ -164,6 +175,20 @@
 	SVC
 	RET
 
+TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
+	MOVD	$SYS_getpid, R8
+	SVC
+	MOVD	R0, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0-24
+	MOVD	tgid+0(FP), R0
+	MOVD	tid+8(FP), R1
+	MOVD	sig+16(FP), R2
+	MOVD	$SYS_tgkill, R8
+	SVC
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
 	MOVW	mode+0(FP), R0
 	MOVD	new+8(FP), R1
@@ -181,8 +206,8 @@
 	MOVW	R0, ret+24(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$24-12
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$24-12
 	MOVD	RSP, R20	// R20 is unchanged by C code
 	MOVD	RSP, R1
 
@@ -207,6 +232,31 @@
 	MOVW	$CLOCK_REALTIME, R0
 	MOVD	runtime·vdsoClockgettimeSym(SB), R2
 	CBZ	R2, fallback
+
+	// Store g on gsignal's stack, so if we receive a signal
+	// during VDSO code we can find the g.
+	// If we don't have a signal stack, we won't receive signal,
+	// so don't bother saving g.
+	// When using cgo, we already saved g on TLS, also don't save
+	// g here.
+	// Also don't save g if we are already on the signal stack.
+	// We won't get a nested signal.
+	MOVBU	runtime·iscgo(SB), R22
+	CBNZ	R22, nosaveg
+	MOVD	m_gsignal(R21), R22          // g.m.gsignal
+	CBZ	R22, nosaveg
+	CMP	g, R22
+	BEQ	nosaveg
+	MOVD	(g_stack+stack_lo)(R22), R22 // g.m.gsignal.stack.lo
+	MOVD	g, (R22)
+
+	BL	(R2)
+
+	MOVD	ZR, (R22)  // clear g slot, R22 is unchanged by C code
+
+	B	finish
+
+nosaveg:
 	BL	(R2)
 	B	finish
 
@@ -225,7 +275,7 @@
 	MOVW	R5, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$24-8
+TEXT runtime·nanotime1(SB),NOSPLIT,$24-8
 	MOVD	RSP, R20	// R20 is unchanged by C code
 	MOVD	RSP, R1
 
@@ -250,6 +300,31 @@
 	MOVW	$CLOCK_MONOTONIC, R0
 	MOVD	runtime·vdsoClockgettimeSym(SB), R2
 	CBZ	R2, fallback
+
+	// Store g on gsignal's stack, so if we receive a signal
+	// during VDSO code we can find the g.
+	// If we don't have a signal stack, we won't receive signal,
+	// so don't bother saving g.
+	// When using cgo, we already saved g on TLS, also don't save
+	// g here.
+	// Also don't save g if we are already on the signal stack.
+	// We won't get a nested signal.
+	MOVBU	runtime·iscgo(SB), R22
+	CBNZ	R22, nosaveg
+	MOVD	m_gsignal(R21), R22          // g.m.gsignal
+	CBZ	R22, nosaveg
+	CMP	g, R22
+	BEQ	nosaveg
+	MOVD	(g_stack+stack_lo)(R22), R22 // g.m.gsignal.stack.lo
+	MOVD	g, (R22)
+
+	BL	(R2)
+
+	MOVD	ZR, (R22)  // clear g slot, R22 is unchanged by C code
+
+	B	finish
+
+nosaveg:
 	BL	(R2)
 	B	finish
 
@@ -344,8 +419,7 @@
 	// first save R0, because runtime·load_g will clobber it
 	MOVW	R0, 8(RSP)
 	MOVBU	runtime·iscgo(SB), R0
-	CMP	$0, R0
-	BEQ	2(PC)
+	CBZ	R0, 2(PC)
 	BL	runtime·load_g(SB)
 
 	MOVD	R1, 16(RSP)
@@ -605,6 +679,21 @@
 	SVC
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R0 // fd
+	MOVD	$3, R1	// F_GETFL
+	MOVD	$0, R2
+	MOVD	$SYS_fcntl, R8
+	SVC
+	MOVD	$0x800, R2 // O_NONBLOCK
+	ORR	R0, R2
+	MOVW	fd+0(FP), R0 // fd
+	MOVD	$4, R1	// F_SETFL
+	MOVD	$SYS_fcntl, R8
+	SVC
+	RET
+
 // int access(const char *name, int mode)
 TEXT runtime·access(SB),NOSPLIT,$0-20
 	MOVD	$AT_FDCWD, R0
diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index 33ed105..6668a0f 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s
@@ -21,7 +21,7 @@
 #define SYS_close		5003
 #define SYS_getpid		5038
 #define SYS_kill		5060
-#define SYS_fcntl		5080
+#define SYS_fcntl		5070
 #define SYS_mmap		5009
 #define SYS_munmap		5011
 #define SYS_setitimer		5036
@@ -46,6 +46,7 @@
 #define SYS_clock_gettime	5222
 #define SYS_epoll_create1	5285
 #define SYS_brk			5012
+#define SYS_pipe2		5287
 
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	code+0(FP), R4
@@ -88,14 +89,14 @@
 	MOVW	R2, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
 	MOVV	fd+0(FP), R4
 	MOVV	p+8(FP), R5
 	MOVW	n+16(FP), R6
 	MOVV	$SYS_write, R2
 	SYSCALL
 	BEQ	R7, 2(PC)
-	MOVW	$-1, R2
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+24(FP)
 	RET
 
@@ -106,10 +107,32 @@
 	MOVV	$SYS_read, R2
 	SYSCALL
 	BEQ	R7, 2(PC)
-	MOVW	$-1, R2
+	SUBVU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+24(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVV	$r+0(FP), R4
+	MOVV	R0, R5
+	MOVV	$SYS_pipe2, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVV	$r+8(FP), R4
+	MOVW	flags+0(FP), R5
+	MOVV	$SYS_pipe2, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+16(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$16-4
 	MOVWU	usec+0(FP), R3
 	MOVV	R3, R5
@@ -158,6 +181,20 @@
 	SYSCALL
 	RET
 
+TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
+	MOVV	$SYS_getpid, R2
+	SYSCALL
+	MOVV	R2, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24
+	MOVV	tgid+0(FP), R4
+	MOVV	tid+8(FP), R5
+	MOVV	sig+16(FP), R6
+	MOVV	$SYS_tgkill, R2
+	SYSCALL
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
 	MOVW	mode+0(FP), R4
 	MOVV	new+8(FP), R5
@@ -176,25 +213,90 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$16
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$16
+	MOVV	R29, R16	// R16 is unchanged by C code
+	MOVV	R29, R1
+
+	MOVV	g_m(g), R17	// R17 = m
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	MOVV	R31, m_vdsoPC(R17)
+	MOVV	R29, m_vdsoSP(R17)
+
+	MOVV	m_curg(R17), R4
+	MOVV	g, R5
+	BNE	R4, R5, noswitch
+
+	MOVV	m_g0(R17), R4
+	MOVV	(g_sched+gobuf_sp)(R4), R1	// Set SP to g0 stack
+
+noswitch:
+	SUBV	$16, R1
+	AND	$~15, R1	// Align for C code
+	MOVV	R1, R29
+
 	MOVW	$0, R4 // CLOCK_REALTIME
 	MOVV	$0(R29), R5
-	MOVV	$SYS_clock_gettime, R2
-	SYSCALL
+
+	MOVV	runtime·vdsoClockgettimeSym(SB), R25
+	BEQ	R25, fallback
+
+	JAL	(R25)
+
+finish:
 	MOVV	0(R29), R3	// sec
 	MOVV	8(R29), R5	// nsec
+
+	MOVV	R16, R29	// restore SP
+	MOVV	R0, m_vdsoSP(R17)	// clear vdsoSP
+
 	MOVV	R3, sec+0(FP)
 	MOVW	R5, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$16
-	MOVW	$1, R4 // CLOCK_MONOTONIC
-	MOVV	$0(R29), R5
+fallback:
 	MOVV	$SYS_clock_gettime, R2
 	SYSCALL
+	JMP finish
+
+TEXT runtime·nanotime1(SB),NOSPLIT,$16
+	MOVV	R29, R16	// R16 is unchanged by C code
+	MOVV	R29, R1
+
+	MOVV	g_m(g), R17	// R17 = m
+
+	// Set vdsoPC and vdsoSP for SIGPROF traceback.
+	MOVV	R31, m_vdsoPC(R17)
+	MOVV	R29, m_vdsoSP(R17)
+
+	MOVV	m_curg(R17), R4
+	MOVV	g, R5
+	BNE	R4, R5, noswitch
+
+	MOVV	m_g0(R17), R4
+	MOVV	(g_sched+gobuf_sp)(R4), R1	// Set SP to g0 stack
+
+noswitch:
+	SUBV	$16, R1
+	AND	$~15, R1	// Align for C code
+	MOVV	R1, R29
+
+	MOVW	$1, R4 // CLOCK_MONOTONIC
+	MOVV	$0(R29), R5
+
+	MOVV	runtime·vdsoClockgettimeSym(SB), R25
+	BEQ	R25, fallback
+
+	JAL	(R25)
+
+finish:
 	MOVV	0(R29), R3	// sec
 	MOVV	8(R29), R5	// nsec
+
+	MOVV	R16, R29	// restore SP
+	MOVV	R0, m_vdsoSP(R17)	// clear vdsoSP
+
 	// sec is in R3, nsec in R5
 	// return nsec in R3
 	MOVV	$1000000000, R4
@@ -204,6 +306,11 @@
 	MOVV	R3, ret+0(FP)
 	RET
 
+fallback:
+	MOVV	$SYS_clock_gettime, R2
+	SYSCALL
+	JMP	finish
+
 TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	how+0(FP), R4
 	MOVV	new+8(FP), R5
@@ -454,6 +561,21 @@
 	SYSCALL
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R4 // fd
+	MOVV	$3, R5	// F_GETFL
+	MOVV	$0, R6
+	MOVV	$SYS_fcntl, R2
+	SYSCALL
+	MOVW	$0x80, R6 // O_NONBLOCK
+	OR	R2, R6
+	MOVW	fd+0(FP), R4 // fd
+	MOVV	$4, R5	// F_SETFL
+	MOVV	$SYS_fcntl, R2
+	SYSCALL
+	RET
+
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index 6e539fb..fab2ab3 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s
@@ -20,6 +20,7 @@
 #define SYS_close		4006
 #define SYS_getpid		4020
 #define SYS_kill		4037
+#define SYS_pipe		4042
 #define SYS_brk			4045
 #define SYS_fcntl		4055
 #define SYS_mmap		4090
@@ -44,6 +45,7 @@
 #define SYS_clock_gettime	4263
 #define SYS_tgkill		4266
 #define SYS_epoll_create1	4326
+#define SYS_pipe2		4328
 
 TEXT runtime·exit(SB),NOSPLIT,$0-4
 	MOVW	code+0(FP), R4
@@ -86,14 +88,14 @@
 	MOVW	R2, ret+4(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$0-16
+TEXT runtime·write1(SB),NOSPLIT,$0-16
 	MOVW	fd+0(FP), R4
 	MOVW	p+4(FP), R5
 	MOVW	n+8(FP), R6
 	MOVW	$SYS_write, R2
 	SYSCALL
 	BEQ	R7, 2(PC)
-	MOVW	$-1, R2
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+12(FP)
 	RET
 
@@ -104,10 +106,38 @@
 	MOVW	$SYS_read, R2
 	SYSCALL
 	BEQ	R7, 2(PC)
-	MOVW	$-1, R2
+	SUBU	R2, R0, R2	// caller expects negative errno
 	MOVW	R2, ret+12(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVW	$SYS_pipe, R2
+	SYSCALL
+	BEQ	R7, pipeok
+	MOVW	$-1, R1
+	MOVW	R1, r+0(FP)
+	MOVW	R1, w+4(FP)
+	SUBU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+8(FP)
+	RET
+pipeok:
+	MOVW	R2, r+0(FP)
+	MOVW	R3, w+4(FP)
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVW	$r+4(FP), R4
+	MOVW	flags+0(FP), R5
+	MOVW	$SYS_pipe2, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	SUBU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, errno+12(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$28-4
 	MOVW	usec+0(FP), R3
 	MOVW	R3, R5
@@ -156,6 +186,20 @@
 	SYSCALL
 	RET
 
+TEXT ·getpid(SB),NOSPLIT,$0-4
+	MOVW	$SYS_getpid, R2
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT,$0-12
+	MOVW	tgid+0(FP), R4
+	MOVW	tid+4(FP), R5
+	MOVW	sig+8(FP), R6
+	MOVW	$SYS_tgkill, R2
+	SYSCALL
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT,$0-12
 	MOVW	mode+0(FP), R4
 	MOVW	new+4(FP), R5
@@ -174,8 +218,8 @@
 	MOVW	R2, ret+12(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$8-12
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$8-12
 	MOVW	$0, R4	// CLOCK_REALTIME
 	MOVW	$4(R29), R5
 	MOVW	$SYS_clock_gettime, R2
@@ -193,7 +237,7 @@
 	MOVW	R5, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$8-8
+TEXT runtime·nanotime1(SB),NOSPLIT,$8-8
 	MOVW	$1, R4	// CLOCK_MONOTONIC
 	MOVW	$4(R29), R5
 	MOVW	$SYS_clock_gettime, R2
@@ -487,6 +531,21 @@
 	SYSCALL
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW	fd+0(FP), R4 // fd
+	MOVW	$3, R5	// F_GETFL
+	MOVW	$0, R6
+	MOVW	$SYS_fcntl, R2
+	SYSCALL
+	MOVW	$0x80, R6 // O_NONBLOCK
+	OR	R2, R6
+	MOVW	fd+0(FP), R4 // fd
+	MOVW	$4, R5	// F_SETFL
+	MOVW	$SYS_fcntl, R2
+	SYSCALL
+	RET
+
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index 13d2315..8629fe3 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s
@@ -21,6 +21,7 @@
 #define SYS_close		  6
 #define SYS_getpid		 20
 #define SYS_kill		 37
+#define SYS_pipe		 42
 #define SYS_brk			 45
 #define SYS_fcntl		 55
 #define SYS_mmap		 90
@@ -45,6 +46,7 @@
 #define SYS_clock_gettime	246
 #define SYS_tgkill		250
 #define SYS_epoll_create1	315
+#define SYS_pipe2		317
 
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	code+0(FP), R3
@@ -80,13 +82,13 @@
 	MOVW	R3, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
 	MOVD	fd+0(FP), R3
 	MOVD	p+8(FP), R4
 	MOVW	n+16(FP), R5
 	SYSCALL	$SYS_write
 	BVC	2(PC)
-	MOVW	$-1, R3
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+24(FP)
 	RET
 
@@ -96,10 +98,25 @@
 	MOVW	n+16(FP), R5
 	SYSCALL	$SYS_read
 	BVC	2(PC)
-	MOVW	$-1, R3
+	NEG	R3	// caller expects negative errno
 	MOVW	R3, ret+24(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	ADD	$FIXED_FRAME, R1, R3
+	SYSCALL	$SYS_pipe
+	MOVW	R3, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	ADD	$FIXED_FRAME+8, R1, R3
+	MOVW	flags+0(FP), R4
+	SYSCALL	$SYS_pipe2
+	MOVW	R3, errno+16(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$16-4
 	MOVW	usec+0(FP), R3
 	MOVD	R3, R5
@@ -139,6 +156,18 @@
 	SYSCALL	$SYS_kill
 	RET
 
+TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
+	SYSCALL $SYS_getpid
+	MOVD	R3, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24
+	MOVD	tgid+0(FP), R3
+	MOVD	tid+8(FP), R4
+	MOVD	sig+16(FP), R5
+	SYSCALL $SYS_tgkill
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
 	MOVW	mode+0(FP), R3
 	MOVD	new+8(FP), R4
@@ -155,8 +184,8 @@
 	MOVW	R3, ret+24(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$16
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$16
 	MOVD	R1, R15		// R15 is unchanged by C code
 	MOVD	g_m(g), R21	// R21 = m
 
@@ -203,7 +232,7 @@
 	MOVD	40(R1), R5
 	JMP	finish
 
-TEXT runtime·nanotime(SB),NOSPLIT,$16
+TEXT runtime·nanotime1(SB),NOSPLIT,$16
 	MOVD	$1, R3		// CLOCK_MONOTONIC
 
 	MOVD	R1, R15		// R15 is unchanged by C code
@@ -251,7 +280,7 @@
 	ADD	$32, R1, R4
 	SYSCALL $SYS_clock_gettime
 	MOVD	32(R1), R3
-	MOVD	48(R1), R5
+	MOVD	40(R1), R5
 	JMP	finish
 
 TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
@@ -612,6 +641,18 @@
 	SYSCALL	$SYS_fcntl
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R3 // fd
+	MOVD	$3, R4	// F_GETFL
+	MOVD	$0, R5
+	SYSCALL	$SYS_fcntl
+	OR	$0x800, R3, R5 // O_NONBLOCK
+	MOVW	fd+0(FP), R3 // fd
+	MOVD	$4, R4	// F_SETFL
+	SYSCALL	$SYS_fcntl
+	RET
+
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_linux_riscv64.s b/src/runtime/sys_linux_riscv64.s
new file mode 100644
index 0000000..626ab39
--- /dev/null
+++ b/src/runtime/sys_linux_riscv64.s
@@ -0,0 +1,515 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//
+// System calls and other sys.stuff for riscv64, Linux
+//
+
+#include "textflag.h"
+#include "go_asm.h"
+
+#define AT_FDCWD -100
+
+#define SYS_brk			214
+#define SYS_clock_gettime	113
+#define SYS_clone		220
+#define SYS_close		57
+#define SYS_connect		203
+#define SYS_epoll_create1	20
+#define SYS_epoll_ctl		21
+#define SYS_epoll_pwait		22
+#define SYS_exit		93
+#define SYS_exit_group		94
+#define SYS_faccessat		48
+#define SYS_fcntl		25
+#define SYS_futex		98
+#define SYS_getpid		172
+#define SYS_getrlimit		163
+#define SYS_gettid		178
+#define SYS_gettimeofday	169
+#define SYS_kill		129
+#define SYS_madvise		233
+#define SYS_mincore		232
+#define SYS_mmap		222
+#define SYS_munmap		215
+#define SYS_nanosleep		101
+#define SYS_openat		56
+#define SYS_pipe2		59
+#define SYS_pselect6		72
+#define SYS_read		63
+#define SYS_rt_sigaction	134
+#define SYS_rt_sigprocmask	135
+#define SYS_rt_sigreturn	139
+#define SYS_sched_getaffinity	123
+#define SYS_sched_yield		124
+#define SYS_setitimer		103
+#define SYS_sigaltstack		132
+#define SYS_socket		198
+#define SYS_tgkill		131
+#define SYS_tkill		130
+#define SYS_write		64
+
+// func exit(code int32)
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), A0
+	MOV	$SYS_exit_group, A7
+	ECALL
+	RET
+
+// func exitThread(wait *uint32)
+TEXT runtime·exitThread(SB),NOSPLIT|NOFRAME,$0-8
+	MOV	wait+0(FP), A0
+	// We're done using the stack.
+	FENCE
+	MOVW	ZERO, (A0)
+	FENCE
+	MOV	$0, A0	// exit code
+	MOV	$SYS_exit, A7
+	ECALL
+	JMP	0(PC)
+
+// func open(name *byte, mode, perm int32) int32
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
+	MOV	$AT_FDCWD, A0
+	MOV	name+0(FP), A1
+	MOVW	mode+8(FP), A2
+	MOVW	perm+12(FP), A3
+	MOV	$SYS_openat, A7
+	ECALL
+	MOV	$-4096, T0
+	BGEU	T0, A0, 2(PC)
+	MOV	$-1, A0
+	MOVW	A0, ret+16(FP)
+	RET
+
+// func closefd(fd int32) int32
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	fd+0(FP), A0
+	MOV	$SYS_close, A7
+	ECALL
+	MOV	$-4096, T0
+	BGEU	T0, A0, 2(PC)
+	MOV	$-1, A0
+	MOVW	A0, ret+8(FP)
+	RET
+
+// func write1(fd uintptr, p unsafe.Pointer, n int32) int32
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
+	MOV	fd+0(FP), A0
+	MOV	p+8(FP), A1
+	MOVW	n+16(FP), A2
+	MOV	$SYS_write, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func read(fd int32, p unsafe.Pointer, n int32) int32
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	fd+0(FP), A0
+	MOV	p+8(FP), A1
+	MOVW	n+16(FP), A2
+	MOV	$SYS_read, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOV	$r+0(FP), A0
+	MOV	ZERO, A1
+	MOV	$SYS_pipe2, A7
+	ECALL
+	MOVW	A0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOV	$r+8(FP), A0
+	MOVW	flags+0(FP), A1
+	MOV	$SYS_pipe2, A7
+	ECALL
+	MOVW	A0, errno+16(FP)
+	RET
+
+// func getrlimit(kind int32, limit unsafe.Pointer) int32
+TEXT runtime·getrlimit(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	kind+0(FP), A0
+	MOV	limit+8(FP), A1
+	MOV	$SYS_getrlimit, A7
+	ECALL
+	MOVW	A0, ret+16(FP)
+	RET
+
+// func usleep(usec uint32)
+TEXT runtime·usleep(SB),NOSPLIT,$24-4
+	MOVWU	usec+0(FP), A0
+	MOV	$1000, A1
+	MUL	A1, A0, A0
+	MOV	$1000000000, A1
+	DIV	A1, A0, A2
+	MOV	A2, 8(X2)
+	REM	A1, A0, A3
+	MOV	A3, 16(X2)
+	ADD	$8, X2, A0
+	MOV	ZERO, A1
+	MOV	$SYS_nanosleep, A7
+	ECALL
+	RET
+
+// func gettid() uint32
+TEXT runtime·gettid(SB),NOSPLIT,$0-4
+	MOV	$SYS_gettid, A7
+	ECALL
+	MOVW	A0, ret+0(FP)
+	RET
+
+// func raise(sig uint32)
+TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
+	MOV	$SYS_gettid, A7
+	ECALL
+	// arg 1 tid - already in A0
+	MOVW	sig+0(FP), A1	// arg 2
+	MOV	$SYS_tkill, A7
+	ECALL
+	RET
+
+// func raiseproc(sig uint32)
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
+	MOV	$SYS_getpid, A7
+	ECALL
+	// arg 1 pid - already in A0
+	MOVW	sig+0(FP), A1	// arg 2
+	MOV	$SYS_kill, A7
+	ECALL
+	RET
+
+// func getpid() int
+TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
+	MOV	$SYS_getpid, A7
+	ECALL
+	MOV	A0, ret+0(FP)
+	RET
+
+// func tgkill(tgid, tid, sig int)
+TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24
+	MOV	tgid+0(FP), A0
+	MOV	tid+8(FP), A1
+	MOV	sig+16(FP), A2
+	MOV	$SYS_tgkill, A7
+	ECALL
+	RET
+
+// func setitimer(mode int32, new, old *itimerval)
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	mode+0(FP), A0
+	MOV	new+8(FP), A1
+	MOV	old+16(FP), A2
+	MOV	$SYS_setitimer, A7
+	ECALL
+	RET
+
+// func mincore(addr unsafe.Pointer, n uintptr, dst *byte) int32
+TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOV	dst+16(FP), A2
+	MOV	$SYS_mincore, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$24-12
+	MOV	$0, A0 // CLOCK_REALTIME
+	MOV	$8(X2), A1
+	MOV	$SYS_clock_gettime, A7
+	ECALL
+	MOV	8(X2), T0	// sec
+	MOV	16(X2), T1	// nsec
+	MOV	T0, sec+0(FP)
+	MOVW	T1, nsec+8(FP)
+	RET
+
+// func nanotime1() int64
+TEXT runtime·nanotime1(SB),NOSPLIT,$24-8
+	MOV	$1, A0 // CLOCK_MONOTONIC
+	MOV	$8(X2), A1
+	MOV	$SYS_clock_gettime, A7
+	ECALL
+	MOV	8(X2), T0	// sec
+	MOV	16(X2), T1	// nsec
+	// sec is in T0, nsec in T1
+	// return nsec in T0
+	MOV	$1000000000, T2
+	MUL	T2, T0
+	ADD	T1, T0
+	MOV	T0, ret+0(FP)
+	RET
+
+// func rtsigprocmask(how int32, new, old *sigset, size int32)
+TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	how+0(FP), A0
+	MOV	new+8(FP), A1
+	MOV	old+16(FP), A2
+	MOVW	size+24(FP), A3
+	MOV	$SYS_rt_sigprocmask, A7
+	ECALL
+	MOV	$-4096, T0
+	BLTU	A0, T0, 2(PC)
+	WORD	$0	// crash
+	RET
+
+// func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
+TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
+	MOV	sig+0(FP), A0
+	MOV	new+8(FP), A1
+	MOV	old+16(FP), A2
+	MOV	size+24(FP), A3
+	MOV	$SYS_rt_sigaction, A7
+	ECALL
+	MOVW	A0, ret+32(FP)
+	RET
+
+// func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), A0
+	MOV	info+16(FP), A1
+	MOV	ctx+24(FP), A2
+	MOV	fn+0(FP), T1
+	JALR	RA, T1
+	RET
+
+// func sigtramp(signo, ureg, ctxt unsafe.Pointer)
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	MOVW	A0, 8(X2)
+	MOV	A1, 16(X2)
+	MOV	A2, 24(X2)
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVBU	runtime·iscgo(SB), A0
+	BEQ	A0, ZERO, 2(PC)
+	CALL	runtime·load_g(SB)
+
+	MOV	$runtime·sigtrampgo(SB), A0
+	JALR	RA, A0
+	RET
+
+// func cgoSigtramp()
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	MOV	$runtime·sigtramp(SB), T1
+	JALR	ZERO, T1
+
+// func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) (p unsafe.Pointer, err int)
+TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOVW	prot+16(FP), A2
+	MOVW	flags+20(FP), A3
+	MOVW	fd+24(FP), A4
+	MOVW	off+28(FP), A5
+	MOV	$SYS_mmap, A7
+	ECALL
+	MOV	$-4096, T0
+	BGEU	T0, A0, 5(PC)
+	SUB	A0, ZERO, A0
+	MOV	ZERO, p+32(FP)
+	MOV	A0, err+40(FP)
+	RET
+ok:
+	MOV	A0, p+32(FP)
+	MOV	ZERO, err+40(FP)
+	RET
+
+// func munmap(addr unsafe.Pointer, n uintptr)
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOV	$SYS_munmap, A7
+	ECALL
+	MOV	$-4096, T0
+	BLTU	A0, T0, 2(PC)
+	WORD	$0	// crash
+	RET
+
+// func madvise(addr unsafe.Pointer, n uintptr, flags int32)
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOV	n+8(FP), A1
+	MOVW	flags+16(FP), A2
+	MOV	$SYS_madvise, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
+TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
+	MOV	addr+0(FP), A0
+	MOVW	op+8(FP), A1
+	MOVW	val+12(FP), A2
+	MOV	ts+16(FP), A3
+	MOV	addr2+24(FP), A4
+	MOVW	val3+32(FP), A5
+	MOV	$SYS_futex, A7
+	ECALL
+	MOVW	A0, ret+40(FP)
+	RET
+
+// func clone(flags int32, stk, mp, gp, fn unsafe.Pointer) int32
+TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), A0
+	MOV	stk+8(FP), A1
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	MOV	mp+16(FP), T0
+	MOV	gp+24(FP), T1
+	MOV	fn+32(FP), T2
+
+	MOV	T0, -8(A1)
+	MOV	T1, -16(A1)
+	MOV	T2, -24(A1)
+	MOV	$1234, T0
+	MOV	T0, -32(A1)
+
+	MOV	$SYS_clone, A7
+	ECALL
+
+	// In parent, return.
+	BEQ	ZERO, A0, child
+	MOVW	ZERO, ret+40(FP)
+	RET
+
+child:
+	// In child, on new stack.
+	MOV	-32(X2), T0
+	MOV	$1234, A0
+	BEQ	A0, T0, good
+	WORD	$0	// crash
+
+good:
+	// Initialize m->procid to Linux tid
+	MOV	$SYS_gettid, A7
+	ECALL
+
+	MOV	-24(X2), T2	// fn
+	MOV	-16(X2), T1	// g
+	MOV	-8(X2), T0	// m
+
+	BEQ	ZERO, T0, nog
+	BEQ	ZERO, T1, nog
+
+	MOV	A0, m_procid(T0)
+
+	// In child, set up new stack
+	MOV	T0, g_m(T1)
+	MOV	T1, g
+
+nog:
+	// Call fn
+	JALR	RA, T2
+
+	// It shouldn't return.  If it does, exit this thread.
+	MOV	$111, A0
+	MOV	$SYS_exit, A7
+	ECALL
+	JMP	-3(PC)	// keep exiting
+
+// func sigaltstack(new, old *stackt)
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+	MOV	new+0(FP), A0
+	MOV	old+8(FP), A1
+	MOV	$SYS_sigaltstack, A7
+	ECALL
+	MOV	$-4096, T0
+	BLTU	A0, T0, 2(PC)
+	WORD	$0	// crash
+	RET
+
+// func osyield()
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
+	MOV	$SYS_sched_yield, A7
+	ECALL
+	RET
+
+// func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
+TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
+	MOV	pid+0(FP), A0
+	MOV	len+8(FP), A1
+	MOV	buf+16(FP), A2
+	MOV	$SYS_sched_getaffinity, A7
+	ECALL
+	MOV	A0, ret+24(FP)
+	RET
+
+// func epollcreate(size int32) int32
+TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
+	MOV	$0, A0
+	MOV	$SYS_epoll_create1, A7
+	ECALL
+	MOVW	A0, ret+8(FP)
+	RET
+
+// func epollcreate1(flags int32) int32
+TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), A0
+	MOV	$SYS_epoll_create1, A7
+	ECALL
+	MOVW	A0, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollevent) int32
+TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), A0
+	MOVW	op+4(FP), A1
+	MOVW	fd+8(FP), A2
+	MOV	ev+16(FP), A3
+	MOV	$SYS_epoll_ctl, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func epollwait(epfd int32, ev *epollevent, nev, timeout int32) int32
+TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), A0
+	MOV	ev+8(FP), A1
+	MOVW	nev+16(FP), A2
+	MOVW	timeout+20(FP), A3
+	MOV	$0, A4
+	MOV	$SYS_epoll_pwait, A7
+	ECALL
+	MOVW	A0, ret+24(FP)
+	RET
+
+// func closeonexec(int32)
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
+	MOVW	fd+0(FP), A0  // fd
+	MOV	$2, A1	// F_SETFD
+	MOV	$1, A2	// FD_CLOEXEC
+	MOV	$SYS_fcntl, A7
+	ECALL
+	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), A0 // fd
+	MOV	$3, A1	// F_GETFL
+	MOV	$0, A2
+	MOV	$SYS_fcntl, A7
+	ECALL
+	MOV	$0x800, A2 // O_NONBLOCK
+	OR	A0, A2
+	MOVW	fd+0(FP), A0 // fd
+	MOV	$4, A1	// F_SETFL
+	MOV	$SYS_fcntl, A7
+	ECALL
+	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
+	// Implemented as brk(NULL).
+	MOV	$0, A0
+	MOV	$SYS_brk, A7
+	ECALL
+	MOVW	A0, ret+0(FP)
+	RET
diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
index 58b36df..c15a1d5 100644
--- a/src/runtime/sys_linux_s390x.s
+++ b/src/runtime/sys_linux_s390x.s
@@ -16,6 +16,7 @@
 #define SYS_close                 6
 #define SYS_getpid               20
 #define SYS_kill                 37
+#define SYS_pipe		 42
 #define SYS_brk			 45
 #define SYS_fcntl                55
 #define SYS_mmap                 90
@@ -39,6 +40,7 @@
 #define SYS_epoll_ctl           250
 #define SYS_epoll_wait          251
 #define SYS_clock_gettime       260
+#define SYS_pipe2		325
 #define SYS_epoll_create1       327
 
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
@@ -80,15 +82,12 @@
 	MOVW	R2, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0-28
 	MOVD	fd+0(FP), R2
 	MOVD	p+8(FP), R3
 	MOVW	n+16(FP), R4
 	MOVW	$SYS_write, R1
 	SYSCALL
-	MOVD	$-4095, R3
-	CMPUBLT	R2, R3, 2(PC)
-	MOVW	$-1, R2
 	MOVW	R2, ret+24(FP)
 	RET
 
@@ -98,12 +97,26 @@
 	MOVW	n+16(FP), R4
 	MOVW	$SYS_read, R1
 	SYSCALL
-	MOVD	$-4095, R3
-	CMPUBLT	R2, R3, 2(PC)
-	MOVW	$-1, R2
 	MOVW	R2, ret+24(FP)
 	RET
 
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVD	$r+0(FP), R2
+	MOVW	$SYS_pipe, R1
+	SYSCALL
+	MOVW	R2, errno+8(FP)
+	RET
+
+// func pipe2() (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	$r+8(FP), R2
+	MOVW	flags+0(FP), R3
+	MOVW	$SYS_pipe2, R1
+	SYSCALL
+	MOVW	R2, errno+16(FP)
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$16-4
 	MOVW	usec+0(FP), R2
 	MOVD	R2, R4
@@ -150,6 +163,20 @@
 	SYSCALL
 	RET
 
+TEXT ·getpid(SB),NOSPLIT|NOFRAME,$0-8
+	MOVW	$SYS_getpid, R1
+	SYSCALL
+	MOVD	R2, ret+0(FP)
+	RET
+
+TEXT ·tgkill(SB),NOSPLIT|NOFRAME,$0-24
+	MOVD	tgid+0(FP), R2
+	MOVD	tid+8(FP), R3
+	MOVD	sig+16(FP), R4
+	MOVW	$SYS_tgkill, R1
+	SYSCALL
+	RET
+
 TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
 	MOVW	mode+0(FP), R2
 	MOVD	new+8(FP), R3
@@ -167,8 +194,8 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$16
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$16
 	MOVW	$0, R2 // CLOCK_REALTIME
 	MOVD	$tp-16(SP), R3
 	MOVW	$SYS_clock_gettime, R1
@@ -179,7 +206,7 @@
 	MOVW	R3, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$16
+TEXT runtime·nanotime1(SB),NOSPLIT,$16
 	MOVW	$1, R2 // CLOCK_MONOTONIC
 	MOVD	$tp-16(SP), R3
 	MOVW	$SYS_clock_gettime, R1
@@ -441,6 +468,21 @@
 	SYSCALL
 	RET
 
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R2 // fd
+	MOVD	$3, R3	// F_GETFL
+	XOR	R4, R4
+	MOVW	$SYS_fcntl, R1
+	SYSCALL
+	MOVD	$0x800, R4 // O_NONBLOCK
+	OR	R2, R4
+	MOVW	fd+0(FP), R2 // fd
+	MOVD	$4, R3	// F_SETFL
+	MOVW	$SYS_fcntl, R1
+	SYSCALL
+	RET
+
 // func sbrk0() uintptr
 TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
 	// Implemented as brk(NULL).
diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s
deleted file mode 100644
index 8460aab..0000000
--- a/src/runtime/sys_nacl_386.s
+++ /dev/null
@@ -1,374 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "textflag.h"
-#include "syscall_nacl.h"
-
-#define NACL_SYSCALL(code) \
-	MOVL $(0x10000 + ((code)<<5)), AX; CALL AX
-
-TEXT runtime·exit(SB),NOSPLIT,$4
-	MOVL code+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_exit)
-	JMP 0(PC)
-
-// func exitThread(wait *uint32)
-TEXT runtime·exitThread(SB),NOSPLIT,$4-4
-	MOVL wait+0(FP), AX
-	// SYS_thread_exit will clear *wait when the stack is free.
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_thread_exit)
-	JMP 0(PC)
-
-TEXT runtime·open(SB),NOSPLIT,$12
-	MOVL name+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL mode+4(FP), AX
-	MOVL AX, 4(SP)
-	MOVL perm+8(FP), AX
-	MOVL AX, 8(SP)
-	NACL_SYSCALL(SYS_open)
-	MOVL AX, ret+12(FP)
-	RET
-
-TEXT runtime·closefd(SB),NOSPLIT,$4
-	MOVL fd+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_close)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·read(SB),NOSPLIT,$12
-	MOVL fd+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL p+4(FP), AX
-	MOVL AX, 4(SP)
-	MOVL n+8(FP), AX
-	MOVL AX, 8(SP)
-	NACL_SYSCALL(SYS_read)
-	MOVL AX, ret+12(FP)
-	RET
-
-TEXT syscall·naclWrite(SB), NOSPLIT, $16-16
-	MOVL arg1+0(FP), DI
-	MOVL arg2+4(FP), SI
-	MOVL arg3+8(FP), DX
-	MOVL DI, 0(SP)
-	MOVL SI, 4(SP)
-	MOVL DX, 8(SP)
-	CALL runtime·write(SB)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·write(SB),NOSPLIT,$12
-	MOVL fd+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL p+4(FP), AX
-	MOVL AX, 4(SP)
-	MOVL n+8(FP), AX
-	MOVL AX, 8(SP)
-	NACL_SYSCALL(SYS_write)
-	MOVL AX, ret+12(FP)
-	RET
-
-TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$8
-	MOVL p+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL size+4(FP), AX
-	MOVL AX, 4(SP)
-	NACL_SYSCALL(SYS_exception_stack)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$8
-	MOVL fn+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL arg+4(FP), AX
-	MOVL AX, 4(SP)
-	NACL_SYSCALL(SYS_exception_handler)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_sem_create(SB),NOSPLIT,$4
-	MOVL flag+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_sem_create)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$4
-	MOVL sem+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_sem_wait)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_sem_post(SB),NOSPLIT,$4
-	MOVL sem+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_sem_post)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$4
-	MOVL flag+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_mutex_create)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$4
-	MOVL mutex+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_mutex_lock)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$4
-	MOVL mutex+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_mutex_trylock)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$4
-	MOVL mutex+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_mutex_unlock)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_create(SB),NOSPLIT,$4
-	MOVL flag+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_cond_create)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$8
-	MOVL cond+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL n+4(FP), AX
-	MOVL AX, 4(SP)
-	NACL_SYSCALL(SYS_cond_wait)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$4
-	MOVL cond+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_cond_signal)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$4
-	MOVL cond+0(FP), AX
-	MOVL AX, 0(SP)
-	NACL_SYSCALL(SYS_cond_broadcast)
-	MOVL AX, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$12
-	MOVL cond+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL lock+4(FP), AX
-	MOVL AX, 4(SP)
-	MOVL ts+8(FP), AX
-	MOVL AX, 8(SP)
-	NACL_SYSCALL(SYS_cond_timed_wait_abs)
-	MOVL AX, ret+12(FP)
-	RET
-
-TEXT runtime·nacl_thread_create(SB),NOSPLIT,$16
-	MOVL fn+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL stk+4(FP), AX
-	MOVL AX, 4(SP)
-	MOVL tls+8(FP), AX
-	MOVL AX, 8(SP)
-	MOVL xx+12(FP), AX
-	MOVL AX, 12(SP)
-	NACL_SYSCALL(SYS_thread_create)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
-	JMP runtime·mstart(SB)
-
-TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$8
-	MOVL ts+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL extra+4(FP), AX
-	MOVL AX, 4(SP)
-	NACL_SYSCALL(SYS_nanosleep)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·osyield(SB),NOSPLIT,$0
-	NACL_SYSCALL(SYS_sched_yield)
-	RET
-
-TEXT runtime·mmap(SB),NOSPLIT,$32
-	MOVL	addr+0(FP), AX
-	MOVL	AX, 0(SP)
-	MOVL	n+4(FP), AX
-	MOVL	AX, 4(SP)
-	MOVL	prot+8(FP), AX
-	MOVL	AX, 8(SP)
-	MOVL	flags+12(FP), AX
-	MOVL	AX, 12(SP)
-	MOVL	fd+16(FP), AX
-	MOVL	AX, 16(SP)
-	MOVL	off+20(FP), AX
-	MOVL	AX, 24(SP)
-	MOVL	$0, 28(SP)
-	LEAL	24(SP), AX
-	MOVL	AX, 20(SP)
-	NACL_SYSCALL(SYS_mmap)
-	CMPL	AX, $-4095
-	JNA	ok
-	NEGL	AX
-	MOVL	$0, p+24(FP)
-	MOVL	AX, err+28(FP)
-	RET
-ok:
-	MOVL	AX, p+24(FP)
-	MOVL	$0, err+28(FP)
-	RET
-
-TEXT runtime·walltime(SB),NOSPLIT,$20
-	MOVL $0, 0(SP) // real time clock
-	LEAL 8(SP), AX
-	MOVL AX, 4(SP) // timespec
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVL 8(SP), AX // low 32 sec
-	MOVL 12(SP), CX // high 32 sec
-	MOVL 16(SP), BX // nsec
-
-	// sec is in AX, nsec in BX
-	MOVL	AX, sec_lo+0(FP)
-	MOVL	CX, sec_hi+4(FP)
-	MOVL	BX, nsec+8(FP)
-	RET
-
-TEXT syscall·now(SB),NOSPLIT,$0
-	JMP runtime·walltime(SB)
-
-TEXT runtime·nanotime(SB),NOSPLIT,$20
-	MOVL $0, 0(SP) // real time clock
-	LEAL 8(SP), AX
-	MOVL AX, 4(SP) // timespec
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVL 8(SP), AX // low 32 sec
-	MOVL 16(SP), BX // nsec
-
-	// sec is in AX, nsec in BX
-	// convert to DX:AX nsec
-	MOVL	$1000000000, CX
-	MULL	CX
-	ADDL	BX, AX
-	ADCL	$0, DX
-
-	MOVL	AX, ret_lo+0(FP)
-	MOVL	DX, ret_hi+4(FP)
-	RET
-
-TEXT runtime·setldt(SB),NOSPLIT,$8
-	MOVL	base+4(FP), BX
-	ADDL	$0x8, BX
-	MOVL	BX, 0(SP)
-	NACL_SYSCALL(SYS_tls_init)
-	RET
-
-TEXT runtime·sigtramp(SB),NOSPLIT,$0
-	get_tls(CX)
-
-	// check that g exists
-	MOVL	g(CX), DI
-	CMPL	DI, $0
-	JNE	6(PC)
-	MOVL	$11, BX
-	MOVL	$0, 0(SP)
-	MOVL	$runtime·badsignal(SB), AX
-	CALL	AX
-	JMP 	ret
-
-	// save g
-	NOP	SP	// tell vet SP changed - stop checking offsets
-	MOVL	DI, 20(SP)
-
-	// g = m->gsignal
-	MOVL	g_m(DI), BX
-	MOVL	m_gsignal(BX), BX
-	MOVL	BX, g(CX)
-
-	// copy arguments for sighandler
-	MOVL	$11, 0(SP) // signal
-	MOVL	$0, 4(SP) // siginfo
-	LEAL	8(SP), AX
-	MOVL	AX, 8(SP) // context
-	MOVL	DI, 12(SP) // g
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), BX
-	MOVL	BX, g(CX)
-
-ret:
-	// Enable exceptions again.
-	NACL_SYSCALL(SYS_exception_clear_flag)
-
-	// NaCl has abdicated its traditional operating system responsibility
-	// and declined to implement 'sigreturn'. Instead the only way to return
-	// to the execution of our program is to restore the registers ourselves.
-	// Unfortunately, that is impossible to do with strict fidelity, because
-	// there is no way to do the final update of PC that ends the sequence
-	// without either (1) jumping to a register, in which case the register ends
-	// holding the PC value instead of its intended value or (2) storing the PC
-	// on the stack and using RET, which imposes the requirement that SP is
-	// valid and that is okay to smash the word below it. The second would
-	// normally be the lesser of the two evils, except that on NaCl, the linker
-	// must rewrite RET into "POP reg; AND $~31, reg; JMP reg", so either way
-	// we are going to lose a register as a result of the incoming signal.
-	// Similarly, there is no way to restore EFLAGS; the usual way is to use
-	// POPFL, but NaCl rejects that instruction. We could inspect the bits and
-	// execute a sequence of instructions designed to recreate those flag
-	// settings, but that's a lot of work.
-	//
-	// Thankfully, Go's signal handlers never try to return directly to the
-	// executing code, so all the registers and EFLAGS are dead and can be
-	// smashed. The only registers that matter are the ones that are setting
-	// up for the simulated call that the signal handler has created.
-	// Today those registers are just PC and SP, but in case additional registers
-	// are relevant in the future (for example DX is the Go func context register)
-	// we restore as many registers as possible.
-	//
-	// We smash BP, because that's what the linker smashes during RET.
-	//
-	LEAL	72(SP), BP
-	MOVL	0(BP), AX
-	MOVL	4(BP), CX
-	MOVL	8(BP), DX
-	MOVL	12(BP), BX
-	MOVL	16(BP), SP
-	// 20(BP) is saved BP, never to be seen again
-	MOVL	24(BP), SI
-	MOVL	28(BP), DI
-	// 36(BP) is saved EFLAGS, never to be seen again
-	MOVL	32(BP), BP // saved PC
-	JMP	BP
-
-// func getRandomData([]byte)
-TEXT runtime·getRandomData(SB),NOSPLIT,$8-12
-	MOVL arg_base+0(FP), AX
-	MOVL AX, 0(SP)
-	MOVL arg_len+4(FP), AX
-	MOVL AX, 4(SP)
-	NACL_SYSCALL(SYS_get_random_bytes)
-	RET
diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s
deleted file mode 100644
index 9f4f69c..0000000
--- a/src/runtime/sys_nacl_amd64p32.s
+++ /dev/null
@@ -1,482 +0,0 @@
-// Copyright 2013 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "textflag.h"
-#include "syscall_nacl.h"
-
-#define NACL_SYSCALL(code) \
-	MOVL $(0x10000 + ((code)<<5)), AX; CALL AX
-
-TEXT runtime·settls(SB),NOSPLIT,$0
-	MOVL	DI, TLS // really BP
-	RET
-
-TEXT runtime·exit(SB),NOSPLIT,$0
-	MOVL code+0(FP), DI
-	NACL_SYSCALL(SYS_exit)
-	RET
-
-// func exitThread(wait *uint32)
-TEXT runtime·exitThread(SB),NOSPLIT,$0-4
-	MOVL wait+0(FP), DI
-	// SYS_thread_exit will clear *wait when the stack is free.
-	NACL_SYSCALL(SYS_thread_exit)
-	JMP 0(PC)
-
-TEXT runtime·open(SB),NOSPLIT,$0
-	MOVL name+0(FP), DI
-	MOVL mode+4(FP), SI
-	MOVL perm+8(FP), DX
-	NACL_SYSCALL(SYS_open)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·closefd(SB),NOSPLIT,$0
-	MOVL fd+0(FP), DI
-	NACL_SYSCALL(SYS_close)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·read(SB),NOSPLIT,$0
-	MOVL fd+0(FP), DI
-	MOVL p+4(FP), SI
-	MOVL n+8(FP), DX
-	NACL_SYSCALL(SYS_read)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT syscall·naclWrite(SB), NOSPLIT, $24-20
-	MOVL arg1+0(FP), DI
-	MOVL arg2+4(FP), SI
-	MOVL arg3+8(FP), DX
-	MOVL DI, 0(SP)
-	MOVL SI, 4(SP)
-	MOVL DX, 8(SP)
-	CALL runtime·write(SB)
-	MOVL 16(SP), AX
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·write(SB),NOSPLIT,$16-20
-	// If using fake time and writing to stdout or stderr,
-	// emit playback header before actual data.
-	MOVQ runtime·faketime(SB), AX
-	CMPQ AX, $0
-	JEQ write
-	MOVL fd+0(FP), DI
-	CMPL DI, $1
-	JEQ playback
-	CMPL DI, $2
-	JEQ playback
-
-write:
-	// Ordinary write.
-	MOVL fd+0(FP), DI
-	MOVL p+4(FP), SI
-	MOVL n+8(FP), DX
-	NACL_SYSCALL(SYS_write)
-	MOVL	AX, ret+16(FP)
-	RET
-
-	// Write with playback header.
-	// First, lock to avoid interleaving writes.
-playback:
-	MOVL $1, BX
-	XCHGL	runtime·writelock(SB), BX
-	CMPL BX, $0
-	JNE playback
-
-	MOVQ runtime·lastfaketime(SB), CX
-	MOVL runtime·lastfaketimefd(SB), BX
-	CMPL DI, BX
-	JE samefd
-
-	// If the current fd doesn't match the fd of the previous write,
-	// ensure that the timestamp is strictly greater. That way, we can
-	// recover the original order even if we read the fds separately.
-	INCQ CX
-	MOVL DI, runtime·lastfaketimefd(SB)
-
-samefd:
-	CMPQ AX, CX
-	CMOVQLT CX, AX
-	MOVQ AX, runtime·lastfaketime(SB)
-
-	// Playback header: 0 0 P B <8-byte time> <4-byte data length>
-	MOVL $(('B'<<24) | ('P'<<16)), 0(SP)
-	BSWAPQ AX
-	MOVQ AX, 4(SP)
-	MOVL n+8(FP), DX
-	BSWAPL DX
-	MOVL DX, 12(SP)
-	MOVL fd+0(FP), DI
-	MOVL SP, SI
-	MOVL $16, DX
-	NACL_SYSCALL(SYS_write)
-
-	// Write actual data.
-	MOVL fd+0(FP), DI
-	MOVL p+4(FP), SI
-	MOVL n+8(FP), DX
-	NACL_SYSCALL(SYS_write)
-
-	// Unlock.
-	MOVL	$0, runtime·writelock(SB)
-
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$0
-	MOVL p+0(FP), DI
-	MOVL size+4(FP), SI
-	NACL_SYSCALL(SYS_exception_stack)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$0
-	MOVL fn+0(FP), DI
-	MOVL arg+4(FP), SI
-	NACL_SYSCALL(SYS_exception_handler)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_sem_create(SB),NOSPLIT,$0
-	MOVL flag+0(FP), DI
-	NACL_SYSCALL(SYS_sem_create)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$0
-	MOVL sem+0(FP), DI
-	NACL_SYSCALL(SYS_sem_wait)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_sem_post(SB),NOSPLIT,$0
-	MOVL sem+0(FP), DI
-	NACL_SYSCALL(SYS_sem_post)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$0
-	MOVL flag+0(FP), DI
-	NACL_SYSCALL(SYS_mutex_create)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$0
-	MOVL mutex+0(FP), DI
-	NACL_SYSCALL(SYS_mutex_lock)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$0
-	MOVL mutex+0(FP), DI
-	NACL_SYSCALL(SYS_mutex_trylock)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$0
-	MOVL mutex+0(FP), DI
-	NACL_SYSCALL(SYS_mutex_unlock)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_create(SB),NOSPLIT,$0
-	MOVL flag+0(FP), DI
-	NACL_SYSCALL(SYS_cond_create)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$0
-	MOVL cond+0(FP), DI
-	MOVL n+4(FP), SI
-	NACL_SYSCALL(SYS_cond_wait)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$0
-	MOVL cond+0(FP), DI
-	NACL_SYSCALL(SYS_cond_signal)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$0
-	MOVL cond+0(FP), DI
-	NACL_SYSCALL(SYS_cond_broadcast)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$0
-	MOVL cond+0(FP), DI
-	MOVL lock+4(FP), SI
-	MOVL ts+8(FP), DX
-	NACL_SYSCALL(SYS_cond_timed_wait_abs)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·nacl_thread_create(SB),NOSPLIT,$0
-	MOVL fn+0(FP), DI
-	MOVL stk+4(FP), SI
-	MOVL tls+8(FP), DX
-	MOVL xx+12(FP), CX
-	NACL_SYSCALL(SYS_thread_create)
-	MOVL AX, ret+16(FP)
-	RET
-
-TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
-	NACL_SYSCALL(SYS_tls_get)
-	SUBL	$8, AX
-	MOVL	AX, TLS
-	JMP runtime·mstart(SB)
-
-TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$0
-	MOVL ts+0(FP), DI
-	MOVL extra+4(FP), SI
-	NACL_SYSCALL(SYS_nanosleep)
-	MOVL AX, ret+8(FP)
-	RET
-
-TEXT runtime·osyield(SB),NOSPLIT,$0
-	NACL_SYSCALL(SYS_sched_yield)
-	RET
-
-TEXT runtime·mmap(SB),NOSPLIT,$8
-	MOVL addr+0(FP), DI
-	MOVL n+4(FP), SI
-	MOVL prot+8(FP), DX
-	MOVL flags+12(FP), CX
-	MOVL fd+16(FP), R8
-	MOVL off+20(FP), AX
-	MOVQ AX, 0(SP)
-	MOVL SP, R9
-	NACL_SYSCALL(SYS_mmap)
-	CMPL AX, $-4095
-	JNA ok
-	NEGL AX
-	MOVL	$0, p+24(FP)
-	MOVL	AX, err+28(FP)
-	RET
-ok:
-	MOVL	AX, p+24(FP)
-	MOVL	$0, err+28(FP)
-	RET
-
-TEXT runtime·walltime(SB),NOSPLIT,$16
-	MOVQ runtime·faketime(SB), AX
-	CMPQ AX, $0
-	JEQ realtime
-	MOVQ $0, DX
-	MOVQ $1000000000, CX
-	DIVQ CX
-	MOVQ AX, sec+0(FP)
-	MOVL DX, nsec+8(FP)
-	RET
-realtime:
-	MOVL $0, DI // real time clock
-	LEAL 0(SP), AX
-	MOVL AX, SI // timespec
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVL 0(SP), AX // low 32 sec
-	MOVL 4(SP), CX // high 32 sec
-	MOVL 8(SP), BX // nsec
-
-	// sec is in AX, nsec in BX
-	MOVL	AX, sec_lo+0(FP)
-	MOVL	CX, sec_hi+4(FP)
-	MOVL	BX, nsec+8(FP)
-	RET
-
-TEXT syscall·now(SB),NOSPLIT,$0
-	JMP runtime·walltime(SB)
-
-TEXT runtime·nanotime(SB),NOSPLIT,$16
-	MOVQ runtime·faketime(SB), AX
-	CMPQ AX, $0
-	JEQ 3(PC)
-	MOVQ	AX, ret+0(FP)
-	RET
-	MOVL $0, DI // real time clock
-	LEAL 0(SP), AX
-	MOVL AX, SI // timespec
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVQ 0(SP), AX // sec
-	MOVL 8(SP), DX // nsec
-
-	// sec is in AX, nsec in DX
-	// return nsec in AX
-	IMULQ	$1000000000, AX
-	ADDQ	DX, AX
-	MOVQ	AX, ret+0(FP)
-	RET
-
-TEXT runtime·sigtramp(SB),NOSPLIT,$80
-	// restore TLS register at time of execution,
-	// in case it's been smashed.
-	// the TLS register is really BP, but for consistency
-	// with non-NaCl systems it is referred to here as TLS.
-	// NOTE: Cannot use SYS_tls_get here (like we do in mstart_nacl),
-	// because the main thread never calls tls_set.
-	LEAL ctxt+0(FP), AX
-	MOVL	(16*4+5*8)(AX), AX
-	MOVL	AX, TLS
-
-	// check that g exists
-	get_tls(CX)
-	MOVL	g(CX), DI
-
-	CMPL	DI, $0
-	JEQ	nog
-
-	// save g
-	MOVL	DI, 20(SP)
-
-	// g = m->gsignal
-	MOVL	g_m(DI), BX
-	MOVL	m_gsignal(BX), BX
-	MOVL	BX, g(CX)
-
-//JMP debughandler
-
-	// copy arguments for sighandler
-	MOVL	$11, 0(SP) // signal
-	MOVL	$0, 4(SP) // siginfo
-	LEAL	ctxt+0(FP), AX
-	MOVL	AX, 8(SP) // context
-	MOVL	DI, 12(SP) // g
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), BX
-	MOVL	BX, g(CX)
-
-	// Enable exceptions again.
-	NACL_SYSCALL(SYS_exception_clear_flag)
-
-	// Restore registers as best we can. Impossible to do perfectly.
-	// See comment in sys_nacl_386.s for extended rationale.
-	LEAL	ctxt+0(FP), SI
-	ADDL	$64, SI
-	MOVQ	0(SI), AX
-	MOVQ	8(SI), CX
-	MOVQ	16(SI), DX
-	MOVQ	24(SI), BX
-	MOVL	32(SI), SP	// MOVL for SP sandboxing
-	// 40(SI) is saved BP aka TLS, already restored above
-	// 48(SI) is saved SI, never to be seen again
-	MOVQ	56(SI), DI
-	MOVQ	64(SI), R8
-	MOVQ	72(SI), R9
-	MOVQ	80(SI), R10
-	MOVQ	88(SI), R11
-	MOVQ	96(SI), R12
-	MOVQ	104(SI), R13
-	MOVQ	112(SI), R14
-	// 120(SI) is R15, which is owned by Native Client and must not be modified
-	MOVQ	128(SI), SI // saved PC
-	// 136(SI) is saved EFLAGS, never to be seen again
-	JMP	SI
-
-//debughandler:
-	//// print basic information
-	//LEAL	ctxt+0(FP), DI
-	//MOVL	$runtime·sigtrampf(SB), AX
-	//MOVL	AX, 0(SP)
-	//MOVQ	(16*4+16*8)(DI), BX // rip
-	//MOVQ	BX, 8(SP)
-	//MOVQ	(16*4+0*8)(DI), BX // rax
-	//MOVQ	BX, 16(SP)
-	//MOVQ	(16*4+1*8)(DI), BX // rcx
-	//MOVQ	BX, 24(SP)
-	//MOVQ	(16*4+2*8)(DI), BX // rdx
-	//MOVQ	BX, 32(SP)
-	//MOVQ	(16*4+3*8)(DI), BX // rbx
-	//MOVQ	BX, 40(SP)
-	//MOVQ	(16*4+7*8)(DI), BX // rdi
-	//MOVQ	BX, 48(SP)
-	//MOVQ	(16*4+15*8)(DI), BX // r15
-	//MOVQ	BX, 56(SP)
-	//MOVQ	(16*4+4*8)(DI), BX // rsp
-	//MOVQ	0(BX), BX
-	//MOVQ	BX, 64(SP)
-	//CALL	runtime·printf(SB)
-	//
-	//LEAL	ctxt+0(FP), DI
-	//MOVQ	(16*4+16*8)(DI), BX // rip
-	//MOVL	BX, 0(SP)
-	//MOVQ	(16*4+4*8)(DI), BX // rsp
-	//MOVL	BX, 4(SP)
-	//MOVL	$0, 8(SP)	// lr
-	//get_tls(CX)
-	//MOVL	g(CX), BX
-	//MOVL	BX, 12(SP)	// gp
-	//CALL	runtime·traceback(SB)
-
-notls:
-	MOVL	0, AX
-	RET
-
-nog:
-	MOVL	0, AX
-	RET
-
-// cannot do real signal handling yet, because gsignal has not been allocated.
-MOVL $1, DI; NACL_SYSCALL(SYS_exit)
-
-// func getRandomData([]byte)
-TEXT runtime·getRandomData(SB),NOSPLIT,$0-12
-	MOVL arg_base+0(FP), DI
-	MOVL arg_len+4(FP), SI
-	NACL_SYSCALL(SYS_get_random_bytes)
-	RET
-
-TEXT runtime·nacl_sysinfo(SB),NOSPLIT,$16
-/*
-	MOVL	di+0(FP), DI
-	LEAL	12(DI), BX
-	MOVL	8(DI), AX
-	ADDL	4(DI), AX
-	ADDL	$2, AX
-	LEAL	(BX)(AX*4), BX
-	MOVL	BX, runtime·nacl_irt_query(SB)
-auxloop:
-	MOVL	0(BX), DX
-	CMPL	DX, $0
-	JNE	2(PC)
-	RET
-	CMPL	DX, $32
-	JEQ	auxfound
-	ADDL	$8, BX
-	JMP	auxloop
-auxfound:
-	MOVL	4(BX), BX
-	MOVL	BX, runtime·nacl_irt_query(SB)
-
-	LEAL	runtime·nacl_irt_basic_v0_1_str(SB), DI
-	LEAL	runtime·nacl_irt_basic_v0_1(SB), SI
-	MOVL	runtime·nacl_irt_basic_v0_1_size(SB), DX
-	MOVL	runtime·nacl_irt_query(SB), BX
-	CALL	BX
-
-	LEAL	runtime·nacl_irt_memory_v0_3_str(SB), DI
-	LEAL	runtime·nacl_irt_memory_v0_3(SB), SI
-	MOVL	runtime·nacl_irt_memory_v0_3_size(SB), DX
-	MOVL	runtime·nacl_irt_query(SB), BX
-	CALL	BX
-
-	LEAL	runtime·nacl_irt_thread_v0_1_str(SB), DI
-	LEAL	runtime·nacl_irt_thread_v0_1(SB), SI
-	MOVL	runtime·nacl_irt_thread_v0_1_size(SB), DX
-	MOVL	runtime·nacl_irt_query(SB), BX
-	CALL	BX
-
-	// TODO: Once we have a NaCl SDK with futex syscall support,
-	// try switching to futex syscalls and here load the
-	// nacl-irt-futex-0.1 table.
-*/
-	RET
diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
deleted file mode 100644
index 9020168..0000000
--- a/src/runtime/sys_nacl_arm.s
+++ /dev/null
@@ -1,312 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "go_asm.h"
-#include "go_tls.h"
-#include "textflag.h"
-#include "syscall_nacl.h"
-
-#define NACL_SYSCALL(code) \
-	MOVW	$(0x10000 + ((code)<<5)), R8; BL (R8)
-
-TEXT runtime·exit(SB),NOSPLIT,$0
-	MOVW	code+0(FP), R0
-	NACL_SYSCALL(SYS_exit)
-	RET
-
-// func exitThread(wait *uint32)
-TEXT runtime·exitThread(SB),NOSPLIT,$4-4
-	MOVW wait+0(FP), R0
-	// SYS_thread_exit will clear *wait when the stack is free.
-	NACL_SYSCALL(SYS_thread_exit)
-	JMP 0(PC)
-
-TEXT runtime·open(SB),NOSPLIT,$0
-	MOVW	name+0(FP), R0
-	MOVW	name+0(FP), R1
-	MOVW	name+0(FP), R2
-	NACL_SYSCALL(SYS_open)
-	MOVW	R0, ret+12(FP)
-	RET
-
-TEXT runtime·closefd(SB),NOSPLIT,$0
-	MOVW	fd+0(FP), R0
-	NACL_SYSCALL(SYS_close)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·read(SB),NOSPLIT,$0
-	MOVW	fd+0(FP), R0
-	MOVW	p+4(FP), R1
-	MOVW	n+8(FP), R2
-	NACL_SYSCALL(SYS_read)
-	MOVW	R0, ret+12(FP)
-	RET
-
-// func naclWrite(fd int, b []byte) int
-TEXT syscall·naclWrite(SB),NOSPLIT,$0
-	MOVW	arg1+0(FP), R0
-	MOVW	arg2+4(FP), R1
-	MOVW	arg3+8(FP), R2
-	NACL_SYSCALL(SYS_write)
-	MOVW	R0, ret+16(FP)
-	RET
-
-TEXT runtime·write(SB),NOSPLIT,$0
-	MOVW	fd+0(FP), R0
-	MOVW	p+4(FP), R1
-	MOVW	n+8(FP), R2
-	NACL_SYSCALL(SYS_write)
-	MOVW	R0, ret+12(FP)
-	RET
-
-TEXT runtime·nacl_exception_stack(SB),NOSPLIT,$0
-	MOVW	p+0(FP), R0
-	MOVW	size+4(FP), R1
-	NACL_SYSCALL(SYS_exception_stack)
-	MOVW	R0, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_exception_handler(SB),NOSPLIT,$0
-	MOVW	fn+0(FP), R0
-	MOVW	arg+4(FP), R1
-	NACL_SYSCALL(SYS_exception_handler)
-	MOVW	R0, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_sem_create(SB),NOSPLIT,$0
-	MOVW	flag+0(FP), R0
-	NACL_SYSCALL(SYS_sem_create)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_sem_wait(SB),NOSPLIT,$0
-	MOVW	sem+0(FP), R0
-	NACL_SYSCALL(SYS_sem_wait)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_sem_post(SB),NOSPLIT,$0
-	MOVW	sem+0(FP), R0
-	NACL_SYSCALL(SYS_sem_post)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_create(SB),NOSPLIT,$0
-	MOVW	flag+0(FP), R0
-	NACL_SYSCALL(SYS_mutex_create)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_lock(SB),NOSPLIT,$0
-	MOVW	mutex+0(FP), R0
-	NACL_SYSCALL(SYS_mutex_lock)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_trylock(SB),NOSPLIT,$0
-	MOVW	mutex+0(FP), R0
-	NACL_SYSCALL(SYS_mutex_trylock)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_mutex_unlock(SB),NOSPLIT,$0
-	MOVW	mutex+0(FP), R0
-	NACL_SYSCALL(SYS_mutex_unlock)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_create(SB),NOSPLIT,$0
-	MOVW	flag+0(FP), R0
-	NACL_SYSCALL(SYS_cond_create)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_wait(SB),NOSPLIT,$0
-	MOVW	cond+0(FP), R0
-	MOVW	n+4(FP), R1
-	NACL_SYSCALL(SYS_cond_wait)
-	MOVW	R0, ret+8(FP)
-	RET
-
-TEXT runtime·nacl_cond_signal(SB),NOSPLIT,$0
-	MOVW	cond+0(FP), R0
-	NACL_SYSCALL(SYS_cond_signal)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_broadcast(SB),NOSPLIT,$0
-	MOVW	cond+0(FP), R0
-	NACL_SYSCALL(SYS_cond_broadcast)
-	MOVW	R0, ret+4(FP)
-	RET
-
-TEXT runtime·nacl_cond_timed_wait_abs(SB),NOSPLIT,$0
-	MOVW	cond+0(FP), R0
-	MOVW	lock+4(FP), R1
-	MOVW	ts+8(FP), R2
-	NACL_SYSCALL(SYS_cond_timed_wait_abs)
-	MOVW	R0, ret+12(FP)
-	RET
-
-TEXT runtime·nacl_thread_create(SB),NOSPLIT,$0
-	MOVW	fn+0(FP), R0
-	MOVW	stk+4(FP), R1
-	MOVW	tls+8(FP), R2
-	MOVW	xx+12(FP), R3
-	NACL_SYSCALL(SYS_thread_create)
-	MOVW	R0, ret+16(FP)
-	RET
-
-TEXT runtime·mstart_nacl(SB),NOSPLIT,$0
-	MOVW	0(R9), R0 // TLS
-	MOVW	-8(R0), R1 // g
-	MOVW	-4(R0), R2 // m
-	MOVW	R2, g_m(R1)
-	MOVW	R1, g
-	B runtime·mstart(SB)
-
-TEXT runtime·nacl_nanosleep(SB),NOSPLIT,$0
-	MOVW	ts+0(FP), R0
-	MOVW	extra+4(FP), R1
-	NACL_SYSCALL(SYS_nanosleep)
-	MOVW	R0, ret+8(FP)
-	RET
-
-TEXT runtime·osyield(SB),NOSPLIT,$0
-	NACL_SYSCALL(SYS_sched_yield)
-	RET
-
-TEXT runtime·mmap(SB),NOSPLIT,$8
-	MOVW	addr+0(FP), R0
-	MOVW	n+4(FP), R1
-	MOVW	prot+8(FP), R2
-	MOVW	flags+12(FP), R3
-	MOVW	fd+16(FP), R4
-	// arg6:offset should be passed as a pointer (to int64)
-	MOVW	off+20(FP), R5
-	MOVW	R5, 4(R13)
-	MOVW	$0, R6
-	MOVW	R6, 8(R13)
-	MOVW	$4(R13), R5
-	MOVM.DB.W [R4,R5], (R13) // arg5 and arg6 are passed on stack
-	NACL_SYSCALL(SYS_mmap)
-	MOVM.IA.W (R13), [R4, R5]
-	CMP	$-4095, R0
-	MOVW	$0, R1
-	RSB.HI	$0, R0
-	MOVW.HI	R0, R1		// if error, put in R1
-	MOVW.HI	$0, R0
-	MOVW	R0, p+24(FP)
-	MOVW	R1, err+28(FP)
-	RET
-
-TEXT runtime·walltime(SB),NOSPLIT,$16
-	MOVW	$0, R0 // real time clock
-	MOVW	$4(R13), R1
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVW	4(R13), R0 // low 32-bit sec
-	MOVW	8(R13), R1 // high 32-bit sec
-	MOVW	12(R13), R2 // nsec
-	MOVW	R0, sec_lo+0(FP)
-	MOVW	R1, sec_hi+4(FP)
-	MOVW	R2, nsec+8(FP)
-	RET
-
-TEXT syscall·now(SB),NOSPLIT,$0
-	B runtime·walltime(SB)
-
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB),NOSPLIT,$16
-	MOVW	$0, R0 // real time clock
-	MOVW	$4(R13), R1
-	NACL_SYSCALL(SYS_clock_gettime)
-	MOVW	4(R13), R0 // low 32-bit sec
-	MOVW	8(R13), R1 // high 32-bit sec (ignored for now)
-	MOVW	12(R13), R2 // nsec
-	MOVW	$1000000000, R3
-	MULLU	R0, R3, (R1, R0)
-	MOVW	$0, R4
-	ADD.S	R2, R0
-	ADC	R4, R1
-	MOVW	R0, ret_lo+0(FP)
-	MOVW	R1, ret_hi+4(FP)
-	RET
-
-TEXT runtime·sigtramp(SB),NOSPLIT,$80
-	// load g from thread context
-	MOVW	$ctxt+-4(FP), R0
-	MOVW	(16*4+10*4)(R0), g
-
-	// check that g exists
-	CMP	$0, g
-	BNE 	4(PC)
-	MOVW  	$runtime·badsignal2(SB), R11
-	BL	(R11)
-	RET
-
-	// save g
-	MOVW	g, R3
-	MOVW	g, 20(R13)
-
-	// g = m->gsignal
-	MOVW	g_m(g), R8
-	MOVW	m_gsignal(R8), g
-
-	// copy arguments for call to sighandler
-	MOVW	$11, R0
-	MOVW	R0, 4(R13) // signal
-	MOVW	$0, R0
-	MOVW	R0, 8(R13) // siginfo
-	MOVW	$ctxt+-4(FP), R0
-	MOVW	R0, 12(R13) // context
-	MOVW	R3, 16(R13) // g
-
-	BL	runtime·sighandler(SB)
-
-	// restore g
-	MOVW	20(R13), g
-
-	// Enable exceptions again.
-	NACL_SYSCALL(SYS_exception_clear_flag)
-
-	// Restore registers as best we can. Impossible to do perfectly.
-	// See comment in sys_nacl_386.s for extended rationale.
-	MOVW	$ctxt+-4(FP), R1
-	ADD	$64, R1
-	MOVW	(0*4)(R1), R0
-	MOVW	(2*4)(R1), R2
-	MOVW	(3*4)(R1), R3
-	MOVW	(4*4)(R1), R4
-	MOVW	(5*4)(R1), R5
-	MOVW	(6*4)(R1), R6
-	MOVW	(7*4)(R1), R7
-	MOVW	(8*4)(R1), R8
-	// cannot write to R9
-	MOVW	(10*4)(R1), g
-	MOVW	(11*4)(R1), R11
-	MOVW	(12*4)(R1), R12
-	MOVW	(13*4)(R1), R13
-	MOVW	(14*4)(R1), R14
-	MOVW	(15*4)(R1), R1
-	B	(R1)
-
-nog:
-	MOVW	$0, R0
-	RET
-
-// func getRandomData([]byte)
-TEXT runtime·getRandomData(SB),NOSPLIT,$0-12
-	MOVW arg_base+0(FP), R0
-	MOVW arg_len+4(FP), R1
-	NACL_SYSCALL(SYS_get_random_bytes)
-	RET
-
-// Likewise, this is only valid for ARMv7+, but that's okay.
-TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
-	B	runtime·armPublicationBarrier(SB)
-
-TEXT runtime·read_tls_fallback(SB),NOSPLIT|NOFRAME,$0
-	WORD $0xe7fedef0 // NACL_INSTR_ARM_ABORT_NOW (UDF #0xEDE0)
diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index c14ecfb..d0c470c 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s
@@ -83,15 +83,41 @@
 	MOVL	$SYS_read, AX
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-4
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVL	$42, AX
+	INT	$0x80
+	JCC	pipeok
+	MOVL	$-1, r+0(FP)
+	MOVL	$-1, w+4(FP)
+	MOVL	AX, errno+8(FP)
+	RET
+pipeok:
+	MOVL	AX, r+0(FP)
+	MOVL	DX, w+4(FP)
+	MOVL	$0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$12-16
+	MOVL	$453, AX
+	LEAL	r+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	flags+0(FP), BX
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	AX, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-4
 	MOVL	$SYS_write, AX
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
@@ -114,12 +140,11 @@
 	INT	$0x80
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$12
-	MOVL	$SYS__lwp_self, AX
-	INT	$0x80
+TEXT runtime·lwp_kill(SB),NOSPLIT,$12-8
 	MOVL	$0, 0(SP)
+	MOVL	tid+0(FP), AX
 	MOVL	AX, 4(SP)		// arg 1 - target
-	MOVL	sig+0(FP), AX
+	MOVL	sig+4(FP), AX
 	MOVL	AX, 8(SP)		// arg 2 - signo
 	MOVL	$SYS__lwp_kill, AX
 	INT	$0x80
@@ -181,8 +206,8 @@
 	INT	$0x80
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	LEAL	12(SP), BX
 	MOVL	$CLOCK_REALTIME, 4(SP)	// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -198,9 +223,9 @@
 	MOVL	BX, nsec+8(FP)
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB),NOSPLIT,$32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
 	LEAL	12(SP), BX
 	MOVL	$CLOCK_MONOTONIC, 4(SP)	// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -455,3 +480,20 @@
 	JAE	2(PC)
 	NEGL	AX
 	RET
+
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$16-4
+	MOVL	$92, AX // fcntl
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$3, 8(SP) // F_GETFL
+	MOVL	$0, 12(SP)
+	INT	$0x80
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$4, 8(SP) // F_SETFL
+	ORL	$4, AX // O_NONBLOCK
+	MOVL	AX, 12(SP)
+	MOVL	$92, AX // fcntl
+	INT	$0x80
+	RET
diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index 5fc47ae..dc9bd12 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s
@@ -154,18 +154,42 @@
 	MOVL	$SYS_read, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVL	$42, AX
+	SYSCALL
+	JCC	pipeok
+	MOVL	$-1, r+0(FP)
+	MOVL	$-1, w+4(FP)
+	MOVL	AX, errno+8(FP)
+	RET
+pipeok:
+	MOVL	AX, r+0(FP)
+	MOVL	DX, w+4(FP)
+	MOVL	$0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	LEAQ	r+8(FP), DI
+	MOVL	flags+0(FP), SI
+	MOVL	$453, AX
+	SYSCALL
+	MOVL	AX, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVQ	fd+0(FP), DI		// arg 1 - fd
 	MOVQ	p+8(FP), SI		// arg 2 - buf
 	MOVL	n+16(FP), DX		// arg 3 - nbyte
 	MOVL	$SYS_write, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
@@ -185,11 +209,9 @@
 	SYSCALL
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
-	MOVL	$SYS__lwp_self, AX
-	SYSCALL
-	MOVQ	AX, DI			// arg 1 - target
-	MOVL	sig+0(FP), SI		// arg 2 - signo
+TEXT runtime·lwp_kill(SB),NOSPLIT,$0-16
+	MOVL	tid+0(FP), DI		// arg 1 - target
+	MOVQ	sig+8(FP), SI		// arg 2 - signo
 	MOVL	$SYS__lwp_kill, AX
 	SYSCALL
 	RET
@@ -211,8 +233,8 @@
 	SYSCALL
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVQ	$CLOCK_REALTIME, DI	// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$SYS___clock_gettime50, AX
@@ -225,7 +247,7 @@
 	MOVL	DX, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$32
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
 	MOVQ	$CLOCK_MONOTONIC, DI	// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$SYS___clock_gettime50, AX
@@ -429,3 +451,18 @@
 	MOVL	$SYS_fcntl, AX
 	SYSCALL
 	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$4, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index c32259b..678dea5 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s
@@ -92,16 +92,40 @@
 	MOVW p+4(FP), R1
 	MOVW n+8(FP), R2
 	SWI $SYS_read
-	MOVW.CS	$-1, R0
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	SWI $0xa0002a
+	BCC pipeok
+	MOVW $-1,R2
+	MOVW R2, r+0(FP)
+	MOVW R2, w+4(FP)
+	MOVW R0, errno+8(FP)
+	RET
+pipeok:
+	MOVW $0, R2
+	MOVW R0, r+0(FP)
+	MOVW R1, w+4(FP)
+	MOVW R2, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVW $r+4(FP), R0
+	MOVW flags+0(FP), R1
+	SWI $0xa001c5
+	MOVW R0, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0
 	MOVW	fd+0(FP), R0	// arg 1 - fd
 	MOVW	p+4(FP), R1	// arg 2 - buf
 	MOVW	n+8(FP), R2	// arg 3 - nbyte
 	SWI $SYS_write
-	MOVW.CS	$-1, R0
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
@@ -169,9 +193,9 @@
 	SWI $SYS___nanosleep50
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
-	SWI	$SYS__lwp_self	// the returned R0 is arg 1
-	MOVW	sig+0(FP), R1	// arg 2 - signal
+TEXT runtime·lwp_kill(SB),NOSPLIT,$0-8
+	MOVW	tid+0(FP), R0	// arg 1 - tid
+	MOVW	sig+4(FP), R1	// arg 2 - signal
 	SWI	$SYS__lwp_kill
 	RET
 
@@ -188,8 +212,8 @@
 	SWI $SYS___setitimer50
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVW $0, R0	// CLOCK_REALTIME
 	MOVW $8(R13), R1
 	SWI $SYS___clock_gettime50
@@ -203,9 +227,9 @@
 	MOVW R2, nsec+8(FP)
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB), NOSPLIT, $32
 	MOVW $3, R0 // CLOCK_MONOTONIC
 	MOVW $8(R13), R1
 	SWI $SYS___clock_gettime50
@@ -276,7 +300,11 @@
 	MOVW	R4, R13
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$12
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Reserve space for callee-save registers and arguments.
+	MOVM.DB.W [R4-R11], (R13)
+	SUB	$16, R13
+
 	// this might be called in external code context,
 	// where g is not set.
 	// first save R0, because runtime·load_g will clobber it
@@ -288,6 +316,11 @@
 	MOVW	R1, 8(R13)
 	MOVW	R2, 12(R13)
 	BL	runtime·sigtrampgo(SB)
+
+	// Restore callee-save registers.
+	ADD	$16, R13
+	MOVM.IA.W (R13), [R4-R11]
+
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$12
@@ -385,6 +418,18 @@
 	SWI $SYS_fcntl
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW fd+0(FP), R0	// fd
+	MOVW $3, R1	// F_GETFL
+	MOVW $0, R2
+	SWI $0xa0005c	// sys_fcntl
+	ORR $0x4, R0, R2	// O_NONBLOCK
+	MOVW fd+0(FP), R0	// fd
+	MOVW $4, R1	// F_SETFL
+	SWI $0xa0005c	// sys_fcntl
+	RET
+
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	B	runtime·armPublicationBarrier(SB)
diff --git a/src/runtime/sys_netbsd_arm64.s b/src/runtime/sys_netbsd_arm64.s
index 57ded53..e70be0f 100644
--- a/src/runtime/sys_netbsd_arm64.s
+++ b/src/runtime/sys_netbsd_arm64.s
@@ -14,6 +14,9 @@
 #define CLOCK_MONOTONIC		3
 #define FD_CLOEXEC		1
 #define F_SETFD			2
+#define F_GETFL			3
+#define F_SETFL			4
+#define O_NONBLOCK		4
 
 #define SYS_exit			1
 #define SYS_read			3
@@ -43,6 +46,7 @@
 #define SYS___clock_gettime50		427
 #define SYS___nanosleep50		430
 #define SYS___kevent50			435
+#define SYS_pipe2			453
 #define SYS_openat			468
 #define SYS____lwp_park60		478
 
@@ -141,18 +145,45 @@
 	MOVW	n+16(FP), R2		// arg 3 - count
 	SVC	$SYS_read
 	BCC	ok
-	MOVW	$-1, R0
+	NEG	R0, R0
 ok:
 	MOVW	R0, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	$0, R0
+	SVC	$SYS_pipe2
+	BCC	pipeok
+	MOVW	$-1,R1
+	MOVW	R1, r+0(FP)
+	MOVW	R1, w+4(FP)
+	NEG	R0, R0
+	MOVW	R0, errno+8(FP)
+	RET
+pipeok:
+	MOVW	R0, r+0(FP)
+	MOVW	R1, w+4(FP)
+	MOVW	ZR, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	ADD	$8, RSP, R0
+	MOVW	flags+0(FP), R1
+	SVC	$SYS_pipe2
+	BCC	2(PC)
+	NEG	R0, R0
+	MOVW	R0, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVD	fd+0(FP), R0		// arg 1 - fd
 	MOVD	p+8(FP), R1		// arg 2 - buf
 	MOVW	n+16(FP), R2		// arg 3 - nbyte
 	SVC	$SYS_write
 	BCC	ok
-	MOVW	$-1, R0
+	NEG	R0, R0
 ok:
 	MOVW	R0, ret+24(FP)
 	RET
@@ -174,10 +205,9 @@
 	SVC	$SYS___nanosleep50
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
-	SVC	$SYS__lwp_self
-					// arg 1 - target (lwp_self)
-	MOVW	sig+0(FP), R1		// arg 2 - signo
+TEXT runtime·lwp_kill(SB),NOSPLIT,$0-16
+	MOVW	tid+0(FP), R0		// arg 1 - target
+	MOVD	sig+8(FP), R1		// arg 2 - signo
 	SVC	$SYS__lwp_kill
 	RET
 
@@ -195,8 +225,8 @@
 	SVC	$SYS___setitimer50
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVW	$CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVD	$8(RSP), R1		// arg 2 - tp
 	SVC	$SYS___clock_gettime50
@@ -209,9 +239,9 @@
 	MOVW	R1, nsec+8(FP)
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB), NOSPLIT, $32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB), NOSPLIT, $32
 	MOVD	$CLOCK_MONOTONIC, R0	// arg 1 - clock_id
 	MOVD	$8(RSP), R1		// arg 2 - tp
 	SVC	$SYS___clock_gettime50
@@ -431,3 +461,16 @@
 	MOVW	$FD_CLOEXEC, R2
 	SVC	$SYS_fcntl
 	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R0		// arg 1 - fd
+	MOVD	$F_GETFL, R1		// arg 2 - cmd
+	MOVD	$0, R2			// arg 3
+	SVC	$SYS_fcntl
+	MOVD	$O_NONBLOCK, R2
+	EOR	R0, R2			// arg 3 - flags
+	MOVW	fd+0(FP), R0		// arg 1 - fd
+	MOVD	$F_SETFL, R1		// arg 2 - cmd
+	SVC	$SYS_fcntl
+	RET
diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
index 6457e37..24fbfd6 100644
--- a/src/runtime/sys_openbsd_386.s
+++ b/src/runtime/sys_openbsd_386.s
@@ -46,15 +46,35 @@
 	MOVL	$3, AX
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-4
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$8-12
+	MOVL	$263, AX
+	LEAL	r+0(FP), BX
+	MOVL	BX, 4(SP)
+	INT	$0x80
+	MOVL	AX, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$12-16
+	MOVL	$101, AX
+	LEAL	r+4(FP), BX
+	MOVL	BX, 4(SP)
+	MOVL	flags+0(FP), BX
+	MOVL	BX, 8(SP)
+	INT	$0x80
+	MOVL	AX, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-4
 	MOVL	$4, AX			// sys_write
 	INT	$0x80
 	JAE	2(PC)
-	MOVL	$-1, AX
+	NEGL	AX			// caller expects negative errno
 	MOVL	AX, ret+12(FP)
 	RET
 
@@ -77,12 +97,17 @@
 	INT	$0x80
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
+TEXT runtime·getthrid(SB),NOSPLIT,$0-4
 	MOVL	$299, AX		// sys_getthrid
 	INT	$0x80
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·thrkill(SB),NOSPLIT,$16-8
 	MOVL	$0, 0(SP)
+	MOVL	tid+0(FP), AX
 	MOVL	AX, 4(SP)		// arg 1 - tid
-	MOVL	sig+0(FP), AX
+	MOVL	sig+4(FP), AX
 	MOVL	AX, 8(SP)		// arg 2 - signum
 	MOVL	$0, 12(SP)		// arg 3 - tcb
 	MOVL	$119, AX		// sys_thrkill
@@ -145,8 +170,8 @@
 	INT	$0x80
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	LEAL	12(SP), BX
 	MOVL	$0, 4(SP)		// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -162,9 +187,9 @@
 	MOVL	BX, nsec+8(FP)
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB),NOSPLIT,$32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
 	LEAL	12(SP), BX
 	MOVL	CLOCK_MONOTONIC, 4(SP)	// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -416,4 +441,21 @@
 	NEGL	AX
 	RET
 
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$16-4
+	MOVL	$92, AX // fcntl
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$3, 8(SP) // F_GETFL
+	MOVL	$0, 12(SP)
+	INT	$0x80
+	MOVL	fd+0(FP), BX // fd
+	MOVL	BX, 4(SP)
+	MOVL	$4, 8(SP) // F_SETFL
+	ORL	$4, AX // O_NONBLOCK
+	MOVL	AX, 12(SP)
+	MOVL	$92, AX // fcntl
+	INT	$0x80
+	RET
+
 GLOBL runtime·tlsoffset(SB),NOPTR,$4
diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index d5c030d..37d70ab 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s
@@ -123,18 +123,35 @@
 	MOVL	$3, AX
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	LEAQ	r+0(FP), DI
+	MOVL	$263, AX
+	SYSCALL
+	MOVL	AX, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-20
+	LEAQ	r+8(FP), DI
+	MOVL	flags+0(FP), SI
+	MOVL	$101, AX
+	SYSCALL
+	MOVL	AX, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT,$-8
 	MOVQ	fd+0(FP), DI		// arg 1 - fd
 	MOVQ	p+8(FP), SI		// arg 2 - buf
 	MOVL	n+16(FP), DX		// arg 3 - nbyte
 	MOVL	$4, AX			// sys_write
 	SYSCALL
 	JCC	2(PC)
-	MOVL	$-1, AX
+	NEGQ	AX			// caller expects negative errno
 	MOVL	AX, ret+24(FP)
 	RET
 
@@ -154,11 +171,15 @@
 	SYSCALL
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$16
+TEXT runtime·getthrid(SB),NOSPLIT,$0-4
 	MOVL	$299, AX		// sys_getthrid
 	SYSCALL
-	MOVQ	AX, DI			// arg 1 - tid
-	MOVL	sig+0(FP), SI		// arg 2 - signum
+	MOVL	AX, ret+0(FP)
+	RET
+
+TEXT runtime·thrkill(SB),NOSPLIT,$0-16
+	MOVL	tid+0(FP), DI		// arg 1 - tid
+	MOVQ	sig+8(FP), SI		// arg 2 - signum
 	MOVQ	$0, DX			// arg 3 - tcb
 	MOVL	$119, AX		// sys_thrkill
 	SYSCALL
@@ -181,8 +202,8 @@
 	SYSCALL
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVQ	$0, DI			// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$87, AX			// sys_clock_gettime
@@ -195,7 +216,7 @@
 	MOVL	DX, nsec+8(FP)
 	RET
 
-TEXT runtime·nanotime(SB),NOSPLIT,$24
+TEXT runtime·nanotime1(SB),NOSPLIT,$24
 	MOVQ	CLOCK_MONOTONIC, DI	// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$87, AX			// sys_clock_gettime
@@ -378,3 +399,18 @@
 	MOVL	$92, AX		// fcntl
 	SYSCALL
 	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVL    fd+0(FP), DI  // fd
+	MOVQ    $3, SI  // F_GETFL
+	MOVQ    $0, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	MOVL	fd+0(FP), DI // fd
+	MOVQ	$4, SI // F_SETFL
+	MOVQ	$4, DX // O_NONBLOCK
+	ORL	AX, DX
+	MOVL	$92, AX // fcntl
+	SYSCALL
+	RET
diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index 69c3ded..9e18ce0 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s
@@ -13,11 +13,23 @@
 #define CLOCK_REALTIME	$0
 #define	CLOCK_MONOTONIC	$3
 
+// With OpenBSD 6.7 onwards, an armv7 syscall returns two instructions
+// after the SWI instruction, to allow for a speculative execution
+// barrier to be placed after the SWI without impacting performance.
+// For now use hardware no-ops as this works with both older and newer
+// kernels. After OpenBSD 6.8 is released this should be changed to
+// speculation barriers.
+#define NOOP	MOVW    R0, R0
+#define	INVOKE_SYSCALL	\
+	SWI	$0;	\
+	NOOP;		\
+	NOOP
+
 // Exit the entire program (like C exit)
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0
 	MOVW	code+0(FP), R0	// arg 1 - status
 	MOVW	$1, R12			// sys_exit
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$0, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	RET
@@ -26,7 +38,7 @@
 TEXT runtime·exitThread(SB),NOSPLIT,$0-4
 	MOVW	wait+0(FP), R0		// arg 1 - notdead
 	MOVW	$302, R12		// sys___threxit
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$1, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	JMP	0(PC)
@@ -36,7 +48,7 @@
 	MOVW	mode+4(FP), R1		// arg 2 - mode
 	MOVW	perm+8(FP), R2		// arg 3 - perm
 	MOVW	$5, R12			// sys_open
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$-1, R0
 	MOVW	R0, ret+12(FP)
 	RET
@@ -44,7 +56,7 @@
 TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0
 	MOVW	fd+0(FP), R0		// arg 1 - fd
 	MOVW	$6, R12			// sys_close
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$-1, R0
 	MOVW	R0, ret+4(FP)
 	RET
@@ -54,18 +66,35 @@
 	MOVW	p+4(FP), R1		// arg 2 - buf
 	MOVW	n+8(FP), R2		// arg 3 - nbyte
 	MOVW	$3, R12			// sys_read
-	SWI	$0
-	MOVW.CS	$-1, R0
+	INVOKE_SYSCALL
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT,$0-12
+	MOVW	$r+0(FP), R0
+	MOVW	$263, R12
+	INVOKE_SYSCALL
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT,$0-16
+	MOVW	$r+4(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVW	$101, R12
+	INVOKE_SYSCALL
+	MOVW	R0, errno+12(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0
 	MOVW	fd+0(FP), R0		// arg 1 - fd
 	MOVW	p+4(FP), R1		// arg 2 - buf
 	MOVW	n+8(FP), R2		// arg 3 - nbyte
 	MOVW	$4, R12			// sys_write
-	SWI	$0
-	MOVW.CS	$-1, R0
+	INVOKE_SYSCALL
+	RSB.CS	$0, R0		// caller expects negative errno
 	MOVW	R0, ret+12(FP)
 	RET
 
@@ -82,26 +111,30 @@
 	MOVW	$4(R13), R0		// arg 1 - rqtp
 	MOVW	$0, R1			// arg 2 - rmtp
 	MOVW	$91, R12		// sys_nanosleep
-	SWI	$0
+	INVOKE_SYSCALL
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$12
+TEXT runtime·getthrid(SB),NOSPLIT,$0-4
 	MOVW	$299, R12		// sys_getthrid
-	SWI	$0
-					// arg 1 - tid, already in R0
-	MOVW	sig+0(FP), R1		// arg 2 - signum
+	INVOKE_SYSCALL
+	MOVW	R0, ret+0(FP)
+	RET
+
+TEXT runtime·thrkill(SB),NOSPLIT,$0-8
+	MOVW	tid+0(FP), R0		// arg 1 - tid
+	MOVW	sig+4(FP), R1		// arg 2 - signum
 	MOVW	$0, R2			// arg 3 - tcb
 	MOVW	$119, R12		// sys_thrkill
-	SWI	$0
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$12
-	MOVW	$20, R12
-	SWI	$0			// sys_getpid
+	MOVW	$20, R12		// sys_getpid
+	INVOKE_SYSCALL
 					// arg 1 - pid, already in R0
 	MOVW	sig+0(FP), R1		// arg 2 - signum
 	MOVW	$122, R12		// sys_kill
-	SWI	$0
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$16
@@ -119,7 +152,7 @@
 	MOVW	R7, 16(R13)		// high 32 bits
 	ADD	$4, R13
 	MOVW	$197, R12		// sys_mmap
-	SWI	$0
+	INVOKE_SYSCALL
 	SUB	$4, R13
 	MOVW	$0, R1
 	MOVW.CS	R0, R1			// if error, move to R1
@@ -132,7 +165,7 @@
 	MOVW	addr+0(FP), R0		// arg 1 - addr
 	MOVW	n+4(FP), R1		// arg 2 - len
 	MOVW	$73, R12		// sys_munmap
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$0, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	RET
@@ -142,7 +175,7 @@
 	MOVW	n+4(FP), R1		// arg 2 - len
 	MOVW	flags+8(FP), R2		// arg 2 - flags
 	MOVW	$75, R12		// sys_madvise
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$-1, R0
 	MOVW	R0, ret+12(FP)
 	RET
@@ -152,15 +185,15 @@
 	MOVW	new+4(FP), R1		// arg 2 - new value
 	MOVW	old+8(FP), R2		// arg 3 - old value
 	MOVW	$69, R12		// sys_setitimer
-	SWI	$0
+	INVOKE_SYSCALL
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVW	CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVW	$8(R13), R1		// arg 2 - tp
 	MOVW	$87, R12		// sys_clock_gettime
-	SWI	$0
+	INVOKE_SYSCALL
 
 	MOVW	8(R13), R0		// sec - l32
 	MOVW	12(R13), R1		// sec - h32
@@ -172,13 +205,13 @@
 
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB),NOSPLIT,$32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
 	MOVW	CLOCK_MONOTONIC, R0	// arg 1 - clock_id
 	MOVW	$8(R13), R1		// arg 2 - tp
 	MOVW	$87, R12		// sys_clock_gettime
-	SWI	$0
+	INVOKE_SYSCALL
 
 	MOVW	8(R13), R0		// sec - l32
 	MOVW	12(R13), R4		// sec - h32
@@ -199,7 +232,7 @@
 	MOVW	new+4(FP), R1		// arg 2 - new sigaction
 	MOVW	old+8(FP), R2		// arg 3 - old sigaction
 	MOVW	$46, R12		// sys_sigaction
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$3, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	RET
@@ -208,7 +241,7 @@
 	MOVW	how+0(FP), R0		// arg 1 - mode
 	MOVW	new+4(FP), R1		// arg 2 - new
 	MOVW	$48, R12		// sys_sigprocmask
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$3, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	MOVW	R0, ret+8(FP)
@@ -226,7 +259,11 @@
 	MOVW	R4, R13
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$12
+TEXT runtime·sigtramp(SB),NOSPLIT,$0
+	// Reserve space for callee-save registers and arguments.
+	MOVM.DB.W [R4-R11], (R13)
+	SUB	$16, R13
+
 	// If called from an external code context, g will not be set.
 	// Save R0, since runtime·load_g will clobber it.
 	MOVW	R0, 4(R13)		// signum
@@ -237,6 +274,11 @@
 	MOVW	R1, 8(R13)
 	MOVW	R2, 12(R13)
 	BL	runtime·sigtrampgo(SB)
+
+	// Restore callee-save registers.
+	ADD	$16, R13
+	MOVM.IA.W (R13), [R4-R11]
+
 	RET
 
 // int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
@@ -250,7 +292,7 @@
 	MOVW	param+0(FP), R0		// arg 1 - param
 	MOVW	psize+4(FP), R1		// arg 2 - psize
 	MOVW	$8, R12			// sys___tfork
-	SWI	$0
+	INVOKE_SYSCALL
 
 	// Return if syscall failed.
 	B.CC	4(PC)
@@ -283,14 +325,14 @@
 	MOVW	new+0(FP), R0		// arg 1 - new sigaltstack
 	MOVW	old+4(FP), R1		// arg 2 - old sigaltstack
 	MOVW	$288, R12		// sys_sigaltstack
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW.CS	$0, R8			// crash on syscall failure
 	MOVW.CS	R8, (R8)
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
 	MOVW	$298, R12		// sys_sched_yield
-	SWI	$0
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·thrsleep(SB),NOSPLIT,$4
@@ -302,7 +344,7 @@
 	MOVW	R4, 4(R13)
 	ADD	$4, R13
 	MOVW	$94, R12		// sys___thrsleep
-	SWI	$0
+	INVOKE_SYSCALL
 	SUB	$4, R13
 	MOVW	R0, ret+20(FP)
 	RET
@@ -311,7 +353,7 @@
 	MOVW	ident+0(FP), R0		// arg 1 - ident
 	MOVW	n+4(FP), R1		// arg 2 - n
 	MOVW	$301, R12		// sys___thrwakeup
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVW	R0, ret+8(FP)
 	RET
 
@@ -326,7 +368,7 @@
 	MOVW	R5, 8(R13)
 	ADD	$4, R13
 	MOVW	$202, R12		// sys___sysctl
-	SWI	$0
+	INVOKE_SYSCALL
 	SUB	$4, R13
 	MOVW.CC	$0, R0
 	RSB.CS	$0, R0
@@ -336,7 +378,7 @@
 // int32 runtime·kqueue(void);
 TEXT runtime·kqueue(SB),NOSPLIT,$0
 	MOVW	$269, R12		// sys_kqueue
-	SWI	$0
+	INVOKE_SYSCALL
 	RSB.CS	$0, R0
 	MOVW	R0, ret+0(FP)
 	RET
@@ -353,7 +395,7 @@
 	MOVW	R5, 8(R13)
 	ADD	$4, R13
 	MOVW	$72, R12		// sys_kevent
-	SWI	$0
+	INVOKE_SYSCALL
 	RSB.CS	$0, R0
 	SUB	$4, R13
 	MOVW	R0, ret+24(FP)
@@ -365,7 +407,21 @@
 	MOVW	$2, R1			// arg 2 - cmd (F_SETFD)
 	MOVW	$1, R2			// arg 3 - arg (FD_CLOEXEC)
 	MOVW	$92, R12		// sys_fcntl
-	SWI	$0
+	INVOKE_SYSCALL
+	RET
+
+// func runtime·setNonblock(fd int32)
+TEXT runtime·setNonblock(SB),NOSPLIT,$0-4
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$3, R1	// F_GETFL
+	MOVW	$0, R2
+	MOVW	$92, R12
+	INVOKE_SYSCALL
+	ORR	$0x4, R0, R2	// O_NONBLOCK
+	MOVW	fd+0(FP), R0	// fd
+	MOVW	$4, R1	// F_SETFL
+	MOVW	$92, R12
+	INVOKE_SYSCALL
 	RET
 
 TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
@@ -374,6 +430,6 @@
 TEXT runtime·read_tls_fallback(SB),NOSPLIT|NOFRAME,$0
 	MOVM.WP	[R1, R2, R3, R12], (R13)
 	MOVW	$330, R12		// sys___get_tcb
-	SWI	$0
+	INVOKE_SYSCALL
 	MOVM.IAW (R13), [R1, R2, R3, R12]
 	RET
diff --git a/src/runtime/sys_openbsd_arm64.s b/src/runtime/sys_openbsd_arm64.s
index 52bed4b..621b1b1 100644
--- a/src/runtime/sys_openbsd_arm64.s
+++ b/src/runtime/sys_openbsd_arm64.s
@@ -13,11 +13,22 @@
 #define CLOCK_REALTIME	$0
 #define	CLOCK_MONOTONIC	$3
 
+// With OpenBSD 6.7 onwards, an arm64 syscall returns two instructions
+// after the SVC instruction, to allow for a speculative execution
+// barrier to be placed after the SVC without impacting performance.
+// For now use hardware no-ops as this works with both older and newer
+// kernels. After OpenBSD 6.8 is released this should be changed to
+// speculation barriers.
+#define	INVOKE_SYSCALL	\
+	SVC;		\
+	NOOP;		\
+	NOOP
+
 // Exit the entire program (like C exit)
 TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0
 	MOVW	code+0(FP), R0		// arg 1 - status
 	MOVD	$1, R8			// sys_exit
-	SVC
+	INVOKE_SYSCALL
 	BCC	3(PC)
 	MOVD	$0, R0			// crash on syscall failure
 	MOVD	R0, (R0)
@@ -27,7 +38,7 @@
 TEXT runtime·exitThread(SB),NOSPLIT,$0
 	MOVD	wait+0(FP), R0		// arg 1 - notdead
 	MOVD	$302, R8		// sys___threxit
-	SVC
+	INVOKE_SYSCALL
 	MOVD	$0, R0			// crash on syscall failure
 	MOVD	R0, (R0)
 	JMP	0(PC)
@@ -37,7 +48,7 @@
 	MOVW	mode+8(FP), R1		// arg 2 - mode
 	MOVW	perm+12(FP), R2		// arg 3 - perm
 	MOVD	$5, R8			// sys_open
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	MOVW	$-1, R0
 	MOVW	R0, ret+16(FP)
@@ -46,7 +57,7 @@
 TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0
 	MOVW	fd+0(FP), R0		// arg 1 - fd
 	MOVD	$6, R8			// sys_close
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	MOVW	$-1, R0
 	MOVW	R0, ret+8(FP)
@@ -57,20 +68,42 @@
 	MOVD	p+8(FP), R1		// arg 2 - buf
 	MOVW	n+16(FP), R2		// arg 3 - nbyte
 	MOVD	$3, R8			// sys_read
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
-	MOVW	$-1, R0
+	NEG	R0, R0
 	MOVW	R0, ret+24(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0
-	MOVW	fd+0(FP), R0		// arg 1 - fd
+// func pipe() (r, w int32, errno int32)
+TEXT runtime·pipe(SB),NOSPLIT|NOFRAME,$0-12
+	MOVD	$r+0(FP), R0
+	MOVW	$0, R1
+	MOVD	$101, R8		// sys_pipe2
+	INVOKE_SYSCALL
+	BCC	2(PC)
+	NEG	R0, R0
+	MOVW	R0, errno+8(FP)
+	RET
+
+// func pipe2(flags int32) (r, w int32, errno int32)
+TEXT runtime·pipe2(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	$r+8(FP), R0
+	MOVW	flags+0(FP), R1
+	MOVD	$101, R8		// sys_pipe2
+	INVOKE_SYSCALL
+	BCC	2(PC)
+	NEG	R0, R0
+	MOVW	R0, errno+16(FP)
+	RET
+
+TEXT runtime·write1(SB),NOSPLIT|NOFRAME,$0
+	MOVD	fd+0(FP), R0		// arg 1 - fd
 	MOVD	p+8(FP), R1		// arg 2 - buf
 	MOVW	n+16(FP), R2		// arg 3 - nbyte
 	MOVD	$4, R8			// sys_write
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
-	MOVW	$-1, R0
+	NEG	R0, R0
 	MOVW	R0, ret+24(FP)
 	RET
 
@@ -89,26 +122,30 @@
 	ADD	$8, RSP, R0		// arg 1 - rqtp
 	MOVD	$0, R1			// arg 2 - rmtp
 	MOVD	$91, R8			// sys_nanosleep
-	SVC
+	INVOKE_SYSCALL
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$0
+TEXT runtime·getthrid(SB),NOSPLIT,$0-4
 	MOVD	$299, R8		// sys_getthrid
-	SVC
-					// arg 1 - tid, already in R0
-	MOVW	sig+0(FP), R1		// arg 2 - signum
+	INVOKE_SYSCALL
+	MOVW	R0, ret+0(FP)
+	RET
+
+TEXT runtime·thrkill(SB),NOSPLIT,$0-16
+	MOVW	tid+0(FP), R0		// arg 1 - tid
+	MOVD	sig+8(FP), R1		// arg 2 - signum
 	MOVW	$0, R2			// arg 3 - tcb
 	MOVD	$119, R8		// sys_thrkill
-	SVC
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$0
 	MOVD	$20, R8			// sys_getpid
-	SVC
+	INVOKE_SYSCALL
 					// arg 1 - pid, already in R0
 	MOVW	sig+0(FP), R1		// arg 2 - signum
 	MOVD	$122, R8		// sys_kill
-	SVC
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
@@ -120,7 +157,7 @@
 	MOVW	$0, R5			// arg 6 - pad
 	MOVW	off+28(FP), R6		// arg 7 - offset
 	MOVD	$197, R8		// sys_mmap
-	SVC
+	INVOKE_SYSCALL
 	MOVD	$0, R1
 	BCC	3(PC)
 	MOVD	R0, R1			// if error, move to R1
@@ -133,7 +170,7 @@
 	MOVD	addr+0(FP), R0		// arg 1 - addr
 	MOVD	n+8(FP), R1		// arg 2 - len
 	MOVD	$73, R8			// sys_munmap
-	SVC
+	INVOKE_SYSCALL
 	BCC	3(PC)
 	MOVD	$0, R0			// crash on syscall failure
 	MOVD	R0, (R0)
@@ -144,7 +181,7 @@
 	MOVD	n+8(FP), R1		// arg 2 - len
 	MOVW	flags+16(FP), R2	// arg 2 - flags
 	MOVD	$75, R8			// sys_madvise
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	MOVW	$-1, R0
 	MOVW	R0, ret+24(FP)
@@ -155,15 +192,15 @@
 	MOVD	new+8(FP), R1		// arg 2 - new value
 	MOVD	old+16(FP), R2		// arg 3 - old value
 	MOVD	$69, R8			// sys_setitimer
-	SVC
+	INVOKE_SYSCALL
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB), NOSPLIT, $32
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB), NOSPLIT, $32
 	MOVW	CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVD	$8(RSP), R1		// arg 2 - tp
 	MOVD	$87, R8			// sys_clock_gettime
-	SVC
+	INVOKE_SYSCALL
 
 	MOVD	8(RSP), R0		// sec
 	MOVD	16(RSP), R1		// nsec
@@ -172,13 +209,13 @@
 
 	RET
 
-// int64 nanotime(void) so really
-// void nanotime(int64 *nsec)
-TEXT runtime·nanotime(SB),NOSPLIT,$32
+// int64 nanotime1(void) so really
+// void nanotime1(int64 *nsec)
+TEXT runtime·nanotime1(SB),NOSPLIT,$32
 	MOVW	CLOCK_MONOTONIC, R0	// arg 1 - clock_id
 	MOVD	$8(RSP), R1		// arg 2 - tp
 	MOVD	$87, R8			// sys_clock_gettime
-	SVC
+	INVOKE_SYSCALL
 
 	MOVW	8(RSP), R3		// sec
 	MOVW	16(RSP), R5		// nsec
@@ -194,7 +231,7 @@
 	MOVD	new+8(FP), R1		// arg 2 - new sigaction
 	MOVD	old+16(FP), R2		// arg 3 - old sigaction
 	MOVD	$46, R8			// sys_sigaction
-	SVC
+	INVOKE_SYSCALL
 	BCC	3(PC)
 	MOVD	$3, R0			// crash on syscall failure
 	MOVD	R0, (R0)
@@ -204,7 +241,7 @@
 	MOVW	how+0(FP), R0		// arg 1 - mode
 	MOVW	new+4(FP), R1		// arg 2 - new
 	MOVD	$48, R8			// sys_sigprocmask
-	SVC
+	INVOKE_SYSCALL
 	BCC	3(PC)
 	MOVD	$3, R8			// crash on syscall failure
 	MOVD	R8, (R8)
@@ -288,7 +325,7 @@
 	MOVD	param+0(FP), R0		// arg 1 - param
 	MOVD	psize+8(FP), R1		// arg 2 - psize
 	MOVD	$8, R8			// sys___tfork
-	SVC
+	INVOKE_SYSCALL
 
 	// Return if syscall failed.
 	BCC	4(PC)
@@ -318,7 +355,7 @@
 	MOVD	new+0(FP), R0		// arg 1 - new sigaltstack
 	MOVD	old+8(FP), R1		// arg 2 - old sigaltstack
 	MOVD	$288, R8		// sys_sigaltstack
-	SVC
+	INVOKE_SYSCALL
 	BCC	3(PC)
 	MOVD	$0, R8			// crash on syscall failure
 	MOVD	R8, (R8)
@@ -326,7 +363,7 @@
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
 	MOVD	$298, R8		// sys_sched_yield
-	SVC
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·thrsleep(SB),NOSPLIT,$0
@@ -336,7 +373,7 @@
 	MOVD	lock+24(FP), R3		// arg 4 - lock
 	MOVD	abort+32(FP), R4	// arg 5 - abort
 	MOVD	$94, R8			// sys___thrsleep
-	SVC
+	INVOKE_SYSCALL
 	MOVW	R0, ret+40(FP)
 	RET
 
@@ -344,7 +381,7 @@
 	MOVD	ident+0(FP), R0		// arg 1 - ident
 	MOVW	n+8(FP), R1		// arg 2 - n
 	MOVD	$301, R8		// sys___thrwakeup
-	SVC
+	INVOKE_SYSCALL
 	MOVW	R0, ret+16(FP)
 	RET
 
@@ -356,7 +393,7 @@
 	MOVD	dst+32(FP), R4		// arg 5 - dest
 	MOVD	ndst+40(FP), R5		// arg 6 - newlen
 	MOVD	$202, R8		// sys___sysctl
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	NEG	R0, R0
 	MOVW	R0, ret+48(FP)
@@ -365,7 +402,7 @@
 // int32 runtime·kqueue(void);
 TEXT runtime·kqueue(SB),NOSPLIT,$0
 	MOVD	$269, R8		// sys_kqueue
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	NEG	R0, R0
 	MOVW	R0, ret+0(FP)
@@ -380,7 +417,7 @@
 	MOVW	nev+32(FP), R4		// arg 5 - nevents
 	MOVD	ts+40(FP), R5		// arg 6 - timeout
 	MOVD	$72, R8			// sys_kevent
-	SVC
+	INVOKE_SYSCALL
 	BCC	2(PC)
 	NEG	R0, R0
 	MOVW	R0, ret+48(FP)
@@ -392,5 +429,20 @@
 	MOVD	$2, R1			// arg 2 - cmd (F_SETFD)
 	MOVD	$1, R2			// arg 3 - arg (FD_CLOEXEC)
 	MOVD	$92, R8			// sys_fcntl
-	SVC
+	INVOKE_SYSCALL
+	RET
+
+// func runtime·setNonblock(int32 fd)
+TEXT runtime·setNonblock(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	fd+0(FP), R0		// arg 1 - fd
+	MOVD	$3, R1			// arg 2 - cmd (F_GETFL)
+	MOVD	$0, R2			// arg 3
+	MOVD	$92, R8			// sys_fcntl
+	INVOKE_SYSCALL
+	MOVD	$4, R2			// O_NONBLOCK
+	ORR	R0, R2			// arg 3 - flags
+	MOVW	fd+0(FP), R0		// arg 1 - fd
+	MOVD	$4, R1			// arg 2 - cmd (F_SETFL)
+	MOVD	$92, R8			// sys_fcntl
+	INVOKE_SYSCALL
 	RET
diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index a7fb9fe..f9969f6 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s
@@ -102,9 +102,9 @@
 	MOVL	$-1, ret_hi+8(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$8-12
-	CALL	runtime·nanotime(SB)
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+	CALL	runtime·nanotime1(SB)
 	MOVL	0(SP), AX
 	MOVL	4(SP), DX
 
diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
index a73c33f..383622b 100644
--- a/src/runtime/sys_plan9_amd64.s
+++ b/src/runtime/sys_plan9_amd64.s
@@ -88,9 +88,9 @@
 	MOVQ	AX, ret+8(FP)
 	RET
 
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$8-12
-	CALL	runtime·nanotime(SB)
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$8-12
+	CALL	runtime·nanotime1(SB)
 	MOVQ	0(SP), AX
 
 	// generated code for
diff --git a/src/runtime/sys_plan9_arm.s b/src/runtime/sys_plan9_arm.s
index b82e6c6..9fbe305 100644
--- a/src/runtime/sys_plan9_arm.s
+++ b/src/runtime/sys_plan9_arm.s
@@ -138,8 +138,8 @@
 	MOVW	R0, ret_hi+8(FP)
 	RET
 
-// time.now() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$12-12
+// func walltime1() (sec int64, nsec int32)
+TEXT runtime·walltime1(SB),NOSPLIT,$12-12
 	// use nsec system call to get current time in nanoseconds
 	MOVW	$sysnsec_lo-8(SP), R0	// destination addr
 	MOVW	R0,res-12(SP)
diff --git a/src/runtime/sys_riscv64.go b/src/runtime/sys_riscv64.go
new file mode 100644
index 0000000..e710840
--- /dev/null
+++ b/src/runtime/sys_riscv64.go
@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then did an immediate Gosave.
+func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
+	if buf.lr != 0 {
+		throw("invalid use of gostartcall")
+	}
+	buf.lr = buf.pc
+	buf.pc = uintptr(fn)
+	buf.ctxt = ctxt
+}
diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s
index ead8c8d..05fd187 100644
--- a/src/runtime/sys_solaris_amd64.s
+++ b/src/runtime/sys_solaris_amd64.s
@@ -29,26 +29,6 @@
 	MOVQ	AX,	(m_mOS+mOS_perrno)(BX)
 	RET
 
-// int64 runtime·nanotime1(void);
-//
-// clock_gettime(3c) wrapper because Timespec is too large for
-// runtime·nanotime stack.
-//
-// Called using runtime·sysvicall6 from os_solaris.c:/nanotime.
-// NOT USING GO CALLING CONVENTION.
-TEXT runtime·nanotime1(SB),NOSPLIT,$0
-	// need space for the timespec argument.
-	SUBQ	$64, SP	// 16 bytes will do, but who knows in the future?
-	MOVQ	$3, DI	// CLOCK_REALTIME from <sys/time_impl.h>
-	MOVQ	SP, SI
-	LEAQ	libc_clock_gettime(SB), AX
-	CALL	AX
-	MOVQ	(SP), AX	// tv_sec from struct timespec
-	IMULQ	$1000000000, AX	// multiply into nanoseconds
-	ADDQ	8(SP), AX	// tv_nsec, offset should be stable.
-	ADDQ	$64, SP
-	RET
-
 // pipe(3c) wrapper that returns fds in AX, DX.
 // NOT USING GO CALLING CONVENTION.
 TEXT runtime·pipe1(SB),NOSPLIT,$0
@@ -338,23 +318,3 @@
 	LEAQ	libc_sched_yield(SB), AX
 	CALL	AX
 	RET
-
-// func walltime() (sec int64, nsec int32)
-TEXT runtime·walltime(SB),NOSPLIT,$8-12
-	CALL	runtime·nanotime(SB)
-	MOVQ	0(SP), AX
-
-	// generated code for
-	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
-	// adapted to reduce duplication
-	MOVQ	AX, CX
-	MOVQ	$1360296554856532783, AX
-	MULQ	CX
-	ADDQ	CX, DX
-	RCRQ	$1, DX
-	SHRQ	$29, DX
-	MOVQ	DX, sec+0(FP)
-	IMULQ	$1000000000, DX
-	SUBQ	DX, CX
-	MOVL	CX, nsec+8(FP)
-	RET
diff --git a/src/runtime/sys_wasm.s b/src/runtime/sys_wasm.s
index d7bab92..e7a6570 100644
--- a/src/runtime/sys_wasm.s
+++ b/src/runtime/sys_wasm.s
@@ -17,10 +17,9 @@
 		Get R2
 		I32Const $1
 		I32Sub
-		Set R2
+		Tee R2
 
 		// n == 0
-		Get R2
 		I32Eqz
 		If
 			Return
@@ -54,10 +53,9 @@
 		Get R1
 		I32Const $1
 		I32Sub
-		Set R1
+		Tee R1
 
 		// n == 0
-		Get R1
 		I32Eqz
 		If
 			Return
@@ -101,7 +99,7 @@
 	End
 
 	Get R0
-	F64Const $9223372036854775807.
+	F64Const $0x7ffffffffffffc00p0 // Maximum truncated representation of 0x7fffffffffffffff
 	F64Gt
 	If
 		I64Const $0x8000000000000000
@@ -109,7 +107,7 @@
 	End
 
 	Get R0
-	F64Const $-9223372036854775808.
+	F64Const $-0x7ffffffffffffc00p0 // Minimum truncated representation of -0x8000000000000000
 	F64Lt
 	If
 		I64Const $0x8000000000000000
@@ -130,7 +128,7 @@
 	End
 
 	Get R0
-	F64Const $18446744073709551615.
+	F64Const $0xfffffffffffff800p0 // Maximum truncated representation of 0xffffffffffffffff
 	F64Gt
 	If
 		I64Const $0x8000000000000000
@@ -171,6 +169,10 @@
 	I32Store ret+8(FP)
 	RET
 
+TEXT ·resetMemoryDataView(SB), NOSPLIT, $0
+	CallImport
+	RET
+
 TEXT ·wasmExit(SB), NOSPLIT, $0
 	CallImport
 	RET
@@ -179,11 +181,11 @@
 	CallImport
 	RET
 
-TEXT ·nanotime(SB), NOSPLIT, $0
+TEXT ·nanotime1(SB), NOSPLIT, $0
 	CallImport
 	RET
 
-TEXT ·walltime(SB), NOSPLIT, $0
+TEXT ·walltime1(SB), NOSPLIT, $0
 	CallImport
 	RET
 
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 761da8e..9e1f409 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -444,7 +444,7 @@
 #define time_hi1 4
 #define time_hi2 8
 
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	CMPB	runtime·useQPCTime(SB), $0
 	JNE	useQPC
 loop:
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 2aea8ea..6c8eecd 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -62,6 +62,10 @@
 	// Return result.
 	POPQ	CX
 	MOVQ	AX, libcall_r1(CX)
+	// Floating point return values are returned in XMM0. Setting r2 to this
+	// value in case this call returned a floating point value. For details,
+	// see https://docs.microsoft.com/en-us/cpp/build/x64-calling-convention
+	MOVQ    X0, libcall_r2(CX)
 
 	// GetLastError().
 	MOVQ	0x30(GS), DI
@@ -473,7 +477,7 @@
 #define time_hi1 4
 #define time_hi2 8
 
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	CMPB	runtime·useQPCTime(SB), $0
 	JNE	useQPC
 	MOVQ	$_INTERRUPT_TIME, DI
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 8f8af0a..256b5ff 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -495,7 +495,7 @@
 #define time_hi1 4
 #define time_hi2 8
 
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+TEXT runtime·nanotime1(SB),NOSPLIT,$0-8
 	MOVW	$0, R0
 	MOVB	runtime·useQPCTime(SB), R0
 	CMP	$0, R0
diff --git a/src/runtime/sys_x86.go b/src/runtime/sys_x86.go
index 2b4ed8b..f917cb8 100644
--- a/src/runtime/sys_x86.go
+++ b/src/runtime/sys_x86.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 amd64p32 386
+// +build amd64 386
 
 package runtime
 
diff --git a/src/runtime/syscall_nacl.h b/src/runtime/syscall_nacl.h
deleted file mode 100644
index 5ee75ab..0000000
--- a/src/runtime/syscall_nacl.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Code generated by mknacl.sh; DO NOT EDIT.
-#define SYS_null 1
-#define SYS_nameservice 2
-#define SYS_dup 8
-#define SYS_dup2 9
-#define SYS_open 10
-#define SYS_close 11
-#define SYS_read 12
-#define SYS_write 13
-#define SYS_lseek 14
-#define SYS_stat 16
-#define SYS_fstat 17
-#define SYS_chmod 18
-#define SYS_isatty 19
-#define SYS_brk 20
-#define SYS_mmap 21
-#define SYS_munmap 22
-#define SYS_getdents 23
-#define SYS_mprotect 24
-#define SYS_list_mappings 25
-#define SYS_exit 30
-#define SYS_getpid 31
-#define SYS_sched_yield 32
-#define SYS_sysconf 33
-#define SYS_gettimeofday 40
-#define SYS_clock 41
-#define SYS_nanosleep 42
-#define SYS_clock_getres 43
-#define SYS_clock_gettime 44
-#define SYS_mkdir 45
-#define SYS_rmdir 46
-#define SYS_chdir 47
-#define SYS_getcwd 48
-#define SYS_unlink 49
-#define SYS_imc_makeboundsock 60
-#define SYS_imc_accept 61
-#define SYS_imc_connect 62
-#define SYS_imc_sendmsg 63
-#define SYS_imc_recvmsg 64
-#define SYS_imc_mem_obj_create 65
-#define SYS_imc_socketpair 66
-#define SYS_mutex_create 70
-#define SYS_mutex_lock 71
-#define SYS_mutex_trylock 72
-#define SYS_mutex_unlock 73
-#define SYS_cond_create 74
-#define SYS_cond_wait 75
-#define SYS_cond_signal 76
-#define SYS_cond_broadcast 77
-#define SYS_cond_timed_wait_abs 79
-#define SYS_thread_create 80
-#define SYS_thread_exit 81
-#define SYS_tls_init 82
-#define SYS_thread_nice 83
-#define SYS_tls_get 84
-#define SYS_second_tls_set 85
-#define SYS_second_tls_get 86
-#define SYS_exception_handler 87
-#define SYS_exception_stack 88
-#define SYS_exception_clear_flag 89
-#define SYS_sem_create 100
-#define SYS_sem_wait 101
-#define SYS_sem_post 102
-#define SYS_sem_get_value 103
-#define SYS_dyncode_create 104
-#define SYS_dyncode_modify 105
-#define SYS_dyncode_delete 106
-#define SYS_test_infoleak 109
-#define SYS_test_crash 110
-#define SYS_test_syscall_1 111
-#define SYS_test_syscall_2 112
-#define SYS_futex_wait_abs 120
-#define SYS_futex_wake 121
-#define SYS_pread 130
-#define SYS_pwrite 131
-#define SYS_truncate 140
-#define SYS_lstat 141
-#define SYS_link 142
-#define SYS_rename 143
-#define SYS_symlink 144
-#define SYS_access 145
-#define SYS_readlink 146
-#define SYS_utimes 147
-#define SYS_get_random_bytes 150
diff --git a/src/runtime/syscall_solaris.go b/src/runtime/syscall_solaris.go
index 3538180..0945169 100644
--- a/src/runtime/syscall_solaris.go
+++ b/src/runtime/syscall_solaris.go
@@ -16,7 +16,6 @@
 	libc_gethostname,
 	libc_getpid,
 	libc_ioctl,
-	libc_pipe,
 	libc_setgid,
 	libc_setgroups,
 	libc_setsid,
@@ -143,6 +142,9 @@
 		args: uintptr(unsafe.Pointer(&flags)),
 	}
 	asmcgocall(unsafe.Pointer(&asmsysvicall6x), unsafe.Pointer(&call))
+	if int(call.r1) != -1 {
+		call.err = 0
+	}
 	return call.r1, call.err
 }
 
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 722a73d..0e2fcfb 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -74,16 +74,18 @@
 		argsize += uintptrSize
 	}
 
-	lock(&cbs.lock)
-	defer unlock(&cbs.lock)
+	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
 
 	n := cbs.n
 	for i := 0; i < n; i++ {
 		if cbs.ctxt[i].gobody == fn.data && cbs.ctxt[i].isCleanstack() == cleanstack {
-			return callbackasmAddr(i)
+			r := callbackasmAddr(i)
+			unlock(&cbs.lock)
+			return r
 		}
 	}
 	if n >= cb_max {
+		unlock(&cbs.lock)
 		throw("too many callback functions")
 	}
 
@@ -99,7 +101,9 @@
 	cbs.ctxt[n] = c
 	cbs.n++
 
-	return callbackasmAddr(n)
+	r := callbackasmAddr(n)
+	unlock(&cbs.lock)
+	return r
 }
 
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 5335c12..2e74546 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -725,6 +725,82 @@
 	}
 }
 
+func TestFloatReturn(t *testing.T) {
+	if _, err := exec.LookPath("gcc"); err != nil {
+		t.Skip("skipping test: gcc is missing")
+	}
+	if runtime.GOARCH != "amd64" {
+		t.Skipf("skipping test: GOARCH=%s", runtime.GOARCH)
+	}
+
+	const src = `
+#include <stdint.h>
+#include <windows.h>
+
+float cfuncFloat(uintptr_t a, double b, float c, double d) {
+	if (a == 1 && b == 2.2 && c == 3.3f && d == 4.4e44) {
+		return 1.5f;
+	}
+	return 0;
+}
+
+double cfuncDouble(uintptr_t a, double b, float c, double d) {
+	if (a == 1 && b == 2.2 && c == 3.3f && d == 4.4e44) {
+		return 2.5;
+	}
+	return 0;
+}
+`
+	tmpdir, err := ioutil.TempDir("", "TestFloatReturn")
+	if err != nil {
+		t.Fatal("TempDir failed: ", err)
+	}
+	defer os.RemoveAll(tmpdir)
+
+	srcname := "mydll.c"
+	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	outname := "mydll.dll"
+	cmd := exec.Command("gcc", "-shared", "-s", "-Werror", "-o", outname, srcname)
+	cmd.Dir = tmpdir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build dll: %v - %v", err, string(out))
+	}
+	dllpath := filepath.Join(tmpdir, outname)
+
+	dll := syscall.MustLoadDLL(dllpath)
+	defer dll.Release()
+
+	proc := dll.MustFindProc("cfuncFloat")
+
+	_, r, err := proc.Call(
+		1,
+		uintptr(math.Float64bits(2.2)),
+		uintptr(math.Float32bits(3.3)),
+		uintptr(math.Float64bits(4.4e44)),
+	)
+	fr := math.Float32frombits(uint32(r))
+	if fr != 1.5 {
+		t.Errorf("got %f want 1.5 (err=%v)", fr, err)
+	}
+
+	proc = dll.MustFindProc("cfuncDouble")
+
+	_, r, err = proc.Call(
+		1,
+		uintptr(math.Float64bits(2.2)),
+		uintptr(math.Float32bits(3.3)),
+		uintptr(math.Float64bits(4.4e44)),
+	)
+	dr := math.Float64frombits(uint64(r))
+	if dr != 2.5 {
+		t.Errorf("got %f want 2.5 (err=%v)", dr, err)
+	}
+}
+
 func TestTimeBeginPeriod(t *testing.T) {
 	const TIMERR_NOERROR = 0
 	if *runtime.TimeBeginPeriodRetValue != TIMERR_NOERROR {
diff --git a/src/runtime/testdata/testfaketime/faketime.go b/src/runtime/testdata/testfaketime/faketime.go
new file mode 100644
index 0000000..1fb15eb
--- /dev/null
+++ b/src/runtime/testdata/testfaketime/faketime.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Test faketime support. This is its own test program because we have
+// to build it with custom build tags and hence want to minimize
+// dependencies.
+
+package main
+
+import (
+	"os"
+	"time"
+)
+
+func main() {
+	println("line 1")
+	// Stream switch, increments time
+	os.Stdout.WriteString("line 2\n")
+	os.Stdout.WriteString("line 3\n")
+	// Stream switch, increments time
+	os.Stderr.WriteString("line 4\n")
+	// Time jump
+	time.Sleep(1 * time.Second)
+	os.Stdout.WriteString("line 5\n")
+	// Print the current time.
+	os.Stdout.WriteString(time.Now().UTC().Format(time.RFC3339))
+}
diff --git a/src/runtime/testdata/testprog/checkptr.go b/src/runtime/testdata/testprog/checkptr.go
new file mode 100644
index 0000000..45e6fb1
--- /dev/null
+++ b/src/runtime/testdata/testprog/checkptr.go
@@ -0,0 +1,43 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "unsafe"
+
+func init() {
+	register("CheckPtrAlignmentNoPtr", CheckPtrAlignmentNoPtr)
+	register("CheckPtrAlignmentPtr", CheckPtrAlignmentPtr)
+	register("CheckPtrArithmetic", CheckPtrArithmetic)
+	register("CheckPtrSize", CheckPtrSize)
+	register("CheckPtrSmall", CheckPtrSmall)
+}
+
+func CheckPtrAlignmentNoPtr() {
+	var x [2]int64
+	p := unsafe.Pointer(&x[0])
+	sink2 = (*int64)(unsafe.Pointer(uintptr(p) + 1))
+}
+
+func CheckPtrAlignmentPtr() {
+	var x [2]int64
+	p := unsafe.Pointer(&x[0])
+	sink2 = (**int64)(unsafe.Pointer(uintptr(p) + 1))
+}
+
+func CheckPtrArithmetic() {
+	var x int
+	i := uintptr(unsafe.Pointer(&x))
+	sink2 = (*int)(unsafe.Pointer(i))
+}
+
+func CheckPtrSize() {
+	p := new(int64)
+	sink2 = p
+	sink2 = (*[100]int64)(unsafe.Pointer(p))
+}
+
+func CheckPtrSmall() {
+	sink2 = unsafe.Pointer(uintptr(1))
+}
diff --git a/src/runtime/testdata/testprog/deadlock.go b/src/runtime/testdata/testprog/deadlock.go
index 5f0d120..105d6a5 100644
--- a/src/runtime/testdata/testprog/deadlock.go
+++ b/src/runtime/testdata/testprog/deadlock.go
@@ -22,6 +22,9 @@
 	register("StackOverflow", StackOverflow)
 	register("ThreadExhaustion", ThreadExhaustion)
 	register("RecursivePanic", RecursivePanic)
+	register("RecursivePanic2", RecursivePanic2)
+	register("RecursivePanic3", RecursivePanic3)
+	register("RecursivePanic4", RecursivePanic4)
 	register("GoexitExit", GoexitExit)
 	register("GoNil", GoNil)
 	register("MainGoroutineID", MainGoroutineID)
@@ -29,6 +32,8 @@
 	register("GoexitInPanic", GoexitInPanic)
 	register("PanicAfterGoexit", PanicAfterGoexit)
 	register("RecoveredPanicAfterGoexit", RecoveredPanicAfterGoexit)
+	register("RecoverBeforePanicAfterGoexit", RecoverBeforePanicAfterGoexit)
+	register("RecoverBeforePanicAfterGoexit2", RecoverBeforePanicAfterGoexit2)
 	register("PanicTraceback", PanicTraceback)
 	register("GoschedInPanic", GoschedInPanic)
 	register("SyscallInPanic", SyscallInPanic)
@@ -111,6 +116,50 @@
 	panic("again")
 }
 
+// Same as RecursivePanic, but do the first recover and the second panic in
+// separate defers, and make sure they are executed in the correct order.
+func RecursivePanic2() {
+	func() {
+		defer func() {
+			fmt.Println(recover())
+		}()
+		var x [8192]byte
+		func(x [8192]byte) {
+			defer func() {
+				panic("second panic")
+			}()
+			defer func() {
+				fmt.Println(recover())
+			}()
+			panic("first panic")
+		}(x)
+	}()
+	panic("third panic")
+}
+
+// Make sure that the first panic finished as a panic, even though the second
+// panic was recovered
+func RecursivePanic3() {
+	defer func() {
+		defer func() {
+			recover()
+		}()
+		panic("second panic")
+	}()
+	panic("first panic")
+}
+
+// Test case where a single defer recovers one panic but starts another panic. If
+// the second panic is never recovered, then the recovered first panic will still
+// appear on the panic stack (labeled '[recovered]') and the runtime stack.
+func RecursivePanic4() {
+	defer func() {
+		recover()
+		panic("second panic")
+	}()
+	panic("first panic")
+}
+
 func GoexitExit() {
 	println("t1")
 	go func() {
@@ -202,6 +251,50 @@
 	runtime.Goexit()
 }
 
+func RecoverBeforePanicAfterGoexit() {
+	// 1. defer a function that recovers
+	// 2. defer a function that panics
+	// 3. call goexit
+	// Goexit runs the #2 defer. Its panic
+	// is caught by the #1 defer.  For Goexit, we explicitly
+	// resume execution in the Goexit loop, instead of resuming
+	// execution in the caller (which would make the Goexit disappear!)
+	defer func() {
+		r := recover()
+		if r == nil {
+			panic("bad recover")
+		}
+	}()
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
+func RecoverBeforePanicAfterGoexit2() {
+	for i := 0; i < 2; i++ {
+		defer func() {
+		}()
+	}
+	// 1. defer a function that recovers
+	// 2. defer a function that panics
+	// 3. call goexit
+	// Goexit runs the #2 defer. Its panic
+	// is caught by the #1 defer.  For Goexit, we explicitly
+	// resume execution in the Goexit loop, instead of resuming
+	// execution in the caller (which would make the Goexit disappear!)
+	defer func() {
+		r := recover()
+		if r == nil {
+			panic("bad recover")
+		}
+	}()
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
 func PanicTraceback() {
 	pt1()
 }
diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
index cca9c45..74732cd 100644
--- a/src/runtime/testdata/testprog/gc.go
+++ b/src/runtime/testdata/testprog/gc.go
@@ -11,6 +11,7 @@
 	"runtime/debug"
 	"sync/atomic"
 	"time"
+	"unsafe"
 )
 
 func init() {
@@ -19,6 +20,7 @@
 	register("GCSys", GCSys)
 	register("GCPhys", GCPhys)
 	register("DeferLiveness", DeferLiveness)
+	register("GCZombie", GCZombie)
 }
 
 func GCSys() {
@@ -147,9 +149,24 @@
 		size    = 4 << 20
 		split   = 64 << 10
 		objects = 2
+
+		// The page cache could hide 64 8-KiB pages from the scavenger today.
+		maxPageCache = (8 << 10) * 64
+
+		// Reduce GOMAXPROCS down to 4 if it's greater. We need to bound the amount
+		// of memory held in the page cache because the scavenger can't reach it.
+		// The page cache will hold at most maxPageCache of memory per-P, so this
+		// bounds the amount of memory hidden from the scavenger to 4*maxPageCache
+		// at most.
+		maxProcs = 4
 	)
 	// Set GOGC so that this test operates under consistent assumptions.
 	debug.SetGCPercent(100)
+	procs := runtime.GOMAXPROCS(-1)
+	if procs > maxProcs {
+		defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(maxProcs))
+		procs = runtime.GOMAXPROCS(-1)
+	}
 	// Save objects which we want to survive, and condemn objects which we don't.
 	// Note that we condemn objects in this way and release them all at once in
 	// order to avoid having the GC start freeing up these objects while the loop
@@ -197,10 +214,22 @@
 	// Since the runtime should scavenge the entirety of the remaining holes,
 	// theoretically there should be no more free and unscavenged memory. However due
 	// to other allocations that happen during this test we may still see some physical
-	// memory over-use. 10% here is an arbitrary but very conservative threshold which
-	// should easily account for any other allocations this test may have done.
+	// memory over-use.
 	overuse := (float64(heapBacked) - float64(stats.HeapAlloc)) / float64(stats.HeapAlloc)
-	if overuse <= 0.10 {
+	// Compute the threshold.
+	//
+	// In theory, this threshold should just be zero, but that's not possible in practice.
+	// Firstly, the runtime's page cache can hide up to maxPageCache of free memory from the
+	// scavenger per P. To account for this, we increase the threshold by the ratio between the
+	// total amount the runtime could hide from the scavenger to the amount of memory we expect
+	// to be able to scavenge here, which is (size-split)*objects. This computation is the crux
+	// GOMAXPROCS above; if GOMAXPROCS is too high the threshold just becomes 100%+ since the
+	// amount of memory being allocated is fixed. Then we add 5% to account for noise, such as
+	// other allocations this test may have performed that we don't explicitly account for The
+	// baseline threshold here is around 11% for GOMAXPROCS=1, capping out at around 30% for
+	// GOMAXPROCS=4.
+	threshold := 0.05 + float64(procs)*maxPageCache/float64((size-split)*objects)
+	if overuse <= threshold {
 		fmt.Println("OK")
 		return
 	}
@@ -210,8 +239,8 @@
 	// In the context of this test, this indicates a large amount of
 	// fragmentation with physical pages that are otherwise unused but not
 	// returned to the OS.
-	fmt.Printf("exceeded physical memory overuse threshold of 10%%: %3.2f%%\n"+
-		"(alloc: %d, goal: %d, sys: %d, rel: %d, objs: %d)\n", overuse*100,
+	fmt.Printf("exceeded physical memory overuse threshold of %3.2f%%: %3.2f%%\n"+
+		"(alloc: %d, goal: %d, sys: %d, rel: %d, objs: %d)\n", threshold*100, overuse*100,
 		stats.HeapAlloc, stats.NextGC, stats.HeapSys, stats.HeapReleased, len(saved))
 	runtime.KeepAlive(saved)
 }
@@ -237,3 +266,37 @@
 func escape(x interface{}) { sink2 = x; sink2 = nil }
 
 var sink2 interface{}
+
+// Test zombie object detection and reporting.
+func GCZombie() {
+	// Allocate several objects of unusual size (so free slots are
+	// unlikely to all be re-allocated by the runtime).
+	const size = 190
+	const count = 8192 / size
+	keep := make([]*byte, 0, (count+1)/2)
+	free := make([]uintptr, 0, (count+1)/2)
+	zombies := make([]*byte, 0, len(free))
+	for i := 0; i < count; i++ {
+		obj := make([]byte, size)
+		p := &obj[0]
+		if i%2 == 0 {
+			keep = append(keep, p)
+		} else {
+			free = append(free, uintptr(unsafe.Pointer(p)))
+		}
+	}
+
+	// Free the unreferenced objects.
+	runtime.GC()
+
+	// Bring the free objects back to life.
+	for _, p := range free {
+		zombies = append(zombies, (*byte)(unsafe.Pointer(p)))
+	}
+
+	// GC should detect the zombie objects.
+	runtime.GC()
+	println("failed")
+	runtime.KeepAlive(keep)
+	runtime.KeepAlive(zombies)
+}
diff --git a/src/runtime/testdata/testprog/lockosthread.go b/src/runtime/testdata/testprog/lockosthread.go
index fd3123e..e9d7fdb 100644
--- a/src/runtime/testdata/testprog/lockosthread.go
+++ b/src/runtime/testdata/testprog/lockosthread.go
@@ -7,6 +7,7 @@
 import (
 	"os"
 	"runtime"
+	"sync"
 	"time"
 )
 
@@ -30,6 +31,7 @@
 		runtime.LockOSThread()
 	})
 	register("LockOSThreadAvoidsStatePropagation", LockOSThreadAvoidsStatePropagation)
+	register("LockOSThreadTemplateThreadRace", LockOSThreadTemplateThreadRace)
 }
 
 func LockOSThreadMain() {
@@ -195,3 +197,50 @@
 	runtime.UnlockOSThread()
 	println("OK")
 }
+
+func LockOSThreadTemplateThreadRace() {
+	// This test attempts to reproduce the race described in
+	// golang.org/issue/38931. To do so, we must have a stop-the-world
+	// (achieved via ReadMemStats) racing with two LockOSThread calls.
+	//
+	// While this test attempts to line up the timing, it is only expected
+	// to fail (and thus hang) around 2% of the time if the race is
+	// present.
+
+	// Ensure enough Ps to actually run everything in parallel. Though on
+	// <4 core machines, we are still at the whim of the kernel scheduler.
+	runtime.GOMAXPROCS(4)
+
+	go func() {
+		// Stop the world; race with LockOSThread below.
+		var m runtime.MemStats
+		for {
+			runtime.ReadMemStats(&m)
+		}
+	}()
+
+	// Try to synchronize both LockOSThreads.
+	start := time.Now().Add(10 * time.Millisecond)
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	for i := 0; i < 2; i++ {
+		go func() {
+			for time.Now().Before(start) {
+			}
+
+			// Add work to the local runq to trigger early startm
+			// in handoffp.
+			go func() {}()
+
+			runtime.LockOSThread()
+			runtime.Gosched() // add a preemption point.
+			wg.Done()
+		}()
+	}
+
+	wg.Wait()
+	// If both LockOSThreads completed then we did not hit the race.
+	println("OK")
+}
diff --git a/src/runtime/testdata/testprog/numcpu_freebsd.go b/src/runtime/testdata/testprog/numcpu_freebsd.go
index 42ee154..aff36ec 100644
--- a/src/runtime/testdata/testprog/numcpu_freebsd.go
+++ b/src/runtime/testdata/testprog/numcpu_freebsd.go
@@ -85,7 +85,13 @@
 	if err != nil {
 		return nil, fmt.Errorf("fail to execute '%s': %s", cmdline, err)
 	}
-	pos := bytes.IndexRune(output, ':')
+	pos := bytes.IndexRune(output, '\n')
+	if pos == -1 {
+		return nil, fmt.Errorf("invalid output from '%s', '\\n' not found: %s", cmdline, output)
+	}
+	output = output[0:pos]
+
+	pos = bytes.IndexRune(output, ':')
 	if pos == -1 {
 		return nil, fmt.Errorf("invalid output from '%s', ':' not found: %s", cmdline, output)
 	}
diff --git a/src/runtime/testdata/testprog/panicprint.go b/src/runtime/testdata/testprog/panicprint.go
new file mode 100644
index 0000000..c8deabe
--- /dev/null
+++ b/src/runtime/testdata/testprog/panicprint.go
@@ -0,0 +1,111 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+type MyBool bool
+type MyComplex128 complex128
+type MyComplex64 complex64
+type MyFloat32 float32
+type MyFloat64 float64
+type MyInt int
+type MyInt8 int8
+type MyInt16 int16
+type MyInt32 int32
+type MyInt64 int64
+type MyString string
+type MyUint uint
+type MyUint8 uint8
+type MyUint16 uint16
+type MyUint32 uint32
+type MyUint64 uint64
+type MyUintptr uintptr
+
+func panicCustomComplex64() {
+	panic(MyComplex64(0.11 + 3i))
+}
+
+func panicCustomComplex128() {
+	panic(MyComplex128(32.1 + 10i))
+}
+
+func panicCustomString() {
+	panic(MyString("Panic"))
+}
+
+func panicCustomBool() {
+	panic(MyBool(true))
+}
+
+func panicCustomInt() {
+	panic(MyInt(93))
+}
+
+func panicCustomInt8() {
+	panic(MyInt8(93))
+}
+
+func panicCustomInt16() {
+	panic(MyInt16(93))
+}
+
+func panicCustomInt32() {
+	panic(MyInt32(93))
+}
+
+func panicCustomInt64() {
+	panic(MyInt64(93))
+}
+
+func panicCustomUint() {
+	panic(MyUint(93))
+}
+
+func panicCustomUint8() {
+	panic(MyUint8(93))
+}
+
+func panicCustomUint16() {
+	panic(MyUint16(93))
+}
+
+func panicCustomUint32() {
+	panic(MyUint32(93))
+}
+
+func panicCustomUint64() {
+	panic(MyUint64(93))
+}
+
+func panicCustomUintptr() {
+	panic(MyUintptr(93))
+}
+
+func panicCustomFloat64() {
+	panic(MyFloat64(-93.70))
+}
+
+func panicCustomFloat32() {
+	panic(MyFloat32(-93.70))
+}
+
+func init() {
+	register("panicCustomComplex64", panicCustomComplex64)
+	register("panicCustomComplex128", panicCustomComplex128)
+	register("panicCustomBool", panicCustomBool)
+	register("panicCustomFloat32", panicCustomFloat32)
+	register("panicCustomFloat64", panicCustomFloat64)
+	register("panicCustomInt", panicCustomInt)
+	register("panicCustomInt8", panicCustomInt8)
+	register("panicCustomInt16", panicCustomInt16)
+	register("panicCustomInt32", panicCustomInt32)
+	register("panicCustomInt64", panicCustomInt64)
+	register("panicCustomString", panicCustomString)
+	register("panicCustomUint", panicCustomUint)
+	register("panicCustomUint8", panicCustomUint8)
+	register("panicCustomUint16", panicCustomUint16)
+	register("panicCustomUint32", panicCustomUint32)
+	register("panicCustomUint64", panicCustomUint64)
+	register("panicCustomUintptr", panicCustomUintptr)
+}
diff --git a/src/runtime/testdata/testprog/preempt.go b/src/runtime/testdata/testprog/preempt.go
new file mode 100644
index 0000000..1c74d0e
--- /dev/null
+++ b/src/runtime/testdata/testprog/preempt.go
@@ -0,0 +1,71 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"runtime"
+	"runtime/debug"
+	"sync/atomic"
+)
+
+func init() {
+	register("AsyncPreempt", AsyncPreempt)
+}
+
+func AsyncPreempt() {
+	// Run with just 1 GOMAXPROCS so the runtime is required to
+	// use scheduler preemption.
+	runtime.GOMAXPROCS(1)
+	// Disable GC so we have complete control of what we're testing.
+	debug.SetGCPercent(-1)
+
+	// Start a goroutine with no sync safe-points.
+	var ready, ready2 uint32
+	go func() {
+		for {
+			atomic.StoreUint32(&ready, 1)
+			dummy()
+			dummy()
+		}
+	}()
+	// Also start one with a frameless function.
+	// This is an especially interesting case for
+	// LR machines.
+	go func() {
+		atomic.AddUint32(&ready2, 1)
+		frameless()
+	}()
+	// Also test empty infinite loop.
+	go func() {
+		atomic.AddUint32(&ready2, 1)
+		for {
+		}
+	}()
+
+	// Wait for the goroutine to stop passing through sync
+	// safe-points.
+	for atomic.LoadUint32(&ready) == 0 || atomic.LoadUint32(&ready2) < 2 {
+		runtime.Gosched()
+	}
+
+	// Run a GC, which will have to stop the goroutine for STW and
+	// for stack scanning. If this doesn't work, the test will
+	// deadlock and timeout.
+	runtime.GC()
+
+	println("OK")
+}
+
+//go:noinline
+func frameless() {
+	for i := int64(0); i < 1<<62; i++ {
+		out += i * i * i * i * i * 12345
+	}
+}
+
+var out int64
+
+//go:noinline
+func dummy() {}
diff --git a/src/runtime/testdata/testprog/signal.go b/src/runtime/testdata/testprog/signal.go
index 2ccbada..417e105 100644
--- a/src/runtime/testdata/testprog/signal.go
+++ b/src/runtime/testdata/testprog/signal.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !windows,!plan9,!nacl
+// +build !windows,!plan9
 
 package main
 
diff --git a/src/runtime/testdata/testprog/vdso.go b/src/runtime/testdata/testprog/vdso.go
new file mode 100644
index 0000000..ef92f48
--- /dev/null
+++ b/src/runtime/testdata/testprog/vdso.go
@@ -0,0 +1,55 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Invoke signal hander in the VDSO context (see issue 32912).
+
+package main
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime/pprof"
+	"time"
+)
+
+func init() {
+	register("SignalInVDSO", signalInVDSO)
+}
+
+func signalInVDSO() {
+	f, err := ioutil.TempFile("", "timeprofnow")
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := pprof.StartCPUProfile(f); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	t0 := time.Now()
+	t1 := t0
+	// We should get a profiling signal 100 times a second,
+	// so running for 1 second should be sufficient.
+	for t1.Sub(t0) < time.Second {
+		t1 = time.Now()
+	}
+
+	pprof.StopCPUProfile()
+
+	name := f.Name()
+	if err := f.Close(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := os.Remove(name); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	fmt.Println("success")
+}
diff --git a/src/runtime/testdata/testprogcgo/eintr.go b/src/runtime/testdata/testprogcgo/eintr.go
new file mode 100644
index 0000000..791ff1b
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/eintr.go
@@ -0,0 +1,246 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <errno.h>
+#include <signal.h>
+#include <string.h>
+
+static int clearRestart(int sig) {
+	struct sigaction sa;
+
+	memset(&sa, 0, sizeof sa);
+	if (sigaction(sig, NULL, &sa) < 0) {
+		return errno;
+	}
+	sa.sa_flags &=~ SA_RESTART;
+	if (sigaction(sig, &sa, NULL) < 0) {
+		return errno;
+	}
+	return 0;
+}
+*/
+import "C"
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"log"
+	"net"
+	"os"
+	"os/exec"
+	"sync"
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("EINTR", EINTR)
+	register("Block", Block)
+}
+
+// Test various operations when a signal handler is installed without
+// the SA_RESTART flag. This tests that the os and net APIs handle EINTR.
+func EINTR() {
+	if errno := C.clearRestart(C.int(syscall.SIGURG)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+	if errno := C.clearRestart(C.int(syscall.SIGWINCH)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+	if errno := C.clearRestart(C.int(syscall.SIGCHLD)); errno != 0 {
+		log.Fatal(syscall.Errno(errno))
+	}
+
+	var wg sync.WaitGroup
+	testPipe(&wg)
+	testNet(&wg)
+	testExec(&wg)
+	wg.Wait()
+	fmt.Println("OK")
+}
+
+// spin does CPU bound spinning and allocating for a millisecond,
+// to get a SIGURG.
+//go:noinline
+func spin() (float64, []byte) {
+	stop := time.Now().Add(time.Millisecond)
+	r1 := 0.0
+	r2 := make([]byte, 200)
+	for time.Now().Before(stop) {
+		for i := 1; i < 1e6; i++ {
+			r1 += r1 / float64(i)
+			r2 = append(r2, bytes.Repeat([]byte{byte(i)}, 100)...)
+			r2 = r2[100:]
+		}
+	}
+	return r1, r2
+}
+
+// winch sends a few SIGWINCH signals to the process.
+func winch() {
+	ticker := time.NewTicker(100 * time.Microsecond)
+	defer ticker.Stop()
+	pid := syscall.Getpid()
+	for n := 10; n > 0; n-- {
+		syscall.Kill(pid, syscall.SIGWINCH)
+		<-ticker.C
+	}
+}
+
+// sendSomeSignals triggers a few SIGURG and SIGWINCH signals.
+func sendSomeSignals() {
+	done := make(chan struct{})
+	go func() {
+		spin()
+		close(done)
+	}()
+	winch()
+	<-done
+}
+
+// testPipe tests pipe operations.
+func testPipe(wg *sync.WaitGroup) {
+	r, w, err := os.Pipe()
+	if err != nil {
+		log.Fatal(err)
+	}
+	if err := syscall.SetNonblock(int(r.Fd()), false); err != nil {
+		log.Fatal(err)
+	}
+	if err := syscall.SetNonblock(int(w.Fd()), false); err != nil {
+		log.Fatal(err)
+	}
+	wg.Add(2)
+	go func() {
+		defer wg.Done()
+		defer w.Close()
+		// Spin before calling Write so that the first ReadFull
+		// in the other goroutine will likely be interrupted
+		// by a signal.
+		sendSomeSignals()
+		// This Write will likely be interrupted by a signal
+		// as the other goroutine spins in the middle of reading.
+		// We write enough data that we should always fill the
+		// pipe buffer and need multiple write system calls.
+		if _, err := w.Write(bytes.Repeat([]byte{0}, 2<<20)); err != nil {
+			log.Fatal(err)
+		}
+	}()
+	go func() {
+		defer wg.Done()
+		defer r.Close()
+		b := make([]byte, 1<<20)
+		// This ReadFull will likely be interrupted by a signal,
+		// as the other goroutine spins before writing anything.
+		if _, err := io.ReadFull(r, b); err != nil {
+			log.Fatal(err)
+		}
+		// Spin after reading half the data so that the Write
+		// in the other goroutine will likely be interrupted
+		// before it completes.
+		sendSomeSignals()
+		if _, err := io.ReadFull(r, b); err != nil {
+			log.Fatal(err)
+		}
+	}()
+}
+
+// testNet tests network operations.
+func testNet(wg *sync.WaitGroup) {
+	ln, err := net.Listen("tcp4", "127.0.0.1:0")
+	if err != nil {
+		if errors.Is(err, syscall.EAFNOSUPPORT) || errors.Is(err, syscall.EPROTONOSUPPORT) {
+			return
+		}
+		log.Fatal(err)
+	}
+	wg.Add(2)
+	go func() {
+		defer wg.Done()
+		defer ln.Close()
+		c, err := ln.Accept()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer c.Close()
+		cf, err := c.(*net.TCPConn).File()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer cf.Close()
+		if err := syscall.SetNonblock(int(cf.Fd()), false); err != nil {
+			log.Fatal(err)
+		}
+		// See comments in testPipe.
+		sendSomeSignals()
+		if _, err := cf.Write(bytes.Repeat([]byte{0}, 2<<20)); err != nil {
+			log.Fatal(err)
+		}
+	}()
+	go func() {
+		defer wg.Done()
+		sendSomeSignals()
+		c, err := net.Dial("tcp", ln.Addr().String())
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer c.Close()
+		cf, err := c.(*net.TCPConn).File()
+		if err != nil {
+			log.Fatal(err)
+		}
+		defer cf.Close()
+		if err := syscall.SetNonblock(int(cf.Fd()), false); err != nil {
+			log.Fatal(err)
+		}
+		// See comments in testPipe.
+		b := make([]byte, 1<<20)
+		if _, err := io.ReadFull(cf, b); err != nil {
+			log.Fatal(err)
+		}
+		sendSomeSignals()
+		if _, err := io.ReadFull(cf, b); err != nil {
+			log.Fatal(err)
+		}
+	}()
+}
+
+func testExec(wg *sync.WaitGroup) {
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		cmd := exec.Command(os.Args[0], "Block")
+		stdin, err := cmd.StdinPipe()
+		if err != nil {
+			log.Fatal(err)
+		}
+		cmd.Stderr = new(bytes.Buffer)
+		cmd.Stdout = cmd.Stderr
+		if err := cmd.Start(); err != nil {
+			log.Fatal(err)
+		}
+
+		go func() {
+			sendSomeSignals()
+			stdin.Close()
+		}()
+
+		if err := cmd.Wait(); err != nil {
+			log.Fatalf("%v:\n%s", err, cmd.Stdout)
+		}
+	}()
+}
+
+// Block blocks until stdin is closed.
+func Block() {
+	io.Copy(ioutil.Discard, os.Stdin)
+}
diff --git a/src/runtime/testdata/testprogcgo/numgoroutine.go b/src/runtime/testdata/testprogcgo/numgoroutine.go
index 12fda49..5bdfe52 100644
--- a/src/runtime/testdata/testprogcgo/numgoroutine.go
+++ b/src/runtime/testdata/testprogcgo/numgoroutine.go
@@ -41,13 +41,6 @@
 	// Test that there are just the expected number of goroutines
 	// running. Specifically, test that the spare M's goroutine
 	// doesn't show up.
-	//
-	// On non-Windows platforms there's a signal handling thread
-	// started by os/signal.init in addition to the main
-	// goroutine.
-	if runtime.GOOS != "windows" {
-		baseGoroutines = 1
-	}
 	if _, ok := checkNumGoroutine("first", 1+baseGoroutines); !ok {
 		return
 	}
diff --git a/src/runtime/testdata/testprogcgo/segv.go b/src/runtime/testdata/testprogcgo/segv.go
new file mode 100644
index 0000000..3237a8c
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/segv.go
@@ -0,0 +1,56 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// static void nop() {}
+import "C"
+
+import (
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("Segv", Segv)
+	register("SegvInCgo", SegvInCgo)
+}
+
+var Sum int
+
+func Segv() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for i := 0; ; i++ {
+			Sum += i
+		}
+	}()
+
+	<-c
+
+	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
+
+	// Give the OS time to deliver the signal.
+	time.Sleep(time.Second)
+}
+
+func SegvInCgo() {
+	c := make(chan bool)
+	go func() {
+		close(c)
+		for {
+			C.nop()
+		}
+	}()
+
+	<-c
+
+	syscall.Kill(syscall.Getpid(), syscall.SIGSEGV)
+
+	// Give the OS time to deliver the signal.
+	time.Sleep(time.Second)
+}
diff --git a/src/runtime/testdata/testprognet/signal.go b/src/runtime/testdata/testprognet/signal.go
index a1559fe..4d2de79 100644
--- a/src/runtime/testdata/testprognet/signal.go
+++ b/src/runtime/testdata/testprognet/signal.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !windows,!plan9,!nacl
+// +build !windows,!plan9
 
 // This is in testprognet instead of testprog because testprog
 // must not import anything (like net, but also like os/signal)
diff --git a/src/runtime/testdata/testwinlib/main.c b/src/runtime/testdata/testwinlib/main.c
new file mode 100644
index 0000000..e84a32f
--- /dev/null
+++ b/src/runtime/testdata/testwinlib/main.c
@@ -0,0 +1,57 @@
+#include <stdio.h>
+#include <windows.h>
+#include "testwinlib.h"
+
+int exceptionCount;
+int continueCount;
+LONG WINAPI customExceptionHandlder(struct _EXCEPTION_POINTERS *ExceptionInfo)
+{
+    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_BREAKPOINT)
+    {
+        exceptionCount++;
+        // prepare context to resume execution
+        CONTEXT *c = ExceptionInfo->ContextRecord;
+        c->Rip = *(ULONG_PTR *)c->Rsp;
+        c->Rsp += 8;
+        return EXCEPTION_CONTINUE_EXECUTION;
+    }
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+LONG WINAPI customContinueHandlder(struct _EXCEPTION_POINTERS *ExceptionInfo)
+{
+    if (ExceptionInfo->ExceptionRecord->ExceptionCode == EXCEPTION_BREAKPOINT)
+    {
+        continueCount++;
+        return EXCEPTION_CONTINUE_EXECUTION;
+    }
+    return EXCEPTION_CONTINUE_SEARCH;
+}
+
+void throwFromC()
+{
+    DebugBreak();
+}
+int main()
+{
+    // simulate a "lazily" attached debugger, by calling some go code before attaching the exception/continue handler
+    Dummy();
+    exceptionCount = 0;
+    continueCount = 0;
+    void *exceptionHandlerHandle = AddVectoredExceptionHandler(0, customExceptionHandlder);
+    if (NULL == exceptionHandlerHandle)
+    {
+        printf("cannot add vectored exception handler\n");
+        return 2;
+    }
+    void *continueHandlerHandle = AddVectoredContinueHandler(0, customContinueHandlder);
+    if (NULL == continueHandlerHandle)
+    {
+        printf("cannot add vectored continue handler\n");
+        return 2;
+    }
+    CallMeBack(throwFromC);
+    RemoveVectoredContinueHandler(continueHandlerHandle);
+    RemoveVectoredExceptionHandler(exceptionHandlerHandle);
+    printf("exceptionCount: %d\ncontinueCount: %d\n", exceptionCount, continueCount);
+    return 0;
+}
\ No newline at end of file
diff --git a/src/runtime/testdata/testwinlib/main.go b/src/runtime/testdata/testwinlib/main.go
new file mode 100644
index 0000000..400eaa1
--- /dev/null
+++ b/src/runtime/testdata/testwinlib/main.go
@@ -0,0 +1,28 @@
+// +build windows,cgo
+
+package main
+
+// #include <windows.h>
+// typedef void(*callmeBackFunc)();
+// static void bridgeCallback(callmeBackFunc callback) {
+//	callback();
+//}
+import "C"
+
+// CallMeBack call backs C code.
+//export CallMeBack
+func CallMeBack(callback C.callmeBackFunc) {
+	C.bridgeCallback(callback)
+}
+
+// Dummy is called by the C code before registering the exception/continue handlers simulating a debugger.
+// This makes sure that the Go runtime's lastcontinuehandler is reached before the C continue handler and thus,
+// validate that it does not crash the program before another handler could take an action.
+// The idea here is to reproduce what happens when you attach a debugger to a running program.
+// It also simulate the behavior of the .Net debugger, which register its exception/continue handlers lazily.
+//export Dummy
+func Dummy() int {
+	return 42
+}
+
+func main() {}
diff --git a/src/runtime/testdata/testwinlibsignal/dummy.go b/src/runtime/testdata/testwinlibsignal/dummy.go
new file mode 100644
index 0000000..82dfd91
--- /dev/null
+++ b/src/runtime/testdata/testwinlibsignal/dummy.go
@@ -0,0 +1,10 @@
+// +build windows
+
+package main
+
+//export Dummy
+func Dummy() int {
+	return 42
+}
+
+func main() {}
diff --git a/src/runtime/testdata/testwinlibsignal/main.c b/src/runtime/testdata/testwinlibsignal/main.c
new file mode 100644
index 0000000..1787fef
--- /dev/null
+++ b/src/runtime/testdata/testwinlibsignal/main.c
@@ -0,0 +1,50 @@
+#include <windows.h>
+#include <stdio.h>
+
+HANDLE waitForCtrlBreakEvent;
+
+BOOL WINAPI CtrlHandler(DWORD fdwCtrlType)
+{
+    switch (fdwCtrlType)
+    {
+    case CTRL_BREAK_EVENT:
+        SetEvent(waitForCtrlBreakEvent);
+        return TRUE;
+    default:
+        return FALSE;
+    }
+}
+
+int main(void)
+{
+    waitForCtrlBreakEvent = CreateEvent(NULL, TRUE, FALSE, NULL);
+    if (!waitForCtrlBreakEvent) {
+        fprintf(stderr, "ERROR: Could not create event");
+        return 1;
+    }
+
+    if (!SetConsoleCtrlHandler(CtrlHandler, TRUE))
+    {
+        fprintf(stderr, "ERROR: Could not set control handler");
+        return 1;
+    }
+
+    // The library must be loaded after the SetConsoleCtrlHandler call
+    // so that the library handler registers after the main program.
+    // This way the library handler gets called first.
+    HMODULE dummyDll = LoadLibrary("dummy.dll");
+    if (!dummyDll) {
+        fprintf(stderr, "ERROR: Could not load dummy.dll");
+        return 1;
+    }
+
+    printf("ready\n");
+    fflush(stdout);
+
+    if (WaitForSingleObject(waitForCtrlBreakEvent, 5000) != WAIT_OBJECT_0) {
+        fprintf(stderr, "FAILURE: No signal received");
+        return 1;
+    }
+
+    return 0;
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index 28a4722..fdb5066 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -7,17 +7,18 @@
 package runtime
 
 import (
-	"internal/cpu"
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
 // Package time knows the layout of this structure.
 // If this struct changes, adjust ../time/sleep.go:/runtimeTimer.
-// For GOOS=nacl, package syscall knows the layout of this structure.
-// If this struct changes, adjust ../syscall/net_nacl.go:/runtimeTimer.
 type timer struct {
-	tb *timersBucket // the bucket the timer lives in
-	i  int           // heap index
+	// If this timer is on a heap, which P's heap it is on.
+	// puintptr rather than *p to match uintptr in the versions
+	// of this struct defined in other packages.
+	pp puintptr
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
 	// each time calling f(arg, now) in the timer goroutine, so f must be
@@ -27,52 +28,141 @@
 	f      func(interface{}, uintptr)
 	arg    interface{}
 	seq    uintptr
+
+	// What to set the when field to in timerModifiedXX status.
+	nextwhen int64
+
+	// The status field holds one of the values below.
+	status uint32
 }
 
-// timersLen is the length of timers array.
+// Code outside this file has to be careful in using a timer value.
 //
-// Ideally, this would be set to GOMAXPROCS, but that would require
-// dynamic reallocation
+// The pp, status, and nextwhen fields may only be used by code in this file.
 //
-// The current value is a compromise between memory usage and performance
-// that should cover the majority of GOMAXPROCS values used in the wild.
-const timersLen = 64
-
-// timers contains "per-P" timer heaps.
+// Code that creates a new timer value can set the when, period, f,
+// arg, and seq fields.
+// A new timer value may be passed to addtimer (called by time.startTimer).
+// After doing that no fields may be touched.
 //
-// Timers are queued into timersBucket associated with the current P,
-// so each P may work with its own timers independently of other P instances.
+// An active timer (one that has been passed to addtimer) may be
+// passed to deltimer (time.stopTimer), after which it is no longer an
+// active timer. It is an inactive timer.
+// In an inactive timer the period, f, arg, and seq fields may be modified,
+// but not the when field.
+// It's OK to just drop an inactive timer and let the GC collect it.
+// It's not OK to pass an inactive timer to addtimer.
+// Only newly allocated timer values may be passed to addtimer.
 //
-// Each timersBucket may be associated with multiple P
-// if GOMAXPROCS > timersLen.
-var timers [timersLen]struct {
-	timersBucket
+// An active timer may be passed to modtimer. No fields may be touched.
+// It remains an active timer.
+//
+// An inactive timer may be passed to resettimer to turn into an
+// active timer with an updated when field.
+// It's OK to pass a newly allocated timer value to resettimer.
+//
+// Timer operations are addtimer, deltimer, modtimer, resettimer,
+// cleantimers, adjusttimers, and runtimer.
+//
+// We don't permit calling addtimer/deltimer/modtimer/resettimer simultaneously,
+// but adjusttimers and runtimer can be called at the same time as any of those.
+//
+// Active timers live in heaps attached to P, in the timers field.
+// Inactive timers live there too temporarily, until they are removed.
+//
+// addtimer:
+//   timerNoStatus   -> timerWaiting
+//   anything else   -> panic: invalid value
+// deltimer:
+//   timerWaiting         -> timerModifying -> timerDeleted
+//   timerModifiedEarlier -> timerModifying -> timerDeleted
+//   timerModifiedLater   -> timerModifying -> timerDeleted
+//   timerNoStatus        -> do nothing
+//   timerDeleted         -> do nothing
+//   timerRemoving        -> do nothing
+//   timerRemoved         -> do nothing
+//   timerRunning         -> wait until status changes
+//   timerMoving          -> wait until status changes
+//   timerModifying       -> wait until status changes
+// modtimer:
+//   timerWaiting    -> timerModifying -> timerModifiedXX
+//   timerModifiedXX -> timerModifying -> timerModifiedYY
+//   timerNoStatus   -> timerModifying -> timerWaiting
+//   timerRemoved    -> timerModifying -> timerWaiting
+//   timerDeleted    -> timerModifying -> timerModifiedXX
+//   timerRunning    -> wait until status changes
+//   timerMoving     -> wait until status changes
+//   timerRemoving   -> wait until status changes
+//   timerModifying  -> wait until status changes
+// cleantimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// adjusttimers (looks in P's timer heap):
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerModifiedXX -> timerMoving -> timerWaiting
+// runtimer (looks in P's timer heap):
+//   timerNoStatus   -> panic: uninitialized timer
+//   timerWaiting    -> timerWaiting or
+//   timerWaiting    -> timerRunning -> timerNoStatus or
+//   timerWaiting    -> timerRunning -> timerWaiting
+//   timerModifying  -> wait until status changes
+//   timerModifiedXX -> timerMoving -> timerWaiting
+//   timerDeleted    -> timerRemoving -> timerRemoved
+//   timerRunning    -> panic: concurrent runtimer calls
+//   timerRemoved    -> panic: inconsistent timer heap
+//   timerRemoving   -> panic: inconsistent timer heap
+//   timerMoving     -> panic: inconsistent timer heap
 
-	// The padding should eliminate false sharing
-	// between timersBucket values.
-	pad [cpu.CacheLinePadSize - unsafe.Sizeof(timersBucket{})%cpu.CacheLinePadSize]byte
-}
+// Values for the timer status field.
+const (
+	// Timer has no status set yet.
+	timerNoStatus = iota
 
-func (t *timer) assignBucket() *timersBucket {
-	id := uint8(getg().m.p.ptr().id) % timersLen
-	t.tb = &timers[id].timersBucket
-	return t.tb
-}
+	// Waiting for timer to fire.
+	// The timer is in some P's heap.
+	timerWaiting
 
-//go:notinheap
-type timersBucket struct {
-	lock         mutex
-	gp           *g
-	created      bool
-	sleeping     bool
-	rescheduling bool
-	sleepUntil   int64
-	waitnote     note
-	t            []*timer
-}
+	// Running the timer function.
+	// A timer will only have this status briefly.
+	timerRunning
 
-// nacl fake time support - time in nanoseconds since 1970
-var faketime int64
+	// The timer is deleted and should be removed.
+	// It should not be run, but it is still in some P's heap.
+	timerDeleted
+
+	// The timer is being removed.
+	// The timer will only have this status briefly.
+	timerRemoving
+
+	// The timer has been stopped.
+	// It is not in any P's heap.
+	timerRemoved
+
+	// The timer is being modified.
+	// The timer will only have this status briefly.
+	timerModifying
+
+	// The timer has been modified to an earlier time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedEarlier
+
+	// The timer has been modified to the same or a later time.
+	// The new when value is in the nextwhen field.
+	// The timer is in some P's heap, possibly in the wrong place.
+	timerModifiedLater
+
+	// The timer has been modified and is being moved.
+	// The timer will only have this status briefly.
+	timerMoving
+)
+
+// maxWhen is the maximum value for timer's when field.
+const maxWhen = 1<<63 - 1
+
+// verifyTimers can be set to true to add debugging checks that the
+// timer heaps are valid.
+const verifyTimers = false
 
 // Package time APIs.
 // Godoc uses the comments in package time, not these.
@@ -92,17 +182,20 @@
 		t = new(timer)
 		gp.timer = t
 	}
-	*t = timer{}
-	t.when = nanotime() + ns
 	t.f = goroutineReady
 	t.arg = gp
-	tb := t.assignBucket()
-	lock(&tb.lock)
-	if !tb.addtimerLocked(t) {
-		unlock(&tb.lock)
-		badTimer()
-	}
-	goparkunlock(&tb.lock, waitReasonSleep, traceEvGoSleep, 2)
+	t.nextwhen = nanotime() + ns
+	gopark(resetForSleep, unsafe.Pointer(t), waitReasonSleep, traceEvGoSleep, 1)
+}
+
+// resetForSleep is called after the goroutine is parked for timeSleep.
+// We can't call resettimer in timeSleep itself because if this is a short
+// sleep and there are many goroutines then the P can wind up running the
+// timer function, goroutineReady, before the goroutine has been parked.
+func resetForSleep(gp *g, ut unsafe.Pointer) bool {
+	t := (*timer)(ut)
+	resettimer(t, t.nextwhen)
+	return true
 }
 
 // startTimer adds t to the timer heap.
@@ -114,13 +207,29 @@
 	addtimer(t)
 }
 
-// stopTimer removes t from the timer heap if it is there.
-// It returns true if t was removed, false if t wasn't even there.
+// stopTimer stops a timer.
+// It reports whether t was stopped before being run.
 //go:linkname stopTimer time.stopTimer
 func stopTimer(t *timer) bool {
 	return deltimer(t)
 }
 
+// resetTimer resets an inactive timer, adding it to the heap.
+//go:linkname resetTimer time.resetTimer
+// Reports whether the timer was modified before it was run.
+func resetTimer(t *timer, when int64) bool {
+	if raceenabled {
+		racerelease(unsafe.Pointer(t))
+	}
+	return resettimer(t, when)
+}
+
+// modTimer modifies an existing timer.
+//go:linkname modTimer time.modTimer
+func modTimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
+	modtimer(t, when, period, f, arg, seq)
+}
+
 // Go runtime.
 
 // Ready the goroutine arg.
@@ -128,251 +237,808 @@
 	goready(arg.(*g), 0)
 }
 
+// addtimer adds a timer to the current P.
+// This should only be called with a newly created timer.
+// That avoids the risk of changing the when field of a timer in some P's heap,
+// which could cause the heap to become unsorted.
 func addtimer(t *timer) {
-	tb := t.assignBucket()
-	lock(&tb.lock)
-	ok := tb.addtimerLocked(t)
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
-	}
-}
-
-// Add a timer to the heap and start or kick timerproc if the new timer is
-// earlier than any of the others.
-// Timers are locked.
-// Returns whether all is well: false if the data structure is corrupt
-// due to user-level races.
-func (tb *timersBucket) addtimerLocked(t *timer) bool {
-	// when must never be negative; otherwise timerproc will overflow
+	// when must never be negative; otherwise runtimer will overflow
 	// during its delta calculation and never expire other runtime timers.
 	if t.when < 0 {
-		t.when = 1<<63 - 1
+		t.when = maxWhen
 	}
-	t.i = len(tb.t)
-	tb.t = append(tb.t, t)
-	if !siftupTimer(tb.t, t.i) {
-		return false
+	if t.status != timerNoStatus {
+		throw("addtimer called with initialized timer")
 	}
-	if t.i == 0 {
-		// siftup moved to top: new earliest deadline.
-		if tb.sleeping && tb.sleepUntil > t.when {
-			tb.sleeping = false
-			notewakeup(&tb.waitnote)
-		}
-		if tb.rescheduling {
-			tb.rescheduling = false
-			goready(tb.gp, 0)
-		}
-		if !tb.created {
-			tb.created = true
-			go timerproc(tb)
-		}
-	}
-	return true
+	t.status = timerWaiting
+
+	when := t.when
+
+	pp := getg().m.p.ptr()
+	lock(&pp.timersLock)
+	cleantimers(pp)
+	doaddtimer(pp, t)
+	unlock(&pp.timersLock)
+
+	wakeNetPoller(when)
 }
 
-// Delete timer t from the heap.
-// Do not need to update the timerproc: if it wakes up early, no big deal.
+// doaddtimer adds t to the current P's heap.
+// The caller must have locked the timers for pp.
+func doaddtimer(pp *p, t *timer) {
+	// Timers rely on the network poller, so make sure the poller
+	// has started.
+	if netpollInited == 0 {
+		netpollGenericInit()
+	}
+
+	if t.pp != 0 {
+		throw("doaddtimer: P already set in timer")
+	}
+	t.pp.set(pp)
+	i := len(pp.timers)
+	pp.timers = append(pp.timers, t)
+	siftupTimer(pp.timers, i)
+	if t == pp.timers[0] {
+		atomic.Store64(&pp.timer0When, uint64(t.when))
+	}
+	atomic.Xadd(&pp.numTimers, 1)
+}
+
+// deltimer deletes the timer t. It may be on some other P, so we can't
+// actually remove it from the timers heap. We can only mark it as deleted.
+// It will be removed in due course by the P whose heap it is on.
+// Reports whether the timer was removed before it was run.
 func deltimer(t *timer) bool {
-	if t.tb == nil {
-		// t.tb can be nil if the user created a timer
-		// directly, without invoking startTimer e.g
-		//    time.Ticker{C: c}
-		// In this case, return early without any deletion.
-		// See Issue 21874.
-		return false
-	}
-
-	tb := t.tb
-
-	lock(&tb.lock)
-	removed, ok := tb.deltimerLocked(t)
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
-	}
-	return removed
-}
-
-func (tb *timersBucket) deltimerLocked(t *timer) (removed, ok bool) {
-	// t may not be registered anymore and may have
-	// a bogus i (typically 0, if generated by Go).
-	// Verify it before proceeding.
-	i := t.i
-	last := len(tb.t) - 1
-	if i < 0 || i > last || tb.t[i] != t {
-		return false, true
-	}
-	if i != last {
-		tb.t[i] = tb.t[last]
-		tb.t[i].i = i
-	}
-	tb.t[last] = nil
-	tb.t = tb.t[:last]
-	ok = true
-	if i != last {
-		if !siftupTimer(tb.t, i) {
-			ok = false
-		}
-		if !siftdownTimer(tb.t, i) {
-			ok = false
-		}
-	}
-	return true, ok
-}
-
-func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) {
-	tb := t.tb
-
-	lock(&tb.lock)
-	_, ok := tb.deltimerLocked(t)
-	if ok {
-		t.when = when
-		t.period = period
-		t.f = f
-		t.arg = arg
-		t.seq = seq
-		ok = tb.addtimerLocked(t)
-	}
-	unlock(&tb.lock)
-	if !ok {
-		badTimer()
-	}
-}
-
-// Timerproc runs the time-driven events.
-// It sleeps until the next event in the tb heap.
-// If addtimer inserts a new earlier event, it wakes timerproc early.
-func timerproc(tb *timersBucket) {
-	tb.gp = getg()
 	for {
-		lock(&tb.lock)
-		tb.sleeping = false
-		now := nanotime()
-		delta := int64(-1)
-		for {
-			if len(tb.t) == 0 {
-				delta = -1
-				break
-			}
-			t := tb.t[0]
-			delta = t.when - now
-			if delta > 0 {
-				break
-			}
-			ok := true
-			if t.period > 0 {
-				// leave in heap but adjust next time to fire
-				t.when += t.period * (1 + -delta/t.period)
-				if !siftdownTimer(tb.t, 0) {
-					ok = false
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting, timerModifiedLater:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp := acquirem()
+			if atomic.Cas(&t.status, s, timerModifying) {
+				// Must fetch t.pp before changing status,
+				// as cleantimers in another goroutine
+				// can clear t.pp of a timerDeleted timer.
+				tpp := t.pp.ptr()
+				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+					badTimer()
 				}
+				releasem(mp)
+				atomic.Xadd(&tpp.deletedTimers, 1)
+				// Timer was not yet run.
+				return true
 			} else {
-				// remove from heap
-				last := len(tb.t) - 1
-				if last > 0 {
-					tb.t[0] = tb.t[last]
-					tb.t[0].i = 0
-				}
-				tb.t[last] = nil
-				tb.t = tb.t[:last]
-				if last > 0 {
-					if !siftdownTimer(tb.t, 0) {
-						ok = false
-					}
-				}
-				t.i = -1 // mark as removed
+				releasem(mp)
 			}
-			f := t.f
-			arg := t.arg
-			seq := t.seq
-			unlock(&tb.lock)
-			if !ok {
+		case timerModifiedEarlier:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp := acquirem()
+			if atomic.Cas(&t.status, s, timerModifying) {
+				// Must fetch t.pp before setting status
+				// to timerDeleted.
+				tpp := t.pp.ptr()
+				atomic.Xadd(&tpp.adjustTimers, -1)
+				if !atomic.Cas(&t.status, timerModifying, timerDeleted) {
+					badTimer()
+				}
+				releasem(mp)
+				atomic.Xadd(&tpp.deletedTimers, 1)
+				// Timer was not yet run.
+				return true
+			} else {
+				releasem(mp)
+			}
+		case timerDeleted, timerRemoving, timerRemoved:
+			// Timer was already run.
+			return false
+		case timerRunning, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerNoStatus:
+			// Removing timer that was never added or
+			// has already been run. Also see issue 21874.
+			return false
+		case timerModifying:
+			// Simultaneous calls to deltimer and modtimer.
+			// Wait for the other call to complete.
+			osyield()
+		default:
+			badTimer()
+		}
+	}
+}
+
+// dodeltimer removes timer i from the current P's heap.
+// We are locked on the P when this is called.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func dodeltimer(pp *p, i int) {
+	if t := pp.timers[i]; t.pp.ptr() != pp {
+		throw("dodeltimer: wrong P")
+	} else {
+		t.pp = 0
+	}
+	last := len(pp.timers) - 1
+	if i != last {
+		pp.timers[i] = pp.timers[last]
+	}
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	if i != last {
+		// Moving to i may have moved the last timer to a new parent,
+		// so sift up to preserve the heap guarantee.
+		siftupTimer(pp.timers, i)
+		siftdownTimer(pp.timers, i)
+	}
+	if i == 0 {
+		updateTimer0When(pp)
+	}
+	atomic.Xadd(&pp.numTimers, -1)
+}
+
+// dodeltimer0 removes timer 0 from the current P's heap.
+// We are locked on the P when this is called.
+// It reports whether it saw no problems due to races.
+// The caller must have locked the timers for pp.
+func dodeltimer0(pp *p) {
+	if t := pp.timers[0]; t.pp.ptr() != pp {
+		throw("dodeltimer0: wrong P")
+	} else {
+		t.pp = 0
+	}
+	last := len(pp.timers) - 1
+	if last > 0 {
+		pp.timers[0] = pp.timers[last]
+	}
+	pp.timers[last] = nil
+	pp.timers = pp.timers[:last]
+	if last > 0 {
+		siftdownTimer(pp.timers, 0)
+	}
+	updateTimer0When(pp)
+	atomic.Xadd(&pp.numTimers, -1)
+}
+
+// modtimer modifies an existing timer.
+// This is called by the netpoll code or time.Ticker.Reset.
+// Reports whether the timer was modified before it was run.
+func modtimer(t *timer, when, period int64, f func(interface{}, uintptr), arg interface{}, seq uintptr) bool {
+	if when < 0 {
+		when = maxWhen
+	}
+
+	status := uint32(timerNoStatus)
+	wasRemoved := false
+	var pending bool
+	var mp *m
+loop:
+	for {
+		switch status = atomic.Load(&t.status); status {
+		case timerWaiting, timerModifiedEarlier, timerModifiedLater:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+			if atomic.Cas(&t.status, status, timerModifying) {
+				pending = true // timer not yet run
+				break loop
+			}
+			releasem(mp)
+		case timerNoStatus, timerRemoved:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+
+			// Timer was already run and t is no longer in a heap.
+			// Act like addtimer.
+			if atomic.Cas(&t.status, status, timerModifying) {
+				wasRemoved = true
+				pending = false // timer already run or stopped
+				break loop
+			}
+			releasem(mp)
+		case timerDeleted:
+			// Prevent preemption while the timer is in timerModifying.
+			// This could lead to a self-deadlock. See #38070.
+			mp = acquirem()
+			if atomic.Cas(&t.status, status, timerModifying) {
+				atomic.Xadd(&t.pp.ptr().deletedTimers, -1)
+				pending = false // timer already stopped
+				break loop
+			}
+			releasem(mp)
+		case timerRunning, timerRemoving, timerMoving:
+			// The timer is being run or moved, by a different P.
+			// Wait for it to complete.
+			osyield()
+		case timerModifying:
+			// Multiple simultaneous calls to modtimer.
+			// Wait for the other call to complete.
+			osyield()
+		default:
+			badTimer()
+		}
+	}
+
+	t.period = period
+	t.f = f
+	t.arg = arg
+	t.seq = seq
+
+	if wasRemoved {
+		t.when = when
+		pp := getg().m.p.ptr()
+		lock(&pp.timersLock)
+		doaddtimer(pp, t)
+		unlock(&pp.timersLock)
+		if !atomic.Cas(&t.status, timerModifying, timerWaiting) {
+			badTimer()
+		}
+		releasem(mp)
+		wakeNetPoller(when)
+	} else {
+		// The timer is in some other P's heap, so we can't change
+		// the when field. If we did, the other P's heap would
+		// be out of order. So we put the new when value in the
+		// nextwhen field, and let the other P set the when field
+		// when it is prepared to resort the heap.
+		t.nextwhen = when
+
+		newStatus := uint32(timerModifiedLater)
+		if when < t.when {
+			newStatus = timerModifiedEarlier
+		}
+
+		// Update the adjustTimers field.  Subtract one if we
+		// are removing a timerModifiedEarlier, add one if we
+		// are adding a timerModifiedEarlier.
+		adjust := int32(0)
+		if status == timerModifiedEarlier {
+			adjust--
+		}
+		if newStatus == timerModifiedEarlier {
+			adjust++
+		}
+		if adjust != 0 {
+			atomic.Xadd(&t.pp.ptr().adjustTimers, adjust)
+		}
+
+		// Set the new status of the timer.
+		if !atomic.Cas(&t.status, timerModifying, newStatus) {
+			badTimer()
+		}
+		releasem(mp)
+
+		// If the new status is earlier, wake up the poller.
+		if newStatus == timerModifiedEarlier {
+			wakeNetPoller(when)
+		}
+	}
+
+	return pending
+}
+
+// resettimer resets the time when a timer should fire.
+// If used for an inactive timer, the timer will become active.
+// This should be called instead of addtimer if the timer value has been,
+// or may have been, used previously.
+// Reports whether the timer was modified before it was run.
+func resettimer(t *timer, when int64) bool {
+	return modtimer(t, when, t.period, t.f, t.arg, t.seq)
+}
+
+// cleantimers cleans up the head of the timer queue. This speeds up
+// programs that create and delete timers; leaving them in the heap
+// slows down addtimer. Reports whether no timer problems were found.
+// The caller must have locked the timers for pp.
+func cleantimers(pp *p) {
+	gp := getg()
+	for {
+		if len(pp.timers) == 0 {
+			return
+		}
+
+		// This loop can theoretically run for a while, and because
+		// it is holding timersLock it cannot be preempted.
+		// If someone is trying to preempt us, just return.
+		// We can clean the timers later.
+		if gp.preemptStop {
+			return
+		}
+
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("cleantimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			dodeltimer0(pp)
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
 				badTimer()
 			}
-			if raceenabled {
-				raceacquire(unsafe.Pointer(t))
+			atomic.Xadd(&pp.deletedTimers, -1)
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
 			}
-			f(arg, seq)
-			lock(&tb.lock)
+			// Now we can change the when field.
+			t.when = t.nextwhen
+			// Move t to the right position.
+			dodeltimer0(pp)
+			doaddtimer(pp, t)
+			if s == timerModifiedEarlier {
+				atomic.Xadd(&pp.adjustTimers, -1)
+			}
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				badTimer()
+			}
+		default:
+			// Head of timers does not need adjustment.
+			return
 		}
-		if delta < 0 || faketime > 0 {
-			// No timers left - put goroutine to sleep.
-			tb.rescheduling = true
-			goparkunlock(&tb.lock, waitReasonTimerGoroutineIdle, traceEvGoBlock, 1)
+	}
+}
+
+// moveTimers moves a slice of timers to pp. The slice has been taken
+// from a different P.
+// This is currently called when the world is stopped, but the caller
+// is expected to have locked the timers for pp.
+func moveTimers(pp *p, timers []*timer) {
+	for _, t := range timers {
+	loop:
+		for {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				t.pp = 0
+				doaddtimer(pp, t)
+				break loop
+			case timerModifiedEarlier, timerModifiedLater:
+				if !atomic.Cas(&t.status, s, timerMoving) {
+					continue
+				}
+				t.when = t.nextwhen
+				t.pp = 0
+				doaddtimer(pp, t)
+				if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+					badTimer()
+				}
+				break loop
+			case timerDeleted:
+				if !atomic.Cas(&t.status, s, timerRemoved) {
+					continue
+				}
+				t.pp = 0
+				// We no longer need this timer in the heap.
+				break loop
+			case timerModifying:
+				// Loop until the modification is complete.
+				osyield()
+			case timerNoStatus, timerRemoved:
+				// We should not see these status values in a timers heap.
+				badTimer()
+			case timerRunning, timerRemoving, timerMoving:
+				// Some other P thinks it owns this timer,
+				// which should not happen.
+				badTimer()
+			default:
+				badTimer()
+			}
+		}
+	}
+}
+
+// adjusttimers looks through the timers in the current P's heap for
+// any timers that have been modified to run earlier, and puts them in
+// the correct place in the heap. While looking for those timers,
+// it also moves timers that have been modified to run later,
+// and removes deleted timers. The caller must have locked the timers for pp.
+func adjusttimers(pp *p) {
+	if len(pp.timers) == 0 {
+		return
+	}
+	if atomic.Load(&pp.adjustTimers) == 0 {
+		if verifyTimers {
+			verifyTimerHeap(pp)
+		}
+		return
+	}
+	var moved []*timer
+loop:
+	for i := 0; i < len(pp.timers); i++ {
+		t := pp.timers[i]
+		if t.pp.ptr() != pp {
+			throw("adjusttimers: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerDeleted:
+			if atomic.Cas(&t.status, s, timerRemoving) {
+				dodeltimer(pp, i)
+				if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+					badTimer()
+				}
+				atomic.Xadd(&pp.deletedTimers, -1)
+				// Look at this heap position again.
+				i--
+			}
+		case timerModifiedEarlier, timerModifiedLater:
+			if atomic.Cas(&t.status, s, timerMoving) {
+				// Now we can change the when field.
+				t.when = t.nextwhen
+				// Take t off the heap, and hold onto it.
+				// We don't add it back yet because the
+				// heap manipulation could cause our
+				// loop to skip some other timer.
+				dodeltimer(pp, i)
+				moved = append(moved, t)
+				if s == timerModifiedEarlier {
+					if n := atomic.Xadd(&pp.adjustTimers, -1); int32(n) <= 0 {
+						break loop
+					}
+				}
+				// Look at this heap position again.
+				i--
+			}
+		case timerNoStatus, timerRunning, timerRemoving, timerRemoved, timerMoving:
+			badTimer()
+		case timerWaiting:
+			// OK, nothing to do.
+		case timerModifying:
+			// Check again after modification is complete.
+			osyield()
+			i--
+		default:
+			badTimer()
+		}
+	}
+
+	if len(moved) > 0 {
+		addAdjustedTimers(pp, moved)
+	}
+
+	if verifyTimers {
+		verifyTimerHeap(pp)
+	}
+}
+
+// addAdjustedTimers adds any timers we adjusted in adjusttimers
+// back to the timer heap.
+func addAdjustedTimers(pp *p, moved []*timer) {
+	for _, t := range moved {
+		doaddtimer(pp, t)
+		if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+			badTimer()
+		}
+	}
+}
+
+// nobarrierWakeTime looks at P's timers and returns the time when we
+// should wake up the netpoller. It returns 0 if there are no timers.
+// This function is invoked when dropping a P, and must run without
+// any write barriers. Therefore, if there are any timers that needs
+// to be moved earlier, it conservatively returns the current time.
+// The netpoller M will wake up and adjust timers before sleeping again.
+//go:nowritebarrierrec
+func nobarrierWakeTime(pp *p) int64 {
+	if atomic.Load(&pp.adjustTimers) > 0 {
+		return nanotime()
+	} else {
+		return int64(atomic.Load64(&pp.timer0When))
+	}
+}
+
+// runtimer examines the first timer in timers. If it is ready based on now,
+// it runs the timer and removes or updates it.
+// Returns 0 if it ran a timer, -1 if there are no more timers, or the time
+// when the first timer should run.
+// The caller must have locked the timers for pp.
+// If a timer is run, this will temporarily unlock the timers.
+//go:systemstack
+func runtimer(pp *p, now int64) int64 {
+	for {
+		t := pp.timers[0]
+		if t.pp.ptr() != pp {
+			throw("runtimer: bad p")
+		}
+		switch s := atomic.Load(&t.status); s {
+		case timerWaiting:
+			if t.when > now {
+				// Not ready to run.
+				return t.when
+			}
+
+			if !atomic.Cas(&t.status, s, timerRunning) {
+				continue
+			}
+			// Note that runOneTimer may temporarily unlock
+			// pp.timersLock.
+			runOneTimer(pp, t, now)
+			return 0
+
+		case timerDeleted:
+			if !atomic.Cas(&t.status, s, timerRemoving) {
+				continue
+			}
+			dodeltimer0(pp)
+			if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+				badTimer()
+			}
+			atomic.Xadd(&pp.deletedTimers, -1)
+			if len(pp.timers) == 0 {
+				return -1
+			}
+
+		case timerModifiedEarlier, timerModifiedLater:
+			if !atomic.Cas(&t.status, s, timerMoving) {
+				continue
+			}
+			t.when = t.nextwhen
+			dodeltimer0(pp)
+			doaddtimer(pp, t)
+			if s == timerModifiedEarlier {
+				atomic.Xadd(&pp.adjustTimers, -1)
+			}
+			if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+				badTimer()
+			}
+
+		case timerModifying:
+			// Wait for modification to complete.
+			osyield()
+
+		case timerNoStatus, timerRemoved:
+			// Should not see a new or inactive timer on the heap.
+			badTimer()
+		case timerRunning, timerRemoving, timerMoving:
+			// These should only be set when timers are locked,
+			// and we didn't do it.
+			badTimer()
+		default:
+			badTimer()
+		}
+	}
+}
+
+// runOneTimer runs a single timer.
+// The caller must have locked the timers for pp.
+// This will temporarily unlock the timers while running the timer function.
+//go:systemstack
+func runOneTimer(pp *p, t *timer, now int64) {
+	if raceenabled {
+		ppcur := getg().m.p.ptr()
+		if ppcur.timerRaceCtx == 0 {
+			ppcur.timerRaceCtx = racegostart(funcPC(runtimer) + sys.PCQuantum)
+		}
+		raceacquirectx(ppcur.timerRaceCtx, unsafe.Pointer(t))
+	}
+
+	f := t.f
+	arg := t.arg
+	seq := t.seq
+
+	if t.period > 0 {
+		// Leave in heap but adjust next time to fire.
+		delta := t.when - now
+		t.when += t.period * (1 + -delta/t.period)
+		siftdownTimer(pp.timers, 0)
+		if !atomic.Cas(&t.status, timerRunning, timerWaiting) {
+			badTimer()
+		}
+		updateTimer0When(pp)
+	} else {
+		// Remove from heap.
+		dodeltimer0(pp)
+		if !atomic.Cas(&t.status, timerRunning, timerNoStatus) {
+			badTimer()
+		}
+	}
+
+	if raceenabled {
+		// Temporarily use the current P's racectx for g0.
+		gp := getg()
+		if gp.racectx != 0 {
+			throw("runOneTimer: unexpected racectx")
+		}
+		gp.racectx = gp.m.p.ptr().timerRaceCtx
+	}
+
+	unlock(&pp.timersLock)
+
+	f(arg, seq)
+
+	lock(&pp.timersLock)
+
+	if raceenabled {
+		gp := getg()
+		gp.racectx = 0
+	}
+}
+
+// clearDeletedTimers removes all deleted timers from the P's timer heap.
+// This is used to avoid clogging up the heap if the program
+// starts a lot of long-running timers and then stops them.
+// For example, this can happen via context.WithTimeout.
+//
+// This is the only function that walks through the entire timer heap,
+// other than moveTimers which only runs when the world is stopped.
+//
+// The caller must have locked the timers for pp.
+func clearDeletedTimers(pp *p) {
+	cdel := int32(0)
+	cearlier := int32(0)
+	to := 0
+	changedHeap := false
+	timers := pp.timers
+nextTimer:
+	for _, t := range timers {
+		for {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if changedHeap {
+					timers[to] = t
+					siftupTimer(timers, to)
+				}
+				to++
+				continue nextTimer
+			case timerModifiedEarlier, timerModifiedLater:
+				if atomic.Cas(&t.status, s, timerMoving) {
+					t.when = t.nextwhen
+					timers[to] = t
+					siftupTimer(timers, to)
+					to++
+					changedHeap = true
+					if !atomic.Cas(&t.status, timerMoving, timerWaiting) {
+						badTimer()
+					}
+					if s == timerModifiedEarlier {
+						cearlier++
+					}
+					continue nextTimer
+				}
+			case timerDeleted:
+				if atomic.Cas(&t.status, s, timerRemoving) {
+					t.pp = 0
+					cdel++
+					if !atomic.Cas(&t.status, timerRemoving, timerRemoved) {
+						badTimer()
+					}
+					changedHeap = true
+					continue nextTimer
+				}
+			case timerModifying:
+				// Loop until modification complete.
+				osyield()
+			case timerNoStatus, timerRemoved:
+				// We should not see these status values in a timer heap.
+				badTimer()
+			case timerRunning, timerRemoving, timerMoving:
+				// Some other P thinks it owns this timer,
+				// which should not happen.
+				badTimer()
+			default:
+				badTimer()
+			}
+		}
+	}
+
+	// Set remaining slots in timers slice to nil,
+	// so that the timer values can be garbage collected.
+	for i := to; i < len(timers); i++ {
+		timers[i] = nil
+	}
+
+	atomic.Xadd(&pp.deletedTimers, -cdel)
+	atomic.Xadd(&pp.numTimers, -cdel)
+	atomic.Xadd(&pp.adjustTimers, -cearlier)
+
+	timers = timers[:to]
+	pp.timers = timers
+	updateTimer0When(pp)
+
+	if verifyTimers {
+		verifyTimerHeap(pp)
+	}
+}
+
+// verifyTimerHeap verifies that the timer heap is in a valid state.
+// This is only for debugging, and is only called if verifyTimers is true.
+// The caller must have locked the timers.
+func verifyTimerHeap(pp *p) {
+	for i, t := range pp.timers {
+		if i == 0 {
+			// First timer has no parent.
 			continue
 		}
-		// At least one timer pending. Sleep until then.
-		tb.sleeping = true
-		tb.sleepUntil = now + delta
-		noteclear(&tb.waitnote)
-		unlock(&tb.lock)
-		notetsleepg(&tb.waitnote, delta)
+
+		// The heap is 4-ary. See siftupTimer and siftdownTimer.
+		p := (i - 1) / 4
+		if t.when < pp.timers[p].when {
+			print("bad timer heap at ", i, ": ", p, ": ", pp.timers[p].when, ", ", i, ": ", t.when, "\n")
+			throw("bad timer heap")
+		}
+	}
+	if numTimers := int(atomic.Load(&pp.numTimers)); len(pp.timers) != numTimers {
+		println("timer heap len", len(pp.timers), "!= numTimers", numTimers)
+		throw("bad timer heap len")
 	}
 }
 
-func timejump() *g {
-	if faketime == 0 {
-		return nil
+// updateTimer0When sets the P's timer0When field.
+// The caller must have locked the timers for pp.
+func updateTimer0When(pp *p) {
+	if len(pp.timers) == 0 {
+		atomic.Store64(&pp.timer0When, 0)
+	} else {
+		atomic.Store64(&pp.timer0When, uint64(pp.timers[0].when))
 	}
-
-	for i := range timers {
-		lock(&timers[i].lock)
-	}
-	gp := timejumpLocked()
-	for i := range timers {
-		unlock(&timers[i].lock)
-	}
-
-	return gp
 }
 
-func timejumpLocked() *g {
-	// Determine a timer bucket with minimum when.
-	var minT *timer
-	for i := range timers {
-		tb := &timers[i]
-		if !tb.created || len(tb.t) == 0 {
+// timeSleepUntil returns the time when the next timer should fire,
+// and the P that holds the timer heap that that timer is on.
+// This is only called by sysmon and checkdead.
+func timeSleepUntil() (int64, *p) {
+	next := int64(maxWhen)
+	var pret *p
+
+	// Prevent allp slice changes. This is like retake.
+	lock(&allpLock)
+	for _, pp := range allp {
+		if pp == nil {
+			// This can happen if procresize has grown
+			// allp but not yet created new Ps.
 			continue
 		}
-		t := tb.t[0]
-		if minT == nil || t.when < minT.when {
-			minT = t
+
+		c := atomic.Load(&pp.adjustTimers)
+		if c == 0 {
+			w := int64(atomic.Load64(&pp.timer0When))
+			if w != 0 && w < next {
+				next = w
+				pret = pp
+			}
+			continue
 		}
-	}
-	if minT == nil || minT.when <= faketime {
-		return nil
-	}
 
-	faketime = minT.when
-	tb := minT.tb
-	if !tb.rescheduling {
-		return nil
-	}
-	tb.rescheduling = false
-	return tb.gp
-}
-
-func timeSleepUntil() int64 {
-	next := int64(1<<63 - 1)
-
-	// Determine minimum sleepUntil across all the timer buckets.
-	//
-	// The function can not return a precise answer,
-	// as another timer may pop in as soon as timers have been unlocked.
-	// So lock the timers one by one instead of all at once.
-	for i := range timers {
-		tb := &timers[i]
-
-		lock(&tb.lock)
-		if tb.sleeping && tb.sleepUntil < next {
-			next = tb.sleepUntil
+		lock(&pp.timersLock)
+		for _, t := range pp.timers {
+			switch s := atomic.Load(&t.status); s {
+			case timerWaiting:
+				if t.when < next {
+					next = t.when
+				}
+			case timerModifiedEarlier, timerModifiedLater:
+				if t.nextwhen < next {
+					next = t.nextwhen
+				}
+				if s == timerModifiedEarlier {
+					c--
+				}
+			}
+			// The timers are sorted, so we only have to check
+			// the first timer for each P, unless there are
+			// some timerModifiedEarlier timers. The number
+			// of timerModifiedEarlier timers is in the adjustTimers
+			// field, used to initialize c, above.
+			//
+			// We don't worry about cases like timerModifying.
+			// New timers can show up at any time,
+			// so this function is necessarily imprecise.
+			// Do a signed check here since we aren't
+			// synchronizing the read of pp.adjustTimers
+			// with the check of a timer status.
+			if int32(c) <= 0 {
+				break
+			}
 		}
-		unlock(&tb.lock)
+		unlock(&pp.timersLock)
 	}
+	unlock(&allpLock)
 
-	return next
+	return next, pret
 }
 
 // Heap maintenance algorithms.
@@ -382,13 +1048,10 @@
 // it will cause the program to crash with a mysterious
 // "panic holding locks" message. Instead, we panic while not
 // holding a lock.
-// The races can occur despite the bucket locks because assignBucket
-// itself is called without locks, so racy calls can cause a timer to
-// change buckets while executing these functions.
 
-func siftupTimer(t []*timer, i int) bool {
+func siftupTimer(t []*timer, i int) {
 	if i >= len(t) {
-		return false
+		badTimer()
 	}
 	when := t[i].when
 	tmp := t[i]
@@ -398,20 +1061,17 @@
 			break
 		}
 		t[i] = t[p]
-		t[i].i = i
 		i = p
 	}
 	if tmp != t[i] {
 		t[i] = tmp
-		t[i].i = i
 	}
-	return true
 }
 
-func siftdownTimer(t []*timer, i int) bool {
+func siftdownTimer(t []*timer, i int) {
 	n := len(t)
 	if i >= n {
-		return false
+		badTimer()
 	}
 	when := t[i].when
 	tmp := t[i]
@@ -441,14 +1101,11 @@
 			break
 		}
 		t[i] = t[c]
-		t[i].i = i
 		i = c
 	}
 	if tmp != t[i] {
 		t[i] = tmp
-		t[i].i = i
 	}
-	return true
 }
 
 // badTimer is called if the timer data structures have been corrupted,
@@ -456,5 +1113,5 @@
 // panicing due to invalid slice access while holding locks.
 // See issue #25686.
 func badTimer() {
-	panic(errorString("racy use of timers"))
+	throw("timer data corruption")
 }
diff --git a/src/runtime/time_fake.go b/src/runtime/time_fake.go
new file mode 100644
index 0000000..c64d299
--- /dev/null
+++ b/src/runtime/time_fake.go
@@ -0,0 +1,100 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build faketime
+// +build !windows
+
+// Faketime isn't currently supported on Windows. This would require:
+//
+// 1. Shadowing time_now, which is implemented in assembly on Windows.
+//    Since that's exported directly to the time package from runtime
+//    assembly, this would involve moving it from sys_windows_*.s into
+//    its own assembly files build-tagged with !faketime and using the
+//    implementation of time_now from timestub.go in faketime mode.
+//
+// 2. Modifying syscall.Write to call syscall.faketimeWrite,
+//    translating the Stdout and Stderr handles into FDs 1 and 2.
+//    (See CL 192739 PS 3.)
+
+package runtime
+
+import "unsafe"
+
+// faketime is the simulated time in nanoseconds since 1970 for the
+// playground.
+var faketime int64 = 1257894000000000000
+
+var faketimeState struct {
+	lock mutex
+
+	// lastfaketime is the last faketime value written to fd 1 or 2.
+	lastfaketime int64
+
+	// lastfd is the fd to which lastfaketime was written.
+	//
+	// Subsequent writes to the same fd may use the same
+	// timestamp, but the timestamp must increase if the fd
+	// changes.
+	lastfd uintptr
+}
+
+//go:nosplit
+func nanotime() int64 {
+	return faketime
+}
+
+func walltime() (sec int64, nsec int32) {
+	return faketime / 1000000000, int32(faketime % 1000000000)
+}
+
+func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	if !(fd == 1 || fd == 2) {
+		// Do an ordinary write.
+		return write1(fd, p, n)
+	}
+
+	// Write with the playback header.
+
+	// First, lock to avoid interleaving writes.
+	lock(&faketimeState.lock)
+
+	// If the current fd doesn't match the fd of the previous write,
+	// ensure that the timestamp is strictly greater. That way, we can
+	// recover the original order even if we read the fds separately.
+	t := faketimeState.lastfaketime
+	if fd != faketimeState.lastfd {
+		t++
+		faketimeState.lastfd = fd
+	}
+	if faketime > t {
+		t = faketime
+	}
+	faketimeState.lastfaketime = t
+
+	// Playback header: 0 0 P B <8-byte time> <4-byte data length> (big endian)
+	var buf [4 + 8 + 4]byte
+	buf[2] = 'P'
+	buf[3] = 'B'
+	tu := uint64(t)
+	buf[4] = byte(tu >> (7 * 8))
+	buf[5] = byte(tu >> (6 * 8))
+	buf[6] = byte(tu >> (5 * 8))
+	buf[7] = byte(tu >> (4 * 8))
+	buf[8] = byte(tu >> (3 * 8))
+	buf[9] = byte(tu >> (2 * 8))
+	buf[10] = byte(tu >> (1 * 8))
+	buf[11] = byte(tu >> (0 * 8))
+	nu := uint32(n)
+	buf[12] = byte(nu >> (3 * 8))
+	buf[13] = byte(nu >> (2 * 8))
+	buf[14] = byte(nu >> (1 * 8))
+	buf[15] = byte(nu >> (0 * 8))
+	write1(fd, unsafe.Pointer(&buf[0]), int32(len(buf)))
+
+	// Write actual data.
+	res := write1(fd, p, n)
+
+	unlock(&faketimeState.lock)
+	return res
+}
diff --git a/src/runtime/time_nofake.go b/src/runtime/time_nofake.go
new file mode 100644
index 0000000..1912a94
--- /dev/null
+++ b/src/runtime/time_nofake.go
@@ -0,0 +1,31 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !faketime
+
+package runtime
+
+import "unsafe"
+
+// faketime is the simulated time in nanoseconds since 1970 for the
+// playground.
+//
+// Zero means not to use faketime.
+var faketime int64
+
+//go:nosplit
+func nanotime() int64 {
+	return nanotime1()
+}
+
+func walltime() (sec int64, nsec int32) {
+	return walltime1()
+}
+
+// write must be nosplit on Windows (see write1)
+//
+//go:nosplit
+func write(fd uintptr, p unsafe.Pointer, n int32) int32 {
+	return write1(fd, p, n)
+}
diff --git a/src/runtime/time_test.go b/src/runtime/time_test.go
new file mode 100644
index 0000000..bf29561
--- /dev/null
+++ b/src/runtime/time_test.go
@@ -0,0 +1,93 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"bytes"
+	"encoding/binary"
+	"errors"
+	"internal/testenv"
+	"os/exec"
+	"reflect"
+	"runtime"
+	"testing"
+)
+
+func TestFakeTime(t *testing.T) {
+	if runtime.GOOS == "windows" {
+		t.Skip("faketime not supported on windows")
+	}
+
+	t.Parallel()
+
+	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var stdout, stderr bytes.Buffer
+	cmd := exec.Command(exe)
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+
+	err = testenv.CleanCmdEnv(cmd).Run()
+	if err != nil {
+		t.Fatalf("exit status: %v\n%s", err, stderr.String())
+	}
+
+	t.Logf("raw stdout: %q", stdout.String())
+	t.Logf("raw stderr: %q", stdout.String())
+
+	f1, err1 := parseFakeTime(stdout.Bytes())
+	if err1 != nil {
+		t.Fatal(err1)
+	}
+	f2, err2 := parseFakeTime(stderr.Bytes())
+	if err2 != nil {
+		t.Fatal(err2)
+	}
+
+	const time0 = 1257894000000000000
+	got := [][]fakeTimeFrame{f1, f2}
+	var want = [][]fakeTimeFrame{{
+		{time0 + 1, "line 2\n"},
+		{time0 + 1, "line 3\n"},
+		{time0 + 1e9, "line 5\n"},
+		{time0 + 1e9, "2009-11-10T23:00:01Z"},
+	}, {
+		{time0, "line 1\n"},
+		{time0 + 2, "line 4\n"},
+	}}
+	if !reflect.DeepEqual(want, got) {
+		t.Fatalf("want %v, got %v", want, got)
+	}
+}
+
+type fakeTimeFrame struct {
+	time uint64
+	data string
+}
+
+func parseFakeTime(x []byte) ([]fakeTimeFrame, error) {
+	var frames []fakeTimeFrame
+	for len(x) != 0 {
+		if len(x) < 4+8+4 {
+			return nil, errors.New("truncated header")
+		}
+		const magic = "\x00\x00PB"
+		if string(x[:len(magic)]) != magic {
+			return nil, errors.New("bad magic")
+		}
+		x = x[len(magic):]
+		time := binary.BigEndian.Uint64(x)
+		x = x[8:]
+		dlen := binary.BigEndian.Uint32(x)
+		x = x[4:]
+		data := string(x[:dlen])
+		x = x[dlen:]
+		frames = append(frames, fakeTimeFrame{time, data})
+	}
+	return frames, nil
+}
diff --git a/src/runtime/timestub2.go b/src/runtime/timestub2.go
index 00c2c55..6d73aab 100644
--- a/src/runtime/timestub2.go
+++ b/src/runtime/timestub2.go
@@ -6,7 +6,8 @@
 // +build !windows
 // +build !freebsd
 // +build !aix
+// +build !solaris
 
 package runtime
 
-func walltime() (sec int64, nsec int32)
+func walltime1() (sec int64, nsec int32)
diff --git a/src/runtime/tls_arm.s b/src/runtime/tls_arm.s
index 350089a..e42de8d 100644
--- a/src/runtime/tls_arm.s
+++ b/src/runtime/tls_arm.s
@@ -17,14 +17,11 @@
 // Note: both functions will clobber R0 and R11 and
 // can be called from 5c ABI code.
 
-// On android and darwin, runtime.tls_g is a normal variable.
+// On android, runtime.tls_g is a normal variable.
 // TLS offset is computed in x_cgo_inittls.
 #ifdef GOOS_android
 #define TLSG_IS_VARIABLE
 #endif
-#ifdef GOOS_darwin
-#define TLSG_IS_VARIABLE
-#endif
 
 // save_g saves the g register into pthread-provided
 // thread-local memory, so that we can call externally compiled
@@ -33,11 +30,6 @@
 //       runtime.mcall assumes this function only clobbers R0 and R11.
 // Returns with g in R0.
 TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0
-#ifdef GOOS_nacl
-	// nothing to do as nacl/arm does not use TLS at all.
-	MOVW	g, R0 // preserve R0 across call to setg<>
-	RET
-#else
 	// If the host does not support MRC the linker will replace it with
 	// a call to runtime.read_tls_fallback which jumps to __kuser_get_tls.
 	// The replacement function saves LR in R11 over the call to read_tls_fallback.
@@ -48,16 +40,11 @@
 	MOVW	g, 0(R0)
 	MOVW	g, R0 // preserve R0 across call to setg<>
 	RET
-#endif
 
 // load_g loads the g register from pthread-provided
 // thread-local memory, for use after calling externally compiled
 // ARM code that overwrote those registers.
 TEXT runtime·load_g(SB),NOSPLIT,$0
-#ifdef GOOS_nacl
-	// nothing to do as nacl/arm does not use TLS at all.
-	RET
-#else
 	// See save_g
 	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
 	BIC $3, R0 // Darwin/ARM might return unaligned pointer
@@ -65,7 +52,6 @@
 	ADD	R11, R0
 	MOVW	0(R0), g
 	RET
-#endif
 
 // This is called from rt0_go, which runs on the system stack
 // using the initial stack allocated by the OS.
@@ -78,7 +64,6 @@
 // Declare a dummy word ($4, not $0) to make sure the
 // frame is 8 bytes and stays 8-byte-aligned.
 TEXT runtime·_initcgo(SB),NOSPLIT,$4
-#ifndef GOOS_nacl
 	// if there is an _cgo_init, call it.
 	MOVW	_cgo_init(SB), R4
 	CMP	$0, R4
@@ -93,7 +78,6 @@
 	MOVW	$setg_gcc<>(SB), R1 	// arg 1: setg
 	MOVW	g, R0 			// arg 0: G
 	BL	(R4) // will clobber R0-R3
-#endif
 nocgo:
 	RET
 
diff --git a/src/runtime/tls_arm64.h b/src/runtime/tls_arm64.h
index 27f517c..f60f4f6 100644
--- a/src/runtime/tls_arm64.h
+++ b/src/runtime/tls_arm64.h
@@ -20,6 +20,11 @@
 #define MRS_TPIDR_R0 WORD $0xd53bd060 // MRS TPIDRRO_EL0, R0
 #endif
 
+#ifdef GOOS_freebsd
+#define TPIDR TPIDR_EL0
+#define MRS_TPIDR_R0 WORD $0xd53bd040 // MRS TPIDR_EL0, R0
+#endif
+
 #ifdef GOOS_netbsd
 #define TPIDR TPIDRRO_EL0
 #define MRS_TPIDR_R0 WORD $0xd53bd040 // MRS TPIDRRO_EL0, R0
diff --git a/src/runtime/tls_arm64.s b/src/runtime/tls_arm64.s
index fb8627d..999914d 100644
--- a/src/runtime/tls_arm64.s
+++ b/src/runtime/tls_arm64.s
@@ -10,8 +10,7 @@
 
 TEXT runtime·load_g(SB),NOSPLIT,$0
 	MOVB	runtime·iscgo(SB), R0
-	CMP	$0, R0
-	BEQ	nocgo
+	CBZ	R0, nocgo
 
 	MRS_TPIDR_R0
 #ifdef GOOS_darwin
@@ -27,8 +26,7 @@
 
 TEXT runtime·save_g(SB),NOSPLIT,$0
 	MOVB	runtime·iscgo(SB), R0
-	CMP	$0, R0
-	BEQ	nocgo
+	CBZ	R0, nocgo
 
 	MRS_TPIDR_R0
 #ifdef GOOS_darwin
diff --git a/src/runtime/tls_riscv64.s b/src/runtime/tls_riscv64.s
new file mode 100644
index 0000000..8386980
--- /dev/null
+++ b/src/runtime/tls_riscv64.s
@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// If !iscgo, this is a no-op.
+//
+// NOTE: mcall() assumes this clobbers only R23 (REGTMP).
+// FIXME: cgo
+TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
+	RET
+
+TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+	RET
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 08e92d2..169b650 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -54,7 +54,7 @@
 	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
 	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
 	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
-	traceEvTimerGoroutine    = 35 // denotes timer goroutine [timer goroutine id]
+	traceEvTimerGoroutine    = 35 // not currently used; previously denoted timer goroutine [timer goroutine id]
 	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
 	traceEvString            = 37 // string dictionary entry [ID, length, string]
 	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
@@ -84,7 +84,7 @@
 	// and ppc64le.
 	// Tracing won't work reliably for architectures where cputicks is emulated
 	// by nanotime, so the value doesn't matter for those architectures.
-	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64|sys.GoarchAmd64p32)
+	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64)
 	// Maximum number of PCs in a single stack trace.
 	// Since events contain only stack id rather than whole stack trace,
 	// we can allow quite large values here.
@@ -180,9 +180,15 @@
 // Most clients should use the runtime/trace package or the testing package's
 // -test.trace flag instead of calling StartTrace directly.
 func StartTrace() error {
-	// Stop the world, so that we can take a consistent snapshot
+	// Stop the world so that we can take a consistent snapshot
 	// of all goroutines at the beginning of the trace.
-	stopTheWorld("start tracing")
+	// Do not stop the world during GC so we ensure we always see
+	// a consistent view of GC-related events (e.g. a start is always
+	// paired with an end).
+	stopTheWorldGC("start tracing")
+
+	// Prevent sysmon from running any code that could generate events.
+	lock(&sched.sysmonlock)
 
 	// We are in stop-the-world, but syscalls can finish and write to trace concurrently.
 	// Exitsyscall could check trace.enabled long before and then suddenly wake up
@@ -193,7 +199,8 @@
 
 	if trace.enabled || trace.shutdown {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
 		return errorString("tracing is already enabled")
 	}
 
@@ -264,7 +271,9 @@
 
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
 	return nil
 }
 
@@ -273,14 +282,18 @@
 func StopTrace() {
 	// Stop the world so that we can collect the trace buffers from all p's below,
 	// and also to avoid races with traceEvent.
-	stopTheWorld("stop tracing")
+	stopTheWorldGC("stop tracing")
+
+	// See the comment in StartTrace.
+	lock(&sched.sysmonlock)
 
 	// See the comment in StartTrace.
 	lock(&trace.bufLock)
 
 	if !trace.enabled {
 		unlock(&trace.bufLock)
-		startTheWorld()
+		unlock(&sched.sysmonlock)
+		startTheWorldGC()
 		return
 	}
 
@@ -317,7 +330,9 @@
 	trace.shutdown = true
 	unlock(&trace.bufLock)
 
-	startTheWorld()
+	unlock(&sched.sysmonlock)
+
+	startTheWorldGC()
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
@@ -413,13 +428,6 @@
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		for i := range timers {
-			tb := &timers[i]
-			if tb.gp != nil {
-				data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
-				data = traceAppend(data, uint64(tb.gp.goid))
-			}
-		}
 		// This will emit a bunch of full buffers, we will pick them up
 		// on the next iteration.
 		trace.stackTab.dump()
@@ -873,6 +881,7 @@
 
 	tab.mem.drop()
 	*tab = traceStackTable{}
+	lockInit(&((*tab).lock), lockRankTraceStackTab)
 }
 
 type traceFrame struct {
@@ -929,7 +938,7 @@
 
 // alloc allocates n-byte block.
 func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
-	n = round(n, sys.PtrSize)
+	n = alignUp(n, sys.PtrSize)
 	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
 		if n > uintptr(len(a.head.ptr().data)) {
 			throw("trace: alloc too large")
diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
index 62c06e6..cfc0419 100644
--- a/src/runtime/trace/trace_stack_test.go
+++ b/src/runtime/trace/trace_stack_test.go
@@ -233,6 +233,7 @@
 		}},
 		{trace.EvGomaxprocs, []frame{
 			{"runtime.startTheWorld", 0}, // this is when the current gomaxprocs is logged.
+			{"runtime.startTheWorldGC", 0},
 			{"runtime.GOMAXPROCS", 0},
 			{"runtime/trace_test.TestTraceSymbolize", 0},
 			{"testing.tRunner", 0},
@@ -251,6 +252,7 @@
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
+				{"internal/poll.ignoringEINTR", 0},
 				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index ef48c9f..944c847 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -26,8 +26,8 @@
 // takes up only 4 bytes on the stack, while on 64-bit systems it takes up 8 bytes.
 // Typically this is ptrSize.
 //
-// As an exception, amd64p32 has ptrSize == 4 but the CALL instruction still
-// stores an 8-byte return PC onto the stack. To accommodate this, we use regSize
+// As an exception, amd64p32 had ptrSize == 4 but the CALL instruction still
+// stored an 8-byte return PC onto the stack. To accommodate this, we used regSize
 // as the size of the architecture-pushed return PC.
 //
 // usesLR is defined below in terms of minFrameSize, which is defined in
@@ -340,7 +340,20 @@
 			pc := frame.pc
 			// backup to CALL instruction to read inlining info (same logic as below)
 			tracepc := pc
-			if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry && !waspanic {
+			// Normally, pc is a return address. In that case, we want to look up
+			// file/line information using pc-1, because that is the pc of the
+			// call instruction (more precisely, the last byte of the call instruction).
+			// Callers expect the pc buffer to contain return addresses and do the
+			// same -1 themselves, so we keep pc unchanged.
+			// When the pc is from a signal (e.g. profiler or segv) then we want
+			// to look up file/line information using pc, and we store pc+1 in the
+			// pc buffer so callers can unconditionally subtract 1 before looking up.
+			// See issue 34123.
+			// The pc can be at function entry when the frame is initialized without
+			// actually running code, like runtime.mstart.
+			if (n == 0 && flags&_TraceTrap != 0) || waspanic || pc == f.entry {
+				pc++
+			} else {
 				tracepc--
 			}
 
@@ -462,6 +475,7 @@
 		}
 
 		waspanic = f.funcID == funcID_sigpanic
+		injectedCall := waspanic || f.funcID == funcID_asyncPreempt
 
 		// Do not unwind past the bottom of the stack.
 		if !flr.valid() {
@@ -477,8 +491,8 @@
 		frame.argmap = nil
 
 		// On link register architectures, sighandler saves the LR on stack
-		// before faking a call to sigpanic.
-		if usesLR && waspanic {
+		// before faking a call.
+		if usesLR && injectedCall {
 			x := *(*uintptr)(unsafe.Pointer(frame.sp))
 			frame.sp += sys.MinFrameSize
 			if GOARCH == "arm64" {
@@ -860,6 +874,7 @@
 	_Gwaiting:   "waiting",
 	_Gdead:      "dead",
 	_Gcopystack: "copystack",
+	_Gpreempted: "preempted",
 }
 
 func goroutineheader(gp *g) {
@@ -997,8 +1012,8 @@
 
 // isSystemGoroutine reports whether the goroutine g must be omitted
 // in stack dumps and deadlock detector. This is any goroutine that
-// starts at a runtime.* entry point, except for runtime.main and
-// sometimes runtime.runfinq.
+// starts at a runtime.* entry point, except for runtime.main,
+// runtime.handleAsyncEvent (wasm only) and sometimes runtime.runfinq.
 //
 // If fixed is true, any goroutine that can vary between user and
 // system (that is, the finalizer goroutine) is considered a user
@@ -1009,7 +1024,7 @@
 	if !f.valid() {
 		return false
 	}
-	if f.funcID == funcID_runtime_main {
+	if f.funcID == funcID_runtime_main || f.funcID == funcID_handleAsyncEvent {
 		return false
 	}
 	if f.funcID == funcID_runfinq {
diff --git a/src/runtime/treap_test.go b/src/runtime/treap_test.go
deleted file mode 100644
index 110f51c..0000000
--- a/src/runtime/treap_test.go
+++ /dev/null
@@ -1,270 +0,0 @@
-// Copyright 2019 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime_test
-
-import (
-	"fmt"
-	"runtime"
-	"testing"
-)
-
-var spanDesc = map[uintptr]struct {
-	pages uintptr
-	scav  bool
-}{
-	0xc0000000: {2, false},
-	0xc0006000: {1, false},
-	0xc0010000: {8, false},
-	0xc0022000: {7, false},
-	0xc0034000: {4, true},
-	0xc0040000: {5, false},
-	0xc0050000: {5, true},
-	0xc0060000: {5000, false},
-}
-
-// Wrap the Treap one more time because go:notinheap doesn't
-// actually follow a structure across package boundaries.
-//
-//go:notinheap
-type treap struct {
-	runtime.Treap
-}
-
-func maskMatchName(mask, match runtime.TreapIterType) string {
-	return fmt.Sprintf("%0*b-%0*b", runtime.TreapIterBits, uint8(mask), runtime.TreapIterBits, uint8(match))
-}
-
-func TestTreapFilter(t *testing.T) {
-	var iterTypes = [...]struct {
-		mask, match runtime.TreapIterType
-		filter      runtime.TreapIterFilter // expected filter
-	}{
-		{0, 0, 0xf},
-		{runtime.TreapIterScav, 0, 0x5},
-		{runtime.TreapIterScav, runtime.TreapIterScav, 0xa},
-		{runtime.TreapIterScav | runtime.TreapIterHuge, runtime.TreapIterHuge, 0x4},
-		{runtime.TreapIterScav | runtime.TreapIterHuge, 0, 0x1},
-		{0, runtime.TreapIterScav, 0x0},
-	}
-	for _, it := range iterTypes {
-		t.Run(maskMatchName(it.mask, it.match), func(t *testing.T) {
-			if f := runtime.TreapFilter(it.mask, it.match); f != it.filter {
-				t.Fatalf("got %#x, want %#x", f, it.filter)
-			}
-		})
-	}
-}
-
-// This test ensures that the treap implementation in the runtime
-// maintains all stated invariants after different sequences of
-// insert, removeSpan, find, and erase. Invariants specific to the
-// treap data structure are checked implicitly: after each mutating
-// operation, treap-related invariants are checked for the entire
-// treap.
-func TestTreap(t *testing.T) {
-	// Set up a bunch of spans allocated into mheap_.
-	// Also, derive a set of typeCounts of each type of span
-	// according to runtime.TreapIterType so we can verify against
-	// them later.
-	spans := make([]runtime.Span, 0, len(spanDesc))
-	typeCounts := [1 << runtime.TreapIterBits][1 << runtime.TreapIterBits]int{}
-	for base, de := range spanDesc {
-		s := runtime.AllocSpan(base, de.pages, de.scav)
-		defer s.Free()
-		spans = append(spans, s)
-
-		for i := runtime.TreapIterType(0); i < 1<<runtime.TreapIterBits; i++ {
-			for j := runtime.TreapIterType(0); j < 1<<runtime.TreapIterBits; j++ {
-				if s.MatchesIter(i, j) {
-					typeCounts[i][j]++
-				}
-			}
-		}
-	}
-	t.Run("TypeCountsSanity", func(t *testing.T) {
-		// Just sanity check type counts for a few values.
-		check := func(mask, match runtime.TreapIterType, count int) {
-			tc := typeCounts[mask][match]
-			if tc != count {
-				name := maskMatchName(mask, match)
-				t.Fatalf("failed a sanity check for mask/match %s counts: got %d, wanted %d", name, tc, count)
-			}
-		}
-		check(0, 0, len(spanDesc))
-		check(runtime.TreapIterScav, 0, 6)
-		check(runtime.TreapIterScav, runtime.TreapIterScav, 2)
-	})
-	t.Run("Insert", func(t *testing.T) {
-		tr := treap{}
-		// Test just a very basic insert/remove for sanity.
-		tr.Insert(spans[0])
-		tr.RemoveSpan(spans[0])
-	})
-	t.Run("FindTrivial", func(t *testing.T) {
-		tr := treap{}
-		// Test just a very basic find operation for sanity.
-		tr.Insert(spans[0])
-		i := tr.Find(1)
-		if i.Span() != spans[0] {
-			t.Fatal("found unknown span in treap")
-		}
-		tr.RemoveSpan(spans[0])
-	})
-	t.Run("FindFirstFit", func(t *testing.T) {
-		// Run this 10 times, recreating the treap each time.
-		// Because of the non-deterministic structure of a treap,
-		// we'll be able to test different structures this way.
-		for i := 0; i < 10; i++ {
-			tr := runtime.Treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Find(5)
-			if i.Span().Base() != 0xc0010000 {
-				t.Fatalf("expected span at lowest address which could fit 5 pages, instead found span at %x", i.Span().Base())
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		}
-	})
-	t.Run("Iterate", func(t *testing.T) {
-		for mask := runtime.TreapIterType(0); mask < 1<<runtime.TreapIterBits; mask++ {
-			for match := runtime.TreapIterType(0); match < 1<<runtime.TreapIterBits; match++ {
-				iterName := maskMatchName(mask, match)
-				t.Run(iterName, func(t *testing.T) {
-					t.Run("StartToEnd", func(t *testing.T) {
-						// Ensure progressing an iterator actually goes over the whole treap
-						// from the start and that it iterates over the elements in order.
-						// Furthermore, ensure that it only iterates over the relevant parts
-						// of the treap.
-						// Finally, ensures that Start returns a valid iterator.
-						tr := treap{}
-						for _, s := range spans {
-							tr.Insert(s)
-						}
-						nspans := 0
-						lastBase := uintptr(0)
-						for i := tr.Start(mask, match); i.Valid(); i = i.Next() {
-							nspans++
-							if lastBase > i.Span().Base() {
-								t.Fatalf("not iterating in correct order: encountered base %x before %x", lastBase, i.Span().Base())
-							}
-							lastBase = i.Span().Base()
-							if !i.Span().MatchesIter(mask, match) {
-								t.Fatalf("found non-matching span while iteration over mask/match %s: base %x", iterName, i.Span().Base())
-							}
-						}
-						if nspans != typeCounts[mask][match] {
-							t.Fatal("failed to iterate forwards over full treap")
-						}
-						for _, s := range spans {
-							tr.RemoveSpan(s)
-						}
-					})
-					t.Run("EndToStart", func(t *testing.T) {
-						// See StartToEnd tests.
-						tr := treap{}
-						for _, s := range spans {
-							tr.Insert(s)
-						}
-						nspans := 0
-						lastBase := ^uintptr(0)
-						for i := tr.End(mask, match); i.Valid(); i = i.Prev() {
-							nspans++
-							if lastBase < i.Span().Base() {
-								t.Fatalf("not iterating in correct order: encountered base %x before %x", lastBase, i.Span().Base())
-							}
-							lastBase = i.Span().Base()
-							if !i.Span().MatchesIter(mask, match) {
-								t.Fatalf("found non-matching span while iteration over mask/match %s: base %x", iterName, i.Span().Base())
-							}
-						}
-						if nspans != typeCounts[mask][match] {
-							t.Fatal("failed to iterate backwards over full treap")
-						}
-						for _, s := range spans {
-							tr.RemoveSpan(s)
-						}
-					})
-				})
-			}
-		}
-		t.Run("Prev", func(t *testing.T) {
-			// Test the iterator invariant that i.prev().next() == i.
-			tr := treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Start(0, 0).Next().Next()
-			p := i.Prev()
-			if !p.Valid() {
-				t.Fatal("i.prev() is invalid")
-			}
-			if p.Next().Span() != i.Span() {
-				t.Fatal("i.prev().next() != i")
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		})
-		t.Run("Next", func(t *testing.T) {
-			// Test the iterator invariant that i.next().prev() == i.
-			tr := treap{}
-			for _, s := range spans {
-				tr.Insert(s)
-			}
-			i := tr.Start(0, 0).Next().Next()
-			n := i.Next()
-			if !n.Valid() {
-				t.Fatal("i.next() is invalid")
-			}
-			if n.Prev().Span() != i.Span() {
-				t.Fatal("i.next().prev() != i")
-			}
-			for _, s := range spans {
-				tr.RemoveSpan(s)
-			}
-		})
-	})
-	t.Run("EraseOne", func(t *testing.T) {
-		// Test that erasing one iterator correctly retains
-		// all relationships between elements.
-		tr := treap{}
-		for _, s := range spans {
-			tr.Insert(s)
-		}
-		i := tr.Start(0, 0).Next().Next().Next()
-		s := i.Span()
-		n := i.Next()
-		p := i.Prev()
-		tr.Erase(i)
-		if n.Prev().Span() != p.Span() {
-			t.Fatal("p, n := i.Prev(), i.Next(); n.prev() != p after i was erased")
-		}
-		if p.Next().Span() != n.Span() {
-			t.Fatal("p, n := i.Prev(), i.Next(); p.next() != n after i was erased")
-		}
-		tr.Insert(s)
-		for _, s := range spans {
-			tr.RemoveSpan(s)
-		}
-	})
-	t.Run("EraseAll", func(t *testing.T) {
-		// Test that erasing iterators actually removes nodes from the treap.
-		tr := treap{}
-		for _, s := range spans {
-			tr.Insert(s)
-		}
-		for i := tr.Start(0, 0); i.Valid(); {
-			n := i.Next()
-			tr.Erase(i)
-			i = n
-		}
-		if size := tr.Size(); size != 0 {
-			t.Fatalf("should have emptied out treap, %d spans left", size)
-		}
-	})
-}
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 660b45e..52b6cb3 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -14,26 +14,31 @@
 //	cmd/compile/internal/gc/reflect.go
 //	cmd/link/internal/ld/decodesym.go
 //	reflect/type.go
+//      internal/reflectlite/type.go
 type tflag uint8
 
 const (
-	tflagUncommon  tflag = 1 << 0
-	tflagExtraStar tflag = 1 << 1
-	tflagNamed     tflag = 1 << 2
+	tflagUncommon      tflag = 1 << 0
+	tflagExtraStar     tflag = 1 << 1
+	tflagNamed         tflag = 1 << 2
+	tflagRegularMemory tflag = 1 << 3 // equal and hash can treat values of this type as a single region of t.size bytes
 )
 
 // Needs to be in sync with ../cmd/link/internal/ld/decodesym.go:/^func.commonsize,
 // ../cmd/compile/internal/gc/reflect.go:/^func.dcommontype and
 // ../reflect/type.go:/^type.rtype.
+// ../internal/reflectlite/type.go:/^type.rtype.
 type _type struct {
 	size       uintptr
 	ptrdata    uintptr // size of memory prefix holding all pointers
 	hash       uint32
 	tflag      tflag
 	align      uint8
-	fieldalign uint8
+	fieldAlign uint8
 	kind       uint8
-	alg        *typeAlg
+	// function for comparing objects of this type
+	// (ptr to object A, ptr to object B) -> ==?
+	equal func(unsafe.Pointer, unsafe.Pointer) bool
 	// gcdata stores the GC type data for the garbage collector.
 	// If the KindGCProg bit is set in kind, gcdata is a GC program.
 	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
@@ -287,7 +292,7 @@
 		for i := range md.textsectmap {
 			sectaddr := md.textsectmap[i].vaddr
 			sectlen := md.textsectmap[i].length
-			if uintptr(off) >= sectaddr && uintptr(off) <= sectaddr+sectlen {
+			if uintptr(off) >= sectaddr && uintptr(off) < sectaddr+sectlen {
 				res = md.textsectmap[i].baseaddr + uintptr(off) - uintptr(md.textsectmap[i].vaddr)
 				break
 			}
@@ -358,10 +363,12 @@
 }
 
 type maptype struct {
-	typ        _type
-	key        *_type
-	elem       *_type
-	bucket     *_type // internal type representing a hash bucket
+	typ    _type
+	key    *_type
+	elem   *_type
+	bucket *_type // internal type representing a hash bucket
+	// function for hashing keys (ptr to key, seed) -> hash
+	hasher     func(unsafe.Pointer, uintptr) uintptr
 	keysize    uint8  // size of key slot
 	elemsize   uint8  // size of elem slot
 	bucketsize uint16 // size of bucket
@@ -497,6 +504,16 @@
 	return pkgPathName.name()
 }
 
+func (n name) isBlank() bool {
+	if n.bytes == nil {
+		return false
+	}
+	if n.nameLen() != 1 {
+		return false
+	}
+	return *n.data(3) == '_'
+}
+
 // typelinksinit scans the types from extra modules and builds the
 // moduledata typemap used to de-duplicate type pointers.
 func typelinksinit() {
diff --git a/src/runtime/utf8.go b/src/runtime/utf8.go
index 6bf5965..52b7576 100644
--- a/src/runtime/utf8.go
+++ b/src/runtime/utf8.go
@@ -7,7 +7,7 @@
 // Numbers fundamental to the encoding.
 const (
 	runeError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
-	runeSelf  = 0x80         // characters below Runeself are represented as themselves in a single byte.
+	runeSelf  = 0x80         // characters below runeSelf are represented as themselves in a single byte.
 	maxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
 )
 
diff --git a/src/runtime/vdso_elf64.go b/src/runtime/vdso_elf64.go
index 7c9bd96..6ded9d6 100644
--- a/src/runtime/vdso_elf64.go
+++ b/src/runtime/vdso_elf64.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build amd64 arm64 ppc64 ppc64le
+// +build amd64 arm64 mips64 mips64le ppc64 ppc64le
 
 package runtime
 
diff --git a/src/runtime/vdso_freebsd.go b/src/runtime/vdso_freebsd.go
index 4e58919..122cc8b 100644
--- a/src/runtime/vdso_freebsd.go
+++ b/src/runtime/vdso_freebsd.go
@@ -97,7 +97,7 @@
 func fallback_walltime() (sec int64, nsec int32)
 
 //go:nosplit
-func nanotime() int64 {
+func nanotime1() int64 {
 	bt := vdsoClockGettime(_CLOCK_MONOTONIC)
 	if bt == zeroBintime {
 		return fallback_nanotime()
@@ -105,7 +105,7 @@
 	return int64((1e9 * uint64(bt.sec)) + ((1e9 * uint64(bt.frac>>32)) >> 32))
 }
 
-func walltime() (sec int64, nsec int32) {
+func walltime1() (sec int64, nsec int32) {
 	bt := vdsoClockGettime(_CLOCK_REALTIME)
 	if bt == zeroBintime {
 		return fallback_walltime()
diff --git a/src/runtime/vdso_freebsd_arm64.go b/src/runtime/vdso_freebsd_arm64.go
new file mode 100644
index 0000000..7d9f62d
--- /dev/null
+++ b/src/runtime/vdso_freebsd_arm64.go
@@ -0,0 +1,21 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_VDSO_TH_ALGO_ARM_GENTIM = 1
+)
+
+func getCntxct(physical bool) uint32
+
+//go:nosplit
+func (th *vdsoTimehands) getTimecounter() (uint32, bool) {
+	switch th.algo {
+	case _VDSO_TH_ALGO_ARM_GENTIM:
+		return getCntxct(false), true
+	default:
+		return 0, false
+	}
+}
diff --git a/src/runtime/vdso_in_none.go b/src/runtime/vdso_in_none.go
index f2d6bb5..7f4019c 100644
--- a/src/runtime/vdso_in_none.go
+++ b/src/runtime/vdso_in_none.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build linux,!386,!amd64,!arm,!arm64,!ppc64,!ppc64le !linux
+// +build linux,!386,!amd64,!arm,!arm64,!mips64,!mips64le,!ppc64,!ppc64le !linux
 
 package runtime
 
diff --git a/src/runtime/vdso_linux.go b/src/runtime/vdso_linux.go
index 71ba4ce..6e29424 100644
--- a/src/runtime/vdso_linux.go
+++ b/src/runtime/vdso_linux.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build linux
-// +build 386 amd64 arm arm64 ppc64 ppc64le
+// +build 386 amd64 arm arm64 mips64 mips64le ppc64 ppc64le
 
 package runtime
 
@@ -281,6 +281,7 @@
 }
 
 // vdsoMarker reports whether PC is on the VDSO page.
+//go:nosplit
 func inVDSOPage(pc uintptr) bool {
 	for _, k := range vdsoSymbolKeys {
 		if *k.ptr != 0 {
diff --git a/src/runtime/vdso_linux_mips64x.go b/src/runtime/vdso_linux_mips64x.go
new file mode 100644
index 0000000..3a0f947
--- /dev/null
+++ b/src/runtime/vdso_linux_mips64x.go
@@ -0,0 +1,28 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+package runtime
+
+const (
+	// vdsoArrayMax is the byte-size of a maximally sized array on this architecture.
+	// See cmd/compile/internal/mips64/galign.go arch.MAXWIDTH initialization.
+	vdsoArrayMax = 1<<50 - 1
+)
+
+// see man 7 vdso : mips
+var vdsoLinuxVersion = vdsoVersionKey{"LINUX_2.6", 0x3ae75f6}
+
+// The symbol name is not __kernel_clock_gettime as suggested by the manpage;
+// according to Linux source code it should be __vdso_clock_gettime instead.
+var vdsoSymbolKeys = []vdsoSymbolKey{
+	{"__vdso_clock_gettime", 0xd35ec75, 0x6e43a318, &vdsoClockgettimeSym},
+}
+
+// initialize to fall back to syscall
+var (
+	vdsoClockgettimeSym uintptr = 0
+)
diff --git a/src/runtime/vlop_386.s b/src/runtime/vlop_386.s
index 3387c51..b478ff8 100644
--- a/src/runtime/vlop_386.s
+++ b/src/runtime/vlop_386.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/vlop-386.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/vlop-386.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/vlop-386.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index 41d2858..9e19938 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s
@@ -1,5 +1,5 @@
 // Inferno's libkern/vlop-arm.s
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/vlop-arm.s
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/vlop-arm.s
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -40,9 +40,7 @@
 #define Ra	R11
 
 // Be careful: Ra == R11 will be used by the linker for synthesized instructions.
-// Note: this function does not have a frame. If it ever needs a frame,
-// the RET instruction will clobber R12 on nacl, and the compiler's register
-// allocator needs to know.
+// Note: this function does not have a frame.
 TEXT runtime·udiv(SB),NOSPLIT|NOFRAME,$0
 	MOVBU	internal∕cpu·ARM+const_offsetARMHasIDIVA(SB), Ra
 	CMP	$0, Ra
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index f790d3b..38e0b32 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -1,5 +1,5 @@
 // Inferno's libkern/vlrt-arm.c
-// https://bitbucket.org/inferno-os/inferno-os/src/default/libkern/vlrt-arm.c
+// https://bitbucket.org/inferno-os/inferno-os/src/master/libkern/vlrt-arm.c
 //
 //         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
@@ -130,9 +130,6 @@
 	return r
 }
 
-//go:nosplit
-// nosplit because division is used in syscall context in nanotime on darwin/386
-// and darwin/arm where stack splits are not allowed.
 func int64div(n, d int64) int64 {
 	// Check for 32 bit operands
 	if int64(int32(n)) == n && int64(int32(d)) == d {