Update prebuilts to go1.9rc1 ab/4215840 Test: m -j blueprint_tools Change-Id: I8ce0d15553d871a3e79dc5836260651c0e515b5c

commit: c78f7149fedd171aa5e7b2071d601b4d21ce4773 [log] [tgz]
author: Dan Willemsen <[email protected]> Wed Jul 26 13:08:14 2017 -0700
committer: Dan Willemsen <[email protected]> Wed Jul 26 13:14:12 2017 -0700
tree: adbcec2da9660fc1e8303812f761738383c63d02
parent: 3377874f67bcc7d200408ceb8b02169c4c3b201d [diff]
diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md
index ea7c5c1..0b390c3 100644
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md

@@ -105,7 +105,7 @@
 periods. Blocking on a `mutex` directly blocks the M, without
 interacting with the Go scheduler. This means it is safe to use from
 the lowest levels of the runtime, but also prevents any associated G
-and P from being rescheduled.
+and P from being rescheduled. `rwmutex` is similar.
 
 For one-shot notifications, use `note`, which provides `notesleep` and
 `notewakeup`. Unlike traditional UNIX `sleep`/`wakeup`, `note`s are
@@ -130,7 +130,7 @@
 <table>
 <tr><th></th><th colspan="3">Blocks</th></tr>
 <tr><th>Interface</th><th>G</th><th>M</th><th>P</th></tr>
-<tr><td>mutex</td><td>Y</td><td>Y</td><td>Y</td></tr>
+<tr><td>(rw)mutex</td><td>Y</td><td>Y</td><td>Y</td></tr>
 <tr><td>note</td><td>Y</td><td>Y</td><td>Y/N</td></tr>
 <tr><td>park</td><td>Y</td><td>N</td><td>N</td></tr>
 </table>
@@ -238,11 +238,11 @@
 ------------
 
 `go:notinheap` applies to type declarations. It indicates that a type
-must never be heap allocated. Specifically, pointers to this type must
-always fail the `runtime.inheap` check. The type may be used for
-global variables, for stack variables, or for objects in unmanaged
-memory (e.g., allocated with `sysAlloc`, `persistentalloc`, or
-`fixalloc`). Specifically:
+must never be allocated from the GC'd heap. Specifically, pointers to
+this type must always fail the `runtime.inheap` check. The type may be
+used for global variables, for stack variables, or for objects in
+unmanaged memory (e.g., allocated with `sysAlloc`, `persistentalloc`,
+`fixalloc`, or from a manually-managed span). Specifically:
 
 1. `new(T)`, `make([]T)`, `append([]T, ...)` and implicit heap
    allocation of T are disallowed. (Though implicit allocations are

diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 5c378c6..8d388da 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go

@@ -206,16 +206,16 @@
 	return *(*string)(p) == *(*string)(q)
 }
 func interequal(p, q unsafe.Pointer) bool {
-	return ifaceeq(*(*iface)(p), *(*iface)(q))
+	x := *(*iface)(p)
+	y := *(*iface)(q)
+	return x.tab == y.tab && ifaceeq(x.tab, x.data, y.data)
 }
 func nilinterequal(p, q unsafe.Pointer) bool {
-	return efaceeq(*(*eface)(p), *(*eface)(q))
+	x := *(*eface)(p)
+	y := *(*eface)(q)
+	return x._type == y._type && efaceeq(x._type, x.data, y.data)
 }
-func efaceeq(x, y eface) bool {
-	t := x._type
-	if t != y._type {
-		return false
-	}
+func efaceeq(t *_type, x, y unsafe.Pointer) bool {
 	if t == nil {
 		return true
 	}
@@ -224,27 +224,23 @@
 		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
-		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
+		return eq(noescape(unsafe.Pointer(&x)), noescape(unsafe.Pointer(&y)))
 	}
-	return eq(x.data, y.data)
+	return eq(x, y)
 }
-func ifaceeq(x, y iface) bool {
-	xtab := x.tab
-	if xtab != y.tab {
-		return false
-	}
-	if xtab == nil {
+func ifaceeq(tab *itab, x, y unsafe.Pointer) bool {
+	if tab == nil {
 		return true
 	}
-	t := xtab._type
+	t := tab._type
 	eq := t.alg.equal
 	if eq == nil {
 		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
-		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
+		return eq(noescape(unsafe.Pointer(&x)), noescape(unsafe.Pointer(&y)))
 	}
-	return eq(x.data, y.data)
+	return eq(x, y)
 }
 
 // Testing adapters for hash quality tests (see hash_test.go)
@@ -287,9 +283,9 @@
 	// Install aes hash algorithm if we have the instructions we need
 	if (GOARCH == "386" || GOARCH == "amd64") &&
 		GOOS != "nacl" &&
-		cpuid_ecx&(1<<25) != 0 && // aes (aesenc)
-		cpuid_ecx&(1<<9) != 0 && // sse3 (pshufb)
-		cpuid_ecx&(1<<19) != 0 { // sse4.1 (pinsr{d,q})
+		support_aes && // AESENC
+		support_ssse3 && // PSHUFB
+		support_sse41 { // PINSR{D,Q}
 		useAeshash = true
 		algarray[alg_MEM32].hash = aeshash32
 		algarray[alg_MEM64].hash = aeshash64

diff --git a/src/runtime/asm.s b/src/runtime/asm.s
index 3ddea7c..2646172 100644
--- a/src/runtime/asm.s
+++ b/src/runtime/asm.s

@@ -14,3 +14,24 @@
 
 GLOBL runtime·mheap_(SB), NOPTR, $0
 GLOBL runtime·memstats(SB), NOPTR, $0
+
+// NaCl requires that these skips be verifiable machine code.
+#ifdef GOARCH_amd64
+#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
+#endif
+#ifdef GOARCH_386
+#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
+#endif
+#ifdef GOARCH_amd64p32
+#define SKIP4 BYTE $0x90; BYTE $0x90; BYTE $0x90; BYTE $0x90
+#endif
+#ifndef SKIP4
+#define SKIP4 WORD $0
+#endif
+
+#define SKIP16 SKIP4; SKIP4; SKIP4; SKIP4
+#define SKIP64 SKIP16; SKIP16; SKIP16; SKIP16
+
+// This function must be sizeofSkipFunction bytes.
+TEXT runtime·skipPleaseUseCallersFrames(SB),NOSPLIT,$0-0
+	SKIP64; SKIP64; SKIP64; SKIP64

diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 3d0b74c..5bbf286 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s

@@ -67,30 +67,86 @@
 	JNE	notintel
 	CMPL	CX, $0x6C65746E  // "ntel"
 	JNE	notintel
+	MOVB	$1, runtime·isIntel(SB)
 	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
 
 	// Load EAX=1 cpuid flags
 	MOVL	$1, AX
 	CPUID
-	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
-	MOVL	AX, runtime·cpuid_ecx(SB)
-	MOVL	DX, runtime·cpuid_edx(SB)
+	MOVL	CX, DI // Move to global variable clobbers CX when generating PIC
+	MOVL	AX, runtime·processorVersionInfo(SB)
 
 	// Check for MMX support
-	TESTL	$(1<<23), DX	// MMX
-	JZ 	bad_proc
+	TESTL	$(1<<23), DX // MMX
+	JZ	bad_proc
 
+	TESTL	$(1<<26), DX // SSE2
+	SETNE	runtime·support_sse2(SB)
+
+	TESTL	$(1<<9), DI // SSSE3
+	SETNE	runtime·support_ssse3(SB)
+
+	TESTL	$(1<<19), DI // SSE4.1
+	SETNE	runtime·support_sse41(SB)
+
+	TESTL	$(1<<20), DI // SSE4.2
+	SETNE	runtime·support_sse42(SB)
+
+	TESTL	$(1<<23), DI // POPCNT
+	SETNE	runtime·support_popcnt(SB)
+
+	TESTL	$(1<<25), DI // AES
+	SETNE	runtime·support_aes(SB)
+
+	TESTL	$(1<<27), DI // OSXSAVE
+	SETNE	runtime·support_osxsave(SB)
+
+	// If OS support for XMM and YMM is not present
+	// support_avx will be set back to false later.
+	TESTL	$(1<<28), DI // AVX
+	SETNE	runtime·support_avx(SB)
+
+eax7:
 	// Load EAX=7/ECX=0 cpuid flags
 	CMPL	SI, $7
-	JLT	nocpuinfo
+	JLT	osavx
 	MOVL	$7, AX
 	MOVL	$0, CX
 	CPUID
-	MOVL	BX, runtime·cpuid_ebx7(SB)
 
-nocpuinfo:	
+	TESTL	$(1<<3), BX // BMI1
+	SETNE	runtime·support_bmi1(SB)
 
+	// If OS support for XMM and YMM is not present
+	// support_avx2 will be set back to false later.
+	TESTL	$(1<<5), BX
+	SETNE	runtime·support_avx2(SB)
+
+	TESTL	$(1<<8), BX // BMI2
+	SETNE	runtime·support_bmi2(SB)
+
+	TESTL	$(1<<9), BX // ERMS
+	SETNE	runtime·support_erms(SB)
+
+osavx:
+	// nacl does not support XGETBV to test
+	// for XMM and YMM OS support.
+#ifndef GOOS_nacl
+	CMPB	runtime·support_osxsave(SB), $1
+	JNE	noavx
+	MOVL	$0, CX
+	// For XGETBV, OSXSAVE bit is required and sufficient
+	XGETBV
+	ANDL	$6, AX
+	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
+	JE nocpuinfo
+#endif
+noavx:
+	MOVB $0, runtime·support_avx(SB)
+	MOVB $0, runtime·support_avx2(SB)
+
+nocpuinfo:
 	// if there is an _cgo_init, call it to let it
 	// initialize and to set up GS.  if not,
 	// we set up GS ourselves.
@@ -415,22 +471,6 @@
 	MOVL	$0, DX
 	JMP runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten return PC.
-	// AX may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	get_tls(CX)
-	MOVL	g(CX), CX
-	MOVL	(g_stkbar+slice_array)(CX), DX
-	MOVL	g_stkbarPos(CX), BX
-	IMULL	$stkbar__size, BX	// Too big for SIB.
-	MOVL	stkbar_savedLRVal(DX)(BX*1), BX
-	// Record that this stack barrier was hit.
-	ADDL	$1, g_stkbarPos(CX)
-	// Jump to the original return PC.
-	JMP	BX
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -812,33 +852,13 @@
 TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
 	MOVL	argp+0(FP),AX		// addr of first arg
 	MOVL	-4(AX),AX		// get calling pc
-	CMPL	AX, runtime·stackBarrierPC(SB)
-	JNE	nobar
-	// Get original return PC.
-	CALL	runtime·nextBarrierPC(SB)
-	MOVL	0(SP), AX
-nobar:
 	MOVL	AX, ret+4(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
-	MOVL	argp+0(FP),AX		// addr of first arg
-	MOVL	pc+4(FP), BX
-	MOVL	-4(AX), DX
-	CMPL	DX, runtime·stackBarrierPC(SB)
-	JEQ	setbar
-	MOVL	BX, -4(AX)		// set calling pc
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVL	BX, 0(SP)
-	CALL	runtime·setNextBarrierPC(SB)
-	RET
-
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-8
-	TESTL	$0x4000000, runtime·cpuid_edx(SB) // no sse2, no mfence
-	JEQ	done
+	CMPB	runtime·support_sse2(SB), $1
+	JNE	done
 	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
 	JNE	mfence
 	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
@@ -1345,8 +1365,8 @@
 hugeloop:
 	CMPL	BX, $64
 	JB	bigloop
-	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
-	JE	bigloop
+	CMPB	runtime·support_sse2(SB), $1
+	JNE	bigloop
 	MOVOU	(SI), X0
 	MOVOU	(DI), X1
 	MOVOU	16(SI), X2
@@ -1489,8 +1509,8 @@
 	JEQ	allsame
 	CMPL	BP, $4
 	JB	small
-	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
-	JE	mediumloop
+	CMPB	runtime·support_sse2(SB), $1
+	JNE	mediumloop
 largeloop:
 	CMPL	BP, $16
 	JB	mediumloop
@@ -1595,20 +1615,6 @@
 	MOVL	BX, (AX)
 	RET
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	get_tls(CX)
-	MOVL	g(CX), AX
-	MOVL	g_m(AX), AX
-	MOVL	m_fastrand(AX), DX
-	ADDL	DX, DX
-	MOVL	DX, BX
-	XORL	$0x88888eef, DX
-	JPL	2(PC)
-	MOVL	BX, DX
-	MOVL	DX, m_fastrand(AX)
-	MOVL	DX, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVL	$0, AX
 	RET

diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index cb428d6..6405be9 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s

@@ -26,10 +26,10 @@
 	MOVQ	SP, (g_stack+stack_hi)(DI)
 
 	// find out information about the processor we're on
-	MOVQ	$0, AX
+	MOVL	$0, AX
 	CPUID
-	MOVQ	AX, SI
-	CMPQ	AX, $0
+	MOVL	AX, SI
+	CMPL	AX, $0
 	JE	nocpuinfo
 
 	// Figure out how to serialize RDTSC.
@@ -41,60 +41,77 @@
 	JNE	notintel
 	CMPL	CX, $0x6C65746E  // "ntel"
 	JNE	notintel
+	MOVB	$1, runtime·isIntel(SB)
 	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
 
 	// Load EAX=1 cpuid flags
-	MOVQ	$1, AX
+	MOVL	$1, AX
 	CPUID
-	MOVL	CX, runtime·cpuid_ecx(SB)
-	MOVL	DX, runtime·cpuid_edx(SB)
+	MOVL	AX, runtime·processorVersionInfo(SB)
 
+	TESTL	$(1<<26), DX // SSE2
+	SETNE	runtime·support_sse2(SB)
+
+	TESTL	$(1<<9), CX // SSSE3
+	SETNE	runtime·support_ssse3(SB)
+
+	TESTL	$(1<<19), CX // SSE4.1
+	SETNE	runtime·support_sse41(SB)
+
+	TESTL	$(1<<20), CX // SSE4.2
+	SETNE	runtime·support_sse42(SB)
+
+	TESTL	$(1<<23), CX // POPCNT
+	SETNE	runtime·support_popcnt(SB)
+
+	TESTL	$(1<<25), CX // AES
+	SETNE	runtime·support_aes(SB)
+
+	TESTL	$(1<<27), CX // OSXSAVE
+	SETNE	runtime·support_osxsave(SB)
+
+	// If OS support for XMM and YMM is not present
+	// support_avx will be set back to false later.
+	TESTL	$(1<<28), CX // AVX
+	SETNE	runtime·support_avx(SB)
+
+eax7:
 	// Load EAX=7/ECX=0 cpuid flags
-	CMPQ	SI, $7
-	JLT	no7
+	CMPL	SI, $7
+	JLT	osavx
 	MOVL	$7, AX
 	MOVL	$0, CX
 	CPUID
-	MOVL	BX, runtime·cpuid_ebx7(SB)
-no7:
-	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
-	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
-	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
-	MOVL	runtime·cpuid_ecx(SB), CX
-	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
-	CMPL    CX, $0x18000000
-	JNE     noavx
-	MOVL    $0, CX
+
+	TESTL	$(1<<3), BX // BMI1
+	SETNE	runtime·support_bmi1(SB)
+
+	// If OS support for XMM and YMM is not present
+	// support_avx2 will be set back to false later.
+	TESTL	$(1<<5), BX
+	SETNE	runtime·support_avx2(SB)
+
+	TESTL	$(1<<8), BX // BMI2
+	SETNE	runtime·support_bmi2(SB)
+
+	TESTL	$(1<<9), BX // ERMS
+	SETNE	runtime·support_erms(SB)
+
+osavx:
+	CMPB	runtime·support_osxsave(SB), $1
+	JNE	noavx
+	MOVL	$0, CX
 	// For XGETBV, OSXSAVE bit is required and sufficient
 	XGETBV
-	ANDL    $6, AX
-	CMPL    AX, $6 // Check for OS support of YMM registers
-	JNE     noavx
-	MOVB    $1, runtime·support_avx(SB)
-	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
-	JEQ     noavx2
-	MOVB    $1, runtime·support_avx2(SB)
-	JMP     testbmi1
+	ANDL	$6, AX
+	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
+	JE nocpuinfo
 noavx:
-	MOVB    $0, runtime·support_avx(SB)
-noavx2:
-	MOVB    $0, runtime·support_avx2(SB)
-testbmi1:
-	// Detect BMI1 and BMI2 extensions as per
-	// 5.1.16.1 Detection of VEX-encoded GPR Instructions,
-	//   LZCNT and TZCNT, PREFETCHW chapter of [1]
-	MOVB    $0, runtime·support_bmi1(SB)
-	TESTL   $(1<<3), runtime·cpuid_ebx7(SB) // check for BMI1 bit
-	JEQ     testbmi2
-	MOVB    $1, runtime·support_bmi1(SB)
-testbmi2:
-	MOVB    $0, runtime·support_bmi2(SB)
-	TESTL   $(1<<8), runtime·cpuid_ebx7(SB) // check for BMI2 bit
-	JEQ     nocpuinfo
-	MOVB    $1, runtime·support_bmi2(SB)
-nocpuinfo:	
-	
+	MOVB $0, runtime·support_avx(SB)
+	MOVB $0, runtime·support_avx2(SB)
+
+nocpuinfo:
 	// if there is an _cgo_init, call it.
 	MOVQ	_cgo_init(SB), AX
 	TESTQ	AX, AX
@@ -405,28 +422,6 @@
 	MOVL	$0, DX
 	JMP	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten return PC.
-	// AX may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	get_tls(CX)
-	MOVQ	g(CX), CX
-	MOVQ	(g_stkbar+slice_array)(CX), DX
-	MOVQ	g_stkbarPos(CX), BX
-	IMULQ	$stkbar__size, BX	// Too big for SIB.
-	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
-	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
-	// Assert that we're popping the right saved LR.
-	ADDQ	$8, R8
-	CMPQ	R8, SP
-	JEQ	2(PC)
-	MOVL	$0, 0
-	// Record that this stack barrier was hit.
-	ADDQ	$1, g_stkbarPos(CX)
-	// Jump to the original return PC.
-	JMP	BX
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -841,29 +836,9 @@
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
 	MOVQ	argp+0(FP),AX		// addr of first arg
 	MOVQ	-8(AX),AX		// get calling pc
-	CMPQ	AX, runtime·stackBarrierPC(SB)
-	JNE	nobar
-	// Get original return PC.
-	CALL	runtime·nextBarrierPC(SB)
-	MOVQ	0(SP), AX
-nobar:
 	MOVQ	AX, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
-	MOVQ	argp+0(FP),AX		// addr of first arg
-	MOVQ	pc+8(FP), BX
-	MOVQ	-8(AX), CX
-	CMPQ	CX, runtime·stackBarrierPC(SB)
-	JEQ	setbar
-	MOVQ	BX, -8(AX)		// set calling pc
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVQ	BX, 0(SP)
-	CALL	runtime·setNextBarrierPC(SB)
-	RET
-
 // func cputicks() int64
 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
@@ -1728,17 +1703,6 @@
 	VZEROUPPER
 	JMP loop
 
-
-TEXT strings·supportAVX2(SB),NOSPLIT,$0-1
-	MOVBLZX runtime·support_avx2(SB), AX
-	MOVB AX, ret+0(FP)
-	RET
-
-TEXT bytes·supportAVX2(SB),NOSPLIT,$0-1
-	MOVBLZX runtime·support_avx2(SB), AX
-	MOVB AX, ret+0(FP)
-	RET
-
 TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
 	MOVQ s+0(FP), DI
 	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
@@ -1967,9 +1931,8 @@
 	VZEROUPPER
 	JMP success
 sse42:
-	MOVL runtime·cpuid_ecx(SB), CX
-	ANDL $0x100000, CX
-	JZ no_sse42
+	CMPB runtime·support_sse42(SB), $1
+	JNE no_sse42
 	CMPQ AX, $12
 	// PCMPESTRI is slower than normal compare,
 	// so using it makes sense only if we advance 4+ bytes per compare
@@ -2163,17 +2126,194 @@
 	MOVB	$0, ret+48(FP)
 	RET
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	get_tls(CX)
-	MOVQ	g(CX), AX
-	MOVQ	g_m(AX), AX
-	MOVL	m_fastrand(AX), DX
-	ADDL	DX, DX
-	MOVL	DX, BX
-	XORL	$0x88888eef, DX
-	CMOVLMI	BX, DX
-	MOVL	DX, m_fastrand(AX)
-	MOVL	DX, ret+0(FP)
+
+TEXT bytes·countByte(SB),NOSPLIT,$0-40
+	MOVQ s+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+24(FP), AL
+	LEAQ ret+32(FP), R8
+	JMP  runtime·countByte(SB)
+
+TEXT strings·countByte(SB),NOSPLIT,$0-32
+	MOVQ s+0(FP), SI
+	MOVQ s_len+8(FP), BX
+	MOVB c+16(FP), AL
+	LEAQ ret+24(FP), R8
+	JMP  runtime·countByte(SB)
+
+// input:
+//   SI: data
+//   BX: data len
+//   AL: byte sought
+//   R8: address to put result
+// This requires the POPCNT instruction
+TEXT runtime·countByte(SB),NOSPLIT,$0
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
+	MOVD AX, X0
+	PUNPCKLBW X0, X0
+	PUNPCKLBW X0, X0
+	PSHUFL $0, X0, X0
+
+	CMPQ BX, $16
+	JLT small
+
+	MOVQ $0, R12 // Accumulator
+
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
+sse:
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
+	PMOVMSKB X1, DX
+	// Count number of matching bytes
+	POPCNTL DX, DX
+	// Accumulate into R12
+	ADDQ DX, R12
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JBE	sseloop
+
+	// Get the number of bytes to consider in the last 16 bytes
+	ANDQ $15, BX
+	JZ end
+
+	// Create mask to ignore overlap between previous 16 byte block
+	// and the next.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Process the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched so we need to mask part of it.
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+end:
+	MOVQ R12, (R8)
+	RET
+
+// handle for lengths < 16
+small:
+	TESTQ	BX, BX
+	JEQ	endzero
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	// We must ignore high bytes as they aren't part of our slice.
+	// Create mask.
+	MOVB BX, CX
+	MOVQ $1, R10
+	SALQ CL, R10
+	SUBQ $1, R10
+
+	// Load data
+	MOVOU	(SI), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	MOVQ	DX, (R8)
+	RET
+endzero:
+	MOVQ $0, (R8)
+	RET
+
+endofpage:
+	// We must ignore low bytes as they aren't part of our slice.
+	MOVQ $16,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+
+	// Load data into the high end of X1.
+	MOVOU	-16(SI)(BX*1), X1
+	// Compare target byte with each byte in data.
+	PCMPEQB	X0, X1
+	// Move result bits to integer register.
+	PMOVMSKB X1, DX
+	// Apply mask
+	ANDQ R10, DX
+	// Directly return DX, we don't need to accumulate
+	// since we have <16 bytes.
+	POPCNTL DX, DX
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   runtime·support_avx2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLE avx2_loop
+
+	// If last block is already processed,
+	// skip to the end.
+	CMPQ DI, R11
+	JEQ endavx
+
+	// Load address of the last 32 bytes.
+	// There is an overlap with the previous block.
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPMOVMSKB Y3, DX
+	// Exit AVX mode.
+	VZEROUPPER
+
+	// Create mask to ignore overlap between previous 32 byte block
+	// and the next.
+	ANDQ $31, BX
+	MOVQ $32,CX
+	SUBQ BX, CX
+	MOVQ $0xFFFFFFFF, R10
+	SARQ CL, R10
+	SALQ CL, R10
+	// Apply mask
+	ANDQ R10, DX
+	POPCNTL DX, DX
+	ADDQ DX, R12
+	MOVQ R12, (R8)
+	RET
+endavx:
+	// Exit AVX mode.
+	VZEROUPPER
+	MOVQ R12, (R8)
 	RET
 
 TEXT runtime·return0(SB), NOSPLIT, $0

diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index c3c1c15..6367b3f 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s

@@ -28,16 +28,92 @@
 	MOVL	SP, (g_stack+stack_hi)(DI)
 
 	// find out information about the processor we're on
-	MOVQ	$0, AX
+	MOVL	$0, AX
 	CPUID
-	CMPQ	AX, $0
+	CMPL	AX, $0
 	JE	nocpuinfo
-	MOVQ	$1, AX
+
+	CMPL	BX, $0x756E6547  // "Genu"
+	JNE	notintel
+	CMPL	DX, $0x49656E69  // "ineI"
+	JNE	notintel
+	CMPL	CX, $0x6C65746E  // "ntel"
+	JNE	notintel
+	MOVB	$1, runtime·isIntel(SB)
+notintel:
+
+	// Load EAX=1 cpuid flags
+	MOVL	$1, AX
 	CPUID
-	MOVL	CX, runtime·cpuid_ecx(SB)
-	MOVL	DX, runtime·cpuid_edx(SB)
-nocpuinfo:	
-	
+	MOVL	AX, runtime·processorVersionInfo(SB)
+
+	TESTL	$(1<<26), DX // SSE2
+	SETNE	runtime·support_sse2(SB)
+
+	TESTL	$(1<<9), CX // SSSE3
+	SETNE	runtime·support_ssse3(SB)
+
+	TESTL	$(1<<19), CX // SSE4.1
+	SETNE	runtime·support_sse41(SB)
+
+	TESTL	$(1<<20), CX // SSE4.2
+	SETNE	runtime·support_sse42(SB)
+
+	TESTL	$(1<<23), CX // POPCNT
+	SETNE	runtime·support_popcnt(SB)
+
+	TESTL	$(1<<25), CX // AES
+	SETNE	runtime·support_aes(SB)
+
+	TESTL	$(1<<27), CX // OSXSAVE
+	SETNE	runtime·support_osxsave(SB)
+
+	// If OS support for XMM and YMM is not present
+	// support_avx will be set back to false later.
+	TESTL	$(1<<28), CX // AVX
+	SETNE	runtime·support_avx(SB)
+
+eax7:
+	// Load EAX=7/ECX=0 cpuid flags
+	CMPL	SI, $7
+	JLT	osavx
+	MOVL	$7, AX
+	MOVL	$0, CX
+	CPUID
+
+	TESTL	$(1<<3), BX // BMI1
+	SETNE	runtime·support_bmi1(SB)
+
+	// If OS support for XMM and YMM is not present
+	// support_avx2 will be set back to false later.
+	TESTL	$(1<<5), BX
+	SETNE	runtime·support_avx2(SB)
+
+	TESTL	$(1<<8), BX // BMI2
+	SETNE	runtime·support_bmi2(SB)
+
+	TESTL	$(1<<9), BX // ERMS
+	SETNE	runtime·support_erms(SB)
+
+osavx:
+	// nacl does not support XGETBV to test
+	// for XMM and YMM OS support.
+#ifndef GOOS_nacl
+	CMPB	runtime·support_osxsave(SB), $1
+	JNE	noavx
+	MOVL	$0, CX
+	// For XGETBV, OSXSAVE bit is required and sufficient
+	XGETBV
+	ANDL	$6, AX
+	CMPL	AX, $6 // Check for OS support of XMM and YMM registers.
+	JE nocpuinfo
+#endif
+noavx:
+	MOVB $0, runtime·support_avx(SB)
+	MOVB $0, runtime·support_avx2(SB)
+
+nocpuinfo:
+
 needtls:
 	LEAL	runtime·m0+m_tls(SB), DI
 	CALL	runtime·settls(SB)
@@ -309,23 +385,6 @@
 	MOVL	$0, DX
 	JMP	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten return PC.
-	// AX may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	get_tls(CX)
-	MOVL	g(CX), CX
-	MOVL	(g_stkbar+slice_array)(CX), DX
-	MOVL	g_stkbarPos(CX), BX
-	IMULL	$stkbar__size, BX	// Too big for SIB.
-	ADDL	DX, BX
-	MOVL	stkbar_savedLRVal(BX), BX
-	// Record that this stack barrier was hit.
-	ADDL	$1, g_stkbarPos(CX)
-	// Jump to the original return PC.
-	JMP	BX
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -521,29 +580,9 @@
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
 	MOVL	argp+0(FP),AX		// addr of first arg
 	MOVL	-8(AX),AX		// get calling pc
-	CMPL	AX, runtime·stackBarrierPC(SB)
-	JNE	nobar
-	// Get original return PC.
-	CALL	runtime·nextBarrierPC(SB)
-	MOVL	0(SP), AX
-nobar:
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-8
-	MOVL	argp+0(FP),AX		// addr of first arg
-	MOVL	pc+4(FP), BX		// pc to set
-	MOVL	-8(AX), CX
-	CMPL	CX, runtime·stackBarrierPC(SB)
-	JEQ	setbar
-	MOVQ	BX, -8(AX)		// set calling pc
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVL	BX, 0(SP)
-	CALL	runtime·setNextBarrierPC(SB)
-	RET
-
 // int64 runtime·cputicks(void)
 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	RDTSC
@@ -991,19 +1030,6 @@
 	MOVB	AX, ret+24(FP)
 	RET
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	get_tls(CX)
-	MOVL	g(CX), AX
-	MOVL	g_m(AX), AX
-	MOVL	m_fastrand(AX), DX
-	ADDL	DX, DX
-	MOVL	DX, BX
-	XORL	$0x88888eef, DX
-	CMOVLMI	BX, DX
-	MOVL	DX, m_fastrand(AX)
-	MOVL	DX, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVL	$0, AX
 	RET

diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 79c28a8..09b6759 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s

@@ -340,23 +340,6 @@
 	MOVW	$0, R7
 	B runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R0 may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVW	(g_stkbar+slice_array)(g), R4
-	MOVW	g_stkbarPos(g), R5
-	MOVW	$stkbar__size, R6
-	MUL	R5, R6
-	ADD	R4, R6
-	MOVW	stkbar_savedLRVal(R6), R6
-	// Record that this stack barrier was hit.
-	ADD	$1, R5
-	MOVW	R5, g_stkbarPos(g)
-	// Jump to the original return PC.
-	B	(R6)
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -696,30 +679,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
 	MOVW	8(R13), R0		// LR saved by caller
-	MOVW	runtime·stackBarrierPC(SB), R1
-	CMP	R0, R1
-	BNE	nobar
-	// Get original return PC.
-	BL	runtime·nextBarrierPC(SB)
-	MOVW	4(R13), R0
-nobar:
 	MOVW	R0, ret+4(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
-	MOVW	pc+4(FP), R0
-	MOVW	8(R13), R1
-	MOVW	runtime·stackBarrierPC(SB), R2
-	CMP	R1, R2
-	BEQ	setbar
-	MOVW	R0, 8(R13)		// set LR in caller
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVW	R0, 4(R13)
-	BL	runtime·setNextBarrierPC(SB)
-	RET
-
 TEXT runtime·emptyfunc(SB),0,$0-0
 	RET
 
@@ -971,15 +933,6 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT runtime·fastrand(SB),NOSPLIT,$-4-4
-	MOVW	g_m(g), R1
-	MOVW	m_fastrand(R1), R0
-	ADD.S	R0, R0
-	EOR.MI	$0x88888eef, R0
-	MOVW	R0, m_fastrand(R1)
-	MOVW	R0, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB),NOSPLIT,$0
 	MOVW	$0, R0
 	RET
@@ -988,6 +941,7 @@
 	MOVW	cycles+0(FP), R1
 	MOVW	$0, R0
 yieldloop:
+	WORD	$0xe320f001	// YIELD (NOP pre-ARMv6K)
 	CMP	R0, R1
 	B.NE	2(PC)
 	RET
@@ -1057,11 +1011,13 @@
 
 #ifndef GOOS_nacl
 // This is called from .init_array and follows the platform, not Go, ABI.
-TEXT runtime·addmoduledata(SB),NOSPLIT,$0-4
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-8
 	MOVW	R9, saver9-4(SP) // The access to global variables below implicitly uses R9, which is callee-save
+	MOVW	R11, saver11-8(SP) // Likewise, R11 is the temp register, but callee-save in C ABI
 	MOVW	runtime·lastmoduledatap(SB), R1
 	MOVW	R0, moduledata_next(R1)
 	MOVW	R0, runtime·lastmoduledatap(SB)
+	MOVW	saver11-8(SP), R11
 	MOVW	saver9-4(SP), R9
 	RET
 #endif

diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 0e286d4..30ecec7 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s

@@ -315,23 +315,6 @@
 	MOVW	$0, R26
 	B runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R0 may be live (see return0). Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVD	(g_stkbar+slice_array)(g), R4
-	MOVD	g_stkbarPos(g), R5
-	MOVD	$stkbar__size, R6
-	MUL	R5, R6
-	ADD	R4, R6
-	MOVD	stkbar_savedLRVal(R6), R6
-	// Record that this stack barrier was hit.
-	ADD	$1, R5
-	MOVD	R5, g_stkbarPos(g)
-	// Jump to the original return PC.
-	B	(R6)
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -723,30 +706,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
 	MOVD	16(RSP), R0		// LR saved by caller
-	MOVD	runtime·stackBarrierPC(SB), R1
-	CMP	R0, R1
-	BNE	nobar
-	// Get original return PC.
-	BL	runtime·nextBarrierPC(SB)
-	MOVD	8(RSP), R0
-nobar:
 	MOVD	R0, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
-	MOVD	pc+8(FP), R0
-	MOVD	16(RSP), R1
-	MOVD	runtime·stackBarrierPC(SB), R2
-	CMP	R1, R2
-	BEQ	setbar
-	MOVD	R0, 16(RSP)		// set LR in caller
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVD	R0, 8(RSP)
-	BL	runtime·setNextBarrierPC(SB)
-	RET
-
 TEXT runtime·abort(SB),NOSPLIT,$-8-0
 	B	(ZR)
 	UNDEF
@@ -959,18 +921,6 @@
 	MOVB	R0, ret+48(FP)
 	RET
 
-TEXT runtime·fastrand(SB),NOSPLIT,$-8-4
-	MOVD	g_m(g), R1
-	MOVWU	m_fastrand(R1), R0
-	ADD	R0, R0
-	CMPW	$0, R0
-	BGE	notneg
-	EOR	$0x88888eef, R0
-notneg:
-	MOVW	R0, m_fastrand(R1)
-	MOVW	R0, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R0
 	RET

diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index c2d991d..57d4578 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s

@@ -286,24 +286,6 @@
 	MOVV	R0, REGCTXT
 	JMP	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R1 may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVV	(g_stkbar+slice_array)(g), R2
-	MOVV	g_stkbarPos(g), R3
-	MOVV	$stkbar__size, R4
-	MULVU	R3, R4
-	MOVV	LO, R4
-	ADDV	R2, R4
-	MOVV	stkbar_savedLRVal(R4), R4
-	// Record that this stack barrier was hit.
-	ADDV	$1, R3
-	MOVV	R3, g_stkbarPos(g)
-	// Jump to the original return PC.
-	JMP	(R4)
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -636,28 +618,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
 	MOVV	16(R29), R1		// LR saved by caller
-	MOVV	runtime·stackBarrierPC(SB), R2
-	BNE	R1, R2, nobar
-	// Get original return PC.
-	JAL	runtime·nextBarrierPC(SB)
-	MOVV	8(R29), R1
-nobar:
 	MOVV	R1, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
-	MOVV	pc+8(FP), R1
-	MOVV	16(R29), R2
-	MOVV	runtime·stackBarrierPC(SB), R3
-	BEQ	R2, R3, setbar
-	MOVV	R1, 16(R29)		// set LR in caller
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVV	R1, 8(R29)
-	JAL	runtime·setNextBarrierPC(SB)
-	RET
-
 TEXT runtime·abort(SB),NOSPLIT,$-8-0
 	MOVW	(R0), R0
 	UNDEF
@@ -831,16 +794,6 @@
 	MOVV	R1, ret+24(FP)
 	RET
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	MOVV	g_m(g), R2
-	MOVWU	m_fastrand(R2), R1
-	ADDU	R1, R1
-	BGEZ	R1, 2(PC)
-	XOR	$0x88888eef, R1
-	MOVW	R1, m_fastrand(R2)
-	MOVW	R1, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R1
 	RET

diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index 73da768..536c315 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s

@@ -287,22 +287,6 @@
 	MOVW	R0, REGCTXT
 	JMP	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R1 may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVW	(g_stkbar+slice_array)(g), R2
-	MOVW	g_stkbarPos(g), R3
-	MOVW	$stkbar__size, R4
-	MULU	R3, R4
-	MOVW	LO, R4
-	ADDU	R2, R4
-	MOVW	stkbar_savedLRVal(R4), R4
-	ADDU	$1, R3
-	MOVW	R3, g_stkbarPos(g)	// Record that this stack barrier was hit.
-	JMP	(R4)	// Jump to the original return PC.
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -637,26 +621,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$4-8
 	MOVW	8(R29), R1	// LR saved by caller
-	MOVW	runtime·stackBarrierPC(SB), R2
-	BNE	R1, R2, nobar
-	JAL	runtime·nextBarrierPC(SB)	// Get original return PC.
-	MOVW	4(R29), R1
-nobar:
 	MOVW	R1, ret+4(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
-	MOVW	pc+4(FP), R1
-	MOVW	8(R29), R2
-	MOVW	runtime·stackBarrierPC(SB), R3
-	BEQ	R2, R3, setbar
-	MOVW	R1, 8(R29)	// set LR in caller
-	RET
-setbar:
-	MOVW	R1, 4(R29)
-	JAL	runtime·setNextBarrierPC(SB)	// Set the stack barrier return PC.
-	RET
-
 TEXT runtime·abort(SB),NOSPLIT,$0-0
 	UNDEF
 
@@ -904,16 +871,6 @@
 	MOVW	R8, ret+24(FP)
 	RET
 
-TEXT runtime·fastrand(SB),NOSPLIT,$0-4
-	MOVW	g_m(g), R2
-	MOVW	m_fastrand(R2), R1
-	ADDU	R1, R1
-	BGEZ	R1, 2(PC)
-	XOR	$0x88888eef, R1
-	MOVW	R1, m_fastrand(R2)
-	MOVW	R1, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB),NOSPLIT,$0
 	MOVW	$0, R1
 	RET

diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 1d6adcc..616861e 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s

@@ -85,14 +85,14 @@
 	// start this M
 	BL	runtime·mstart(SB)
 
-	MOVD	R0, 1(R0)
+	MOVD	R0, 0(R0)
 	RET
 
 DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
 GLOBL	runtime·mainPC(SB),RODATA,$8
 
 TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
-	MOVD	R0, 2(R0) // TODO: TD
+	MOVD	R0, 0(R0) // TODO: TD
 	RET
 
 TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
@@ -341,24 +341,6 @@
 	MOVD	R0, R11
 	BR	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R3 may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVD	(g_stkbar+slice_array)(g), R4
-	MOVD	g_stkbarPos(g), R5
-	MOVD	$stkbar__size, R6
-	MULLD	R5, R6
-	ADD	R4, R6
-	MOVD	stkbar_savedLRVal(R6), R6
-	// Record that this stack barrier was hit.
-	ADD	$1, R5
-	MOVD	R5, g_stkbarPos(g)
-	// Jump to the original return PC.
-	MOVD	R6, CTR
-	BR	(CTR)
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -734,30 +716,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
 	MOVD	FIXED_FRAME+8(R1), R3		// LR saved by caller
-	MOVD	runtime·stackBarrierPC(SB), R4
-	CMP	R3, R4
-	BNE	nobar
-	// Get original return PC.
-	BL	runtime·nextBarrierPC(SB)
-	MOVD	FIXED_FRAME+0(R1), R3
-nobar:
 	MOVD	R3, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
-	MOVD	pc+8(FP), R3
-	MOVD	FIXED_FRAME+8(R1), R4
-	MOVD	runtime·stackBarrierPC(SB), R5
-	CMP	R4, R5
-	BEQ	setbar
-	MOVD	R3, FIXED_FRAME+8(R1)		// set LR in caller
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVD	R3, FIXED_FRAME+0(R1)
-	BL	runtime·setNextBarrierPC(SB)
-	RET
-
 TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R0
 	UNDEF
@@ -1152,53 +1113,183 @@
 	MOVBZ	R3,ret+48(FP)
 	RET
 
-TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
-	MOVD	s+0(FP), R3
-	MOVD	s_len+8(FP), R4
-	MOVBZ	c+24(FP), R5	// byte to find
-	MOVD	R3, R6		// store base for later
-	SUB	$1, R3
-	ADD	R3, R4		// end-1
+TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	s+0(FP), R3		// R3 = byte array pointer
+	MOVD	s_len+8(FP), R4		// R4 = length
+	MOVBZ	c+24(FP), R5		// R5 = byte
+	MOVD	$ret+32(FP), R14	// R14 = &ret
+	BR	runtime·indexbytebody<>(SB)
 
+TEXT strings·IndexByte(SB),NOSPLIT|NOFRAME,$0-32
+	MOVD	s+0(FP), R3	  // R3 = string
+	MOVD	s_len+8(FP), R4	  // R4 = length
+	MOVBZ	c+16(FP), R5	  // R5 = byte
+	MOVD	$ret+24(FP), R14  // R14 = &ret
+	BR	runtime·indexbytebody<>(SB)
+
+TEXT runtime·indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
+	DCBT	(R3)		// Prepare cache line.
+	MOVD	R3,R10		// Save base address for calculating the index later.
+	RLDICR	$0,R3,$60,R8	// Align address to doubleword boundary in R8.
+	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
+
+	// Calculate last acceptable address and check for possible overflow
+	// using a saturated add.
+	// Overflows set last acceptable address to 0xffffffffffffffff.
+	ADD	R4,R3,R7
+	SUBC	R3,R7,R6
+	SUBE	R0,R0,R9
+	MOVW	R9,R6
+	OR	R6,R7,R7
+
+	RLDIMI	$16,R5,$32,R5
+	CMPU	R4,$32		// Check if it's a small string (<32 bytes). Those will be processed differently.
+	MOVD	$-1,R9
+	WORD $0x54661EB8	// Calculate padding in R6 (rlwinm r6,r3,3,26,28).
+	RLDIMI	$32,R5,$0,R5
+	ADD	$-1,R7,R7
+#ifdef GOARCH_ppc64le
+	SLD	R6,R9,R9	// Prepare mask for Little Endian
+#else
+	SRD	R6,R9,R9	// Same for Big Endian
+#endif
+	BLE	small_string	// Jump to the small string case if it's <32 bytes.
+
+	// Case for length >32 bytes
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base
+	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation
+	BNE	CR7,done
+
+	// Check for doubleword alignment and jump to the loop setup if aligned.
+	MOVFL	R8,CR7
+	BC	12,28,loop_setup
+
+	// Not aligned, so handle the second doubleword
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR7
+	BNE	CR7,done
+
+loop_setup:
+	// We are now aligned to a 16-byte boundary. We will load two doublewords
+	// per loop iteration. The last doubleword is in R7, so our loop counter
+	// starts at (R7-R8)/16.
+	SUB	R8,R7,R6
+	SRD	$4,R6,R6
+	MOVD	R6,CTR
+
+	// Note: when we have an align directive, align this loop to 32 bytes so
+	// it fits in a single icache sector.
 loop:
-	CMP	R3, R4
-	BEQ	notfound
-	MOVBZU	1(R3), R7
-	CMP	R7, R5
-	BNE	loop
+	// Load two doublewords, then compare and merge in a single register. We
+	// will check two doublewords per iteration, then find out which of them
+	// contains the byte later. This speeds up the search.
+	MOVD	8(R8),R12
+	MOVDU	16(R8),R11
+	CMPB	R12,R5,R3
+	CMPB	R11,R5,R9
+	OR	R3,R9,R6
+	CMPU	R6,$0,CR7
+	BNE	CR7,found
+	BC	16,0,loop
 
-	SUB	R6, R3		// remove base
-	MOVD	R3, ret+32(FP)
-	RET
+	// Counter zeroed, but we may have another doubleword to read
+	CMPU	R8,R7
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	BNE	CR6,done
 
 notfound:
-	MOVD	$-1, R3
-	MOVD	R3, ret+32(FP)
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
 	RET
 
-TEXT strings·IndexByte(SB),NOSPLIT,$0-32
-	MOVD	p+0(FP), R3
-	MOVD	b_len+8(FP), R4
-	MOVBZ	c+16(FP), R5	// byte to find
-	MOVD	R3, R6		// store base for later
-	SUB	$1, R3
-	ADD	R3, R4		// end-1
+found:
+	// One of the doublewords from the loop contains the byte we are looking
+	// for. Check the first doubleword and adjust the address if found.
+	CMPU	R3,$0,CR6
+	ADD	$-8,R8,R8
+	BNE	CR6,done
 
-loop:
-	CMP	R3, R4
+	// Not found, so it must be in the second doubleword of the merged pair.
+	MOVD	R9,R3
+	ADD	$8,R8,R8
+
+done:
+	// At this point, R3 has 0xFF in the same position as the byte we are
+	// looking for in the doubleword. Use that to calculate the exact index
+	// of the byte.
+#ifdef GOARCH_ppc64le
+	ADD	$-1,R3,R11
+	ANDN	R3,R11,R11
+	POPCNTD	R11,R11		// Count trailing zeros (Little Endian).
+#else
+	CNTLZD	R3,R11		// Count leading zeros (Big Endian).
+#endif
+	CMPU	R8,R7		// Check if we are at the last doubleword.
+	SRD	$3,R11		// Convert trailing zeros to bytes.
+	ADD	R11,R8,R3
+	CMPU	R11,R4,CR7	// If at the last doubleword, check the byte offset.
+	BNE	return
+	BLE	CR7,return
+	MOVD	$-1,R3
+	MOVD	R3,(R14)
+	RET
+
+return:
+	SUB	R10,R3		// Calculate index.
+	MOVD	R3,(R14)
+	RET
+
+small_string:
+	// We unroll this loop for better performance.
+	CMPU	R4,$0		// Check for length=0
 	BEQ	notfound
-	MOVBZU	1(R3), R7
-	CMP	R7, R5
-	BNE	loop
 
-	SUB	R6, R3		// remove base
-	MOVD	R3, ret+24(FP)
-	RET
+	MOVD	0(R8),R12	// Load one doubleword from the aligned address in R8.
+	CMPB	R12,R5,R3	// Check for a match.
+	AND	R9,R3,R3	// Mask bytes below s_base.
+	CMPU	R3,$0,CR7	// If we have a match, jump to the final computation.
+	RLDICL	$0,R7,$61,R4	// length-1
+	RLDICR	$0,R7,$60,R7	// Last doubleword in R7.
+        CMPU	R8,R7
+	BNE	CR7,done
+	BEQ	notfound	// Hit length.
 
-notfound:
-	MOVD	$-1, R3
-	MOVD	R3, ret+24(FP)
-	RET
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BEQ	notfound
+
+	MOVDU	8(R8),R12
+	CMPB	R12,R5,R3
+	CMPU	R3,$0,CR6
+	CMPU	R8,R7
+	BNE	CR6,done
+	BR	notfound
 
 TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s1_base+0(FP), R5
@@ -1224,17 +1315,6 @@
 	BR      cmpbodyBE<>(SB)
 #endif
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	MOVD	g_m(g), R4
-	MOVWZ	m_fastrand(R4), R3
-	ADD	R3, R3
-	CMPW	R3, $0
-	BGE	2(PC)
-	XOR	$0x88888eef, R3
-	MOVW	R3, m_fastrand(R4)
-	MOVW	R3, ret+0(FP)
-	RET
-
 TEXT runtime·return0(SB), NOSPLIT, $0
 	MOVW	$0, R3
 	RET

diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index c2212a5..20e740b 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s

@@ -298,23 +298,6 @@
 	MOVD	$0, R12
 	BR	runtime·morestack(SB)
 
-TEXT runtime·stackBarrier(SB),NOSPLIT,$0
-	// We came here via a RET to an overwritten LR.
-	// R3 may be live. Other registers are available.
-
-	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
-	MOVD	(g_stkbar+slice_array)(g), R4
-	MOVD	g_stkbarPos(g), R5
-	MOVD	$stkbar__size, R6
-	MULLD	R5, R6
-	ADD	R4, R6
-	MOVD	stkbar_savedLRVal(R6), R6
-	// Record that this stack barrier was hit.
-	ADD	$1, R5
-	MOVD	R5, g_stkbarPos(g)
-	// Jump to the original return PC.
-	BR	(R6)
-
 // reflectcall: call a function with the given argument list
 // func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
 // we don't have variable-sized frames, so we use a small number
@@ -675,28 +658,9 @@
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
 	MOVD	16(R15), R3		// LR saved by caller
-	MOVD	runtime·stackBarrierPC(SB), R4
-	CMPBNE	R3, R4, nobar
-	// Get original return PC.
-	BL	runtime·nextBarrierPC(SB)
-	MOVD	8(R15), R3
-nobar:
 	MOVD	R3, ret+8(FP)
 	RET
 
-TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
-	MOVD	pc+8(FP), R3
-	MOVD	16(R15), R4
-	MOVD	runtime·stackBarrierPC(SB), R5
-	CMPBEQ	R4, R5, setbar
-	MOVD	R3, 16(R15)		// set LR in caller
-	RET
-setbar:
-	// Set the stack barrier return PC.
-	MOVD	R3, 8(R15)
-	BL	runtime·setNextBarrierPC(SB)
-	RET
-
 TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R0
 	UNDEF
@@ -851,17 +815,6 @@
 	CLC	$1, 0(R3), 0(R5)
 	RET
 
-TEXT runtime·fastrand(SB), NOSPLIT, $0-4
-	MOVD	g_m(g), R4
-	MOVWZ	m_fastrand(R4), R3
-	ADD	R3, R3
-	CMPW	R3, $0
-	BGE	2(PC)
-	XOR	$0x88888eef, R3
-	MOVW	R3, m_fastrand(R4)
-	MOVW	R3, ret+0(FP)
-	RET
-
 TEXT bytes·IndexByte(SB),NOSPLIT|NOFRAME,$0-40
 	MOVD	s+0(FP), R3     // s => R3
 	MOVD	s_len+8(FP), R4 // s_len => R4

diff --git a/src/runtime/cgo.go b/src/runtime/cgo.go
index 9cf7b58..395d54a 100644
--- a/src/runtime/cgo.go
+++ b/src/runtime/cgo.go

@@ -16,6 +16,7 @@
 //go:linkname _cgo_notify_runtime_init_done _cgo_notify_runtime_init_done
 //go:linkname _cgo_callers _cgo_callers
 //go:linkname _cgo_set_context_function _cgo_set_context_function
+//go:linkname _cgo_yield _cgo_yield
 
 var (
 	_cgo_init                     unsafe.Pointer
@@ -24,6 +25,7 @@
 	_cgo_notify_runtime_init_done unsafe.Pointer
 	_cgo_callers                  unsafe.Pointer
 	_cgo_set_context_function     unsafe.Pointer
+	_cgo_yield                    unsafe.Pointer
 )
 
 // iscgo is set to true by the runtime/cgo package
@@ -48,3 +50,5 @@
 // so it emits the test and keeps the call, giving the desired
 // escape analysis result. The test is cheaper than the call.
 var cgoAlwaysFalse bool
+
+var cgo_yield = &_cgo_yield

diff --git a/src/runtime/cgo/asm_ppc64x.s b/src/runtime/cgo/asm_ppc64x.s
index dded1be..1cf27dd 100644
--- a/src/runtime/cgo/asm_ppc64x.s
+++ b/src/runtime/cgo/asm_ppc64x.s

@@ -16,6 +16,8 @@
 	// Start with standard C stack frame layout and linkage
 	MOVD	LR, R0
 	MOVD	R0, 16(R1)	// Save LR in caller's frame
+	MOVW	CR, R0		// Save CR in caller's frame
+	MOVD	R0, 8(R1)
 	MOVD	R2, 24(R1)	// Save TOC in caller's frame
 
 	BL	saveregs2<>(SB)
@@ -38,6 +40,8 @@
 	BL	restoreregs2<>(SB)
 
 	MOVD	24(R1), R2
+	MOVD	8(R1), R0
+	MOVFL	R0, $0xff
 	MOVD	16(R1), R0
 	MOVD	R0, LR
 	RET

diff --git a/src/runtime/cgo/callbacks.go b/src/runtime/cgo/callbacks.go
index 9bde5a9..8590aa3 100644
--- a/src/runtime/cgo/callbacks.go
+++ b/src/runtime/cgo/callbacks.go

@@ -92,5 +92,15 @@
 var x_cgo_set_context_function byte
 var _cgo_set_context_function = &x_cgo_set_context_function
 
+// Calls a libc function to execute background work injected via libc
+// interceptors, such as processing pending signals under the thread
+// sanitizer.
+//
+// Left as a nil pointer if no libc interceptors are expected.
+
+//go:cgo_import_static _cgo_yield
+//go:linkname _cgo_yield _cgo_yield
+var _cgo_yield unsafe.Pointer
+
 //go:cgo_export_static _cgo_topofstack
 //go:cgo_export_dynamic _cgo_topofstack

diff --git a/src/runtime/cgo/gcc_android_arm.c b/src/runtime/cgo/gcc_android_arm.c
index c7b13f9..d8936ea 100644
--- a/src/runtime/cgo/gcc_android_arm.c
+++ b/src/runtime/cgo/gcc_android_arm.c

@@ -10,13 +10,6 @@
 
 #define magic1 (0x23581321U)
 
-// PTHREAD_KEYS_MAX has been added to sys/limits.h at head in bionic:
-// https://android.googlesource.com/platform/bionic/+/master/libc/include/sys/limits.h
-// TODO(crawshaw): remove this definition when NDK r10d is required.
-#ifndef PTHREAD_KEYS_MAX
-#define PTHREAD_KEYS_MAX 128
-#endif
-
 // inittls allocates a thread-local storage slot for g.
 //
 // It finds the first available slot using pthread_key_create and uses
@@ -32,7 +25,11 @@
 		fatalf("pthread_key_create failed: %d", err);
 	}
 	pthread_setspecific(k, (void*)magic1);
-	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
+	// If thread local slots are laid out as we expect, our magic word will
+	// be located at some low offset from tlsbase. However, just in case something went
+	// wrong, the search is limited to sensible offsets. PTHREAD_KEYS_MAX was the
+	// original limit, but issue 19472 made a higher limit necessary.
+	for (i=0; i<384; i++) {
 		if (*(tlsbase+i) == (void*)magic1) {
 			*tlsg = (void*)(i*sizeof(void *));
 			pthread_setspecific(k, 0);

diff --git a/src/runtime/cgo/gcc_android_arm64.c b/src/runtime/cgo/gcc_android_arm64.c
index f8ad684..499a11f 100644
--- a/src/runtime/cgo/gcc_android_arm64.c
+++ b/src/runtime/cgo/gcc_android_arm64.c

@@ -25,7 +25,11 @@
 		fatalf("pthread_key_create failed: %d", err);
 	}
 	pthread_setspecific(k, (void*)magic1);
-	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
+	// If thread local slots are laid out as we expect, our magic word will
+	// be located at some low offset from tlsbase. However, just in case something went
+	// wrong, the search is limited to sensible offsets. PTHREAD_KEYS_MAX was the
+	// original limit, but issue 19472 made a higher limit necessary.
+	for (i=0; i<384; i++) {
 		if (*(tlsbase+i) == (void*)magic1) {
 			*tlsg = (void*)(i*sizeof(void *));
 			pthread_setspecific(k, 0);

diff --git a/src/runtime/cgo/gcc_darwin_386.c b/src/runtime/cgo/gcc_darwin_386.c
index 83092db..4ab3267 100644
--- a/src/runtime/cgo/gcc_darwin_386.c
+++ b/src/runtime/cgo/gcc_darwin_386.c

@@ -122,7 +122,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_darwin_amd64.c b/src/runtime/cgo/gcc_darwin_amd64.c
index 93a6b8e..181d0ab 100644
--- a/src/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/runtime/cgo/gcc_darwin_amd64.c

@@ -93,7 +93,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_darwin_arm.c b/src/runtime/cgo/gcc_darwin_arm.c
index b3f8046..e2f96e9 100644
--- a/src/runtime/cgo/gcc_darwin_arm.c
+++ b/src/runtime/cgo/gcc_darwin_arm.c

@@ -64,7 +64,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 
@@ -107,30 +107,42 @@
 		return;
 	}
 	CFStringRef url_str_ref = CFURLGetString(url_ref);
-	char url[MAXPATHLEN];
-        if (!CFStringGetCString(url_str_ref, url, sizeof(url), kCFStringEncodingUTF8)) {
+	char buf[MAXPATHLEN];
+	Boolean res = CFStringGetCString(url_str_ref, buf, sizeof(buf), kCFStringEncodingUTF8);
+	CFRelease(url_ref);
+	if (!res) {
 		fprintf(stderr, "runtime/cgo: cannot get URL string\n");
 		return;
 	}
 
 	// url is of the form "file:///path/to/Info.plist".
 	// strip it down to the working directory "/path/to".
-	int url_len = strlen(url);
+	int url_len = strlen(buf);
 	if (url_len < sizeof("file://")+sizeof("/Info.plist")) {
-		fprintf(stderr, "runtime/cgo: bad URL: %s\n", url);
+		fprintf(stderr, "runtime/cgo: bad URL: %s\n", buf);
 		return;
 	}
-	url[url_len-sizeof("/Info.plist")+1] = 0;
-	char *dir = &url[0] + sizeof("file://")-1;
+	buf[url_len-sizeof("/Info.plist")+1] = 0;
+	char *dir = &buf[0] + sizeof("file://")-1;
 
 	if (chdir(dir) != 0) {
 		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
 	}
 
-	// No-op to set a breakpoint on, immediately after the real chdir.
-	// Gives the test harness in go_darwin_arm_exec (which uses lldb) a
-	// chance to move the working directory.
-	getwd(dir);
+	// The test harness in go_darwin_arm_exec passes the relative working directory
+	// in the GoExecWrapperWorkingDirectory property of the app bundle.
+	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
+	if (wd_ref != NULL) {
+		if (!CFStringGetCString(wd_ref, buf, sizeof(buf), kCFStringEncodingUTF8)) {
+			fprintf(stderr, "runtime/cgo: cannot get GoExecWrapperWorkingDirectory string\n");
+			return;
+		}
+		if (chdir(buf) != 0) {
+			fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", buf);
+		}
+		// Notify the test harness that we're correctly set up
+		raise(SIGINT);
+	}
 }
 
 void

diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index 039dcc0..25106b7 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c

@@ -66,7 +66,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 
@@ -109,30 +109,42 @@
 		return;
 	}
 	CFStringRef url_str_ref = CFURLGetString(url_ref);
-	char url[MAXPATHLEN];
-        if (!CFStringGetCString(url_str_ref, url, sizeof(url), kCFStringEncodingUTF8)) {
+	char buf[MAXPATHLEN];
+	Boolean res = CFStringGetCString(url_str_ref, buf, sizeof(buf), kCFStringEncodingUTF8);
+	CFRelease(url_ref);
+	if (!res) {
 		fprintf(stderr, "runtime/cgo: cannot get URL string\n");
 		return;
 	}
 
 	// url is of the form "file:///path/to/Info.plist".
 	// strip it down to the working directory "/path/to".
-	int url_len = strlen(url);
+	int url_len = strlen(buf);
 	if (url_len < sizeof("file://")+sizeof("/Info.plist")) {
-		fprintf(stderr, "runtime/cgo: bad URL: %s\n", url);
+		fprintf(stderr, "runtime/cgo: bad URL: %s\n", buf);
 		return;
 	}
-	url[url_len-sizeof("/Info.plist")+1] = 0;
-	char *dir = &url[0] + sizeof("file://")-1;
+	buf[url_len-sizeof("/Info.plist")+1] = 0;
+	char *dir = &buf[0] + sizeof("file://")-1;
 
 	if (chdir(dir) != 0) {
 		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
 	}
 
-	// No-op to set a breakpoint on, immediately after the real chdir.
-	// Gives the test harness in go_darwin_arm_exec (which uses lldb) a
-	// chance to move the working directory.
-	getwd(dir);
+	// The test harness in go_darwin_arm_exec passes the relative working directory
+	// in the GoExecWrapperWorkingDirectory property of the app bundle.
+	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
+	if (wd_ref != NULL) {
+		if (!CFStringGetCString(wd_ref, buf, sizeof(buf), kCFStringEncodingUTF8)) {
+			fprintf(stderr, "runtime/cgo: cannot get GoExecWrapperWorkingDirectory string\n");
+			return;
+		}
+		if (chdir(buf) != 0) {
+			fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", buf);
+		}
+		// Notify the test harness that we're correctly set up
+		raise(SIGINT);
+	}
 }
 
 void

diff --git a/src/runtime/cgo/gcc_dragonfly_amd64.c b/src/runtime/cgo/gcc_dragonfly_amd64.c
index bdfbf6b..d25db91 100644
--- a/src/runtime/cgo/gcc_dragonfly_amd64.c
+++ b/src/runtime/cgo/gcc_dragonfly_amd64.c

@@ -41,7 +41,7 @@
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_freebsd_386.c b/src/runtime/cgo/gcc_freebsd_386.c
index c6d4f25..9097a2a 100644
--- a/src/runtime/cgo/gcc_freebsd_386.c
+++ b/src/runtime/cgo/gcc_freebsd_386.c

@@ -41,7 +41,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_freebsd_amd64.c b/src/runtime/cgo/gcc_freebsd_amd64.c
index bdfbf6b..d25db91 100644
--- a/src/runtime/cgo/gcc_freebsd_amd64.c
+++ b/src/runtime/cgo/gcc_freebsd_amd64.c

@@ -41,7 +41,7 @@
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_freebsd_arm.c b/src/runtime/cgo/gcc_freebsd_arm.c
index 746ca89..74f2e0e 100644
--- a/src/runtime/cgo/gcc_freebsd_arm.c
+++ b/src/runtime/cgo/gcc_freebsd_arm.c

@@ -57,7 +57,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_libinit.c b/src/runtime/cgo/gcc_libinit.c
index f6fbaa3..31594ad 100644
--- a/src/runtime/cgo/gcc_libinit.c
+++ b/src/runtime/cgo/gcc_libinit.c

@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo
-// +build darwin dragonfly freebsd linux netbsd solaris
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
 
 #include <pthread.h>
 #include <errno.h>

diff --git a/src/runtime/cgo/gcc_libinit_openbsd.c b/src/runtime/cgo/gcc_libinit_openbsd.c
deleted file mode 100644
index c8308e5..0000000
--- a/src/runtime/cgo/gcc_libinit_openbsd.c
+++ /dev/null

@@ -1,74 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include <sys/types.h>
-#include <errno.h>
-#include <pthread.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include "libcgo.h"
-
-// The context function, used when tracing back C calls into Go.
-static void (*cgo_context_function)(struct context_arg*);
-
-void
-x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
-	fprintf(stderr, "x_cgo_sys_thread_create not implemented");
-	abort();
-}
-
-uintptr_t
-_cgo_wait_runtime_init_done() {
-	void (*pfn)(struct context_arg*);
-
-	// TODO(spetrovic): implement this method.
-
-	pfn = _cgo_get_context_function();
-	if (pfn != nil) {
-		struct context_arg arg;
-
-		arg.Context = 0;
-		(*pfn)(&arg);
-		return arg.Context;
-	}
-	return 0;
-}
-
-void
-x_cgo_notify_runtime_init_done(void* dummy) {
-	// TODO(spetrovic): implement this method.
-}
-
-// Sets the context function to call to record the traceback context
-// when calling a Go function from C code. Called from runtime.SetCgoTraceback.
-void x_cgo_set_context_function(void (*context)(struct context_arg*)) {
-	// TODO(iant): Needs synchronization.
-	cgo_context_function = context;
-}
-
-// Gets the context function.
-void (*(_cgo_get_context_function(void)))(struct context_arg*) {
-	return cgo_context_function;
-}
-
-// _cgo_try_pthread_create retries sys_pthread_create if it fails with
-// EAGAIN.
-int
-_cgo_openbsd_try_pthread_create(int (*sys_pthread_create)(pthread_t*, const pthread_attr_t*, void* (*)(void*), void*),
-	pthread_t* thread, const pthread_attr_t* attr, void* (*pfn)(void*), void* arg) {
-	int tries;
-	int err;
-	struct timespec ts;
-
-	for (tries = 0; tries < 100; tries++) {
-		err = sys_pthread_create(thread, attr, pfn, arg);
-		if (err != EAGAIN) {
-			return err;
-		}
-		ts.tv_sec = 0;
-		ts.tv_nsec = (tries + 1) * 1000 * 1000; // Milliseconds.
-		nanosleep(&ts, nil);
-	}
-	return EAGAIN;
-}

diff --git a/src/runtime/cgo/gcc_linux_386.c b/src/runtime/cgo/gcc_linux_386.c
index 457a2c7..6be4569 100644
--- a/src/runtime/cgo/gcc_linux_386.c
+++ b/src/runtime/cgo/gcc_linux_386.c

@@ -52,7 +52,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_amd64.c b/src/runtime/cgo/gcc_linux_amd64.c
index 5d8ff10..42008c3 100644
--- a/src/runtime/cgo/gcc_linux_amd64.c
+++ b/src/runtime/cgo/gcc_linux_amd64.c

@@ -69,7 +69,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_arm.c b/src/runtime/cgo/gcc_linux_arm.c
index 31ced5e..870a8a4 100644
--- a/src/runtime/cgo/gcc_linux_arm.c
+++ b/src/runtime/cgo/gcc_linux_arm.c

@@ -32,7 +32,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_arm64.c b/src/runtime/cgo/gcc_linux_arm64.c
index 35b8e27..b328407 100644
--- a/src/runtime/cgo/gcc_linux_arm64.c
+++ b/src/runtime/cgo/gcc_linux_arm64.c

@@ -32,7 +32,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_mips64x.c b/src/runtime/cgo/gcc_linux_mips64x.c
index e0ce08f..afcd323 100644
--- a/src/runtime/cgo/gcc_linux_mips64x.c
+++ b/src/runtime/cgo/gcc_linux_mips64x.c

@@ -36,7 +36,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_mipsx.c b/src/runtime/cgo/gcc_linux_mipsx.c
index 7ed9d87..2a5f64a 100644
--- a/src/runtime/cgo/gcc_linux_mipsx.c
+++ b/src/runtime/cgo/gcc_linux_mipsx.c

@@ -36,7 +36,7 @@
 	pthread_attr_init(&attr);
 	size = 0;
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_ppc64x.c b/src/runtime/cgo/gcc_linux_ppc64x.c
index fcf77cf..9cb6e0c 100644
--- a/src/runtime/cgo/gcc_linux_ppc64x.c
+++ b/src/runtime/cgo/gcc_linux_ppc64x.c

@@ -42,7 +42,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_linux_s390x.c b/src/runtime/cgo/gcc_linux_s390x.c
index cdc9c23..bb60048 100644
--- a/src/runtime/cgo/gcc_linux_s390x.c
+++ b/src/runtime/cgo/gcc_linux_s390x.c

@@ -40,7 +40,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_mmap.c b/src/runtime/cgo/gcc_mmap.c
index 088bcb2..29acd3c 100644
--- a/src/runtime/cgo/gcc_mmap.c
+++ b/src/runtime/cgo/gcc_mmap.c

@@ -6,6 +6,7 @@
 
 #include <errno.h>
 #include <stdint.h>
+#include <stdlib.h>
 #include <sys/mman.h>
 
 #include "libcgo.h"
@@ -23,3 +24,16 @@
 	}
 	return p;
 }
+
+void
+x_cgo_munmap(void *addr, uintptr_t length) {
+	int r;
+
+	_cgo_tsan_acquire();
+	r = munmap(addr, length);
+	_cgo_tsan_release();
+	if (r < 0) {
+		/* The Go runtime is not prepared for munmap to fail.  */
+		abort();
+	}
+}

diff --git a/src/runtime/cgo/gcc_netbsd_386.c b/src/runtime/cgo/gcc_netbsd_386.c
index fb317c1..5495f0f 100644
--- a/src/runtime/cgo/gcc_netbsd_386.c
+++ b/src/runtime/cgo/gcc_netbsd_386.c

@@ -40,7 +40,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_netbsd_amd64.c b/src/runtime/cgo/gcc_netbsd_amd64.c
index 77a553f..dc966fc 100644
--- a/src/runtime/cgo/gcc_netbsd_amd64.c
+++ b/src/runtime/cgo/gcc_netbsd_amd64.c

@@ -41,7 +41,7 @@
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_netbsd_arm.c b/src/runtime/cgo/gcc_netbsd_arm.c
index 672f49c..b0c80ea 100644
--- a/src/runtime/cgo/gcc_netbsd_arm.c
+++ b/src/runtime/cgo/gcc_netbsd_arm.c

@@ -41,7 +41,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
 	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 

diff --git a/src/runtime/cgo/gcc_openbsd_386.c b/src/runtime/cgo/gcc_openbsd_386.c
index 0cac047..127a1b6 100644
--- a/src/runtime/cgo/gcc_openbsd_386.c
+++ b/src/runtime/cgo/gcc_openbsd_386.c

@@ -3,8 +3,6 @@
 // license that can be found in the LICENSE file.
 
 #include <sys/types.h>
-#include <dlfcn.h>
-#include <errno.h>
 #include <pthread.h>
 #include <signal.h>
 #include <string.h>
@@ -14,125 +12,6 @@
 static void* threadentry(void*);
 static void (*setg_gcc)(void*);
 
-// TCB_SIZE is sizeof(struct thread_control_block), as defined in
-// /usr/src/lib/librthread/tcb.h on OpenBSD 5.9 and earlier.
-#define TCB_SIZE (4 * sizeof(void *))
-
-// TIB_SIZE is sizeof(struct tib), as defined in
-// /usr/include/tib.h on OpenBSD 6.0 and later.
-#define TIB_SIZE (4 * sizeof(void *) + 6 * sizeof(int))
-
-// TLS_SIZE is the size of TLS needed for Go.
-#define TLS_SIZE (2 * sizeof(void *))
-
-void *__get_tcb(void);
-void __set_tcb(void *);
-
-static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
-	void *(*start_routine)(void *), void *arg);
-
-struct thread_args {
-	void *(*func)(void *);
-	void *arg;
-};
-
-static int has_tib = 0;
-
-static void
-tcb_fixup(int mainthread)
-{
-	void *tls, *newtcb, *oldtcb;
-	size_t tls_size, tcb_size;
-
-	// TODO(jsing): Remove once OpenBSD 6.1 is released and OpenBSD 5.9 is
-	// no longer supported.
-
-	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
-	// we need to allocate our own TLS space while preserving the existing
-	// TCB or TIB that has been setup via librthread.
-
-	tcb_size = has_tib ? TIB_SIZE : TCB_SIZE;
-	tls_size = TLS_SIZE + tcb_size;
-	tls = malloc(tls_size);
-	if(tls == NULL)
-		abort();
-
-	// The signal trampoline expects the TLS slots to be zeroed.
-	bzero(tls, TLS_SIZE);
-
-	oldtcb = __get_tcb();
-	newtcb = tls + TLS_SIZE;
-	bcopy(oldtcb, newtcb, tcb_size);
-	if(has_tib) {
-		 // Fix up self pointer.
-		*(uintptr_t *)(newtcb) = (uintptr_t)newtcb;
-	}
-	__set_tcb(newtcb);
-
-	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
-	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
-	// has proper support for PT_TLS.
-}
-
-static void *
-thread_start_wrapper(void *arg)
-{
-	struct thread_args args = *(struct thread_args *)arg;
-
-	free(arg);
-	tcb_fixup(0);
-
-	return args.func(args.arg);
-}
-
-static void init_pthread_wrapper(void) {
-	void *handle;
-
-	// Locate symbol for the system pthread_create function.
-	handle = dlopen("libpthread.so", RTLD_LAZY);
-	if(handle == NULL) {
-		fprintf(stderr, "runtime/cgo: dlopen failed to load libpthread: %s\n", dlerror());
-		abort();
-	}
-	sys_pthread_create = dlsym(handle, "pthread_create");
-	if(sys_pthread_create == NULL) {
-		fprintf(stderr, "runtime/cgo: dlsym failed to find pthread_create: %s\n", dlerror());
-		abort();
-	}
-	// _rthread_init is hidden in OpenBSD librthread that has TIB.
-	if(dlsym(handle, "_rthread_init") == NULL) {
-		has_tib = 1;
-	}
-	dlclose(handle);
-}
-
-static pthread_once_t init_pthread_wrapper_once = PTHREAD_ONCE_INIT;
-
-int
-pthread_create(pthread_t *thread, const pthread_attr_t *attr,
-	void *(*start_routine)(void *), void *arg)
-{
-	struct thread_args *p;
-
-	// we must initialize our wrapper in pthread_create, because it is valid to call
-	// pthread_create in a static constructor, and in fact, our test for issue 9456
-	// does just that.
-	if(pthread_once(&init_pthread_wrapper_once, init_pthread_wrapper) != 0) {
-		fprintf(stderr, "runtime/cgo: failed to initialize pthread_create wrapper\n");
-		abort();
-	}
-
-	p = malloc(sizeof(*p));
-	if(p == NULL) {
-		errno = ENOMEM;
-		return -1;
-	}
-	p->func = start_routine;
-	p->arg = arg;
-
-	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
-}
-
 void
 x_cgo_init(G *g, void (*setg)(void*))
 {
@@ -144,16 +23,8 @@
 	pthread_attr_getstacksize(&attr, &size);
 	g->stacklo = (uintptr)&attr - size + 4096;
 	pthread_attr_destroy(&attr);
-
-	if(pthread_once(&init_pthread_wrapper_once, init_pthread_wrapper) != 0) {
-		fprintf(stderr, "runtime/cgo: failed to initialize pthread_create wrapper\n");
-		abort();
-	}
-
-	tcb_fixup(1);
 }
 
-
 void
 _cgo_sys_thread_start(ThreadStart *ts)
 {
@@ -169,9 +40,9 @@
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
-	err = _cgo_openbsd_try_pthread_create(sys_pthread_create, &p, &attr, threadentry, ts);
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 
 	pthread_sigmask(SIG_SETMASK, &oset, nil);
 
@@ -186,8 +57,6 @@
 {
 	ThreadStart ts;
 
-	tcb_fixup(0);
-
 	ts = *(ThreadStart*)v;
 	free(v);
 

diff --git a/src/runtime/cgo/gcc_openbsd_amd64.c b/src/runtime/cgo/gcc_openbsd_amd64.c
index 86a9185..34319fb 100644
--- a/src/runtime/cgo/gcc_openbsd_amd64.c
+++ b/src/runtime/cgo/gcc_openbsd_amd64.c

@@ -3,8 +3,6 @@
 // license that can be found in the LICENSE file.
 
 #include <sys/types.h>
-#include <dlfcn.h>
-#include <errno.h>
 #include <pthread.h>
 #include <signal.h>
 #include <string.h>
@@ -14,125 +12,6 @@
 static void* threadentry(void*);
 static void (*setg_gcc)(void*);
 
-// TCB_SIZE is sizeof(struct thread_control_block), as defined in
-// /usr/src/lib/librthread/tcb.h on OpenBSD 5.9 and earlier.
-#define TCB_SIZE (4 * sizeof(void *))
-
-// TIB_SIZE is sizeof(struct tib), as defined in
-// /usr/include/tib.h on OpenBSD 6.0 and later.
-#define TIB_SIZE (4 * sizeof(void *) + 6 * sizeof(int))
-
-// TLS_SIZE is the size of TLS needed for Go.
-#define TLS_SIZE (2 * sizeof(void *))
-
-void *__get_tcb(void);
-void __set_tcb(void *);
-
-static int (*sys_pthread_create)(pthread_t *thread, const pthread_attr_t *attr,
-	void *(*start_routine)(void *), void *arg);
-
-struct thread_args {
-	void *(*func)(void *);
-	void *arg;
-};
-
-static int has_tib = 0;
-
-static void
-tcb_fixup(int mainthread)
-{
-	void *tls, *newtcb, *oldtcb;
-	size_t tls_size, tcb_size;
-
-	// TODO(jsing): Remove once OpenBSD 6.1 is released and OpenBSD 5.9 is
-	// no longer supported.
-
-	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
-	// we need to allocate our own TLS space while preserving the existing
-	// TCB or TIB that has been setup via librthread.
-
-	tcb_size = has_tib ? TIB_SIZE : TCB_SIZE;
-	tls_size = TLS_SIZE + tcb_size;
-	tls = malloc(tls_size);
-	if(tls == NULL)
-		abort();
-
-	// The signal trampoline expects the TLS slots to be zeroed.
-	bzero(tls, TLS_SIZE);
-
-	oldtcb = __get_tcb();
-	newtcb = tls + TLS_SIZE;
-	bcopy(oldtcb, newtcb, tcb_size);
-	if(has_tib) {
-		 // Fix up self pointer.
-		*(uintptr_t *)(newtcb) = (uintptr_t)newtcb;
-	}
-	__set_tcb(newtcb);
-
-	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
-	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
-	// has proper support for PT_TLS.
-}
-
-static void *
-thread_start_wrapper(void *arg)
-{
-	struct thread_args args = *(struct thread_args *)arg;
-
-	free(arg);
-	tcb_fixup(0);
-
-	return args.func(args.arg);
-}
-
-static void init_pthread_wrapper(void) {
-	void *handle;
-
-	// Locate symbol for the system pthread_create function.
-	handle = dlopen("libpthread.so", RTLD_LAZY);
-	if(handle == NULL) {
-		fprintf(stderr, "runtime/cgo: dlopen failed to load libpthread: %s\n", dlerror());
-		abort();
-	}
-	sys_pthread_create = dlsym(handle, "pthread_create");
-	if(sys_pthread_create == NULL) {
-		fprintf(stderr, "runtime/cgo: dlsym failed to find pthread_create: %s\n", dlerror());
-		abort();
-	}
-	// _rthread_init is hidden in OpenBSD librthread that has TIB.
-	if(dlsym(handle, "_rthread_init") == NULL) {
-		has_tib = 1;
-	}
-	dlclose(handle);
-}
-
-static pthread_once_t init_pthread_wrapper_once = PTHREAD_ONCE_INIT;
-
-int
-pthread_create(pthread_t *thread, const pthread_attr_t *attr,
-	void *(*start_routine)(void *), void *arg)
-{
-	struct thread_args *p;
-
-	// we must initialize our wrapper in pthread_create, because it is valid to call
-	// pthread_create in a static constructor, and in fact, our test for issue 9456
-	// does just that.
-	if(pthread_once(&init_pthread_wrapper_once, init_pthread_wrapper) != 0) {
-		fprintf(stderr, "runtime/cgo: failed to initialize pthread_create wrapper\n");
-		abort();
-	}
-
-	p = malloc(sizeof(*p));
-	if(p == NULL) {
-		errno = ENOMEM;
-		return -1;
-	}
-	p->func = start_routine;
-	p->arg = arg;
-
-	return sys_pthread_create(thread, attr, thread_start_wrapper, p);
-}
-
 void
 x_cgo_init(G *g, void (*setg)(void*))
 {
@@ -144,16 +23,8 @@
 	pthread_attr_getstacksize(&attr, &size);
 	g->stacklo = (uintptr)&attr - size + 4096;
 	pthread_attr_destroy(&attr);
-
-	if(pthread_once(&init_pthread_wrapper_once, init_pthread_wrapper) != 0) {
-		fprintf(stderr, "runtime/cgo: failed to initialize pthread_create wrapper\n");
-		abort();
-	}
-
-	tcb_fixup(1);
 }
 
-
 void
 _cgo_sys_thread_start(ThreadStart *ts)
 {
@@ -169,9 +40,9 @@
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
 
-	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	// Leave stacklo=0 and set stackhi=size; mstart will do the rest.
 	ts->g->stackhi = size;
-	err = _cgo_openbsd_try_pthread_create(sys_pthread_create, &p, &attr, threadentry, ts);
+	err = _cgo_try_pthread_create(&p, &attr, threadentry, ts);
 
 	pthread_sigmask(SIG_SETMASK, &oset, nil);
 
@@ -186,8 +57,6 @@
 {
 	ThreadStart ts;
 
-	tcb_fixup(0);
-
 	ts = *(ThreadStart*)v;
 	free(v);
 

diff --git a/src/runtime/cgo/gcc_sigaction.c b/src/runtime/cgo/gcc_sigaction.c
index 5aca271..72fb08d 100644
--- a/src/runtime/cgo/gcc_sigaction.c
+++ b/src/runtime/cgo/gcc_sigaction.c

@@ -10,6 +10,8 @@
 #include <string.h>
 #include <signal.h>
 
+#include "libcgo.h"
+
 // go_sigaction_t is a C version of the sigactiont struct from
 // defs_linux_amd64.go.  This definition — and its conversion to and from struct
 // sigaction — are specific to linux/amd64.
@@ -33,6 +35,8 @@
 	struct sigaction oldact;
 	int i;
 
+	_cgo_tsan_acquire();
+
 	memset(&act, 0, sizeof act);
 	memset(&oldact, 0, sizeof oldact);
 
@@ -53,7 +57,8 @@
 
 	ret = sigaction(signum, goact ? &act : NULL, oldgoact ? &oldact : NULL);
 	if (ret == -1) {
-		/* This is what the Go code expects on failure. */
+		// runtime.rt_sigaction expects _cgo_sigaction to return errno on error.
+		_cgo_tsan_release();
 		return errno;
 	}
 
@@ -65,12 +70,13 @@
 		}
 		oldgoact->mask = 0;
 		for (i = 0; i < 8 * sizeof(oldgoact->mask); i++) {
-			if (sigismember(&act.sa_mask, i+1) == 1) {
+			if (sigismember(&oldact.sa_mask, i+1) == 1) {
 				oldgoact->mask |= (uint64_t)(1)<<i;
 			}
 		}
-		oldgoact->flags = act.sa_flags;
+		oldgoact->flags = oldact.sa_flags;
 	}
 
+	_cgo_tsan_release();
 	return ret;
 }

diff --git a/src/runtime/cgo/gcc_util.c b/src/runtime/cgo/gcc_util.c
index 99af021..2d5382a 100644
--- a/src/runtime/cgo/gcc_util.c
+++ b/src/runtime/cgo/gcc_util.c

@@ -22,3 +22,39 @@
 
 	_cgo_sys_thread_start(ts);	/* OS-dependent half */
 }
+
+#ifndef CGO_TSAN
+void(* const _cgo_yield)() = NULL;
+#else
+
+#include <string.h>
+
+/*
+Stub for allowing libc interceptors to execute.
+
+_cgo_yield is set to NULL if we do not expect libc interceptors to exist.
+*/
+static void
+x_cgo_yield()
+{
+	/*
+	The libc function(s) we call here must form a no-op and include at least one
+	call that triggers TSAN to process pending asynchronous signals.
+
+	sleep(0) would be fine, but it's not portable C (so it would need more header
+	guards).
+	free(NULL) has a fast-path special case in TSAN, so it doesn't
+	trigger signal delivery.
+	free(malloc(0)) would work (triggering the interceptors in malloc), but
+	it also runs a bunch of user-supplied malloc hooks.
+
+	So we choose strncpy(_, _, 0): it requires an extra header,
+	but it's standard and should be very efficient.
+	*/
+	char nothing = 0;
+	strncpy(&nothing, &nothing, 0);
+}
+
+void(* const _cgo_yield)() = &x_cgo_yield;
+
+#endif  /* GO_TSAN */

diff --git a/src/runtime/cgo/libcgo.h b/src/runtime/cgo/libcgo.h
index 01f9e72..2b8b4e2 100644
--- a/src/runtime/cgo/libcgo.h
+++ b/src/runtime/cgo/libcgo.h

@@ -111,6 +111,11 @@
 #ifdef CGO_TSAN
 
 // These must match the definitions in yesTsanProlog in cmd/cgo/out.go.
+// In general we should call _cgo_tsan_acquire when we enter C code,
+// and call _cgo_tsan_release when we return to Go code.
+// This is only necessary when calling code that might be instrumented
+// by TSAN, which mostly means system library calls that TSAN intercepts.
+// See the comment in cmd/cgo/out.go for more details.
 
 long long _cgo_sync __attribute__ ((common));
 

diff --git a/src/runtime/cgo/mmap.go b/src/runtime/cgo/mmap.go
index ff98359..ad5f6df 100644
--- a/src/runtime/cgo/mmap.go
+++ b/src/runtime/cgo/mmap.go

@@ -15,8 +15,17 @@
 // C/C++ code; this permits that code to see the Go code as normal
 // program addresses that have been initialized.
 
+// To support interceptors that look for both mmap and munmap,
+// also call the C library for munmap.
+
 //go:cgo_import_static x_cgo_mmap
 //go:linkname x_cgo_mmap x_cgo_mmap
 //go:linkname _cgo_mmap _cgo_mmap
 var x_cgo_mmap byte
 var _cgo_mmap = &x_cgo_mmap
+
+//go:cgo_import_static x_cgo_munmap
+//go:linkname x_cgo_munmap x_cgo_munmap
+//go:linkname _cgo_munmap _cgo_munmap
+var x_cgo_munmap byte
+var _cgo_munmap = &x_cgo_munmap

diff --git a/src/runtime/cgo/openbsd.go b/src/runtime/cgo/openbsd.go
index 5c70dbd..81c73bf 100644
--- a/src/runtime/cgo/openbsd.go
+++ b/src/runtime/cgo/openbsd.go

@@ -8,24 +8,13 @@
 
 import _ "unsafe" // for go:linkname
 
-// Supply environ, __progname and __guard_local, because
-// we don't link against the standard OpenBSD crt0.o and
-// the libc dynamic library needs them.
+// Supply __guard_local because we don't link against the standard
+// OpenBSD crt0.o and the libc dynamic library needs it.
 
-//go:linkname _environ environ
-//go:linkname _progname __progname
 //go:linkname _guard_local __guard_local
 
-var _environ uintptr
-var _progname uintptr
 var _guard_local uintptr
 
-//go:cgo_export_dynamic environ environ
-//go:cgo_export_dynamic __progname __progname
-
 // This is normally marked as hidden and placed in the
 // .openbsd.randomdata section.
 //go:cgo_export_dynamic __guard_local __guard_local
-
-// We override pthread_create to support PT_TLS.
-//go:cgo_export_dynamic pthread_create pthread_create

diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
index 5a2a1a2..aa531b9 100644
--- a/src/runtime/cgo_mmap.go
+++ b/src/runtime/cgo_mmap.go

@@ -15,6 +15,11 @@
 //go:linkname _cgo_mmap _cgo_mmap
 var _cgo_mmap unsafe.Pointer
 
+// _cgo_munmap is filled in by runtime/cgo when it is linked into the
+// program, so it is only non-nil when using cgo.
+//go:linkname _cgo_munmap _cgo_munmap
+var _cgo_munmap unsafe.Pointer
+
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer {
 	if _cgo_mmap != nil {
 		// Make ret a uintptr so that writing to it in the
@@ -32,9 +37,24 @@
 	return sysMmap(addr, n, prot, flags, fd, off)
 }
 
+func munmap(addr unsafe.Pointer, n uintptr) {
+	if _cgo_munmap != nil {
+		systemstack(func() { callCgoMunmap(addr, n) })
+		return
+	}
+	sysMunmap(addr, n)
+}
+
 // sysMmap calls the mmap system call. It is implemented in assembly.
 func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
 
 // callCgoMmap calls the mmap function in the runtime/cgo package
 // using the GCC calling convention. It is implemented in assembly.
 func callCgoMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) uintptr
+
+// sysMunmap calls the munmap system call. It is implemented in assembly.
+func sysMunmap(addr unsafe.Pointer, n uintptr)
+
+// callCgoMunmap calls the munmap function in the runtime/cgo package
+// using the GCC calling convention. It is implemented in assembly.
+func callCgoMunmap(addr unsafe.Pointer, n uintptr)

diff --git a/src/runtime/cgo_sigaction.go b/src/runtime/cgo_sigaction.go
index 4da2f40..713490d 100644
--- a/src/runtime/cgo_sigaction.go
+++ b/src/runtime/cgo_sigaction.go

@@ -30,7 +30,7 @@
 
 	var ret int32
 
-	if _cgo_sigaction == nil {
+	if _cgo_sigaction == nil || inForkedChild {
 		ret = sysSigaction(sig, new, old, size)
 	} else {
 		// We need to call _cgo_sigaction, which means we need a big enough stack

diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 69e29ef..755269e 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go

@@ -110,6 +110,7 @@
 	mp := getg().m
 	mp.ncgocall++
 	mp.ncgo++
+	mp.incgo = true
 
 	// Reset traceback.
 	mp.cgoCallers[0] = 0
@@ -151,6 +152,7 @@
 
 //go:nosplit
 func endcgo(mp *m) {
+	mp.incgo = false
 	mp.ncgo--
 
 	if raceenabled {
@@ -180,9 +182,11 @@
 	savedsp := unsafe.Pointer(gp.syscallsp)
 	savedpc := gp.syscallpc
 	exitsyscall(0) // coming out of cgo call
+	gp.m.incgo = false
 
 	cgocallbackg1(ctxt)
 
+	gp.m.incgo = true
 	// going back to cgo call
 	reentersyscall(savedpc, uintptr(savedsp))
 
@@ -531,7 +535,7 @@
 			return
 		}
 		for _, f := range st.fields {
-			cgoCheckArg(f.typ, add(p, f.offset), true, top, msg)
+			cgoCheckArg(f.typ, add(p, f.offset()), true, top, msg)
 		}
 	case kindPtr, kindUnsafePointer:
 		if indir {

diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go
index 8cac5d9..61aaa0a 100644
--- a/src/runtime/cgocheck.go
+++ b/src/runtime/cgocheck.go

@@ -124,7 +124,7 @@
 	aoff := uintptr(src) - mheap_.arena_start
 	idx := aoff >> _PageShift
 	s := mheap_.spans[idx]
-	if s.state == _MSpanStack {
+	if s.state == _MSpanManual {
 		// There are no heap bits for value stored on the stack.
 		// For a channel receive src might be on the stack of some
 		// other goroutine, so we can't unwind the stack even if

diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index b54a46c..6294678 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go

@@ -109,8 +109,8 @@
 
 // entry point for c <- x from compiled code
 //go:nosplit
-func chansend1(t *chantype, c *hchan, elem unsafe.Pointer) {
-	chansend(t, c, elem, true, getcallerpc(unsafe.Pointer(&t)))
+func chansend1(c *hchan, elem unsafe.Pointer) {
+	chansend(c, elem, true, getcallerpc(unsafe.Pointer(&c)))
 }
 
 /*
@@ -125,14 +125,7 @@
  * been closed.  it is easiest to loop and re-run
  * the operation; we'll see that it's now closed.
  */
-func chansend(t *chantype, c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
-	if raceenabled {
-		raceReadObjectPC(t.elem, ep, callerpc, funcPC(chansend))
-	}
-	if msanenabled {
-		msanread(ep, t.elem.size)
-	}
-
+func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
 	if c == nil {
 		if !block {
 			return false
@@ -183,7 +176,7 @@
 	if sg := c.recvq.dequeue(); sg != nil {
 		// Found a waiting receiver. We pass the value we want to send
 		// directly to the receiver, bypassing the channel buffer (if any).
-		send(c, sg, ep, func() { unlock(&c.lock) })
+		send(c, sg, ep, func() { unlock(&c.lock) }, 3)
 		return true
 	}
 
@@ -254,7 +247,7 @@
 // Channel c must be empty and locked.  send unlocks c with unlockf.
 // sg must already be dequeued from c.
 // ep must be non-nil and point to the heap or the caller's stack.
-func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if raceenabled {
 		if c.dataqsiz == 0 {
 			racesync(c, sg)
@@ -284,7 +277,7 @@
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
-	goready(gp, 4)
+	goready(gp, skip+1)
 }
 
 // Sends and receives on unbuffered or empty-buffered channels are the
@@ -391,13 +384,13 @@
 
 // entry points for <- c from compiled code
 //go:nosplit
-func chanrecv1(t *chantype, c *hchan, elem unsafe.Pointer) {
-	chanrecv(t, c, elem, true)
+func chanrecv1(c *hchan, elem unsafe.Pointer) {
+	chanrecv(c, elem, true)
 }
 
 //go:nosplit
-func chanrecv2(t *chantype, c *hchan, elem unsafe.Pointer) (received bool) {
-	_, received = chanrecv(t, c, elem, true)
+func chanrecv2(c *hchan, elem unsafe.Pointer) (received bool) {
+	_, received = chanrecv(c, elem, true)
 	return
 }
 
@@ -407,7 +400,7 @@
 // Otherwise, if c is closed, zeros *ep and returns (true, false).
 // Otherwise, fills in *ep with an element and returns (true, true).
 // A non-nil ep must point to the heap or the caller's stack.
-func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) {
+func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) {
 	// raceenabled: don't need to check ep, as it is always on the stack
 	// or is new memory allocated by reflect.
 
@@ -464,7 +457,7 @@
 		// directly from sender. Otherwise, receive from head of queue
 		// and add sender's value to the tail of the queue (both map to
 		// the same buffer slot because the queue is full).
-		recv(c, sg, ep, func() { unlock(&c.lock) })
+		recv(c, sg, ep, func() { unlock(&c.lock) }, 3)
 		return true, true
 	}
 
@@ -540,7 +533,7 @@
 // Channel c must be full and locked. recv unlocks c with unlockf.
 // sg must already be dequeued from c.
 // A non-nil ep must point to the heap or the caller's stack.
-func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func(), skip int) {
 	if c.dataqsiz == 0 {
 		if raceenabled {
 			racesync(c, sg)
@@ -580,7 +573,7 @@
 	if sg.releasetime != 0 {
 		sg.releasetime = cputicks()
 	}
-	goready(gp, 4)
+	goready(gp, skip+1)
 }
 
 // compiler implements
@@ -600,8 +593,8 @@
 //		... bar
 //	}
 //
-func selectnbsend(t *chantype, c *hchan, elem unsafe.Pointer) (selected bool) {
-	return chansend(t, c, elem, false, getcallerpc(unsafe.Pointer(&t)))
+func selectnbsend(c *hchan, elem unsafe.Pointer) (selected bool) {
+	return chansend(c, elem, false, getcallerpc(unsafe.Pointer(&c)))
 }
 
 // compiler implements
@@ -621,8 +614,8 @@
 //		... bar
 //	}
 //
-func selectnbrecv(t *chantype, elem unsafe.Pointer, c *hchan) (selected bool) {
-	selected, _ = chanrecv(t, c, elem, false)
+func selectnbrecv(elem unsafe.Pointer, c *hchan) (selected bool) {
+	selected, _ = chanrecv(c, elem, false)
 	return
 }
 
@@ -643,20 +636,20 @@
 //		... bar
 //	}
 //
-func selectnbrecv2(t *chantype, elem unsafe.Pointer, received *bool, c *hchan) (selected bool) {
+func selectnbrecv2(elem unsafe.Pointer, received *bool, c *hchan) (selected bool) {
 	// TODO(khr): just return 2 values from this function, now that it is in Go.
-	selected, *received = chanrecv(t, c, elem, false)
+	selected, *received = chanrecv(c, elem, false)
 	return
 }
 
 //go:linkname reflect_chansend reflect.chansend
-func reflect_chansend(t *chantype, c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
-	return chansend(t, c, elem, !nb, getcallerpc(unsafe.Pointer(&t)))
+func reflect_chansend(c *hchan, elem unsafe.Pointer, nb bool) (selected bool) {
+	return chansend(c, elem, !nb, getcallerpc(unsafe.Pointer(&c)))
 }
 
 //go:linkname reflect_chanrecv reflect.chanrecv
-func reflect_chanrecv(t *chantype, c *hchan, nb bool, elem unsafe.Pointer) (selected bool, received bool) {
-	return chanrecv(t, c, elem, !nb)
+func reflect_chanrecv(c *hchan, nb bool, elem unsafe.Pointer) (selected bool, received bool) {
+	return chanrecv(c, elem, !nb)
 }
 
 //go:linkname reflect_chanlen reflect.chanlen

diff --git a/src/runtime/complex.go b/src/runtime/complex.go
index 73f1161..07c596f 100644
--- a/src/runtime/complex.go
+++ b/src/runtime/complex.go

@@ -4,68 +4,58 @@
 
 package runtime
 
-func isposinf(f float64) bool { return f > maxFloat64 }
-func isneginf(f float64) bool { return f < -maxFloat64 }
-func isnan(f float64) bool    { return f != f }
-
-func nan() float64 {
-	var f float64 = 0
-	return f / f
+// inf2one returns a signed 1 if f is an infinity and a signed 0 otherwise.
+// The sign of the result is the sign of f.
+func inf2one(f float64) float64 {
+	g := 0.0
+	if isInf(f) {
+		g = 1.0
+	}
+	return copysign(g, f)
 }
 
-func posinf() float64 {
-	var f float64 = maxFloat64
-	return f * f
-}
+func complex128div(n complex128, m complex128) complex128 {
+	var e, f float64 // complex(e, f) = n/m
 
-func neginf() float64 {
-	var f float64 = maxFloat64
-	return -f * f
-}
+	// Algorithm for robust complex division as described in
+	// Robert L. Smith: Algorithm 116: Complex division. Commun. ACM 5(8): 435 (1962).
+	if abs(real(m)) >= abs(imag(m)) {
+		ratio := imag(m) / real(m)
+		denom := real(m) + ratio*imag(m)
+		e = (real(n) + imag(n)*ratio) / denom
+		f = (imag(n) - real(n)*ratio) / denom
+	} else {
+		ratio := real(m) / imag(m)
+		denom := imag(m) + ratio*real(m)
+		e = (real(n)*ratio + imag(n)) / denom
+		f = (imag(n)*ratio - real(n)) / denom
+	}
 
-func complex128div(n complex128, d complex128) complex128 {
-	// Special cases as in C99.
-	ninf := isposinf(real(n)) || isneginf(real(n)) ||
-		isposinf(imag(n)) || isneginf(imag(n))
-	dinf := isposinf(real(d)) || isneginf(real(d)) ||
-		isposinf(imag(d)) || isneginf(imag(d))
+	if isNaN(e) && isNaN(f) {
+		// Correct final result to infinities and zeros if applicable.
+		// Matches C99: ISO/IEC 9899:1999 - G.5.1  Multiplicative operators.
 
-	nnan := !ninf && (isnan(real(n)) || isnan(imag(n)))
-	dnan := !dinf && (isnan(real(d)) || isnan(imag(d)))
+		a, b := real(n), imag(n)
+		c, d := real(m), imag(m)
 
-	switch {
-	case nnan || dnan:
-		return complex(nan(), nan())
-	case ninf && !dinf:
-		return complex(posinf(), posinf())
-	case !ninf && dinf:
-		return complex(0, 0)
-	case real(d) == 0 && imag(d) == 0:
-		if real(n) == 0 && imag(n) == 0 {
-			return complex(nan(), nan())
-		} else {
-			return complex(posinf(), posinf())
-		}
-	default:
-		// Standard complex arithmetic, factored to avoid unnecessary overflow.
-		a := real(d)
-		if a < 0 {
-			a = -a
-		}
-		b := imag(d)
-		if b < 0 {
-			b = -b
-		}
-		if a <= b {
-			ratio := real(d) / imag(d)
-			denom := real(d)*ratio + imag(d)
-			return complex((real(n)*ratio+imag(n))/denom,
-				(imag(n)*ratio-real(n))/denom)
-		} else {
-			ratio := imag(d) / real(d)
-			denom := imag(d)*ratio + real(d)
-			return complex((imag(n)*ratio+real(n))/denom,
-				(imag(n)-real(n)*ratio)/denom)
+		switch {
+		case m == 0 && (!isNaN(a) || !isNaN(b)):
+			e = copysign(inf, c) * a
+			f = copysign(inf, c) * b
+
+		case (isInf(a) || isInf(b)) && isFinite(c) && isFinite(d):
+			a = inf2one(a)
+			b = inf2one(b)
+			e = inf * (a*c + b*d)
+			f = inf * (b*c - a*d)
+
+		case (isInf(c) || isInf(d)) && isFinite(a) && isFinite(b):
+			c = inf2one(c)
+			d = inf2one(d)
+			e = 0 * (a*c + b*d)
+			f = 0 * (b*c - a*d)
 		}
 	}
+
+	return complex(e, f)
 }

diff --git a/src/runtime/cpuflags_amd64.go b/src/runtime/cpuflags_amd64.go
index 026f0cd..3e408da 100644
--- a/src/runtime/cpuflags_amd64.go
+++ b/src/runtime/cpuflags_amd64.go

@@ -4,72 +4,17 @@
 
 package runtime
 
-var vendorStringBytes [12]byte
-var maxInputValue uint32
-var featureFlags uint32
-var processorVersionInfo uint32
-
-var useRepMovs = true
-
-func hasFeature(feature uint32) bool {
-	return (featureFlags & feature) != 0
-}
-
-func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32) // implemented in cpuidlow_amd64.s
-func xgetbv_low(arg1 uint32) (eax, edx uint32)                // implemented in cpuidlow_amd64.s
+var useAVXmemmove bool
 
 func init() {
-	const cfOSXSAVE uint32 = 1 << 27
-	const cfAVX uint32 = 1 << 28
-
-	leaf0()
-	leaf1()
-
-	enabledAVX := false
-	// Let's check if OS has set CR4.OSXSAVE[bit 18]
-	// to enable XGETBV instruction.
-	if hasFeature(cfOSXSAVE) {
-		eax, _ := xgetbv_low(0)
-		// Let's check that XCR0[2:1] = ‘11b’
-		// i.e. XMM state and YMM state are enabled by OS.
-		enabledAVX = (eax & 0x6) == 0x6
-	}
-
-	isIntelBridgeFamily := (processorVersionInfo == 0x206A0 ||
-		processorVersionInfo == 0x206D0 ||
-		processorVersionInfo == 0x306A0 ||
-		processorVersionInfo == 0x306E0) &&
-		isIntel()
-
-	useRepMovs = !(hasFeature(cfAVX) && enabledAVX) || isIntelBridgeFamily
-}
-
-func leaf0() {
-	eax, ebx, ecx, edx := cpuid_low(0, 0)
-	maxInputValue = eax
-	int32ToBytes(ebx, vendorStringBytes[0:4])
-	int32ToBytes(edx, vendorStringBytes[4:8])
-	int32ToBytes(ecx, vendorStringBytes[8:12])
-}
-
-func leaf1() {
-	if maxInputValue < 1 {
-		return
-	}
-	eax, _, ecx, _ := cpuid_low(1, 0)
 	// Let's remove stepping and reserved fields
-	processorVersionInfo = eax & 0x0FFF3FF0
-	featureFlags = ecx
-}
+	processor := processorVersionInfo & 0x0FFF3FF0
 
-func int32ToBytes(arg uint32, buffer []byte) {
-	buffer[3] = byte(arg >> 24)
-	buffer[2] = byte(arg >> 16)
-	buffer[1] = byte(arg >> 8)
-	buffer[0] = byte(arg)
-}
+	isIntelBridgeFamily := isIntel &&
+		processor == 0x206A0 ||
+		processor == 0x206D0 ||
+		processor == 0x306A0 ||
+		processor == 0x306E0
 
-func isIntel() bool {
-	intelSignature := [12]byte{'G', 'e', 'n', 'u', 'i', 'n', 'e', 'I', 'n', 't', 'e', 'l'}
-	return vendorStringBytes == intelSignature
+	useAVXmemmove = support_avx && !isIntelBridgeFamily
 }

diff --git a/src/runtime/cpuidlow_amd64.s b/src/runtime/cpuidlow_amd64.s
deleted file mode 100644
index 64316c9..0000000
--- a/src/runtime/cpuidlow_amd64.s
+++ /dev/null

@@ -1,22 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// func cpuid_low(arg1, arg2 uint32) (eax, ebx, ecx, edx uint32)
-TEXT ·cpuid_low(SB), 4, $0-24
-    MOVL    arg1+0(FP), AX
-    MOVL    arg2+4(FP), CX
-    CPUID
-    MOVL AX, eax+8(FP)
-    MOVL BX, ebx+12(FP)
-    MOVL CX, ecx+16(FP)
-    MOVL DX, edx+20(FP)
-    RET
-// func xgetbv_low(arg1 uint32) (eax, edx uint32)
-TEXT ·xgetbv_low(SB), 4, $0-16
-    MOVL arg1+0(FP), CX
-    // XGETBV
-    BYTE $0x0F; BYTE $0x01; BYTE $0xD0
-    MOVL AX,eax+8(FP)
-    MOVL DX,edx+12(FP)
-    RET

diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
index a4b14d3..c761e44 100644
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go

@@ -3,118 +3,45 @@
 // license that can be found in the LICENSE file.
 
 // CPU profiling.
-// Based on algorithms and data structures used in
-// https://github.com/google/pprof.
-//
-// The main difference between this code and the google-perftools
-// code is that this code is written to allow copying the profile data
-// to an arbitrary io.Writer, while the google-perftools code always
-// writes to an operating system file.
 //
 // The signal handler for the profiling clock tick adds a new stack trace
-// to a hash table tracking counts for recent traces. Most clock ticks
-// hit in the cache. In the event of a cache miss, an entry must be
-// evicted from the hash table, copied to a log that will eventually be
-// written as profile data. The google-perftools code flushed the
-// log itself during the signal handler. This code cannot do that, because
-// the io.Writer might block or need system calls or locks that are not
-// safe to use from within the signal handler. Instead, we split the log
-// into two halves and let the signal handler fill one half while a goroutine
-// is writing out the other half. When the signal handler fills its half, it
-// offers to swap with the goroutine. If the writer is not done with its half,
-// we lose the stack trace for this clock tick (and record that loss).
-// The goroutine interacts with the signal handler by calling getprofile() to
-// get the next log piece to write, implicitly handing back the last log
-// piece it obtained.
-//
-// The state of this dance between the signal handler and the goroutine
-// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
-// is not using either log half and is waiting (or will soon be waiting) for
-// a new piece by calling notesleep(&p.wait).  If the signal handler
-// changes handoff from 0 to non-zero, it must call notewakeup(&p.wait)
-// to wake the goroutine. The value indicates the number of entries in the
-// log half being handed off. The goroutine leaves the non-zero value in
-// place until it has finished processing the log half and then flips the number
-// back to zero. Setting the high bit in handoff means that the profiling is over,
-// and the goroutine is now in charge of flushing the data left in the hash table
-// to the log and returning that data.
-//
-// The handoff field is manipulated using atomic operations.
-// For the most part, the manipulation of handoff is orderly: if handoff == 0
-// then the signal handler owns it and can change it to non-zero.
-// If handoff != 0 then the goroutine owns it and can change it to zero.
-// If that were the end of the story then we would not need to manipulate
-// handoff using atomic operations. The operations are needed, however,
-// in order to let the log closer set the high bit to indicate "EOF" safely
-// in the situation when normally the goroutine "owns" handoff.
+// to a log of recent traces. The log is read by a user goroutine that
+// turns it into formatted profile data. If the reader does not keep up
+// with the log, those writes will be recorded as a count of lost records.
+// The actual profile buffer is in profbuf.go.
 
 package runtime
 
 import (
 	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
-const (
-	numBuckets      = 1 << 10
-	logSize         = 1 << 17
-	assoc           = 4
-	maxCPUProfStack = 64
-)
+const maxCPUProfStack = 64
 
-type cpuprofEntry struct {
-	count uintptr
-	depth int
-	stack [maxCPUProfStack]uintptr
-}
-
-//go:notinheap
 type cpuProfile struct {
-	on     bool    // profiling is on
-	wait   note    // goroutine waits here
-	count  uintptr // tick count
-	evicts uintptr // eviction count
-	lost   uintptr // lost ticks that need to be logged
+	lock mutex
+	on   bool     // profiling is on
+	log  *profBuf // profile events written here
 
-	// Active recent stack traces.
-	hash [numBuckets]struct {
-		entry [assoc]cpuprofEntry
-	}
-
-	// Log of traces evicted from hash.
-	// Signal handler has filled log[toggle][:nlog].
-	// Goroutine is writing log[1-toggle][:handoff].
-	log     [2][logSize / 2]uintptr
-	nlog    int
-	toggle  int32
-	handoff uint32
-
-	// Writer state.
-	// Writer maintains its own toggle to avoid races
-	// looking at signal handler's toggle.
-	wtoggle  uint32
-	wholding bool // holding & need to release a log half
-	flushing bool // flushing hash table - profile is over
-	eodSent  bool // special end-of-data record sent; => flushing
+	// extra holds extra stacks accumulated in addNonGo
+	// corresponding to profiling signals arriving on
+	// non-Go-created threads. Those stacks are written
+	// to log the next time a normal Go thread gets the
+	// signal handler.
+	// Assuming the stacks are 2 words each (we don't get
+	// a full traceback from those threads), plus one word
+	// size for framing, 100 Hz profiling would generate
+	// 300 words per second.
+	// Hopefully a normal Go thread will get the profiling
+	// signal at least once every few seconds.
+	extra     [1000]uintptr
+	numExtra  int
+	lostExtra uint64 // count of frames lost because extra is full
 }
 
-var (
-	cpuprofLock mutex
-	cpuprof     *cpuProfile
-
-	eod = [3]uintptr{0, 1, 0}
-)
-
-func setcpuprofilerate(hz int32) {
-	systemstack(func() {
-		setcpuprofilerate_m(hz)
-	})
-}
-
-// lostProfileData is a no-op function used in profiles
-// to mark the number of profiling stack traces that were
-// discarded due to slow data writers.
-func lostProfileData() {}
+var cpuprof cpuProfile
 
 // SetCPUProfileRate sets the CPU profiling rate to hz samples per second.
 // If hz <= 0, SetCPUProfileRate turns off profiling.
@@ -132,323 +59,144 @@
 		hz = 1000000
 	}
 
-	lock(&cpuprofLock)
+	lock(&cpuprof.lock)
 	if hz > 0 {
-		if cpuprof == nil {
-			cpuprof = (*cpuProfile)(sysAlloc(unsafe.Sizeof(cpuProfile{}), &memstats.other_sys))
-			if cpuprof == nil {
-				print("runtime: cpu profiling cannot allocate memory\n")
-				unlock(&cpuprofLock)
-				return
-			}
-		}
-		if cpuprof.on || cpuprof.handoff != 0 {
+		if cpuprof.on || cpuprof.log != nil {
 			print("runtime: cannot set cpu profile rate until previous profile has finished.\n")
-			unlock(&cpuprofLock)
+			unlock(&cpuprof.lock)
 			return
 		}
 
 		cpuprof.on = true
-		// pprof binary header format.
-		// https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119
-		p := &cpuprof.log[0]
-		p[0] = 0                 // count for header
-		p[1] = 3                 // depth for header
-		p[2] = 0                 // version number
-		p[3] = uintptr(1e6 / hz) // period (microseconds)
-		p[4] = 0
-		cpuprof.nlog = 5
-		cpuprof.toggle = 0
-		cpuprof.wholding = false
-		cpuprof.wtoggle = 0
-		cpuprof.flushing = false
-		cpuprof.eodSent = false
-		noteclear(&cpuprof.wait)
-
+		cpuprof.log = newProfBuf(1, 1<<17, 1<<14)
+		hdr := [1]uint64{uint64(hz)}
+		cpuprof.log.write(nil, nanotime(), hdr[:], nil)
 		setcpuprofilerate(int32(hz))
-	} else if cpuprof != nil && cpuprof.on {
+	} else if cpuprof.on {
 		setcpuprofilerate(0)
 		cpuprof.on = false
-
-		// Now add is not running anymore, and getprofile owns the entire log.
-		// Set the high bit in cpuprof.handoff to tell getprofile.
-		for {
-			n := cpuprof.handoff
-			if n&0x80000000 != 0 {
-				print("runtime: setcpuprofile(off) twice\n")
-			}
-			if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
-				if n == 0 {
-					// we did the transition from 0 -> nonzero so we wake getprofile
-					notewakeup(&cpuprof.wait)
-				}
-				break
-			}
-		}
+		cpuprof.addExtra()
+		cpuprof.log.close()
 	}
-	unlock(&cpuprofLock)
+	unlock(&cpuprof.lock)
 }
 
 // add adds the stack trace to the profile.
 // It is called from signal handlers and other limited environments
 // and cannot allocate memory or acquire locks that might be
 // held at the time of the signal, nor can it use substantial amounts
-// of stack. It is allowed to call evict.
+// of stack.
 //go:nowritebarrierrec
-func (p *cpuProfile) add(pc []uintptr) {
-	p.addWithFlushlog(pc, p.flushlog)
+func (p *cpuProfile) add(gp *g, stk []uintptr) {
+	// Simple cas-lock to coordinate with setcpuprofilerate.
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
+	}
+
+	if prof.hz != 0 { // implies cpuprof.log != nil
+		if p.numExtra > 0 || p.lostExtra > 0 {
+			p.addExtra()
+		}
+		hdr := [1]uint64{1}
+		// Note: write "knows" that the argument is &gp.labels,
+		// because otherwise its write barrier behavior may not
+		// be correct. See the long comment there before
+		// changing the argument here.
+		cpuprof.log.write(&gp.labels, nanotime(), hdr[:], stk)
+	}
+
+	atomic.Store(&prof.signalLock, 0)
 }
 
-// addWithFlushlog implements add and addNonGo.
-// It is called from signal handlers and other limited environments
-// and cannot allocate memory or acquire locks that might be
-// held at the time of the signal, nor can it use substantial amounts
-// of stack. It may be called by a signal handler with no g or m.
-// It is allowed to call evict, passing the flushlog parameter.
+// addNonGo adds the non-Go stack trace to the profile.
+// It is called from a non-Go thread, so we cannot use much stack at all,
+// nor do anything that needs a g or an m.
+// In particular, we can't call cpuprof.log.write.
+// Instead, we copy the stack into cpuprof.extra,
+// which will be drained the next time a Go thread
+// gets the signal handling event.
 //go:nosplit
 //go:nowritebarrierrec
-func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) {
-	if len(pc) > maxCPUProfStack {
-		pc = pc[:maxCPUProfStack]
+func (p *cpuProfile) addNonGo(stk []uintptr) {
+	// Simple cas-lock to coordinate with SetCPUProfileRate.
+	// (Other calls to add or addNonGo should be blocked out
+	// by the fact that only one SIGPROF can be handled by the
+	// process at a time. If not, this lock will serialize those too.)
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
+		osyield()
 	}
 
-	// Compute hash.
-	h := uintptr(0)
-	for _, x := range pc {
-		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
-		h += x * 41
-	}
-	p.count++
-
-	// Add to entry count if already present in table.
-	b := &p.hash[h%numBuckets]
-Assoc:
-	for i := range b.entry {
-		e := &b.entry[i]
-		if e.depth != len(pc) {
-			continue
-		}
-		for j := range pc {
-			if e.stack[j] != pc[j] {
-				continue Assoc
-			}
-		}
-		e.count++
-		return
+	if cpuprof.numExtra+1+len(stk) < len(cpuprof.extra) {
+		i := cpuprof.numExtra
+		cpuprof.extra[i] = uintptr(1 + len(stk))
+		copy(cpuprof.extra[i+1:], stk)
+		cpuprof.numExtra += 1 + len(stk)
+	} else {
+		cpuprof.lostExtra++
 	}
 
-	// Evict entry with smallest count.
-	var e *cpuprofEntry
-	for i := range b.entry {
-		if e == nil || b.entry[i].count < e.count {
-			e = &b.entry[i]
-		}
-	}
-	if e.count > 0 {
-		if !p.evict(e, flushlog) {
-			// Could not evict entry. Record lost stack.
-			p.lost++
-			return
-		}
-		p.evicts++
-	}
-
-	// Reuse the newly evicted entry.
-	e.depth = len(pc)
-	e.count = 1
-	copy(e.stack[:], pc)
+	atomic.Store(&prof.signalLock, 0)
 }
 
-// evict copies the given entry's data into the log, so that
-// the entry can be reused.  evict is called from add, which
-// is called from the profiling signal handler, so it must not
-// allocate memory or block, and it may be called with no g or m.
-// It is safe to call flushlog. evict returns true if the entry was
-// copied to the log, false if there was no room available.
-//go:nosplit
-//go:nowritebarrierrec
-func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool {
-	d := e.depth
-	nslot := d + 2
-	log := &p.log[p.toggle]
-	if p.nlog+nslot > len(log) {
-		if !flushlog() {
-			return false
+// addExtra adds the "extra" profiling events,
+// queued by addNonGo, to the profile log.
+// addExtra is called either from a signal handler on a Go thread
+// or from an ordinary goroutine; either way it can use stack
+// and has a g. The world may be stopped, though.
+func (p *cpuProfile) addExtra() {
+	// Copy accumulated non-Go profile events.
+	hdr := [1]uint64{1}
+	for i := 0; i < p.numExtra; {
+		p.log.write(nil, 0, hdr[:], p.extra[i+1:i+int(p.extra[i])])
+		i += int(p.extra[i])
+	}
+	p.numExtra = 0
+
+	// Report any lost events.
+	if p.lostExtra > 0 {
+		hdr := [1]uint64{p.lostExtra}
+		lostStk := [2]uintptr{
+			funcPC(_LostExternalCode) + sys.PCQuantum,
+			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
-		log = &p.log[p.toggle]
+		cpuprof.log.write(nil, 0, hdr[:], lostStk[:])
 	}
-
-	q := p.nlog
-	log[q] = e.count
-	q++
-	log[q] = uintptr(d)
-	q++
-	copy(log[q:], e.stack[:d])
-	q += d
-	p.nlog = q
-	e.count = 0
-	return true
 }
 
-// flushlog tries to flush the current log and switch to the other one.
-// flushlog is called from evict, called from add, called from the signal handler,
-// so it cannot allocate memory or block. It can try to swap logs with
-// the writing goroutine, as explained in the comment at the top of this file.
-//go:nowritebarrierrec
-func (p *cpuProfile) flushlog() bool {
-	if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
-		return false
-	}
-	notewakeup(&p.wait)
-
-	p.toggle = 1 - p.toggle
-	log := &p.log[p.toggle]
-	q := 0
-	if p.lost > 0 {
-		lostPC := funcPC(lostProfileData)
-		log[0] = p.lost
-		log[1] = 1
-		log[2] = lostPC
-		q = 3
-		p.lost = 0
-	}
-	p.nlog = q
-	return true
-}
-
-// addNonGo is like add, but runs on a non-Go thread.
-// It can't do anything that might need a g or an m.
-// With this entry point, we don't try to flush the log when evicting an
-// old entry. Instead, we just drop the stack trace if we're out of space.
-//go:nosplit
-//go:nowritebarrierrec
-func (p *cpuProfile) addNonGo(pc []uintptr) {
-	p.addWithFlushlog(pc, func() bool { return false })
-}
-
-// getprofile blocks until the next block of profiling data is available
-// and returns it as a []byte. It is called from the writing goroutine.
-func (p *cpuProfile) getprofile() []byte {
-	if p == nil {
-		return nil
-	}
-
-	if p.wholding {
-		// Release previous log to signal handling side.
-		// Loop because we are racing against SetCPUProfileRate(0).
-		for {
-			n := p.handoff
-			if n == 0 {
-				print("runtime: phase error during cpu profile handoff\n")
-				return nil
-			}
-			if n&0x80000000 != 0 {
-				p.wtoggle = 1 - p.wtoggle
-				p.wholding = false
-				p.flushing = true
-				goto Flush
-			}
-			if atomic.Cas(&p.handoff, n, 0) {
-				break
-			}
-		}
-		p.wtoggle = 1 - p.wtoggle
-		p.wholding = false
-	}
-
-	if p.flushing {
-		goto Flush
-	}
-
-	if !p.on && p.handoff == 0 {
-		return nil
-	}
-
-	// Wait for new log.
-	notetsleepg(&p.wait, -1)
-	noteclear(&p.wait)
-
-	switch n := p.handoff; {
-	case n == 0:
-		print("runtime: phase error during cpu profile wait\n")
-		return nil
-	case n == 0x80000000:
-		p.flushing = true
-		goto Flush
-	default:
-		n &^= 0x80000000
-
-		// Return new log to caller.
-		p.wholding = true
-
-		return uintptrBytes(p.log[p.wtoggle][:n])
-	}
-
-	// In flush mode.
-	// Add is no longer being called. We own the log.
-	// Also, p.handoff is non-zero, so flushlog will return false.
-	// Evict the hash table into the log and return it.
-Flush:
-	for i := range p.hash {
-		b := &p.hash[i]
-		for j := range b.entry {
-			e := &b.entry[j]
-			if e.count > 0 && !p.evict(e, p.flushlog) {
-				// Filled the log. Stop the loop and return what we've got.
-				break Flush
-			}
-		}
-	}
-
-	// Return pending log data.
-	if p.nlog > 0 {
-		// Note that we're using toggle now, not wtoggle,
-		// because we're working on the log directly.
-		n := p.nlog
-		p.nlog = 0
-		return uintptrBytes(p.log[p.toggle][:n])
-	}
-
-	// Made it through the table without finding anything to log.
-	if !p.eodSent {
-		// We may not have space to append this to the partial log buf,
-		// so we always return a new slice for the end-of-data marker.
-		p.eodSent = true
-		return uintptrBytes(eod[:])
-	}
-
-	// Finally done. Clean up and return nil.
-	p.flushing = false
-	if !atomic.Cas(&p.handoff, p.handoff, 0) {
-		print("runtime: profile flush racing with something\n")
-	}
-	return nil
-}
-
-func uintptrBytes(p []uintptr) (ret []byte) {
-	pp := (*slice)(unsafe.Pointer(&p))
-	rp := (*slice)(unsafe.Pointer(&ret))
-
-	rp.array = pp.array
-	rp.len = pp.len * int(unsafe.Sizeof(p[0]))
-	rp.cap = rp.len
-
-	return
-}
-
-// CPUProfile returns the next chunk of binary CPU profiling stack trace data,
-// blocking until data is available. If profiling is turned off and all the profile
-// data accumulated while it was on has been returned, CPUProfile returns nil.
-// The caller must save the returned data before calling CPUProfile again.
+// CPUProfile panics.
+// It formerly provided raw access to chunks of
+// a pprof-format profile generated by the runtime.
+// The details of generating that format have changed,
+// so this functionality has been removed.
 //
-// Most clients should use the runtime/pprof package or
-// the testing package's -test.cpuprofile flag instead of calling
-// CPUProfile directly.
+// Deprecated: use the runtime/pprof package,
+// or the handlers in the net/http/pprof package,
+// or the testing package's -test.cpuprofile flag instead.
 func CPUProfile() []byte {
-	return cpuprof.getprofile()
+	panic("CPUProfile no longer available")
 }
 
 //go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
 func runtime_pprof_runtime_cyclesPerSecond() int64 {
 	return tickspersecond()
 }
+
+// readProfile, provided to runtime/pprof, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+//
+//go:linkname runtime_pprof_readProfile runtime/pprof.readProfile
+func runtime_pprof_readProfile() ([]uint64, []unsafe.Pointer, bool) {
+	lock(&cpuprof.lock)
+	log := cpuprof.log
+	unlock(&cpuprof.lock)
+	data, tags, eof := log.read(profBufBlocking)
+	if len(data) == 0 && eof {
+		lock(&cpuprof.lock)
+		cpuprof.log = nil
+		unlock(&cpuprof.lock)
+	}
+	return data, tags, eof
+}

diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 347b820..a5cbbad 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go

@@ -24,7 +24,10 @@
 }
 
 func TestCgoSignalDeadlock(t *testing.T) {
-	t.Parallel()
+	// Don't call t.Parallel, since too much work going on at the
+	// same time can cause the testprogcgo code to overrun its
+	// timeouts (issue #18598).
+
 	if testing.Short() && runtime.GOOS == "windows" {
 		t.Skip("Skipping in short mode") // takes up to 64 seconds
 	}
@@ -282,33 +285,43 @@
 
 	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
 	if err != nil {
+		if testenv.Builder() == "linux-amd64-alpine" {
+			// See Issue 18243 and Issue 19938.
+			t.Skipf("Skipping failing test on Alpine (golang.org/issue/18243). Ignoring error: %v", err)
+		}
 		t.Fatal(err)
 	}
 	fn := strings.TrimSpace(string(got))
 	defer os.Remove(fn)
 
-	cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1", exe, fn))
-
-	found := false
-	for i, e := range cmd.Env {
-		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
-			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
-			found = true
-			break
+	for try := 0; try < 2; try++ {
+		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-top", "-nodecount=1"))
+		// Check that pprof works both with and without explicit executable on command line.
+		if try == 0 {
+			cmd.Args = append(cmd.Args, exe, fn)
+		} else {
+			cmd.Args = append(cmd.Args, fn)
 		}
-	}
-	if !found {
-		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
-	}
 
-	top, err := cmd.CombinedOutput()
-	t.Logf("%s", top)
-	if err != nil {
-		t.Fatal(err)
-	}
+		found := false
+		for i, e := range cmd.Env {
+			if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+				cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+				found = true
+				break
+			}
+		}
+		if !found {
+			cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
+		}
 
-	if !bytes.Contains(top, []byte("cpuHog")) {
-		t.Error("missing cpuHog in pprof output")
+		top, err := cmd.CombinedOutput()
+		t.Logf("%s:\n%s", cmd.Args, top)
+		if err != nil {
+			t.Error(err)
+		} else if !bytes.Contains(top, []byte("cpuHog")) {
+			t.Error("missing cpuHog in pprof output")
+		}
 	}
 }
 
@@ -385,3 +398,16 @@
 		t.Errorf("expected %q got %s", want, got)
 	}
 }
+
+func TestCgoNumGoroutine(t *testing.T) {
+	switch runtime.GOOS {
+	case "windows", "plan9":
+		t.Skipf("skipping numgoroutine test on %s", runtime.GOOS)
+	}
+	t.Parallel()
+	got := runTestProg(t, "testprogcgo", "NumGoroutine")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}

diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 9ec0ae4..7753809 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go

@@ -164,6 +164,12 @@
 			return
 		}
 		if string(out) != "false\n" {
+			t.Logf("go list -f {{.Stale}} runtime:\n%s", out)
+			out, err := testEnv(exec.Command(testenv.GoToolPath(t), "list", "-f", "{{.StaleReason}}", "runtime")).CombinedOutput()
+			if err != nil {
+				t.Logf("go list -f {{.StaleReason}} failed: %v", err)
+			}
+			t.Logf("go list -f {{.StaleReason}} runtime:\n%s", out)
 			staleRuntimeErr = fmt.Errorf("Stale runtime.a. Run 'go install runtime'.")
 		}
 	})
@@ -302,7 +308,9 @@
 
 func TestBreakpoint(t *testing.T) {
 	output := runTestProg(t, "testprog", "Breakpoint")
-	want := "runtime.Breakpoint()"
+	// If runtime.Breakpoint() is inlined, then the stack trace prints
+	// "runtime.Breakpoint(...)" instead of "runtime.Breakpoint()".
+	want := "runtime.Breakpoint("
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
 	}
@@ -467,28 +475,33 @@
 	fn := strings.TrimSpace(string(got))
 	defer os.Remove(fn)
 
-	cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top", exe, fn))
-
-	found := false
-	for i, e := range cmd.Env {
-		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
-			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
-			found = true
-			break
+	for try := 0; try < 2; try++ {
+		cmd := testEnv(exec.Command(testenv.GoToolPath(t), "tool", "pprof", "-alloc_space", "-top"))
+		// Check that pprof works both with and without explicit executable on command line.
+		if try == 0 {
+			cmd.Args = append(cmd.Args, exe, fn)
+		} else {
+			cmd.Args = append(cmd.Args, fn)
 		}
-	}
-	if !found {
-		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
-	}
+		found := false
+		for i, e := range cmd.Env {
+			if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+				cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+				found = true
+				break
+			}
+		}
+		if !found {
+			cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
+		}
 
-	top, err := cmd.CombinedOutput()
-	t.Logf("%s", top)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if !bytes.Contains(top, []byte("MemProf")) {
-		t.Error("missing MemProf in pprof output")
+		top, err := cmd.CombinedOutput()
+		t.Logf("%s:\n%s", cmd.Args, top)
+		if err != nil {
+			t.Error(err)
+		} else if !bytes.Contains(top, []byte("MemProf")) {
+			t.Error("missing MemProf in pprof output")
+		}
 	}
 }
 
@@ -527,3 +540,77 @@
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
+
+type point struct {
+	x, y *int
+}
+
+func (p *point) negate() {
+	*p.x = *p.x * -1
+	*p.y = *p.y * -1
+}
+
+// Test for issue #10152.
+func TestPanicInlined(t *testing.T) {
+	defer func() {
+		r := recover()
+		if r == nil {
+			t.Fatalf("recover failed")
+		}
+		buf := make([]byte, 2048)
+		n := runtime.Stack(buf, false)
+		buf = buf[:n]
+		if !bytes.Contains(buf, []byte("(*point).negate(")) {
+			t.Fatalf("expecting stack trace to contain call to (*point).negate()")
+		}
+	}()
+
+	pt := new(point)
+	pt.negate()
+}
+
+// Test for issues #3934 and #20018.
+// We want to delay exiting until a panic print is complete.
+func TestPanicRace(t *testing.T) {
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprog")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// The test is intentionally racy, and in my testing does not
+	// produce the expected output about 0.05% of the time.
+	// So run the program in a loop and only fail the test if we
+	// get the wrong output ten times in a row.
+	const tries = 10
+retry:
+	for i := 0; i < tries; i++ {
+		got, err := testEnv(exec.Command(exe, "PanicRace")).CombinedOutput()
+		if err == nil {
+			t.Logf("try %d: program exited successfully, should have failed", i+1)
+			continue
+		}
+
+		if i > 0 {
+			t.Logf("try %d:\n", i+1)
+		}
+		t.Logf("%s\n", got)
+
+		wants := []string{
+			"panic: crash",
+			"PanicRace",
+			"created by ",
+		}
+		for _, want := range wants {
+			if !bytes.Contains(got, []byte(want)) {
+				t.Logf("did not find expected string %q", want)
+				continue retry
+			}
+		}
+
+		// Test generated expected output.
+		return
+	}
+	t.Errorf("test ran %d times without producing expected output", tries)
+}

diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 97deed8..cbaa1f6 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go

@@ -9,6 +9,7 @@
 import (
 	"bytes"
 	"internal/testenv"
+	"io"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -23,6 +24,15 @@
 // Send SIGQUIT to get a stack trace.
 var sigquit = syscall.SIGQUIT
 
+func init() {
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		// We can't use SIGQUIT to kill subprocesses because
+		// it's blocked. Use SIGKILL instead. See issue
+		// #19196 for an example of when this happens.
+		sigquit = syscall.SIGKILL
+	}
+}
+
 func TestCrashDumpsAllThreads(t *testing.T) {
 	switch runtime.GOOS {
 	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "solaris":
@@ -30,6 +40,10 @@
 		t.Skipf("skipping; not supported on %v", runtime.GOOS)
 	}
 
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
+	}
+
 	// We don't use executeTest because we need to kill the
 	// program while it is running.
 
@@ -153,6 +167,82 @@
 }
 `
 
+func TestPanicSystemstack(t *testing.T) {
+	// Test that GOTRACEBACK=crash prints both the system and user
+	// stack of other threads.
+
+	// The GOTRACEBACK=crash handler takes 0.1 seconds even if
+	// it's not writing a core file and potentially much longer if
+	// it is. Skip in short mode.
+	if testing.Short() {
+		t.Skip("Skipping in short mode (GOTRACEBACK=crash is slow)")
+	}
+
+	if runtime.Sigisblocked(int(syscall.SIGQUIT)) {
+		t.Skip("skipping; SIGQUIT is blocked, see golang.org/issue/19196")
+	}
+
+	t.Parallel()
+	cmd := exec.Command(os.Args[0], "testPanicSystemstackInternal")
+	cmd = testEnv(cmd)
+	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
+	pr, pw, err := os.Pipe()
+	if err != nil {
+		t.Fatal("creating pipe: ", err)
+	}
+	cmd.Stderr = pw
+	if err := cmd.Start(); err != nil {
+		t.Fatal("starting command: ", err)
+	}
+	defer cmd.Process.Wait()
+	defer cmd.Process.Kill()
+	if err := pw.Close(); err != nil {
+		t.Log("closing write pipe: ", err)
+	}
+	defer pr.Close()
+
+	// Wait for "x\nx\n" to indicate readiness.
+	buf := make([]byte, 4)
+	_, err = io.ReadFull(pr, buf)
+	if err != nil || string(buf) != "x\nx\n" {
+		t.Fatal("subprocess failed; output:\n", string(buf))
+	}
+
+	// Send SIGQUIT.
+	if err := cmd.Process.Signal(syscall.SIGQUIT); err != nil {
+		t.Fatal("signaling subprocess: ", err)
+	}
+
+	// Get traceback.
+	tb, err := ioutil.ReadAll(pr)
+	if err != nil {
+		t.Fatal("reading traceback from pipe: ", err)
+	}
+
+	// Traceback should have two testPanicSystemstackInternal's
+	// and two blockOnSystemStackInternal's.
+	if bytes.Count(tb, []byte("testPanicSystemstackInternal")) != 2 {
+		t.Fatal("traceback missing user stack:\n", string(tb))
+	} else if bytes.Count(tb, []byte("blockOnSystemStackInternal")) != 2 {
+		t.Fatal("traceback missing system stack:\n", string(tb))
+	}
+}
+
+func init() {
+	if len(os.Args) >= 2 && os.Args[1] == "testPanicSystemstackInternal" {
+		// Get two threads running on the system stack with
+		// something recognizable in the stack trace.
+		runtime.GOMAXPROCS(2)
+		go testPanicSystemstackInternal()
+		testPanicSystemstackInternal()
+	}
+}
+
+func testPanicSystemstackInternal() {
+	runtime.BlockOnSystemStack()
+	os.Exit(1) // Should be unreachable.
+}
+
 func TestSignalExitStatus(t *testing.T) {
 	testenv.MustHaveGoBuild(t)
 	exe, err := buildTestProg(t, "testprog")
@@ -178,3 +268,16 @@
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+func TestSignalDuringExec(t *testing.T) {
+	switch runtime.GOOS {
+	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd":
+	default:
+		t.Skipf("skipping test on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprognet", "SignalDuringExec")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}

diff --git a/src/runtime/debug/garbage.go b/src/runtime/debug/garbage.go
index c82c024..785e9d4 100644
--- a/src/runtime/debug/garbage.go
+++ b/src/runtime/debug/garbage.go

@@ -89,9 +89,7 @@
 // at startup, or 100 if the variable is not set.
 // A negative percentage disables garbage collection.
 func SetGCPercent(percent int) int {
-	old := setGCPercent(int32(percent))
-	runtime.GC()
-	return int(old)
+	return int(setGCPercent(int32(percent)))
 }
 
 // FreeOSMemory forces a garbage collection followed by an

diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
index 04e954b..69e769e 100644
--- a/src/runtime/debug/garbage_test.go
+++ b/src/runtime/debug/garbage_test.go

@@ -5,6 +5,7 @@
 package debug_test
 
 import (
+	"internal/testenv"
 	"runtime"
 	. "runtime/debug"
 	"testing"
@@ -104,15 +105,78 @@
 	}
 }
 
+var (
+	setGCPercentBallast interface{}
+	setGCPercentSink    interface{}
+)
+
 func TestSetGCPercent(t *testing.T) {
+	testenv.SkipFlaky(t, 20076)
+
 	// Test that the variable is being set and returned correctly.
-	// Assume the percentage itself is implemented fine during GC,
-	// which is harder to test.
 	old := SetGCPercent(123)
 	new := SetGCPercent(old)
 	if new != 123 {
 		t.Errorf("SetGCPercent(123); SetGCPercent(x) = %d, want 123", new)
 	}
+
+	// Test that the percentage is implemented correctly.
+	defer func() {
+		SetGCPercent(old)
+		setGCPercentBallast, setGCPercentSink = nil, nil
+	}()
+	SetGCPercent(100)
+	runtime.GC()
+	// Create 100 MB of live heap as a baseline.
+	const baseline = 100 << 20
+	var ms runtime.MemStats
+	runtime.ReadMemStats(&ms)
+	setGCPercentBallast = make([]byte, baseline-ms.Alloc)
+	runtime.GC()
+	runtime.ReadMemStats(&ms)
+	if abs64(baseline-int64(ms.Alloc)) > 10<<20 {
+		t.Fatalf("failed to set up baseline live heap; got %d MB, want %d MB", ms.Alloc>>20, baseline>>20)
+	}
+	// NextGC should be ~200 MB.
+	const thresh = 20 << 20 // TODO: Figure out why this is so noisy on some builders
+	if want := int64(2 * baseline); abs64(want-int64(ms.NextGC)) > thresh {
+		t.Errorf("NextGC = %d MB, want %d±%d MB", ms.NextGC>>20, want>>20, thresh>>20)
+	}
+	// Create some garbage, but not enough to trigger another GC.
+	for i := 0; i < int(1.2*baseline); i += 1 << 10 {
+		setGCPercentSink = make([]byte, 1<<10)
+	}
+	setGCPercentSink = nil
+	// Adjust GOGC to 50. NextGC should be ~150 MB.
+	SetGCPercent(50)
+	runtime.ReadMemStats(&ms)
+	if want := int64(1.5 * baseline); abs64(want-int64(ms.NextGC)) > thresh {
+		t.Errorf("NextGC = %d MB, want %d±%d MB", ms.NextGC>>20, want>>20, thresh>>20)
+	}
+
+	// Trigger a GC and get back to 100 MB live with GOGC=100.
+	SetGCPercent(100)
+	runtime.GC()
+	// Raise live to 120 MB.
+	setGCPercentSink = make([]byte, int(0.2*baseline))
+	// Lower GOGC to 10. This must force a GC.
+	runtime.ReadMemStats(&ms)
+	ngc1 := ms.NumGC
+	SetGCPercent(10)
+	// It may require an allocation to actually force the GC.
+	setGCPercentSink = make([]byte, 1<<20)
+	runtime.ReadMemStats(&ms)
+	ngc2 := ms.NumGC
+	if ngc1 == ngc2 {
+		t.Errorf("expected GC to run but it did not")
+	}
+}
+
+func abs64(a int64) int64 {
+	if a < 0 {
+		return -a
+	}
+	return a
 }
 
 func TestSetMaxThreadsOvf(t *testing.T) {

diff --git a/src/runtime/defs_freebsd.go b/src/runtime/defs_freebsd.go
index 73422b7..0a11d09 100644
--- a/src/runtime/defs_freebsd.go
+++ b/src/runtime/defs_freebsd.go

@@ -28,9 +28,20 @@
 #include <sys/thr.h>
 #include <sys/_sigset.h>
 #include <sys/unistd.h>
+#include <sys/sysctl.h>
+#include <sys/cpuset.h>
+#include <sys/param.h>
 */
 import "C"
 
+// Local consts.
+const (
+	_NBBY            = C.NBBY            // Number of bits in a byte.
+	_CTL_MAXNAME     = C.CTL_MAXNAME     // Largest number of components supported.
+	_CPU_LEVEL_WHICH = C.CPU_LEVEL_WHICH // Actual mask/id for which.
+	_CPU_WHICH_PID   = C.CPU_WHICH_PID   // Specifies a process id.
+)
+
 const (
 	EINTR  = C.EINTR
 	EFAULT = C.EFAULT

diff --git a/src/runtime/defs_freebsd_386.go b/src/runtime/defs_freebsd_386.go
index 0c05d71..92b0550 100644
--- a/src/runtime/defs_freebsd_386.go
+++ b/src/runtime/defs_freebsd_386.go

@@ -6,6 +6,13 @@
 import "unsafe"
 
 const (
+	_NBBY            = 0x8
+	_CTL_MAXNAME     = 0x18
+	_CPU_LEVEL_WHICH = 0x3
+	_CPU_WHICH_PID   = 0x2
+)
+
+const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
 

diff --git a/src/runtime/defs_freebsd_amd64.go b/src/runtime/defs_freebsd_amd64.go
index b416044..645e205 100644
--- a/src/runtime/defs_freebsd_amd64.go
+++ b/src/runtime/defs_freebsd_amd64.go

@@ -6,6 +6,13 @@
 import "unsafe"
 
 const (
+	_NBBY            = 0x8
+	_CTL_MAXNAME     = 0x18
+	_CPU_LEVEL_WHICH = 0x3
+	_CPU_WHICH_PID   = 0x2
+)
+
+const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
 

diff --git a/src/runtime/defs_freebsd_arm.go b/src/runtime/defs_freebsd_arm.go
index 8f85f17..c8a198f 100644
--- a/src/runtime/defs_freebsd_arm.go
+++ b/src/runtime/defs_freebsd_arm.go

@@ -6,6 +6,13 @@
 import "unsafe"
 
 const (
+	_NBBY            = 0x8
+	_CTL_MAXNAME     = 0x18
+	_CPU_LEVEL_WHICH = 0x3
+	_CPU_WHICH_PID   = 0x2
+)
+
+const (
 	_EINTR  = 0x4
 	_EFAULT = 0xe
 

diff --git a/src/runtime/duff_386.s b/src/runtime/duff_386.s
index 5575455..ab01430 100644
--- a/src/runtime/duff_386.s
+++ b/src/runtime/duff_386.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/duff_amd64.s b/src/runtime/duff_amd64.s
index 6ed7f65..a1112a4 100644
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/duff_arm.s b/src/runtime/duff_arm.s
index da9f0cb..ba8235b 100644
--- a/src/runtime/duff_arm.s
+++ b/src/runtime/duff_arm.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/duff_arm64.s b/src/runtime/duff_arm64.s
index 5a147fa..60a0e26 100644
--- a/src/runtime/duff_arm64.s
+++ b/src/runtime/duff_arm64.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/duff_mips64x.s b/src/runtime/duff_mips64x.s
index 062e04a..e21b81d 100644
--- a/src/runtime/duff_mips64x.s
+++ b/src/runtime/duff_mips64x.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/duff_ppc64x.s b/src/runtime/duff_ppc64x.s
index c8204c4..b4bb9e7 100644
--- a/src/runtime/duff_ppc64x.s
+++ b/src/runtime/duff_ppc64x.s

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mkduff.go
+// Code generated by mkduff.go; DO NOT EDIT.
 // Run go generate from src/runtime to update.
 // See mkduff.go for comments.
 

diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
index da34425..6b45a43 100644
--- a/src/runtime/env_posix.go
+++ b/src/runtime/env_posix.go

@@ -13,7 +13,7 @@
 	if env == nil {
 		throw("getenv before env init")
 	}
-	for _, s := range environ() {
+	for _, s := range env {
 		if len(s) > len(key) && s[len(key)] == '=' && s[:len(key)] == key {
 			return s[len(key)+1:]
 		}

diff --git a/src/runtime/error.go b/src/runtime/error.go
index 0238c5e..eafcc9b 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+import _ "unsafe" // for go:linkname
+
 // The Error interface identifies a run time error.
 type Error interface {
 	error
@@ -72,8 +74,6 @@
 
 // For calling from C.
 // Prints an argument passed to panic.
-// There's room for arbitrary complexity here, but we keep it
-// simple and handle just a few important cases: int, string, and Stringer.
 func printany(i interface{}) {
 	switch v := i.(type) {
 	case nil:
@@ -82,8 +82,38 @@
 		print(v.String())
 	case error:
 		print(v.Error())
+	case bool:
+		print(v)
 	case int:
 		print(v)
+	case int8:
+		print(v)
+	case int16:
+		print(v)
+	case int32:
+		print(v)
+	case int64:
+		print(v)
+	case uint:
+		print(v)
+	case uint8:
+		print(v)
+	case uint16:
+		print(v)
+	case uint32:
+		print(v)
+	case uint64:
+		print(v)
+	case uintptr:
+		print(v)
+	case float32:
+		print(v)
+	case float64:
+		print(v)
+	case complex64:
+		print(v)
+	case complex128:
+		print(v)
 	case string:
 		print(v)
 	default:
@@ -91,7 +121,41 @@
 	}
 }
 
+// strings.IndexByte is implemented in runtime/asm_$goarch.s
+// but amusingly we need go:linkname to get access to it here in the runtime.
+//go:linkname stringsIndexByte strings.IndexByte
+func stringsIndexByte(s string, c byte) int
+
 // called from generated code
-func panicwrap(pkg, typ, meth string) {
+func panicwrap() {
+	pc := make([]uintptr, 1)
+	n := Callers(2, pc)
+	if n == 0 {
+		throw("panicwrap: Callers failed")
+	}
+	frames := CallersFrames(pc)
+	frame, _ := frames.Next()
+	name := frame.Function
+	// name is something like "main.(*T).F".
+	// We want to extract pkg ("main"), typ ("T"), and meth ("F").
+	// Do it by finding the parens.
+	i := stringsIndexByte(name, '(')
+	if i < 0 {
+		throw("panicwrap: no ( in " + frame.Function)
+	}
+	pkg := name[:i-1]
+	if i+2 >= len(name) || name[i-1:i+2] != ".(*" {
+		throw("panicwrap: unexpected string after package name: " + frame.Function)
+	}
+	name = name[i+2:]
+	i = stringsIndexByte(name, ')')
+	if i < 0 {
+		throw("panicwrap: no ) in " + frame.Function)
+	}
+	if i+2 >= len(name) || name[i:i+2] != ")." {
+		throw("panicwrap: unexpected string after type name: " + frame.Function)
+	}
+	typ := name[:i]
+	meth := name[i+2:]
 	panic(plainError("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer"))
 }

diff --git a/src/runtime/example_test.go b/src/runtime/example_test.go
new file mode 100644
index 0000000..e4912a5
--- /dev/null
+++ b/src/runtime/example_test.go

@@ -0,0 +1,54 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+)
+
+func ExampleFrames() {
+	c := func() {
+		// Ask runtime.Callers for up to 10 pcs, including runtime.Callers itself.
+		pc := make([]uintptr, 10)
+		n := runtime.Callers(0, pc)
+		if n == 0 {
+			// No pcs available. Stop now.
+			// This can happen if the first argument to runtime.Callers is large.
+			return
+		}
+
+		pc = pc[:n] // pass only valid pcs to runtime.CallersFrames
+		frames := runtime.CallersFrames(pc)
+
+		// Loop to get frames.
+		// A fixed number of pcs can expand to an indefinite number of Frames.
+		for {
+			frame, more := frames.Next()
+			// To keep this example's output stable
+			// even if there are changes in the testing package,
+			// stop unwinding when we leave package runtime.
+			if !strings.Contains(frame.File, "runtime/") {
+				break
+			}
+			fmt.Printf("- more:%v | %s\n", more, frame.Function)
+			if !more {
+				break
+			}
+		}
+	}
+
+	b := func() { c() }
+	a := func() { b() }
+
+	a()
+	// Output:
+	// - more:true | runtime.Callers
+	// - more:true | runtime_test.ExampleFrames.func1
+	// - more:true | runtime_test.ExampleFrames.func2
+	// - more:true | runtime_test.ExampleFrames.func3
+	// - more:true | runtime_test.ExampleFrames
+}

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 9b76555..c929bd4 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go

@@ -41,11 +41,11 @@
 }
 
 func LFStackPush(head *uint64, node *LFNode) {
-	lfstackpush(head, (*lfnode)(unsafe.Pointer(node)))
+	(*lfstack)(head).push((*lfnode)(unsafe.Pointer(node)))
 }
 
 func LFStackPop(head *uint64) *LFNode {
-	return (*LFNode)(unsafe.Pointer(lfstackpop(head)))
+	return (*LFNode)(unsafe.Pointer((*lfstack)(head).pop()))
 }
 
 func GCMask(x interface{}) (ret []byte) {
@@ -245,3 +245,127 @@
 
 	return
 }
+
+func Fastrand() uint32          { return fastrand() }
+func Fastrandn(n uint32) uint32 { return fastrandn(n) }
+
+type ProfBuf profBuf
+
+func NewProfBuf(hdrsize, bufwords, tags int) *ProfBuf {
+	return (*ProfBuf)(newProfBuf(hdrsize, bufwords, tags))
+}
+
+func (p *ProfBuf) Write(tag *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+	(*profBuf)(p).write(tag, now, hdr, stk)
+}
+
+const (
+	ProfBufBlocking    = profBufBlocking
+	ProfBufNonBlocking = profBufNonBlocking
+)
+
+func (p *ProfBuf) Read(mode profBufReadMode) ([]uint64, []unsafe.Pointer, bool) {
+	return (*profBuf)(p).read(profBufReadMode(mode))
+}
+
+func (p *ProfBuf) Close() {
+	(*profBuf)(p).close()
+}
+
+// ReadMemStatsSlow returns both the runtime-computed MemStats and
+// MemStats accumulated by scanning the heap.
+func ReadMemStatsSlow() (base, slow MemStats) {
+	stopTheWorld("ReadMemStatsSlow")
+
+	// Run on the system stack to avoid stack growth allocation.
+	systemstack(func() {
+		// Make sure stats don't change.
+		getg().m.mallocing++
+
+		readmemstats_m(&base)
+
+		// Initialize slow from base and zero the fields we're
+		// recomputing.
+		slow = base
+		slow.Alloc = 0
+		slow.TotalAlloc = 0
+		slow.Mallocs = 0
+		slow.Frees = 0
+		var bySize [_NumSizeClasses]struct {
+			Mallocs, Frees uint64
+		}
+
+		// Add up current allocations in spans.
+		for _, s := range mheap_.allspans {
+			if s.state != mSpanInUse {
+				continue
+			}
+			if sizeclass := s.spanclass.sizeclass(); sizeclass == 0 {
+				slow.Mallocs++
+				slow.Alloc += uint64(s.elemsize)
+			} else {
+				slow.Mallocs += uint64(s.allocCount)
+				slow.Alloc += uint64(s.allocCount) * uint64(s.elemsize)
+				bySize[sizeclass].Mallocs += uint64(s.allocCount)
+			}
+		}
+
+		// Add in frees. readmemstats_m flushed the cached stats, so
+		// these are up-to-date.
+		var smallFree uint64
+		slow.Frees = mheap_.nlargefree
+		for i := range mheap_.nsmallfree {
+			slow.Frees += mheap_.nsmallfree[i]
+			bySize[i].Frees = mheap_.nsmallfree[i]
+			bySize[i].Mallocs += mheap_.nsmallfree[i]
+			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		}
+		slow.Frees += memstats.tinyallocs
+		slow.Mallocs += slow.Frees
+
+		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+
+		for i := range slow.BySize {
+			slow.BySize[i].Mallocs = bySize[i].Mallocs
+			slow.BySize[i].Frees = bySize[i].Frees
+		}
+
+		getg().m.mallocing--
+	})
+
+	startTheWorld()
+	return
+}
+
+// BlockOnSystemStack switches to the system stack, prints "x\n" to
+// stderr, and blocks in a stack containing
+// "runtime.blockOnSystemStackInternal".
+func BlockOnSystemStack() {
+	systemstack(blockOnSystemStackInternal)
+}
+
+func blockOnSystemStackInternal() {
+	print("x\n")
+	lock(&deadlock)
+	lock(&deadlock)
+}
+
+type RWMutex struct {
+	rw rwmutex
+}
+
+func (rw *RWMutex) RLock() {
+	rw.rw.rlock()
+}
+
+func (rw *RWMutex) RUnlock() {
+	rw.rw.runlock()
+}
+
+func (rw *RWMutex) Lock() {
+	rw.rw.lock()
+}
+
+func (rw *RWMutex) Unlock() {
+	rw.rw.unlock()
+}

diff --git a/src/runtime/export_unix_test.go b/src/runtime/export_unix_test.go
new file mode 100644
index 0000000..54d5770
--- /dev/null
+++ b/src/runtime/export_unix_test.go

@@ -0,0 +1,19 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris
+
+package runtime
+
+func sigismember(mask *sigset, i int) bool {
+	clear := *mask
+	sigdelset(&clear, i)
+	return clear != *mask
+}
+
+func Sigisblocked(i int) bool {
+	var sigmask sigset
+	sigprocmask(_SIG_SETMASK, nil, &sigmask)
+	return sigismember(&sigmask, i)
+}

diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 1b53367..6e6c674 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go

@@ -50,13 +50,6 @@
 	gcshrinkstackoff: setting gcshrinkstackoff=1 disables moving goroutines
 	onto smaller stacks. In this mode, a goroutine's stack can only grow.
 
-	gcstackbarrieroff: setting gcstackbarrieroff=1 disables the use of stack barriers
-	that allow the garbage collector to avoid repeating a stack scan during the
-	mark termination phase.
-
-	gcstackbarrierall: setting gcstackbarrierall=1 installs stack barriers
-	in every stack frame, rather than in exponentially-spaced frames.
-
 	gcrescanstacks: setting gcrescanstacks=1 enables stack
 	re-scanning during the STW mark termination phase. This is
 	helpful for debugging if objects are being prematurely
@@ -85,7 +78,7 @@
 	for mark/scan are broken down in to assist time (GC performed in
 	line with allocation), background GC time, and idle GC time.
 	If the line ends with "(forced)", this GC was forced by a
-	runtime.GC() call and all phases are STW.
+	runtime.GC() call.
 
 	Setting gctrace to any value > 0 also causes the garbage collector
 	to emit a summary when memory is released back to the system.
@@ -173,33 +166,26 @@
 // program counter, file name, and line number within the file of the corresponding
 // call. The boolean ok is false if it was not possible to recover the information.
 func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
-	// Ask for two PCs: the one we were asked for
-	// and what it called, so that we can see if it
-	// "called" sigpanic.
-	var rpc [2]uintptr
+	// Make room for three PCs: the one we were asked for,
+	// what it called, so that CallersFrames can see if it "called"
+	// sigpanic, and possibly a PC for skipPleaseUseCallersFrames.
+	var rpc [3]uintptr
 	if callers(1+skip-1, rpc[:]) < 2 {
 		return
 	}
-	f := findfunc(rpc[1])
-	if f == nil {
-		// TODO(rsc): Probably a bug?
-		// The C version said "have retpc at least"
-		// but actually returned pc=0.
-		ok = true
+	var stackExpander stackExpander
+	callers := stackExpander.init(rpc[:])
+	// We asked for one extra, so skip that one. If this is sigpanic,
+	// stepping over this frame will set up state in Frames so the
+	// next frame is correct.
+	callers, _, ok = stackExpander.next(callers)
+	if !ok {
 		return
 	}
-	pc = rpc[1]
-	xpc := pc
-	g := findfunc(rpc[0])
-	// All architectures turn faults into apparent calls to sigpanic.
-	// If we see a call to sigpanic, we do not back up the PC to find
-	// the line number of the call instruction, because there is no call.
-	if xpc > f.entry && (g == nil || g.entry != funcPC(sigpanic)) {
-		xpc--
-	}
-	file, line32 := funcline(f, xpc)
-	line = int(line32)
-	ok = true
+	_, frame, _ := stackExpander.next(callers)
+	pc = frame.PC
+	file = frame.File
+	line = frame.Line
 	return
 }
 
@@ -209,11 +195,13 @@
 // 1 identifying the caller of Callers.
 // It returns the number of entries written to pc.
 //
-// Note that since each slice entry pc[i] is a return program counter,
-// looking up the file and line for pc[i] (for example, using (*Func).FileLine)
-// will normally return the file and line number of the instruction immediately
-// following the call.
-// To easily look up file/line information for the call sequence, use Frames.
+// To translate these PCs into symbolic information such as function
+// names and line numbers, use CallersFrames. CallersFrames accounts
+// for inlined functions and adjusts the return program counters into
+// call program counters. Iterating over the returned slice of PCs
+// directly is discouraged, as is using FuncForPC on any of the
+// returned PCs, since these cannot account for inlining or return
+// program counter adjustment.
 func Callers(skip int, pc []uintptr) int {
 	// runtime.callers uses pc.array==nil as a signal
 	// to print a stack trace. Pick off 0-length pc here
@@ -247,5 +235,5 @@
 const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
-// 386, amd64, arm, or s390x.
+// one of 386, amd64, arm, s390x, and so on.
 const GOARCH string = sys.GOARCH

diff --git a/src/runtime/fastlog2.go b/src/runtime/fastlog2.go
index 5f3fb53..1f251bf 100644
--- a/src/runtime/fastlog2.go
+++ b/src/runtime/fastlog2.go

@@ -4,8 +4,6 @@
 
 package runtime
 
-import "unsafe"
-
 // fastlog2 implements a fast approximation to the base 2 log of a
 // float64. This is used to compute a geometric distribution for heap
 // sampling, without introducing dependencies into package math. This
@@ -27,7 +25,3 @@
 	low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
 	return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
 }
-
-// float64bits returns the IEEE 754 binary representation of f.
-// Taken from math.Float64bits to avoid dependencies into package math.
-func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }

diff --git a/src/runtime/float.go b/src/runtime/float.go
new file mode 100644
index 0000000..459e58d
--- /dev/null
+++ b/src/runtime/float.go

@@ -0,0 +1,53 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var inf = float64frombits(0x7FF0000000000000)
+
+// isNaN reports whether f is an IEEE 754 ``not-a-number'' value.
+func isNaN(f float64) (is bool) {
+	// IEEE 754 says that only NaNs satisfy f != f.
+	return f != f
+}
+
+// isFinite reports whether f is neither NaN nor an infinity.
+func isFinite(f float64) bool {
+	return !isNaN(f - f)
+}
+
+// isInf reports whether f is an infinity.
+func isInf(f float64) bool {
+	return !isNaN(f) && !isFinite(f)
+}
+
+// Abs returns the absolute value of x.
+//
+// Special cases are:
+//	Abs(±Inf) = +Inf
+//	Abs(NaN) = NaN
+func abs(x float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x) &^ sign)
+}
+
+// copysign returns a value with the magnitude
+// of x and the sign of y.
+func copysign(x, y float64) float64 {
+	const sign = 1 << 63
+	return float64frombits(float64bits(x)&^sign | float64bits(y)&sign)
+}
+
+// Float64bits returns the IEEE 754 binary representation of f.
+func float64bits(f float64) uint64 {
+	return *(*uint64)(unsafe.Pointer(&f))
+}
+
+// Float64frombits returns the floating point number corresponding
+// the IEEE 754 binary representation b.
+func float64frombits(b uint64) float64 {
+	return *(*float64)(unsafe.Pointer(&b))
+}

diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 82992e2..27d5a2f 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h

@@ -6,14 +6,14 @@
 // in Go binaries. It is included by assembly sources, so it must
 // be written using #defines.
 //
-// The Go compiler also #includes this file, for now.
-//
-// symtab.go also contains a copy of these constants.
+// These must agree with symtab.go and ../cmd/internal/obj/funcdata.go.
 
 #define PCDATA_StackMapIndex 0
+#define PCDATA_InlTreeIndex 1
 
 #define FUNCDATA_ArgsPointerMaps 0 /* garbage collector blocks */
 #define FUNCDATA_LocalsPointerMaps 1
+#define FUNCDATA_InlTree 2
 
 // Pseudo-assembly statements.
 

diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 4a32f15..03acc8a 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go

@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"fmt"
 	"os"
 	"reflect"
 	"runtime"
@@ -448,3 +449,53 @@
 		t.Fatalf("mheap_.pagesInUse is %d, but direct count is %d", pagesInUse, counted)
 	}
 }
+
+func TestReadMemStats(t *testing.T) {
+	base, slow := runtime.ReadMemStatsSlow()
+	if base != slow {
+		logDiff(t, "MemStats", reflect.ValueOf(base), reflect.ValueOf(slow))
+		t.Fatal("memstats mismatch")
+	}
+}
+
+func logDiff(t *testing.T, prefix string, got, want reflect.Value) {
+	typ := got.Type()
+	switch typ.Kind() {
+	case reflect.Array, reflect.Slice:
+		if got.Len() != want.Len() {
+			t.Logf("len(%s): got %v, want %v", prefix, got, want)
+			return
+		}
+		for i := 0; i < got.Len(); i++ {
+			logDiff(t, fmt.Sprintf("%s[%d]", prefix, i), got.Index(i), want.Index(i))
+		}
+	case reflect.Struct:
+		for i := 0; i < typ.NumField(); i++ {
+			gf, wf := got.Field(i), want.Field(i)
+			logDiff(t, prefix+"."+typ.Field(i).Name, gf, wf)
+		}
+	case reflect.Map:
+		t.Fatal("not implemented: logDiff for map")
+	default:
+		if got.Interface() != want.Interface() {
+			t.Logf("%s: got %v, want %v", prefix, got, want)
+		}
+	}
+}
+
+func BenchmarkReadMemStats(b *testing.B) {
+	var ms runtime.MemStats
+	const heapSize = 100 << 20
+	x := make([]*[1024]byte, heapSize/1024)
+	for i := range x {
+		x[i] = new([1024]byte)
+	}
+	hugeSink = x
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		runtime.ReadMemStats(&ms)
+	}
+
+	hugeSink = nil
+}

diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index 086d374..11ce0cb 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go

@@ -116,6 +116,11 @@
 	oldbuckets unsafe.Pointer // previous bucket array of half the size, non-nil only when growing
 	nevacuate  uintptr        // progress counter for evacuation (buckets less than this have been evacuated)
 
+	extra *mapextra // optional fields
+}
+
+// mapextra holds fields that are not present on all maps.
+type mapextra struct {
 	// If both key and value do not contain pointers and are inline, then we mark bucket
 	// type as containing no pointers. This avoids scanning such maps.
 	// However, bmap.overflow is a pointer. In order to keep overflow buckets
@@ -123,9 +128,11 @@
 	// Overflow is used only if key and value do not contain pointers.
 	// overflow[0] contains overflow buckets for hmap.buckets.
 	// overflow[1] contains overflow buckets for hmap.oldbuckets.
-	// The first indirection allows us to reduce static size of hmap.
-	// The second indirection allows to store a pointer to the slice in hiter.
-	overflow *[2]*[]*bmap
+	// The indirection allows to store a pointer to the slice in hiter.
+	overflow [2]*[]*bmap
+
+	// nextOverflow holds a pointer to a free overflow bucket.
+	nextOverflow *bmap
 }
 
 // A bucket for a Go map.
@@ -170,6 +177,10 @@
 	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize))
 }
 
+func (b *bmap) setoverflow(t *maptype, ovf *bmap) {
+	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
+}
+
 // incrnoverflow increments h.noverflow.
 // noverflow counts the number of overflow buckets.
 // This is used to trigger same-size map growth.
@@ -196,21 +207,40 @@
 	}
 }
 
-func (h *hmap) setoverflow(t *maptype, b, ovf *bmap) {
+func (h *hmap) newoverflow(t *maptype, b *bmap) *bmap {
+	var ovf *bmap
+	if h.extra != nil && h.extra.nextOverflow != nil {
+		// We have preallocated overflow buckets available.
+		// See makeBucketArray for more details.
+		ovf = h.extra.nextOverflow
+		if ovf.overflow(t) == nil {
+			// We're not at the end of the preallocated overflow buckets. Bump the pointer.
+			h.extra.nextOverflow = (*bmap)(add(unsafe.Pointer(ovf), uintptr(t.bucketsize)))
+		} else {
+			// This is the last preallocated overflow bucket.
+			// Reset the overflow pointer on this bucket,
+			// which was set to a non-nil sentinel value.
+			ovf.setoverflow(t, nil)
+			h.extra.nextOverflow = nil
+		}
+	} else {
+		ovf = (*bmap)(newobject(t.bucket))
+	}
 	h.incrnoverflow()
 	if t.bucket.kind&kindNoPointers != 0 {
 		h.createOverflow()
-		*h.overflow[0] = append(*h.overflow[0], ovf)
+		*h.extra.overflow[0] = append(*h.extra.overflow[0], ovf)
 	}
-	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
+	b.setoverflow(t, ovf)
+	return ovf
 }
 
 func (h *hmap) createOverflow() {
-	if h.overflow == nil {
-		h.overflow = new([2]*[]*bmap)
+	if h.extra == nil {
+		h.extra = new(mapextra)
 	}
-	if h.overflow[0] == nil {
-		h.overflow[0] = new([]*bmap)
+	if h.extra.overflow[0] == nil {
+		h.extra.overflow[0] = new([]*bmap)
 	}
 }
 
@@ -225,9 +255,8 @@
 		throw("bad hmap size")
 	}
 
-	if hint < 0 || int64(int32(hint)) != hint {
-		panic(plainError("makemap: size out of range"))
-		// TODO: make hint an int, then none of this nonsense
+	if hint < 0 || hint > int64(maxSliceCap(t.bucket.size)) {
+		hint = 0
 	}
 
 	if !ismapkey(t.key) {
@@ -277,8 +306,14 @@
 	// if B == 0, the buckets field is allocated lazily later (in mapassign)
 	// If hint is large zeroing this memory could take a while.
 	buckets := bucket
+	var extra *mapextra
 	if B != 0 {
-		buckets = newarray(t.bucket, 1<<B)
+		var nextOverflow *bmap
+		buckets, nextOverflow = makeBucketArray(t, B)
+		if nextOverflow != nil {
+			extra = new(mapextra)
+			extra.nextOverflow = nextOverflow
+		}
 	}
 
 	// initialize Hmap
@@ -287,6 +322,7 @@
 	}
 	h.count = 0
 	h.B = B
+	h.extra = extra
 	h.flags = 0
 	h.hash0 = fastrand()
 	h.buckets = buckets
@@ -498,11 +534,13 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	h.flags |= hashWriting
-
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
 
+	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// in which case we have not actually done a write.
+	h.flags |= hashWriting
+
 	if h.buckets == nil {
 		h.buckets = newarray(t.bucket, 1)
 	}
@@ -563,8 +601,7 @@
 
 	if inserti == nil {
 		// all current buckets are full, allocate a new one.
-		newb := (*bmap)(newobject(t.bucket))
-		h.setoverflow(t, b, newb)
+		newb := h.newoverflow(t, b)
 		inserti = &newb.tophash[0]
 		insertk = add(unsafe.Pointer(newb), dataOffset)
 		val = add(insertk, bucketCnt*uintptr(t.keysize))
@@ -611,10 +648,14 @@
 	if h.flags&hashWriting != 0 {
 		throw("concurrent map writes")
 	}
-	h.flags |= hashWriting
 
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash, since alg.hash may panic,
+	// in which case we have not actually done a write (delete).
+	h.flags |= hashWriting
+
 	bucket := hash & (uintptr(1)<<h.B - 1)
 	if h.growing() {
 		growWork(t, h, bucket)
@@ -702,7 +743,7 @@
 		// the table grows and/or overflow buckets are added to the table
 		// while we are iterating.
 		h.createOverflow()
-		it.overflow = *h.overflow
+		it.overflow = h.extra.overflow
 	}
 
 	// decide where to start
@@ -867,6 +908,36 @@
 	goto next
 }
 
+func makeBucketArray(t *maptype, b uint8) (buckets unsafe.Pointer, nextOverflow *bmap) {
+	base := uintptr(1 << b)
+	nbuckets := base
+	// For small b, overflow buckets are unlikely.
+	// Avoid the overhead of the calculation.
+	if b >= 4 {
+		// Add on the estimated number of overflow buckets
+		// required to insert the median number of elements
+		// used with this value of b.
+		nbuckets += 1 << (b - 4)
+		sz := t.bucket.size * nbuckets
+		up := roundupsize(sz)
+		if up != sz {
+			nbuckets = up / t.bucket.size
+		}
+	}
+	buckets = newarray(t.bucket, int(nbuckets))
+	if base != nbuckets {
+		// We preallocated some overflow buckets.
+		// To keep the overhead of tracking these overflow buckets to a minimum,
+		// we use the convention that if a preallocated overflow bucket's overflow
+		// pointer is nil, then there are more available by bumping the pointer.
+		// We need a safe non-nil pointer for the last overflow bucket; just use buckets.
+		nextOverflow = (*bmap)(add(buckets, base*uintptr(t.bucketsize)))
+		last := (*bmap)(add(buckets, (nbuckets-1)*uintptr(t.bucketsize)))
+		last.setoverflow(t, (*bmap)(buckets))
+	}
+	return buckets, nextOverflow
+}
+
 func hashGrow(t *maptype, h *hmap) {
 	// If we've hit the load factor, get bigger.
 	// Otherwise, there are too many overflow buckets,
@@ -877,7 +948,8 @@
 		h.flags |= sameSizeGrow
 	}
 	oldbuckets := h.buckets
-	newbuckets := newarray(t.bucket, 1<<(h.B+bigger))
+	newbuckets, nextOverflow := makeBucketArray(t, h.B+bigger)
+
 	flags := h.flags &^ (iterator | oldIterator)
 	if h.flags&iterator != 0 {
 		flags |= oldIterator
@@ -890,13 +962,19 @@
 	h.nevacuate = 0
 	h.noverflow = 0
 
-	if h.overflow != nil {
+	if h.extra != nil && h.extra.overflow[0] != nil {
 		// Promote current overflow buckets to the old generation.
-		if h.overflow[1] != nil {
+		if h.extra.overflow[1] != nil {
 			throw("overflow is not nil")
 		}
-		h.overflow[1] = h.overflow[0]
-		h.overflow[0] = nil
+		h.extra.overflow[1] = h.extra.overflow[0]
+		h.extra.overflow[0] = nil
+	}
+	if nextOverflow != nil {
+		if h.extra == nil {
+			h.extra = new(mapextra)
+		}
+		h.extra.nextOverflow = nextOverflow
 	}
 
 	// the actual copying of the hash table data is done incrementally
@@ -906,7 +984,7 @@
 // overLoadFactor reports whether count items placed in 1<<B buckets is over loadFactor.
 func overLoadFactor(count int64, B uint8) bool {
 	// TODO: rewrite to use integer math and comparison?
-	return count >= bucketCnt && float32(count) >= loadFactor*float32((uintptr(1)<<B))
+	return count >= bucketCnt && float32(count) >= loadFactor*float32((uint64(1)<<B))
 }
 
 // tooManyOverflowBuckets reports whether noverflow buckets is too many for a map with 1<<B buckets.
@@ -958,6 +1036,11 @@
 	}
 }
 
+func bucketEvacuated(t *maptype, h *hmap, bucket uintptr) bool {
+	b := (*bmap)(add(h.oldbuckets, bucket*uintptr(t.bucketsize)))
+	return evacuated(b)
+}
+
 func evacuate(t *maptype, h *hmap, oldbucket uintptr) {
 	b := (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
 	newbit := h.noldbuckets()
@@ -1034,8 +1117,7 @@
 				if useX {
 					b.tophash[i] = evacuatedX
 					if xi == bucketCnt {
-						newx := (*bmap)(newobject(t.bucket))
-						h.setoverflow(t, x, newx)
+						newx := h.newoverflow(t, x)
 						x = newx
 						xi = 0
 						xk = add(unsafe.Pointer(x), dataOffset)
@@ -1058,8 +1140,7 @@
 				} else {
 					b.tophash[i] = evacuatedY
 					if yi == bucketCnt {
-						newy := (*bmap)(newobject(t.bucket))
-						h.setoverflow(t, y, newy)
+						newy := h.newoverflow(t, y)
 						y = newy
 						yi = 0
 						yk = add(unsafe.Pointer(y), dataOffset)
@@ -1098,14 +1179,23 @@
 	// Advance evacuation mark
 	if oldbucket == h.nevacuate {
 		h.nevacuate = oldbucket + 1
-		if oldbucket+1 == newbit { // newbit == # of oldbuckets
+		// Experiments suggest that 1024 is overkill by at least an order of magnitude.
+		// Put it in there as a safeguard anyway, to ensure O(1) behavior.
+		stop := h.nevacuate + 1024
+		if stop > newbit {
+			stop = newbit
+		}
+		for h.nevacuate != stop && bucketEvacuated(t, h, h.nevacuate) {
+			h.nevacuate++
+		}
+		if h.nevacuate == newbit { // newbit == # of oldbuckets
 			// Growing is all done. Free old main bucket array.
 			h.oldbuckets = nil
 			// Can discard old overflow buckets as well.
 			// If they are still referenced by an iterator,
 			// then the iterator holds a pointers to the slice.
-			if h.overflow != nil {
-				h.overflow[1] = nil
+			if h.extra != nil {
+				h.extra.overflow[1] = nil
 			}
 			h.flags &^= sameSizeGrow
 		}
@@ -1119,8 +1209,8 @@
 // Reflect stubs. Called from ../reflect/asm_*.s
 
 //go:linkname reflect_makemap reflect.makemap
-func reflect_makemap(t *maptype) *hmap {
-	return makemap(t, 0, nil, nil)
+func reflect_makemap(t *maptype, cap int) *hmap {
+	return makemap(t, int64(cap), nil, nil)
 }
 
 //go:linkname reflect_mapaccess reflect.mapaccess

diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
index b5ecc2d..1f9b313 100644
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go

@@ -45,7 +45,7 @@
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -94,7 +94,7 @@
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -143,7 +143,7 @@
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -192,7 +192,7 @@
 			if k != key {
 				continue
 			}
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -223,7 +223,7 @@
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
 			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 				if x == empty {
 					continue
 				}
@@ -240,7 +240,7 @@
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -252,8 +252,6 @@
 				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 			// check first 4 bytes
-			// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
-			// four 1-byte comparisons.
 			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
 				continue
 			}
@@ -295,7 +293,7 @@
 	}
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x != top {
 				continue
 			}
@@ -332,7 +330,7 @@
 		if key.len < 32 {
 			// short key, doing lots of comparisons is ok
 			for i := uintptr(0); i < bucketCnt; i++ {
-				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+				x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 				if x == empty {
 					continue
 				}
@@ -349,7 +347,7 @@
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x == empty {
 				continue
 			}
@@ -402,7 +400,7 @@
 	}
 	for {
 		for i := uintptr(0); i < bucketCnt; i++ {
-			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.topbits[i] without the bounds check
+			x := *((*uint8)(add(unsafe.Pointer(b), i))) // b.tophash[i] without the bounds check
 			if x != top {
 				continue
 			}
@@ -420,3 +418,441 @@
 		}
 	}
 }
+
+func mapassign_fast32(t *maptype, h *hmap, key uint32) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast32))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*4)
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := *((*uint32)(add(unsafe.Pointer(b), dataOffset+i*4)))
+			if k != key {
+				continue
+			}
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*4+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*4)
+	}
+
+	// store new key/value at insert position
+	*((*uint32)(insertk)) = key
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_fast64(t *maptype, h *hmap, key uint64) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_fast64))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*8)
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := *((*uint64)(add(unsafe.Pointer(b), dataOffset+i*8)))
+			if k != key {
+				continue
+			}
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*8+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*8)
+	}
+
+	// store new key/value at insert position
+	*((*uint64)(insertk)) = key
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapassign_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
+	if h == nil {
+		panic(plainError("assignment to entry in nil map"))
+	}
+	if raceenabled {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapassign_faststr))
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	key := stringStructOf(&ky)
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapassign.
+	h.flags |= hashWriting
+
+	if h.buckets == nil {
+		h.buckets = newarray(t.bucket, 1)
+	}
+
+again:
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+
+	var inserti *uint8
+	var insertk unsafe.Pointer
+	var val unsafe.Pointer
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				if b.tophash[i] == empty && inserti == nil {
+					inserti = &b.tophash[i]
+					insertk = add(unsafe.Pointer(b), dataOffset+i*uintptr(t.keysize))
+					val = add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
+				}
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			// already have a mapping for key. Update it.
+			val = add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
+			goto done
+		}
+		ovf := b.overflow(t)
+		if ovf == nil {
+			break
+		}
+		b = ovf
+	}
+
+	// Did not find mapping for key. Allocate new cell & add entry.
+
+	// If we hit the max load factor or we have too many overflow buckets,
+	// and we're not already in the middle of growing, start growing.
+	if !h.growing() && (overLoadFactor(int64(h.count), h.B) || tooManyOverflowBuckets(h.noverflow, h.B)) {
+		hashGrow(t, h)
+		goto again // Growing the table invalidates everything, so try again
+	}
+
+	if inserti == nil {
+		// all current buckets are full, allocate a new one.
+		newb := h.newoverflow(t, b)
+		inserti = &newb.tophash[0]
+		insertk = add(unsafe.Pointer(newb), dataOffset)
+		val = add(insertk, bucketCnt*2*sys.PtrSize)
+	}
+
+	// store new key/value at insert position
+	*((*stringStruct)(insertk)) = *key
+	*inserti = top
+	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+	return val
+}
+
+func mapdelete_fast32(t *maptype, h *hmap, key uint32) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast32))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*uint32)(add(unsafe.Pointer(b), dataOffset+i*4))
+			if key != *k {
+				continue
+			}
+			*k = 0
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*4 + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func mapdelete_fast64(t *maptype, h *hmap, key uint64) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_fast64))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*uint64)(add(unsafe.Pointer(b), dataOffset+i*8))
+			if key != *k {
+				continue
+			}
+			*k = 0
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*8 + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}
+
+func mapdelete_faststr(t *maptype, h *hmap, ky string) {
+	if raceenabled && h != nil {
+		callerpc := getcallerpc(unsafe.Pointer(&t))
+		racewritepc(unsafe.Pointer(h), callerpc, funcPC(mapdelete_faststr))
+	}
+	if h == nil || h.count == 0 {
+		return
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+
+	key := stringStructOf(&ky)
+	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
+
+	// Set hashWriting after calling alg.hash for consistency with mapdelete
+	h.flags |= hashWriting
+
+	bucket := hash & (uintptr(1)<<h.B - 1)
+	if h.growing() {
+		growWork(t, h, bucket)
+	}
+	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
+	if top < minTopHash {
+		top += minTopHash
+	}
+	for {
+		for i := uintptr(0); i < bucketCnt; i++ {
+			if b.tophash[i] != top {
+				continue
+			}
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
+			if k.len != key.len {
+				continue
+			}
+			if k.str != key.str && !memequal(k.str, key.str, uintptr(key.len)) {
+				continue
+			}
+			typedmemclr(t.key, unsafe.Pointer(k))
+			v := unsafe.Pointer(uintptr(unsafe.Pointer(b)) + dataOffset + bucketCnt*2*sys.PtrSize + i*uintptr(t.valuesize))
+			typedmemclr(t.elem, v)
+			b.tophash[i] = empty
+			h.count--
+			goto done
+		}
+		b = b.overflow(t)
+		if b == nil {
+			goto done
+		}
+	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
+}

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 6039417..35f6124 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go

@@ -548,7 +548,7 @@
 	dumpint(memstats.gc_sys)
 	dumpint(memstats.other_sys)
 	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc)
+	dumpint(memstats.last_gc_unix)
 	dumpint(memstats.pause_total_ns)
 	for i := 0; i < 256; i++ {
 		dumpint(memstats.pause_ns[i])
@@ -565,7 +565,7 @@
 	for i := uintptr(0); i < nstk; i++ {
 		pc := stk[i]
 		f := findfunc(pc)
-		if f == nil {
+		if !f.valid() {
 			var buf [64]byte
 			n := len(buf)
 			n--
@@ -653,7 +653,7 @@
 	// Update stats so we can dump them.
 	// As a side effect, flushes all the MCaches so the MSpan.freelist
 	// lists contain all the free objects.
-	updatememstats(nil)
+	updatememstats()
 
 	// Set dump file.
 	dumpfd = fd

diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 46010d5..58ed61e 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go

@@ -53,7 +53,7 @@
 		}
 		for m = (*itab)(atomic.Loadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
 			if m.inter == inter && m._type == typ {
-				if m.bad != 0 {
+				if m.bad {
 					if !canfail {
 						// this can only happen if the conversion
 						// was already done once using the , ok form
@@ -78,7 +78,7 @@
 	m._type = typ
 	additab(m, true, canfail)
 	unlock(&ifaceLock)
-	if m.bad != 0 {
+	if m.bad {
 		return nil
 	}
 	return m
@@ -130,7 +130,7 @@
 			}
 			panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), iname})
 		}
-		m.bad = 1
+		m.bad = true
 		break
 	nextimethod:
 	}
@@ -139,7 +139,7 @@
 	}
 	h := itabhash(inter, typ)
 	m.link = hash[h]
-	m.inhash = 1
+	m.inhash = true
 	atomicstorep(unsafe.Pointer(&hash[h]), unsafe.Pointer(m))
 }
 
@@ -152,7 +152,7 @@
 			// and thanks to the way global symbol resolution works, the
 			// pointed-to itab may already have been inserted into the
 			// global 'hash'.
-			if i.inhash == 0 {
+			if !i.inhash {
 				additab(i, true, false)
 			}
 		}
@@ -160,11 +160,11 @@
 	unlock(&ifaceLock)
 }
 
-// panicdottype is called when doing an i.(T) conversion and the conversion fails.
+// panicdottypeE is called when doing an e.(T) conversion and the conversion fails.
 // have = the dynamic type we have.
 // want = the static type we're trying to convert to.
 // iface = the static type we're converting from.
-func panicdottype(have, want, iface *_type) {
+func panicdottypeE(have, want, iface *_type) {
 	haveString := ""
 	if have != nil {
 		haveString = have.string()
@@ -172,6 +172,16 @@
 	panic(&TypeAssertionError{iface.string(), haveString, want.string(), ""})
 }
 
+// panicdottypeI is called when doing an i.(T) conversion and the conversion fails.
+// Same args as panicdottypeE, but "have" is the dynamic itab we have.
+func panicdottypeI(have *itab, want, iface *_type) {
+	var t *_type
+	if have != nil {
+		t = have._type
+	}
+	panicdottypeE(t, want, iface)
+}
+
 // panicnildottype is called when doing a i.(T) conversion and the interface i is nil.
 // want = the static type we're trying to convert to.
 func panicnildottype(want *_type) {
@@ -195,19 +205,124 @@
 	if msanenabled {
 		msanread(elem, t.size)
 	}
-	if isDirectIface(t) {
-		// This case is implemented directly by the compiler.
-		throw("direct convT2E")
-	}
-	x := newobject(t)
-	// TODO: We allocate a zeroed object only to overwrite it with
-	// actual data. Figure out how to avoid zeroing. Also below in convT2I.
+	x := mallocgc(t.size, t, true)
+	// TODO: We allocate a zeroed object only to overwrite it with actual data.
+	// Figure out how to avoid zeroing. Also below in convT2Eslice, convT2I, convT2Islice.
 	typedmemmove(t, x, elem)
 	e._type = t
 	e.data = x
 	return
 }
 
+func convT2E16(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E16))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint16)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(2, t, false)
+		*(*uint16)(x) = *(*uint16)(elem)
+	}
+	e._type = t
+	e.data = x
+	return
+}
+
+func convT2E32(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E32))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint32)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(4, t, false)
+		*(*uint32)(x) = *(*uint32)(elem)
+	}
+	e._type = t
+	e.data = x
+	return
+}
+
+func convT2E64(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E64))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint64)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(8, t, false)
+		*(*uint64)(x) = *(*uint64)(elem)
+	}
+	e._type = t
+	e.data = x
+	return
+}
+
+func convT2Estring(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Estring))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*string)(elem) == "" {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(t.size, t, true)
+		*(*string)(x) = *(*string)(elem)
+	}
+	e._type = t
+	e.data = x
+	return
+}
+
+func convT2Eslice(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Eslice))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if v := *(*slice)(elem); uintptr(v.array) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(t.size, t, true)
+		*(*slice)(x) = *(*slice)(elem)
+	}
+	e._type = t
+	e.data = x
+	return
+}
+
+func convT2Enoptr(t *_type, elem unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2Enoptr))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	x := mallocgc(t.size, t, false)
+	memmove(x, elem, t.size)
+	e._type = t
+	e.data = x
+	return
+}
+
 func convT2I(tab *itab, elem unsafe.Pointer) (i iface) {
 	t := tab._type
 	if raceenabled {
@@ -216,17 +331,128 @@
 	if msanenabled {
 		msanread(elem, t.size)
 	}
-	if isDirectIface(t) {
-		// This case is implemented directly by the compiler.
-		throw("direct convT2I")
-	}
-	x := newobject(t)
+	x := mallocgc(t.size, t, true)
 	typedmemmove(t, x, elem)
 	i.tab = tab
 	i.data = x
 	return
 }
 
+func convT2I16(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I16))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint16)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(2, t, false)
+		*(*uint16)(x) = *(*uint16)(elem)
+	}
+	i.tab = tab
+	i.data = x
+	return
+}
+
+func convT2I32(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I32))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint32)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(4, t, false)
+		*(*uint32)(x) = *(*uint32)(elem)
+	}
+	i.tab = tab
+	i.data = x
+	return
+}
+
+func convT2I64(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I64))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*uint64)(elem) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(8, t, false)
+		*(*uint64)(x) = *(*uint64)(elem)
+	}
+	i.tab = tab
+	i.data = x
+	return
+}
+
+func convT2Istring(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Istring))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if *(*string)(elem) == "" {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(t.size, t, true)
+		*(*string)(x) = *(*string)(elem)
+	}
+	i.tab = tab
+	i.data = x
+	return
+}
+
+func convT2Islice(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Islice))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	var x unsafe.Pointer
+	if v := *(*slice)(elem); uintptr(v.array) == 0 {
+		x = unsafe.Pointer(&zeroVal[0])
+	} else {
+		x = mallocgc(t.size, t, true)
+		*(*slice)(x) = *(*slice)(elem)
+	}
+	i.tab = tab
+	i.data = x
+	return
+}
+
+func convT2Inoptr(tab *itab, elem unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2Inoptr))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	x := mallocgc(t.size, t, false)
+	memmove(x, elem, t.size)
+	i.tab = tab
+	i.data = x
+	return
+}
+
 func convI2I(inter *interfacetype, i iface) (r iface) {
 	tab := i.tab
 	if tab == nil {
@@ -313,3 +539,39 @@
 		}
 	}
 }
+
+// staticbytes is used to avoid convT2E for byte-sized values.
+var staticbytes = [...]byte{
+	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+	0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+	0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+	0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
+	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
+	0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
+	0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
+	0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+	0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
+	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
+	0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
+	0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+	0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
+	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
+	0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
+	0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+	0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+	0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+	0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+	0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+	0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+	0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+	0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+	0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+	0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+	0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+	0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+	0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+	0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+}

diff --git a/src/runtime/iface_test.go b/src/runtime/iface_test.go
index 7f27baa..6d8f861 100644
--- a/src/runtime/iface_test.go
+++ b/src/runtime/iface_test.go

@@ -29,6 +29,20 @@
 func (TL) Method1() {}
 func (TL) Method2() {}
 
+type T8 uint8
+type T16 uint16
+type T32 uint32
+type T64 uint64
+type Tstr string
+type Tslice []byte
+
+func (T8) Method1()     {}
+func (T16) Method1()    {}
+func (T32) Method1()    {}
+func (T64) Method1()    {}
+func (Tstr) Method1()   {}
+func (Tslice) Method1() {}
+
 var (
 	e  interface{}
 	e_ interface{}
@@ -261,3 +275,129 @@
 		t.Fatalf("want 0 allocs, got %v", n)
 	}
 }
+
+func TestZeroConvT2x(t *testing.T) {
+	tests := []struct {
+		name string
+		fn   func()
+	}{
+		{name: "E8", fn: func() { e = eight8 }},  // any byte-sized value does not allocate
+		{name: "E16", fn: func() { e = zero16 }}, // zero values do not allocate
+		{name: "E32", fn: func() { e = zero32 }},
+		{name: "E64", fn: func() { e = zero64 }},
+		{name: "Estr", fn: func() { e = zerostr }},
+		{name: "Eslice", fn: func() { e = zeroslice }},
+		{name: "Econstflt", fn: func() { e = 99.0 }}, // constants do not allocate
+		{name: "Econststr", fn: func() { e = "change" }},
+		{name: "I8", fn: func() { i1 = eight8I }},
+		{name: "I16", fn: func() { i1 = zero16I }},
+		{name: "I32", fn: func() { i1 = zero32I }},
+		{name: "I64", fn: func() { i1 = zero64I }},
+		{name: "Istr", fn: func() { i1 = zerostrI }},
+		{name: "Islice", fn: func() { i1 = zerosliceI }},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			n := testing.AllocsPerRun(1000, test.fn)
+			if n != 0 {
+				t.Errorf("want zero allocs, got %v", n)
+			}
+		})
+	}
+}
+
+var (
+	eight8  uint8 = 8
+	eight8I T8    = 8
+
+	zero16  uint16 = 0
+	zero16I T16    = 0
+	one16   uint16 = 1
+
+	zero32  uint32 = 0
+	zero32I T32    = 0
+	one32   uint32 = 1
+
+	zero64  uint64 = 0
+	zero64I T64    = 0
+	one64   uint64 = 1
+
+	zerostr  string = ""
+	zerostrI Tstr   = ""
+	nzstr    string = "abc"
+
+	zeroslice  []byte = nil
+	zerosliceI Tslice = nil
+	nzslice    []byte = []byte("abc")
+
+	zerobig [512]byte
+	nzbig   [512]byte = [512]byte{511: 1}
+)
+
+func BenchmarkConvT2Ezero(b *testing.B) {
+	b.Run("zero", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero16
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero32
+			}
+		})
+		b.Run("64", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zero64
+			}
+		})
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zerostr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zeroslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = zerobig
+			}
+		})
+	})
+	b.Run("nonzero", func(b *testing.B) {
+		b.Run("16", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one16
+			}
+		})
+		b.Run("32", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one32
+			}
+		})
+		b.Run("64", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = one64
+			}
+		})
+		b.Run("str", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzstr
+			}
+		})
+		b.Run("slice", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzslice
+			}
+		})
+		b.Run("big", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				e = nzbig
+			}
+		})
+	})
+}

diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 80b178d..19d131e 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s

@@ -6,12 +6,6 @@
 
 #include "textflag.h"
 
-#define LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
-#define LLV(base, rt)	WORD	$((064<<26)|((base)<<21)|((rt)<<16))
-#define SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
-#define SCV(base, rt)	WORD	$((074<<26)|((base)<<21)|((rt)<<16))
-#define SYNC	WORD $0xf
-
 // bool cas(uint32 *ptr, uint32 old, uint32 new)
 // Atomically:
 //	if(*val == old){
@@ -26,9 +20,9 @@
 	SYNC
 cas_again:
 	MOVV	R5, R3
-	LL(1, 4)	// R4 = *R1
+	LL	(R1), R4
 	BNE	R2, R4, cas_fail
-	SC(1, 3)	// *R1 = R3
+	SC	R3, (R1)
 	BEQ	R3, cas_again
 	MOVV	$1, R1
 	MOVB	R1, ret+16(FP)
@@ -53,9 +47,9 @@
 	SYNC
 cas64_again:
 	MOVV	R5, R3
-	LLV(1, 4)	// R4 = *R1
+	LLV	(R1), R4
 	BNE	R2, R4, cas64_fail
-	SCV(1, 3)	// *R1 = R3
+	SCV	R3, (R1)
 	BEQ	R3, cas64_again
 	MOVV	$1, R1
 	MOVB	R1, ret+24(FP)
@@ -104,10 +98,10 @@
 	MOVV	ptr+0(FP), R2
 	MOVW	delta+8(FP), R3
 	SYNC
-	LL(2, 1)	// R1 = *R2
+	LL	(R2), R1
 	ADDU	R1, R3, R4
 	MOVV	R4, R1
-	SC(2, 4)	// *R2 = R4
+	SC	R4, (R2)
 	BEQ	R4, -4(PC)
 	MOVW	R1, ret+16(FP)
 	SYNC
@@ -117,10 +111,10 @@
 	MOVV	ptr+0(FP), R2
 	MOVV	delta+8(FP), R3
 	SYNC
-	LLV(2, 1)	// R1 = *R2
+	LLV	(R2), R1
 	ADDVU	R1, R3, R4
 	MOVV	R4, R1
-	SCV(2, 4)	// *R2 = R4
+	SCV	R4, (R2)
 	BEQ	R4, -4(PC)
 	MOVV	R1, ret+16(FP)
 	SYNC
@@ -132,8 +126,8 @@
 
 	SYNC
 	MOVV	R5, R3
-	LL(2, 1)	// R1 = *R2
-	SC(2, 3)	// *R2 = R3
+	LL	(R2), R1
+	SC	R3, (R2)
 	BEQ	R3, -3(PC)
 	MOVW	R1, ret+16(FP)
 	SYNC
@@ -145,8 +139,8 @@
 
 	SYNC
 	MOVV	R5, R3
-	LLV(2, 1)	// R1 = *R2
-	SCV(2, 3)	// *R2 = R3
+	LLV	(R2), R1
+	SCV	R3, (R2)
 	BEQ	R3, -3(PC)
 	MOVV	R1, ret+16(FP)
 	SYNC
@@ -193,9 +187,9 @@
 	SLLV	R4, R2
 
 	SYNC
-	LL(3, 4)	// R4 = *R3
+	LL	(R3), R4
 	OR	R2, R4
-	SC(3, 4)	// *R3 = R4
+	SC	R4, (R3)
 	BEQ	R4, -4(PC)
 	SYNC
 	RET
@@ -223,9 +217,9 @@
 	OR	R5, R2
 
 	SYNC
-	LL(3, 4)	// R4 = *R3
+	LL	(R3), R4
 	AND	R2, R4
-	SC(3, 4)	// *R3 = R4
+	SC	R4, (R3)
 	BEQ	R4, -4(PC)
 	SYNC
 	RET

diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index aa6067e..7117aef 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s

@@ -165,32 +165,12 @@
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
 	MOVBZ	val+8(FP), R4
-#ifdef  GOARCH_ppc64
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6
-	SLD	R6, R4, R4
-	SYNC
-
-again:
-	LWAR	(R5), R6
-	OR	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-#else
 	SYNC
 again:
 	LBAR	(R3), R6
 	OR	R4, R6
 	STBCCC	R6, (R3)
 	BNE	again
-#endif
 	ISYNC
 	RET
 
@@ -198,34 +178,11 @@
 TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
 	MOVBZ	val+8(FP), R4
-#ifdef  GOARCH_ppc64
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6 | ^(0xFF << R6)
-	MOVD	$0xFF, R7
-	SLD	R6, R4
-	SLD	R6, R7
-	XOR	$-1, R7
-	OR	R7, R4
-	SYNC
-again:
-	LWAR	(R5), R6
-	AND	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-#else
 	SYNC
 again:
 	LBAR	(R3),R6
 	AND	R4,R6
 	STBCCC	R6,(R3)
 	BNE	again
-#endif
 	ISYNC
 	RET

diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
index db2cbec..4e119b0 100644
--- a/src/runtime/internal/sys/intrinsics.go
+++ b/src/runtime/internal/sys/intrinsics.go

@@ -32,22 +32,22 @@
 
 // Ctz64 counts trailing (low-order) zeroes,
 // and if all are zero, then 64.
-func Ctz64(x uint64) uint64 {
+func Ctz64(x uint64) int {
 	x &= -x                      // isolate low-order bit
 	y := x * deBruijn64 >> 58    // extract part of deBruijn sequence
-	y = uint64(deBruijnIdx64[y]) // convert to bit index
-	z := (x - 1) >> 57 & 64      // adjustment if zero
-	return y + z
+	i := int(deBruijnIdx64[y])   // convert to bit index
+	z := int((x - 1) >> 57 & 64) // adjustment if zero
+	return i + z
 }
 
 // Ctz32 counts trailing (low-order) zeroes,
 // and if all are zero, then 32.
-func Ctz32(x uint32) uint32 {
+func Ctz32(x uint32) int {
 	x &= -x                      // isolate low-order bit
 	y := x * deBruijn32 >> 27    // extract part of deBruijn sequence
-	y = uint32(deBruijnIdx32[y]) // convert to bit index
-	z := (x - 1) >> 26 & 32      // adjustment if zero
-	return y + z
+	i := int(deBruijnIdx32[y])   // convert to bit index
+	z := int((x - 1) >> 26 & 32) // adjustment if zero
+	return i + z
 }
 
 // Bswap64 returns its input with byte order reversed

diff --git a/src/runtime/internal/sys/intrinsics_386.s b/src/runtime/internal/sys/intrinsics_386.s
index bc63e5e..4bb4cd6 100644
--- a/src/runtime/internal/sys/intrinsics_386.s
+++ b/src/runtime/internal/sys/intrinsics_386.s

@@ -4,14 +4,12 @@
 
 #include "textflag.h"
 
-TEXT runtime∕internal∕sys·Ctz64(SB), NOSPLIT, $0-16
-	MOVL	$0, ret_hi+12(FP)
-
+TEXT runtime∕internal∕sys·Ctz64(SB), NOSPLIT, $0-12
 	// Try low 32 bits.
 	MOVL	x_lo+0(FP), AX
 	BSFL	AX, AX
 	JZ	tryhigh
-	MOVL	AX, ret_lo+8(FP)
+	MOVL	AX, ret+8(FP)
 	RET
 
 tryhigh:
@@ -20,12 +18,12 @@
 	BSFL	AX, AX
 	JZ	none
 	ADDL	$32, AX
-	MOVL	AX, ret_lo+8(FP)
+	MOVL	AX, ret+8(FP)
 	RET
 
 none:
 	// No bits are set.
-	MOVL	$64, ret_lo+8(FP)
+	MOVL	$64, ret+8(FP)
 	RET
 
 TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8

diff --git a/src/runtime/internal/sys/intrinsics_stubs.go b/src/runtime/internal/sys/intrinsics_stubs.go
index d351048..4d991f4 100644
--- a/src/runtime/internal/sys/intrinsics_stubs.go
+++ b/src/runtime/internal/sys/intrinsics_stubs.go

@@ -6,7 +6,7 @@
 
 package sys
 
-func Ctz64(x uint64) uint64
-func Ctz32(x uint32) uint32
+func Ctz64(x uint64) int
+func Ctz32(x uint32) int
 func Bswap64(x uint64) uint64
 func Bswap32(x uint32) uint32

diff --git a/src/runtime/internal/sys/intrinsics_test.go b/src/runtime/internal/sys/intrinsics_test.go
index 1f2c8da..0444183 100644
--- a/src/runtime/internal/sys/intrinsics_test.go
+++ b/src/runtime/internal/sys/intrinsics_test.go

@@ -6,17 +6,17 @@
 )
 
 func TestCtz64(t *testing.T) {
-	for i := uint(0); i <= 64; i++ {
-		x := uint64(5) << i
-		if got := sys.Ctz64(x); got != uint64(i) {
+	for i := 0; i <= 64; i++ {
+		x := uint64(5) << uint(i)
+		if got := sys.Ctz64(x); got != i {
 			t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
 		}
 	}
 }
 func TestCtz32(t *testing.T) {
-	for i := uint(0); i <= 32; i++ {
-		x := uint32(5) << i
-		if got := sys.Ctz32(x); got != uint32(i) {
+	for i := 0; i <= 32; i++ {
+		x := uint32(5) << uint(i)
+		if got := sys.Ctz32(x); got != i {
 			t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
 		}
 	}

diff --git a/src/runtime/internal/sys/zversion.go b/src/runtime/internal/sys/zversion.go
index 449729c..995a192 100644
--- a/src/runtime/internal/sys/zversion.go
+++ b/src/runtime/internal/sys/zversion.go

@@ -3,7 +3,7 @@
 package sys
 
 const DefaultGoroot = `./prebuilts/go/linux-x86`
-const TheVersion = `go1.8`
+const TheVersion = `go1.9rc1`
 const Goexperiment = ``
 const StackGuardMultiplier = 1
 

diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index 8e33ce1..4787c5b 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go

@@ -3,10 +3,6 @@
 // license that can be found in the LICENSE file.
 
 // Lock-free stack.
-// Initialize head to 0, compare with 0 to test for emptiness.
-// The stack does not keep pointers to nodes,
-// so they can be garbage collected if there are no other pointers to nodes.
-// The following code runs only in non-preemptible contexts.
 
 package runtime
 
@@ -15,32 +11,47 @@
 	"unsafe"
 )
 
-func lfstackpush(head *uint64, node *lfnode) {
+// lfstack is the head of a lock-free stack.
+//
+// The zero value of lfstack is an empty list.
+//
+// This stack is intrusive. Nodes must embed lfnode as the first field.
+//
+// The stack does not keep GC-visible pointers to nodes, so the caller
+// is responsible for ensuring the nodes are not garbage collected
+// (typically by allocating them from manually-managed memory).
+type lfstack uint64
+
+func (head *lfstack) push(node *lfnode) {
 	node.pushcnt++
 	new := lfstackPack(node, node.pushcnt)
 	if node1 := lfstackUnpack(new); node1 != node {
-		print("runtime: lfstackpush invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
-		throw("lfstackpush")
+		print("runtime: lfstack.push invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
+		throw("lfstack.push")
 	}
 	for {
-		old := atomic.Load64(head)
+		old := atomic.Load64((*uint64)(head))
 		node.next = old
-		if atomic.Cas64(head, old, new) {
+		if atomic.Cas64((*uint64)(head), old, new) {
 			break
 		}
 	}
 }
 
-func lfstackpop(head *uint64) unsafe.Pointer {
+func (head *lfstack) pop() unsafe.Pointer {
 	for {
-		old := atomic.Load64(head)
+		old := atomic.Load64((*uint64)(head))
 		if old == 0 {
 			return nil
 		}
 		node := lfstackUnpack(old)
 		next := atomic.Load64(&node.next)
-		if atomic.Cas64(head, old, next) {
+		if atomic.Cas64((*uint64)(head), old, next) {
 			return unsafe.Pointer(node)
 		}
 	}
 }
+
+func (head *lfstack) empty() bool {
+	return atomic.Load64((*uint64)(head)) == 0
+}

diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 073136a..9d55bd1 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go

@@ -38,6 +38,7 @@
 // affect mutex's state.
 
 // We use the uintptr mutex.key and note.key as a uint32.
+//go:nosplit
 func key32(p *uintptr) *uint32 {
 	return (*uint32)(unsafe.Pointer(p))
 }
@@ -140,9 +141,17 @@
 	if gp != gp.m.g0 {
 		throw("notesleep not on g0")
 	}
+	ns := int64(-1)
+	if *cgo_yield != nil {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		ns = 10e6
+	}
 	for atomic.Load(key32(&n.key)) == 0 {
 		gp.m.blocked = true
-		futexsleep(key32(&n.key), 0, -1)
+		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 	}
 }
@@ -156,9 +165,16 @@
 	gp := getg()
 
 	if ns < 0 {
+		if *cgo_yield != nil {
+			// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+			ns = 10e6
+		}
 		for atomic.Load(key32(&n.key)) == 0 {
 			gp.m.blocked = true
-			futexsleep(key32(&n.key), 0, -1)
+			futexsleep(key32(&n.key), 0, ns)
+			if *cgo_yield != nil {
+				asmcgocall(*cgo_yield, nil)
+			}
 			gp.m.blocked = false
 		}
 		return true
@@ -170,8 +186,14 @@
 
 	deadline := nanotime() + ns
 	for {
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, ns)
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 		if atomic.Load(key32(&n.key)) != 0 {
 			break

diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index 0fa0481..5b0169d 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go

@@ -163,7 +163,16 @@
 	}
 	// Queued. Sleep.
 	gp.m.blocked = true
-	semasleep(-1)
+	if *cgo_yield == nil {
+		semasleep(-1)
+	} else {
+		// Sleep for an arbitrary-but-moderate interval to poll libc interceptors.
+		const ns = 10e6
+		for atomic.Loaduintptr(&n.key) == 0 {
+			semasleep(ns)
+			asmcgocall(*cgo_yield, nil)
+		}
+	}
 	gp.m.blocked = false
 }
 
@@ -186,7 +195,15 @@
 	if ns < 0 {
 		// Queued. Sleep.
 		gp.m.blocked = true
-		semasleep(-1)
+		if *cgo_yield == nil {
+			semasleep(-1)
+		} else {
+			// Sleep in arbitrary-but-moderate intervals to poll libc interceptors.
+			const ns = 10e6
+			for semasleep(ns) < 0 {
+				asmcgocall(*cgo_yield, nil)
+			}
+		}
 		gp.m.blocked = false
 		return true
 	}
@@ -195,12 +212,18 @@
 	for {
 		// Registered. Sleep.
 		gp.m.blocked = true
+		if *cgo_yield != nil && ns > 10e6 {
+			ns = 10e6
+		}
 		if semasleep(ns) >= 0 {
 			gp.m.blocked = false
 			// Acquired semaphore, semawakeup unregistered us.
 			// Done.
 			return true
 		}
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		gp.m.blocked = false
 		// Interrupted or timed out. Still registered. Semaphore not acquired.
 		ns = deadline - nanotime()

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index da39dac..8850659 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go

@@ -111,7 +111,7 @@
 
 	// Tiny allocator parameters, see "Tiny allocator" comment in malloc.go.
 	_TinySize      = 16
-	_TinySizeClass = 2
+	_TinySizeClass = int8(2)
 
 	_FixAllocChunk  = 16 << 10               // Chunk size for FixAlloc
 	_MaxMHeapList   = 1 << (20 - _PageShift) // Maximum page length for fixed-size list in MHeap.
@@ -148,7 +148,11 @@
 	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*(32-(sys.GoarchMips+sys.GoarchMipsle))
 	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
 
-	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
+	// _MaxMem is the maximum heap arena size minus 1.
+	//
+	// On 32-bit, this is also the maximum heap pointer value,
+	// since the arena starts at address 0.
+	_MaxMem = 1<<_MHeapMap_TotalBits - 1
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
@@ -156,8 +160,6 @@
 	// collector scales well to 32 cpus.
 	_MaxGcproc = 32
 
-	_MaxArena32 = 1<<32 - 1
-
 	// minLegalPointer is the smallest possible legal pointer.
 	// This is the smallest possible architectural page size,
 	// since we assume that the first page is never mapped.
@@ -238,18 +240,21 @@
 		throw("bad system page size")
 	}
 
-	var p, bitmapSize, spansSize, pSize, limit uintptr
+	// The auxiliary regions start at p and are laid out in the
+	// following order: spans, bitmap, arena.
+	var p, pSize uintptr
 	var reserved bool
 
-	// limit = runtime.memlimit();
-	// See https://golang.org/issue/5049
-	// TODO(rsc): Fix after 1.1.
-	limit = 0
+	// The spans array holds one *mspan per _PageSize of arena.
+	var spansSize uintptr = (_MaxMem + 1) / _PageSize * sys.PtrSize
+	spansSize = round(spansSize, _PageSize)
+	// The bitmap holds 2 bits per word of arena.
+	var bitmapSize uintptr = (_MaxMem + 1) / (sys.PtrSize * 8 / 2)
+	bitmapSize = round(bitmapSize, _PageSize)
 
 	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found. The arena begins with a bitmap large
-	// enough to hold 2 bits per allocated word.
-	if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
+	// allocated data will be found.
+	if sys.PtrSize == 8 {
 		// On a 64-bit machine, allocate from a single contiguous reservation.
 		// 512 GB (MaxMem) should be big enough for now.
 		//
@@ -280,9 +285,7 @@
 		// translation buffers, the user address space is limited to 39 bits
 		// On darwin/arm64, the address space is even smaller.
 		arenaSize := round(_MaxMem, _PageSize)
-		bitmapSize = arenaSize / (sys.PtrSize * 8 / 2)
-		spansSize = arenaSize / _PageSize * sys.PtrSize
-		spansSize = round(spansSize, _PageSize)
+		pSize = bitmapSize + spansSize + arenaSize + _PageSize
 		for i := 0; i <= 0x7f; i++ {
 			switch {
 			case GOARCH == "arm64" && GOOS == "darwin":
@@ -292,7 +295,6 @@
 			default:
 				p = uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
 			}
-			pSize = bitmapSize + spansSize + arenaSize + _PageSize
 			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
 			if p != 0 {
 				break
@@ -310,6 +312,15 @@
 		// When that gets used up, we'll start asking the kernel
 		// for any memory anywhere.
 
+		// We want to start the arena low, but if we're linked
+		// against C code, it's possible global constructors
+		// have called malloc and adjusted the process' brk.
+		// Query the brk so we can avoid trying to map the
+		// arena over it (which will cause the kernel to put
+		// the arena somewhere else, likely at a high
+		// address).
+		procBrk := sbrk0()
+
 		// If we fail to allocate, try again with a smaller arena.
 		// This is necessary on Android L where we share a process
 		// with ART, which reserves virtual memory aggressively.
@@ -323,15 +334,6 @@
 		}
 
 		for _, arenaSize := range arenaSizes {
-			bitmapSize = (_MaxArena32 + 1) / (sys.PtrSize * 8 / 2)
-			spansSize = (_MaxArena32 + 1) / _PageSize * sys.PtrSize
-			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
-				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
-				arenaSize = bitmapSize * 8
-				spansSize = arenaSize / _PageSize * sys.PtrSize
-			}
-			spansSize = round(spansSize, _PageSize)
-
 			// SysReserve treats the address we ask for, end, as a hint,
 			// not as an absolute requirement. If we ask for the end
 			// of the data segment but the operating system requires
@@ -343,6 +345,12 @@
 			// to a MB boundary.
 			p = round(firstmoduledata.end+(1<<18), 1<<20)
 			pSize = bitmapSize + spansSize + arenaSize + _PageSize
+			if p <= procBrk && procBrk < p+pSize {
+				// Move the start above the brk,
+				// leaving some room for future brk
+				// expansion.
+				p = round(procBrk+(1<<20), 1<<20)
+			}
 			p = uintptr(sysReserve(unsafe.Pointer(p), pSize, &reserved))
 			if p != 0 {
 				break
@@ -357,18 +365,22 @@
 	// so SysReserve can give us a PageSize-unaligned pointer.
 	// To overcome this we ask for PageSize more and round up the pointer.
 	p1 := round(p, _PageSize)
+	pSize -= p1 - p
 
 	spansStart := p1
-	mheap_.bitmap = p1 + spansSize + bitmapSize
+	p1 += spansSize
+	mheap_.bitmap = p1 + bitmapSize
+	p1 += bitmapSize
 	if sys.PtrSize == 4 {
 		// Set arena_start such that we can accept memory
 		// reservations located anywhere in the 4GB virtual space.
 		mheap_.arena_start = 0
 	} else {
-		mheap_.arena_start = p1 + (spansSize + bitmapSize)
+		mheap_.arena_start = p1
 	}
 	mheap_.arena_end = p + pSize
-	mheap_.arena_used = p1 + (spansSize + bitmapSize)
+	mheap_.arena_used = p1
+	mheap_.arena_alloc = p1
 	mheap_.arena_reserved = reserved
 
 	if mheap_.arena_start&(_PageSize-1) != 0 {
@@ -387,12 +399,18 @@
 // h.arena_start and h.arena_end. sysAlloc returns nil on failure.
 // There is no corresponding free function.
 func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
-	if n > h.arena_end-h.arena_used {
-		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
-		// Reserve some more space.
+	// strandLimit is the maximum number of bytes to strand from
+	// the current arena block. If we would need to strand more
+	// than this, we fall back to sysAlloc'ing just enough for
+	// this allocation.
+	const strandLimit = 16 << 20
+
+	if n > h.arena_end-h.arena_alloc {
+		// If we haven't grown the arena to _MaxMem yet, try
+		// to reserve some more address space.
 		p_size := round(n+_PageSize, 256<<20)
 		new_end := h.arena_end + p_size // Careful: can overflow
-		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxArena32 {
+		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxMem {
 			// TODO: It would be bad if part of the arena
 			// is reserved and part is not.
 			var reserved bool
@@ -400,37 +418,56 @@
 			if p == 0 {
 				return nil
 			}
+			// p can be just about anywhere in the address
+			// space, including before arena_end.
 			if p == h.arena_end {
+				// The new block is contiguous with
+				// the current block. Extend the
+				// current arena block.
 				h.arena_end = new_end
 				h.arena_reserved = reserved
-			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxArena32 {
+			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxMem && h.arena_end-h.arena_alloc < strandLimit {
+				// We were able to reserve more memory
+				// within the arena space, but it's
+				// not contiguous with our previous
+				// reservation. It could be before or
+				// after our current arena_used.
+				//
 				// Keep everything page-aligned.
 				// Our pages are bigger than hardware pages.
 				h.arena_end = p + p_size
-				used := p + (-p & (_PageSize - 1))
-				h.mapBits(used)
-				h.mapSpans(used)
-				h.arena_used = used
+				p = round(p, _PageSize)
+				h.arena_alloc = p
 				h.arena_reserved = reserved
 			} else {
+				// We got a mapping, but either
+				//
+				// 1) It's not in the arena, so we
+				// can't use it. (This should never
+				// happen on 32-bit.)
+				//
+				// 2) We would need to discard too
+				// much of our current arena block to
+				// use it.
+				//
 				// We haven't added this allocation to
 				// the stats, so subtract it from a
 				// fake stat (but avoid underflow).
+				//
+				// We'll fall back to a small sysAlloc.
 				stat := uint64(p_size)
 				sysFree(unsafe.Pointer(p), p_size, &stat)
 			}
 		}
 	}
 
-	if n <= h.arena_end-h.arena_used {
+	if n <= h.arena_end-h.arena_alloc {
 		// Keep taking from our reservation.
-		p := h.arena_used
+		p := h.arena_alloc
 		sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
-		h.mapBits(p + n)
-		h.mapSpans(p + n)
-		h.arena_used = p + n
-		if raceenabled {
-			racemapshadow(unsafe.Pointer(p), n)
+		h.arena_alloc += n
+		if h.arena_alloc > h.arena_used {
+			h.setArenaUsed(h.arena_alloc, true)
 		}
 
 		if p&(_PageSize-1) != 0 {
@@ -440,7 +477,7 @@
 	}
 
 	// If using 64-bit, our reservation is all we have.
-	if h.arena_end-h.arena_start > _MaxArena32 {
+	if sys.PtrSize != 4 {
 		return nil
 	}
 
@@ -452,28 +489,18 @@
 		return nil
 	}
 
-	if p < h.arena_start || p+p_size-h.arena_start > _MaxArena32 {
-		top := ^uintptr(0)
-		if top-h.arena_start-1 > _MaxArena32 {
-			top = h.arena_start + _MaxArena32 + 1
-		}
+	if p < h.arena_start || p+p_size-h.arena_start > _MaxMem {
+		// This shouldn't be possible because _MaxMem is the
+		// whole address space on 32-bit.
+		top := uint64(h.arena_start) + _MaxMem
 		print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
 		sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
 		return nil
 	}
 
-	p_end := p + p_size
 	p += -p & (_PageSize - 1)
 	if p+n > h.arena_used {
-		h.mapBits(p + n)
-		h.mapSpans(p + n)
-		h.arena_used = p + n
-		if p_end > h.arena_end {
-			h.arena_end = p_end
-		}
-		if raceenabled {
-			racemapshadow(unsafe.Pointer(p), n)
-		}
+		h.setArenaUsed(p+n, true)
 	}
 
 	if p&(_PageSize-1) != 0 {
@@ -496,7 +523,7 @@
 			if freeidx%64 == 0 && freeidx != s.nelems {
 				return 0
 			}
-			s.allocCache >>= (theBit + 1)
+			s.allocCache >>= uint(theBit + 1)
 			s.freeindex = freeidx
 			v := gclinkptr(result*s.elemsize + s.base())
 			s.allocCount++
@@ -512,8 +539,8 @@
 // weight allocation. If it is a heavy weight allocation the caller must
 // determine whether a new GC cycle needs to be started or if the GC is active
 // whether this goroutine needs to assist the GC.
-func (c *mcache) nextFree(sizeclass uint8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
-	s = c.alloc[sizeclass]
+func (c *mcache) nextFree(spc spanClass) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[spc]
 	shouldhelpgc = false
 	freeIndex := s.nextFreeIndex()
 	if freeIndex == s.nelems {
@@ -523,10 +550,10 @@
 			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
 		}
 		systemstack(func() {
-			c.refill(int32(sizeclass))
+			c.refill(spc)
 		})
 		shouldhelpgc = true
-		s = c.alloc[sizeclass]
+		s = c.alloc[spc]
 
 		freeIndex = s.nextFreeIndex()
 	}
@@ -650,10 +677,10 @@
 				return x
 			}
 			// Allocate a new maxTinySize block.
-			span := c.alloc[tinySizeClass]
+			span := c.alloc[tinySpanClass]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, _, shouldhelpgc = c.nextFree(tinySizeClass)
+				v, _, shouldhelpgc = c.nextFree(tinySpanClass)
 			}
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
@@ -673,10 +700,11 @@
 				sizeclass = size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv]
 			}
 			size = uintptr(class_to_size[sizeclass])
-			span := c.alloc[sizeclass]
+			spc := makeSpanClass(sizeclass, noscan)
+			span := c.alloc[spc]
 			v := nextFreeFast(span)
 			if v == 0 {
-				v, span, shouldhelpgc = c.nextFree(sizeclass)
+				v, span, shouldhelpgc = c.nextFree(spc)
 			}
 			x = unsafe.Pointer(v)
 			if needzero && span.needzero != 0 {
@@ -687,7 +715,7 @@
 		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, needzero)
+			s = largeAlloc(size, needzero, noscan)
 		})
 		s.freeindex = 1
 		s.allocCount = 1
@@ -696,9 +724,7 @@
 	}
 
 	var scanSize uintptr
-	if noscan {
-		heapBitsSetTypeNoScan(uintptr(x))
-	} else {
+	if !noscan {
 		// If allocating a defer+arg block, now that we've picked a malloc size
 		// large enough to hold everything, cut the "asked for" size down to
 		// just the defer header, so that the GC bitmap will record the arg block
@@ -769,14 +795,16 @@
 		assistG.gcAssistBytes -= int64(size - dataSize)
 	}
 
-	if shouldhelpgc && gcShouldStart(false) {
-		gcStart(gcBackgroundMode, false)
+	if shouldhelpgc {
+		if t := (gcTrigger{kind: gcTriggerHeap}); t.test() {
+			gcStart(gcBackgroundMode, t)
+		}
 	}
 
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool) *mspan {
+func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
 	// print("largeAlloc size=", size, "\n")
 
 	if size+_PageSize < size {
@@ -792,7 +820,7 @@
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mheap_.alloc(npages, 0, true, needzero)
+	s := mheap_.alloc(npages, makeSpanClass(0, noscan), true, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
@@ -882,7 +910,7 @@
 		rate = 0x3fffffff
 	}
 	if rate != 0 {
-		return int32(int(fastrand()) % (2 * rate))
+		return int32(fastrand() % uint32(2*rate))
 	}
 	return 0
 }

diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 0cf9cfb..d9487ee 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go

@@ -6,6 +6,8 @@
 
 import (
 	"flag"
+	"fmt"
+	"reflect"
 	. "runtime"
 	"testing"
 	"time"
@@ -20,24 +22,62 @@
 	st := new(MemStats)
 	ReadMemStats(st)
 
-	// Everything except HeapReleased, HeapIdle, and NumGC,
-	// because they indeed can be 0.
-	if st.Alloc == 0 || st.TotalAlloc == 0 || st.Sys == 0 || st.Lookups == 0 ||
-		st.Mallocs == 0 || st.Frees == 0 || st.HeapAlloc == 0 || st.HeapSys == 0 ||
-		st.HeapInuse == 0 || st.HeapObjects == 0 || st.StackInuse == 0 ||
-		st.StackSys == 0 || st.MSpanInuse == 0 || st.MSpanSys == 0 || st.MCacheInuse == 0 ||
-		st.MCacheSys == 0 || st.BuckHashSys == 0 || st.GCSys == 0 || st.OtherSys == 0 ||
-		st.NextGC == 0 || st.NumForcedGC == 0 {
-		t.Fatalf("Zero value: %+v", *st)
+	nz := func(x interface{}) error {
+		if x != reflect.Zero(reflect.TypeOf(x)).Interface() {
+			return nil
+		}
+		return fmt.Errorf("zero value")
+	}
+	le := func(thresh float64) func(interface{}) error {
+		return func(x interface{}) error {
+			if reflect.ValueOf(x).Convert(reflect.TypeOf(thresh)).Float() < thresh {
+				return nil
+			}
+			return fmt.Errorf("insanely high value (overflow?); want <= %v", thresh)
+		}
+	}
+	eq := func(x interface{}) func(interface{}) error {
+		return func(y interface{}) error {
+			if x == y {
+				return nil
+			}
+			return fmt.Errorf("want %v", x)
+		}
+	}
+	// Of the uint fields, HeapReleased, HeapIdle can be 0.
+	// PauseTotalNs can be 0 if timer resolution is poor.
+	//
+	// TODO: Test that GCCPUFraction is <= 0.99. This currently
+	// fails on windows/386. (Issue #19319)
+	fields := map[string][]func(interface{}) error{
+		"Alloc": {nz, le(1e10)}, "TotalAlloc": {nz, le(1e11)}, "Sys": {nz, le(1e10)},
+		"Lookups": {nz, le(1e10)}, "Mallocs": {nz, le(1e10)}, "Frees": {nz, le(1e10)},
+		"HeapAlloc": {nz, le(1e10)}, "HeapSys": {nz, le(1e10)}, "HeapIdle": {le(1e10)},
+		"HeapInuse": {nz, le(1e10)}, "HeapReleased": {le(1e10)}, "HeapObjects": {nz, le(1e10)},
+		"StackInuse": {nz, le(1e10)}, "StackSys": {nz, le(1e10)},
+		"MSpanInuse": {nz, le(1e10)}, "MSpanSys": {nz, le(1e10)},
+		"MCacheInuse": {nz, le(1e10)}, "MCacheSys": {nz, le(1e10)},
+		"BuckHashSys": {nz, le(1e10)}, "GCSys": {nz, le(1e10)}, "OtherSys": {nz, le(1e10)},
+		"NextGC": {nz, le(1e10)}, "LastGC": {nz},
+		"PauseTotalNs": {le(1e11)}, "PauseNs": nil, "PauseEnd": nil,
+		"NumGC": {nz, le(1e9)}, "NumForcedGC": {nz, le(1e9)},
+		"GCCPUFraction": nil, "EnableGC": {eq(true)}, "DebugGC": {eq(false)},
+		"BySize": nil,
 	}
 
-	if st.Alloc > 1e10 || st.TotalAlloc > 1e11 || st.Sys > 1e10 || st.Lookups > 1e10 ||
-		st.Mallocs > 1e10 || st.Frees > 1e10 || st.HeapAlloc > 1e10 || st.HeapSys > 1e10 ||
-		st.HeapIdle > 1e10 || st.HeapInuse > 1e10 || st.HeapObjects > 1e10 || st.StackInuse > 1e10 ||
-		st.StackSys > 1e10 || st.MSpanInuse > 1e10 || st.MSpanSys > 1e10 || st.MCacheInuse > 1e10 ||
-		st.MCacheSys > 1e10 || st.BuckHashSys > 1e10 || st.GCSys > 1e10 || st.OtherSys > 1e10 ||
-		st.NextGC > 1e10 || st.NumGC > 1e9 || st.NumForcedGC > 1e9 || st.PauseTotalNs > 1e11 {
-		t.Fatalf("Insanely high value (overflow?): %+v", *st)
+	rst := reflect.ValueOf(st).Elem()
+	for i := 0; i < rst.Type().NumField(); i++ {
+		name, val := rst.Type().Field(i).Name, rst.Field(i).Interface()
+		checks, ok := fields[name]
+		if !ok {
+			t.Errorf("unknown MemStats field %s", name)
+			continue
+		}
+		for _, check := range checks {
+			if err := check(val); err != nil {
+				t.Errorf("%s = %v: %s", name, val, err)
+			}
+		}
 	}
 
 	if st.Sys != st.HeapSys+st.StackSys+st.MSpanSys+st.MCacheSys+

diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index aacd091..81f05a0 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go

@@ -10,6 +10,7 @@
 	"reflect"
 	"runtime"
 	"sort"
+	"strconv"
 	"strings"
 	"sync"
 	"testing"
@@ -587,6 +588,14 @@
 	}
 }
 
+// Test that making a map with a large or invalid hint
+// doesn't panic. (Issue 19926).
+func TestIgnoreBogusMapHint(t *testing.T) {
+	for _, hint := range []int64{-1, 1 << 62} {
+		_ = make(map[int]int, hint)
+	}
+}
+
 func benchmarkMapPop(b *testing.B, n int) {
 	m := map[int]int{}
 	for i := 0; i < b.N; i++ {
@@ -617,3 +626,86 @@
 		t.Fatalf("want 0 allocs, got %v", n)
 	}
 }
+
+func benchmarkMapAssignInt32(b *testing.B, n int) {
+	a := make(map[int32]int)
+	for i := 0; i < b.N; i++ {
+		a[int32(i&(n-1))] = i
+	}
+}
+
+func benchmarkMapDeleteInt32(b *testing.B, n int) {
+	a := make(map[int32]int)
+	for i := 0; i < n*b.N; i++ {
+		a[int32(i)] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, int32(i))
+	}
+}
+
+func benchmarkMapAssignInt64(b *testing.B, n int) {
+	a := make(map[int64]int)
+	for i := 0; i < b.N; i++ {
+		a[int64(i&(n-1))] = i
+	}
+}
+
+func benchmarkMapDeleteInt64(b *testing.B, n int) {
+	a := make(map[int64]int)
+	for i := 0; i < n*b.N; i++ {
+		a[int64(i)] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, int64(i))
+	}
+}
+
+func benchmarkMapAssignStr(b *testing.B, n int) {
+	k := make([]string, n)
+	for i := 0; i < len(k); i++ {
+		k[i] = strconv.Itoa(i)
+	}
+	b.ResetTimer()
+	a := make(map[string]int)
+	for i := 0; i < b.N; i++ {
+		a[k[i&(n-1)]] = i
+	}
+}
+
+func benchmarkMapDeleteStr(b *testing.B, n int) {
+	k := make([]string, n*b.N)
+	for i := 0; i < n*b.N; i++ {
+		k[i] = strconv.Itoa(i)
+	}
+	a := make(map[string]int)
+	for i := 0; i < n*b.N; i++ {
+		a[k[i]] = i
+	}
+	b.ResetTimer()
+	for i := 0; i < n*b.N; i = i + n {
+		delete(a, k[i])
+	}
+}
+
+func runWith(f func(*testing.B, int), v ...int) func(*testing.B) {
+	return func(b *testing.B) {
+		for _, n := range v {
+			b.Run(strconv.Itoa(n), func(b *testing.B) { f(b, n) })
+		}
+	}
+}
+
+func BenchmarkMapAssign(b *testing.B) {
+	b.Run("Int32", runWith(benchmarkMapAssignInt32, 1<<8, 1<<16))
+	b.Run("Int64", runWith(benchmarkMapAssignInt64, 1<<8, 1<<16))
+	b.Run("Str", runWith(benchmarkMapAssignStr, 1<<8, 1<<16))
+}
+
+func BenchmarkMapDelete(b *testing.B) {
+	b.Run("Int32", runWith(benchmarkMapDeleteInt32, 1, 2, 4))
+	b.Run("Int64", runWith(benchmarkMapDeleteInt64, 1, 2, 4))
+	b.Run("Str", runWith(benchmarkMapDeleteStr, 1, 2, 4))
+}

diff --git a/src/runtime/mapspeed_test.go b/src/runtime/mapspeed_test.go
index ac93119..aec0c51 100644
--- a/src/runtime/mapspeed_test.go
+++ b/src/runtime/mapspeed_test.go

@@ -5,6 +5,7 @@
 
 import (
 	"fmt"
+	"strconv"
 	"strings"
 	"testing"
 )
@@ -308,6 +309,20 @@
 	}
 }
 
+func BenchmarkMapPopulate(b *testing.B) {
+	for size := 1; size < 1000000; size *= 10 {
+		b.Run(strconv.Itoa(size), func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				m := make(map[int]bool)
+				for j := 0; j < size; j++ {
+					m[j] = true
+				}
+			}
+		})
+	}
+}
+
 type ComplexAlgKey struct {
 	a, b, c int64
 	_       int

diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index 5848b43..3713c50 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go

@@ -149,6 +149,11 @@
 		// combine the read and the write. Checking inheap is
 		// insufficient since we need to track changes to
 		// roots outside the heap.
+		//
+		// Note: profbuf.go omits a barrier during signal handler
+		// profile logging; that's safe only because this deletion barrier exists.
+		// If we remove the deletion barrier, we'll have to work out
+		// a new way to handle the profile logging.
 		if slot1 := uintptr(unsafe.Pointer(slot)); slot1 >= minPhysPageSize {
 			if optr := *slot; optr != 0 {
 				shade(optr)
@@ -231,6 +236,7 @@
 }
 
 // typedmemmove copies a value of type t to dst from src.
+// Must be nosplit, see #16026.
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	if typ.kind&kindNoPointers == 0 {

diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 89d8a4c..2a9f1b8 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go

@@ -45,6 +45,11 @@
 // not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
+// The bitmap for noscan spans is not maintained. Code must ensure
+// that an object is scannable before consulting its bitmap by
+// checking either the noscan bit in the span or by consulting its
+// type's information.
+//
 // Checkmarks
 //
 // In a concurrent garbage collector, one worries about failing to mark
@@ -134,13 +139,9 @@
 	return (*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(p)) - 1))
 }
 
-// mHeap_MapBits is called each time arena_used is extended.
-// It maps any additional bitmap memory needed for the new arena memory.
-// It must be called with the expected new value of arena_used,
-// *before* h.arena_used has been updated.
-// Waiting to update arena_used until after the memory has been mapped
-// avoids faults when other threads try access the bitmap immediately
-// after observing the change to arena_used.
+// mapBits maps any additional bitmap memory needed for the new arena memory.
+//
+// Don't call this directly. Call mheap.setArenaUsed.
 //
 //go:nowritebarrier
 func (h *mheap) mapBits(arena_used uintptr) {
@@ -186,10 +187,8 @@
 
 //go:nosplit
 func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
-	whichByte := allocBitIndex / 8
-	whichBit := allocBitIndex % 8
-	bytePtr := addb(s.allocBits, whichByte)
-	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
+	bytep, mask := s.allocBits.bitp(allocBitIndex)
+	return markBits{bytep, mask, allocBitIndex}
 }
 
 // refillaCache takes 8 bytes s.allocBits starting at whichByte
@@ -197,7 +196,7 @@
 // can be used. It then places these 8 bytes into the cached 64 bit
 // s.allocCache.
 func (s *mspan) refillAllocCache(whichByte uintptr) {
-	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
+	bytes := (*[8]uint8)(unsafe.Pointer(s.allocBits.bytep(whichByte)))
 	aCache := uint64(0)
 	aCache |= uint64(bytes[0])
 	aCache |= uint64(bytes[1]) << (1 * 8)
@@ -248,7 +247,7 @@
 		return snelems
 	}
 
-	s.allocCache >>= (bitIndex + 1)
+	s.allocCache >>= uint(bitIndex + 1)
 	sfreeindex = result + 1
 
 	if sfreeindex%64 == 0 && sfreeindex != snelems {
@@ -269,10 +268,8 @@
 	if index < s.freeindex {
 		return false
 	}
-	whichByte := index / 8
-	whichBit := index % 8
-	byteVal := *addb(s.allocBits, whichByte)
-	return byteVal&uint8(1<<whichBit) == 0
+	bytep, mask := s.allocBits.bitp(index)
+	return *bytep&mask == 0
 }
 
 func (s *mspan) objIndex(p uintptr) uintptr {
@@ -294,14 +291,12 @@
 }
 
 func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
-	whichByte := objIndex / 8
-	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
-	bytePtr := addb(s.gcmarkBits, whichByte)
-	return markBits{bytePtr, bitMask, objIndex}
+	bytep, mask := s.gcmarkBits.bitp(objIndex)
+	return markBits{bytep, mask, objIndex}
 }
 
 func (s *mspan) markBitsForBase() markBits {
-	return markBits{s.gcmarkBits, uint8(1), 0}
+	return markBits{(*uint8)(s.gcmarkBits), uint8(1), 0}
 }
 
 // isMarked reports whether mark bit m is set.
@@ -332,11 +327,6 @@
 	atomic.And8(m.bytep, ^m.mask)
 }
 
-// clearMarkedNonAtomic clears the marked bit non-atomically.
-func (m markBits) clearMarkedNonAtomic() {
-	*m.bytep ^= m.mask
-}
-
 // markBitsForSpan returns the markBits for the span base address base.
 func markBitsForSpan(base uintptr) (mbits markBits) {
 	if base < mheap_.arena_start || base >= mheap_.arena_used {
@@ -374,6 +364,7 @@
 // heapBitsForSpan returns the heapBits for the span base address base.
 func heapBitsForSpan(base uintptr) (hbits heapBits) {
 	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		print("runtime: base ", hex(base), " not in range [", hex(mheap_.arena_start), ",", hex(mheap_.arena_used), ")\n")
 		throw("heapBitsForSpan: base out of range")
 	}
 	return heapBitsForAddr(base)
@@ -400,7 +391,7 @@
 	// Consult the span table to find the block beginning.
 	s = mheap_.spans[idx]
 	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
-		if s == nil || s.state == _MSpanStack {
+		if s == nil || s.state == _MSpanManual {
 			// If s is nil, the virtual address has never been part of the heap.
 			// This pointer may be to some mmap'd region, so we allow it.
 			// Pointers into stacks are also ok, the runtime manages these explicitly.
@@ -430,6 +421,7 @@
 				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
 				gcDumpObject("object", refBase, refOff)
 			}
+			getg().m.traceback = 2
 			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
 		}
 		return
@@ -509,16 +501,6 @@
 	return h.bits()&bitPointer != 0
 }
 
-// hasPointers reports whether the given object has any pointers.
-// It must be told how large the object at h is for efficiency.
-// h must describe the initial word of the object.
-func (h heapBits) hasPointers(size uintptr) bool {
-	if size == sys.PtrSize { // 1-word objects are always pointers
-		return true
-	}
-	return (*h.bitp>>h.shift)&bitScan != 0
-}
-
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
 // It must be told how large the object at h is, because the encoding of the
 // checkmark bit varies by size.
@@ -578,29 +560,9 @@
 		return
 	}
 	if !inheap(dst) {
-		// If dst is on the stack and in a higher frame than the
-		// caller, we either need to execute write barriers on
-		// it (which is what happens for normal stack writes
-		// through pointers to higher frames), or we need to
-		// force the mark termination stack scan to scan the
-		// frame containing dst.
-		//
-		// Executing write barriers on dst is complicated in the
-		// general case because we either need to unwind the
-		// stack to get the stack map, or we need the type's
-		// bitmap, which may be a GC program.
-		//
-		// Hence, we opt for forcing the re-scan to scan the
-		// frame containing dst, which we can do by simply
-		// unwinding the stack barriers between the current SP
-		// and dst's frame.
 		gp := getg().m.curg
 		if gp != nil && gp.stack.lo <= dst && dst < gp.stack.hi {
-			// Run on the system stack to give it more
-			// stack space.
-			systemstack(func() {
-				gcUnwindBarriers(gp, dst)
-			})
+			// Destination is our own stack. No need for barriers.
 			return
 		}
 
@@ -848,23 +810,23 @@
 	4, 5, 5, 6, 5, 6, 6, 7,
 	5, 6, 6, 7, 6, 7, 7, 8}
 
-// countFree runs through the mark bits in a span and counts the number of free objects
-// in the span.
+// countAlloc returns the number of objects allocated in span s by
+// scanning the allocation bitmap.
 // TODO:(rlh) Use popcount intrinsic.
-func (s *mspan) countFree() int {
+func (s *mspan) countAlloc() int {
 	count := 0
 	maxIndex := s.nelems / 8
 	for i := uintptr(0); i < maxIndex; i++ {
-		mrkBits := *addb(s.gcmarkBits, i)
+		mrkBits := *s.gcmarkBits.bytep(i)
 		count += int(oneBitCount[mrkBits])
 	}
 	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
-		mrkBits := *addb(s.gcmarkBits, maxIndex)
+		mrkBits := *s.gcmarkBits.bytep(maxIndex)
 		mask := uint8((1 << bitsInLastByte) - 1)
 		bits := mrkBits & mask
 		count += int(oneBitCount[bits])
 	}
-	return int(s.nelems) - count
+	return count
 }
 
 // heapBitsSetType records that the new allocation [x, x+size)
@@ -1085,7 +1047,9 @@
 					endnb += endnb
 				}
 				// Truncate to a multiple of original ptrmask.
-				endnb = maxBits / nb * nb
+				// Because nb+nb <= maxBits, nb fits in a byte.
+				// Byte division is cheaper than uintptr division.
+				endnb = uintptr(maxBits/byte(nb)) * nb
 				pbits &= 1<<endnb - 1
 				b = pbits
 				nb = endnb
@@ -1363,13 +1327,6 @@
 	}
 }
 
-// heapBitsSetTypeNoScan marks x as noscan by setting the first word
-// of x in the heap bitmap to scalar/dead.
-func heapBitsSetTypeNoScan(x uintptr) {
-	h := heapBitsForAddr(uintptr(x))
-	*h.bitp &^= (bitPointer | bitScan) << h.shift
-}
-
 var debugPtrmask struct {
 	lock mutex
 	data *byte
@@ -1902,7 +1859,7 @@
 		frame.sp = uintptr(p)
 		_g_ := getg()
 		gentraceback(_g_.m.curg.sched.pc, _g_.m.curg.sched.sp, 0, _g_.m.curg, 0, nil, 1000, getgcmaskcb, noescape(unsafe.Pointer(&frame)), 0)
-		if frame.fn != nil {
+		if frame.fn.valid() {
 			f := frame.fn
 			targetpc := frame.continpc
 			if targetpc == 0 {

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index c483310..96fb273 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go

@@ -33,7 +33,8 @@
 	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
 
 	// The rest is not accessed on every malloc.
-	alloc [_NumSizeClasses]*mspan // spans to allocate from
+
+	alloc [numSpanClasses]*mspan // spans to allocate from, indexed by spanClass
 
 	stackcache [_NumStackOrders]stackfreelist
 
@@ -77,7 +78,7 @@
 	lock(&mheap_.lock)
 	c := (*mcache)(mheap_.cachealloc.alloc())
 	unlock(&mheap_.lock)
-	for i := 0; i < _NumSizeClasses; i++ {
+	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
 	c.next_sample = nextSample()
@@ -103,12 +104,12 @@
 
 // Gets a span that has a free object in it and assigns it
 // to be the cached span for the given sizeclass. Returns this span.
-func (c *mcache) refill(sizeclass int32) *mspan {
+func (c *mcache) refill(spc spanClass) *mspan {
 	_g_ := getg()
 
 	_g_.m.locks++
 	// Return the current cached span to the central lists.
-	s := c.alloc[sizeclass]
+	s := c.alloc[spc]
 
 	if uintptr(s.allocCount) != s.nelems {
 		throw("refill of span with free space remaining")
@@ -119,7 +120,7 @@
 	}
 
 	// Get a new cached span from the central lists.
-	s = mheap_.central[sizeclass].mcentral.cacheSpan()
+	s = mheap_.central[spc].mcentral.cacheSpan()
 	if s == nil {
 		throw("out of memory")
 	}
@@ -128,13 +129,13 @@
 		throw("span has no free space")
 	}
 
-	c.alloc[sizeclass] = s
+	c.alloc[spc] = s
 	_g_.m.locks--
 	return s
 }
 
 func (c *mcache) releaseAll() {
-	for i := 0; i < _NumSizeClasses; i++ {
+	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
 			mheap_.central[i].mcentral.uncacheSpan(s)

diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ddcf81e..eaabcb9 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go

@@ -19,14 +19,19 @@
 //go:notinheap
 type mcentral struct {
 	lock      mutex
-	sizeclass int32
+	spanclass spanClass
 	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
 	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
+
+	// nmalloc is the cumulative count of objects allocated from
+	// this mcentral, assuming all spans in mcaches are
+	// fully-allocated. Written atomically, read under STW.
+	nmalloc uint64
 }
 
 // Initialize a single central free list.
-func (c *mcentral) init(sizeclass int32) {
-	c.sizeclass = sizeclass
+func (c *mcentral) init(spc spanClass) {
+	c.spanclass = spc
 	c.nonempty.init()
 	c.empty.init()
 }
@@ -34,10 +39,14 @@
 // Allocate a span to use in an MCache.
 func (c *mcentral) cacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
-	spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
+	spanBytes := uintptr(class_to_allocnpages[c.spanclass.sizeclass()]) * _PageSize
 	deductSweepCredit(spanBytes, 0)
 
 	lock(&c.lock)
+	traceDone := false
+	if trace.enabled {
+		traceGCSweepStart()
+	}
 	sg := mheap_.sweepgen
 retry:
 	var s *mspan
@@ -87,6 +96,10 @@
 		// all subsequent ones must also be either swept or in process of sweeping
 		break
 	}
+	if trace.enabled {
+		traceGCSweepDone()
+		traceDone = true
+	}
 	unlock(&c.lock)
 
 	// Replenish central list if empty.
@@ -101,15 +114,18 @@
 	// At this point s is a non-empty span, queued at the end of the empty list,
 	// c is unlocked.
 havespan:
+	if trace.enabled && !traceDone {
+		traceGCSweepDone()
+	}
 	cap := int32((s.npages << _PageShift) / s.elemsize)
 	n := cap - int32(s.allocCount)
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	atomic.Xadd64(&c.nmalloc, int64(n))
 	usedBytes := uintptr(s.allocCount) * s.elemsize
-	if usedBytes > 0 {
-		reimburseSweepCredit(usedBytes)
-	}
 	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
 	if trace.enabled {
 		// heap_live changed.
@@ -150,6 +166,10 @@
 		// mCentral_CacheSpan conservatively counted
 		// unallocated slots in heap_live. Undo this.
 		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+		// cacheSpan updated alloc assuming all objects on s
+		// were going to be allocated. Adjust for any that
+		// weren't.
+		atomic.Xadd64(&c.nmalloc, -int64(n))
 	}
 	unlock(&c.lock)
 }
@@ -205,11 +225,11 @@
 
 // grow allocates a new empty span from the heap and initializes it for c's size class.
 func (c *mcentral) grow() *mspan {
-	npages := uintptr(class_to_allocnpages[c.sizeclass])
-	size := uintptr(class_to_size[c.sizeclass])
+	npages := uintptr(class_to_allocnpages[c.spanclass.sizeclass()])
+	size := uintptr(class_to_size[c.spanclass.sizeclass()])
 	n := (npages << _PageShift) / size
 
-	s := mheap_.alloc(npages, c.sizeclass, false, true)
+	s := mheap_.alloc(npages, c.spanclass, false, true)
 	if s == nil {
 		return nil
 	}

diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index a65933f..e0d2347 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go

@@ -59,6 +59,7 @@
 	return p
 }
 
+const _sunosEAGAIN = 11
 const _ENOMEM = 12
 
 func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
@@ -76,7 +77,7 @@
 			flags |= _MAP_FIXED
 		}
 		p := mmap(v, n, _PROT_READ|_PROT_WRITE, flags, -1, 0)
-		if uintptr(p) == _ENOMEM {
+		if uintptr(p) == _ENOMEM || (GOOS == "solaris" && uintptr(p) == _sunosEAGAIN) {
 			throw("runtime: out of memory")
 		}
 		if p != v {
@@ -87,7 +88,7 @@
 	}
 
 	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
-	if uintptr(p) == _ENOMEM {
+	if uintptr(p) == _ENOMEM || (GOOS == "solaris" && uintptr(p) == _sunosEAGAIN) {
 		throw("runtime: out of memory")
 	}
 	if p != v {

diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 2c338c8..c37c82a 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go

@@ -16,6 +16,9 @@
 
 	_PAGE_READWRITE = 0x0004
 	_PAGE_NOACCESS  = 0x0001
+
+	_ERROR_NOT_ENOUGH_MEMORY = 8
+	_ERROR_COMMITMENT_LIMIT  = 1455
 )
 
 // Don't split the stack as this function may be invoked without a valid G,
@@ -112,7 +115,13 @@
 	mSysStatInc(sysStat, n)
 	p := stdcall4(_VirtualAlloc, uintptr(v), n, _MEM_COMMIT, _PAGE_READWRITE)
 	if p != uintptr(v) {
-		print("runtime: VirtualAlloc of ", n, " bytes failed with errno=", getlasterror(), "\n")
-		throw("runtime: cannot map pages in arena address space")
+		errno := getlasterror()
+		print("runtime: VirtualAlloc of ", n, " bytes failed with errno=", errno, "\n")
+		switch errno {
+		case _ERROR_NOT_ENOUGH_MEMORY, _ERROR_COMMITMENT_LIMIT:
+			throw("out of memory")
+		default:
+			throw("runtime: cannot map pages in arena address space")
+		}
 	}
 }

diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s
index ef6e602..1adb26b 100644
--- a/src/runtime/memclr_386.s
+++ b/src/runtime/memclr_386.s

@@ -27,8 +27,8 @@
 	JBE	_5through8
 	CMPL	BX, $16
 	JBE	_9through16
-	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
-	JEQ	nosse2
+	CMPB	runtime·support_sse2(SB), $1
+	JNE	nosse2
 	PXOR	X0, X0
 	CMPL	BX, $32
 	JBE	_17through32

diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
index b712ea1..e76201b 100644
--- a/src/runtime/memmove_386.s
+++ b/src/runtime/memmove_386.s

@@ -49,8 +49,8 @@
 	JBE	move_5through8
 	CMPL	BX, $16
 	JBE	move_9through16
-	TESTL	$0x4000000, runtime·cpuid_edx(SB) // check for sse2
-	JEQ	nosse2
+	CMPB	runtime·support_sse2(SB), $1
+	JNE	nosse2
 	CMPL	BX, $32
 	JBE	move_17through32
 	CMPL	BX, $64
@@ -71,8 +71,8 @@
  */
 forward:
 	// If REP MOVSB isn't fast, don't use it
-	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
-	JEQ	fwdBy4
+	CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
+	JNE	fwdBy4
 
 	// Check alignment
 	MOVL	SI, AX

diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index c2286d3..21bf8e4 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s

@@ -64,8 +64,8 @@
 	JBE	move_129through256
 	// TODO: use branch table and BSR to make this just a single dispatch
 
-	TESTB	$1, runtime·useRepMovs(SB)
-	JZ	avxUnaligned
+	TESTB	$1, runtime·useAVXmemmove(SB)
+	JNZ	avxUnaligned
 
 /*
  * check and set for backwards
@@ -81,8 +81,8 @@
 	JLS	move_256through2048
 
 	// If REP MOVSB isn't fast, don't use it
-	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
-	JEQ	fwdBy8
+	CMPB	runtime·support_erms(SB), $1 // enhanced REP MOVSB/STOSB
+	JNE	fwdBy8
 
 	// Check alignment
 	MOVL	SI, AX
@@ -407,7 +407,7 @@
 gobble_mem_fwd_loop:
 	PREFETCHNTA 0x1C0(SI)
 	PREFETCHNTA 0x280(SI)
-	// Prefetch values were choosen empirically.
+	// Prefetch values were chosen empirically.
 	// Approach for prefetch usage as in 7.6.6 of [1]
 	// [1] 64-ia-32-architectures-optimization-manual.pdf
 	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf

diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 7e191d4..a8729b1 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go

@@ -12,8 +12,12 @@
 	"unsafe"
 )
 
+// finblock is an array of finalizers to be executed. finblocks are
+// arranged in a linked list for the finalizer queue.
+//
 // finblock is allocated from non-GC'd memory, so any heap pointers
-// must be specially handled.
+// must be specially handled. GC currently assumes that the finalizer
+// queue does not grow during marking (but it can shrink).
 //
 //go:notinheap
 type finblock struct {
@@ -71,6 +75,16 @@
 }
 
 func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot *ptrtype) {
+	if gcphase != _GCoff {
+		// Currently we assume that the finalizer queue won't
+		// grow during marking so we don't have to rescan it
+		// during mark termination. If we ever need to lift
+		// this assumption, we can do it by adding the
+		// necessary barriers to queuefinalizer (which it may
+		// have automatically).
+		throw("queuefinalizer during GC")
+	}
+
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
@@ -441,7 +455,7 @@
 	}
 
 	n = s.elemsize
-	if s.sizeclass != 0 {
+	if s.spanclass.sizeclass() != 0 {
 		x = add(x, (uintptr(v)-uintptr(x))/n*n)
 	}
 	return

diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index fe4b0fc..7496671 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go

@@ -29,7 +29,7 @@
 	first  func(arg, p unsafe.Pointer) // called first time p is returned
 	arg    unsafe.Pointer
 	list   *mlink
-	chunk  unsafe.Pointer
+	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
 	stat   *uint64
@@ -54,7 +54,7 @@
 	f.first = first
 	f.arg = arg
 	f.list = nil
-	f.chunk = nil
+	f.chunk = 0
 	f.nchunk = 0
 	f.inuse = 0
 	f.stat = stat
@@ -77,15 +77,15 @@
 		return v
 	}
 	if uintptr(f.nchunk) < f.size {
-		f.chunk = persistentalloc(_FixAllocChunk, 0, f.stat)
+		f.chunk = uintptr(persistentalloc(_FixAllocChunk, 0, f.stat))
 		f.nchunk = _FixAllocChunk
 	}
 
-	v := f.chunk
+	v := unsafe.Pointer(f.chunk)
 	if f.first != nil {
 		f.first(f.arg, v)
 	}
-	f.chunk = add(f.chunk, f.size)
+	f.chunk = f.chunk + f.size
 	f.nchunk -= uint32(f.size)
 	f.inuse += f.size
 	return v

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index cd57720..111fa78 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go

@@ -178,17 +178,21 @@
 		throw("size of Workbuf is suboptimal")
 	}
 
+	// No sweep on the first cycle.
+	mheap_.sweepdone = 1
+
+	// Set a reasonable initial GC trigger.
+	memstats.triggerRatio = 7 / 8.0
+
+	// Fake a heap_marked value so it looks like a trigger at
+	// heapminimum is the appropriate growth from heap_marked.
+	// This will go into computing the initial GC goal.
+	memstats.heap_marked = uint64(float64(heapminimum) / (1 + memstats.triggerRatio))
+
+	// Set gcpercent from the environment. This will also compute
+	// and set the GC trigger and goal.
 	_ = setGCPercent(readgogc())
-	memstats.gc_trigger = heapminimum
-	// Compute the goal heap size based on the trigger:
-	//   trigger = marked * (1 + triggerRatio)
-	//   marked = trigger / (1 + triggerRatio)
-	//   goal = marked * (1 + GOGC/100)
-	//        = trigger / (1 + triggerRatio) * (1 + GOGC/100)
-	memstats.next_gc = uint64(float64(memstats.gc_trigger) / (1 + gcController.triggerRatio) * (1 + float64(gcpercent)/100))
-	if gcpercent < 0 {
-		memstats.next_gc = ^uint64(0)
-	}
+
 	work.startSema = 1
 	work.markDoneSema = 1
 }
@@ -223,12 +227,8 @@
 	}
 	gcpercent = in
 	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
-	if gcController.triggerRatio > float64(gcpercent)/100 {
-		gcController.triggerRatio = float64(gcpercent) / 100
-	}
-	// This is either in gcinit or followed by a STW GC, both of
-	// which will reset other stats like memstats.gc_trigger and
-	// memstats.next_gc to appropriate values.
+	// Update pacing in response to gcpercent change.
+	gcSetTriggerRatio(memstats.triggerRatio)
 	unlock(&mheap_.lock)
 	return out
 }
@@ -238,7 +238,9 @@
 var gcphase uint32
 
 // The compiler knows about this variable.
-// If you change it, you must change the compiler too.
+// If you change it, you must change builtin/runtime.go, too.
+// If you change the first four bytes, you must also change the write
+// barrier insertion code.
 var writeBarrier struct {
 	enabled bool    // compiler emits a check of this before calling write barrier
 	pad     [3]byte // compiler uses 32-bit load for "enabled" field
@@ -328,10 +330,10 @@
 // utilization between assist and background marking to be 25% of
 // GOMAXPROCS. The high-level design of this algorithm is documented
 // at https://golang.org/s/go15gcpacing.
-var gcController = gcControllerState{
-	// Initial trigger ratio guess.
-	triggerRatio: 7 / 8.0,
-}
+//
+// All fields of gcController are used only during a single mark
+// cycle.
+var gcController gcControllerState
 
 type gcControllerState struct {
 	// scanWork is the total scan work performed this cycle. This
@@ -402,14 +404,6 @@
 	// beginning of each cycle.
 	fractionalUtilizationGoal float64
 
-	// triggerRatio is the heap growth ratio at which the garbage
-	// collection cycle should start. E.g., if this is 0.6, then
-	// GC should start when the live heap has reached 1.6 times
-	// the heap size marked by the previous cycle. This should be
-	// ≤ GOGC/100 so the trigger heap size is less than the goal
-	// heap size. This is updated at the end of of each cycle.
-	triggerRatio float64
-
 	_ [sys.CacheLineSize]byte
 
 	// fractionalMarkWorkersNeeded is the number of fractional
@@ -438,7 +432,7 @@
 	// first cycle) or may be much smaller (resulting in a large
 	// error response).
 	if memstats.gc_trigger <= heapminimum {
-		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + c.triggerRatio))
+		memstats.heap_marked = uint64(float64(memstats.gc_trigger) / (1 + memstats.triggerRatio))
 	}
 
 	// Re-compute the heap goal for this cycle in case something
@@ -494,17 +488,12 @@
 
 // revise updates the assist ratio during the GC cycle to account for
 // improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan or memstats.heap_live is updated (with
-// mheap_.lock held).
+// whenever memstats.heap_scan, memstats.heap_live, or
+// memstats.next_gc is updated (with mheap_.lock held).
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
 // available).
-//
-// TODO: Consider removing the periodic controller update altogether.
-// Since we switched to allocating black, in theory we shouldn't have
-// to change the assist ratio. However, this is still a useful hook
-// that we've found many uses for when experimenting.
 func (c *gcControllerState) revise() {
 	// Compute the expected scan work remaining.
 	//
@@ -535,7 +524,7 @@
 	}
 
 	// Compute the heap distance remaining.
-	heapDistance := int64(memstats.next_gc) - int64(memstats.heap_live)
+	heapDistance := int64(memstats.next_gc) - int64(atomic.Load64(&memstats.heap_live))
 	if heapDistance <= 0 {
 		// This shouldn't happen, but if it does, avoid
 		// dividing by zero or setting the assist negative.
@@ -549,10 +538,15 @@
 	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
 }
 
-// endCycle updates the GC controller state at the end of the
-// concurrent part of the GC cycle.
-func (c *gcControllerState) endCycle() {
-	h_t := c.triggerRatio // For debugging
+// endCycle computes the trigger ratio for the next cycle.
+func (c *gcControllerState) endCycle() float64 {
+	if work.userForced {
+		// Forced GC means this cycle didn't start at the
+		// trigger, so where it finished isn't good
+		// information about how to adjust the trigger.
+		// Just leave it where it is.
+		return memstats.triggerRatio
+	}
 
 	// Proportional response gain for the trigger controller. Must
 	// be in [0, 1]. Lower values smooth out transient effects but
@@ -581,25 +575,17 @@
 		utilization += float64(c.assistTime) / float64(assistDuration*int64(gomaxprocs))
 	}
 
-	triggerError := goalGrowthRatio - c.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-c.triggerRatio)
+	triggerError := goalGrowthRatio - memstats.triggerRatio - utilization/gcGoalUtilization*(actualGrowthRatio-memstats.triggerRatio)
 
 	// Finally, we adjust the trigger for next time by this error,
 	// damped by the proportional gain.
-	c.triggerRatio += triggerGain * triggerError
-	if c.triggerRatio < 0 {
-		// This can happen if the mutator is allocating very
-		// quickly or the GC is scanning very slowly.
-		c.triggerRatio = 0
-	} else if c.triggerRatio > goalGrowthRatio*0.95 {
-		// Ensure there's always a little margin so that the
-		// mutator assist ratio isn't infinity.
-		c.triggerRatio = goalGrowthRatio * 0.95
-	}
+	triggerRatio := memstats.triggerRatio + triggerGain*triggerError
 
 	if debug.gcpacertrace > 0 {
 		// Print controller state in terms of the design
 		// document.
 		H_m_prev := memstats.heap_marked
+		h_t := memstats.triggerRatio
 		H_T := memstats.gc_trigger
 		h_a := actualGrowthRatio
 		H_a := memstats.heap_live
@@ -619,6 +605,8 @@
 			" u_a/u_g=", u_a/u_g,
 			"\n")
 	}
+
+	return triggerRatio
 }
 
 // enlistWorker encourages another dedicated mark worker to start on
@@ -650,7 +638,7 @@
 	}
 	myID := gp.m.p.ptr().id
 	for tries := 0; tries < 5; tries++ {
-		id := int32(fastrand() % uint32(gomaxprocs-1))
+		id := int32(fastrandn(uint32(gomaxprocs - 1)))
 		if id >= myID {
 			id++
 		}
@@ -700,9 +688,6 @@
 		// This P is now dedicated to marking until the end of
 		// the concurrent mark phase.
 		_p_.gcMarkWorkerMode = gcMarkWorkerDedicatedMode
-		// TODO(austin): This P isn't going to run anything
-		// else for a while, so kick everything out of its run
-		// queue.
 	} else {
 		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
 			// No more workers are need right now.
@@ -760,6 +745,120 @@
 	return gp
 }
 
+// gcSetTriggerRatio sets the trigger ratio and updates everything
+// derived from it: the absolute trigger, the heap goal, mark pacing,
+// and sweep pacing.
+//
+// This can be called any time. If GC is the in the middle of a
+// concurrent phase, it will adjust the pacing of that phase.
+//
+// This depends on gcpercent, memstats.heap_marked, and
+// memstats.heap_live. These must be up to date.
+//
+// mheap_.lock must be held or the world must be stopped.
+func gcSetTriggerRatio(triggerRatio float64) {
+	// Set the trigger ratio, capped to reasonable bounds.
+	if triggerRatio < 0 {
+		// This can happen if the mutator is allocating very
+		// quickly or the GC is scanning very slowly.
+		triggerRatio = 0
+	} else if gcpercent >= 0 {
+		// Ensure there's always a little margin so that the
+		// mutator assist ratio isn't infinity.
+		maxTriggerRatio := 0.95 * float64(gcpercent) / 100
+		if triggerRatio > maxTriggerRatio {
+			triggerRatio = maxTriggerRatio
+		}
+	}
+	memstats.triggerRatio = triggerRatio
+
+	// Compute the absolute GC trigger from the trigger ratio.
+	//
+	// We trigger the next GC cycle when the allocated heap has
+	// grown by the trigger ratio over the marked heap size.
+	trigger := ^uint64(0)
+	if gcpercent >= 0 {
+		trigger = uint64(float64(memstats.heap_marked) * (1 + triggerRatio))
+		// Don't trigger below the minimum heap size.
+		minTrigger := heapminimum
+		if !gosweepdone() {
+			// Concurrent sweep happens in the heap growth
+			// from heap_live to gc_trigger, so ensure
+			// that concurrent sweep has some heap growth
+			// in which to perform sweeping before we
+			// start the next GC cycle.
+			sweepMin := atomic.Load64(&memstats.heap_live) + sweepMinHeapDistance*uint64(gcpercent)/100
+			if sweepMin > minTrigger {
+				minTrigger = sweepMin
+			}
+		}
+		if trigger < minTrigger {
+			trigger = minTrigger
+		}
+		if int64(trigger) < 0 {
+			print("runtime: next_gc=", memstats.next_gc, " heap_marked=", memstats.heap_marked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "triggerRatio=", triggerRatio, " minTrigger=", minTrigger, "\n")
+			throw("gc_trigger underflow")
+		}
+	}
+	memstats.gc_trigger = trigger
+
+	// Compute the next GC goal, which is when the allocated heap
+	// has grown by GOGC/100 over the heap marked by the last
+	// cycle.
+	goal := ^uint64(0)
+	if gcpercent >= 0 {
+		goal = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
+		if goal < trigger {
+			// The trigger ratio is always less than GOGC/100, but
+			// other bounds on the trigger may have raised it.
+			// Push up the goal, too.
+			goal = trigger
+		}
+	}
+	memstats.next_gc = goal
+	if trace.enabled {
+		traceNextGC()
+	}
+
+	// Update mark pacing.
+	if gcphase != _GCoff {
+		gcController.revise()
+	}
+
+	// Update sweep pacing.
+	if gosweepdone() {
+		mheap_.sweepPagesPerByte = 0
+	} else {
+		// Concurrent sweep needs to sweep all of the in-use
+		// pages by the time the allocated heap reaches the GC
+		// trigger. Compute the ratio of in-use pages to sweep
+		// per byte allocated, accounting for the fact that
+		// some might already be swept.
+		heapLiveBasis := atomic.Load64(&memstats.heap_live)
+		heapDistance := int64(trigger) - int64(heapLiveBasis)
+		// Add a little margin so rounding errors and
+		// concurrent sweep are less likely to leave pages
+		// unswept when GC starts.
+		heapDistance -= 1024 * 1024
+		if heapDistance < _PageSize {
+			// Avoid setting the sweep ratio extremely high
+			heapDistance = _PageSize
+		}
+		pagesSwept := atomic.Load64(&mheap_.pagesSwept)
+		sweepDistancePages := int64(mheap_.pagesInUse) - int64(pagesSwept)
+		if sweepDistancePages <= 0 {
+			mheap_.sweepPagesPerByte = 0
+		} else {
+			mheap_.sweepPagesPerByte = float64(sweepDistancePages) / float64(heapDistance)
+			mheap_.sweepHeapLiveBasis = heapLiveBasis
+			// Write pagesSweptBasis last, since this
+			// signals concurrent sweeps to recompute
+			// their debt.
+			atomic.Store64(&mheap_.pagesSweptBasis, pagesSwept)
+		}
+	}
+}
+
 // gcGoalUtilization is the goal CPU utilization for background
 // marking as a fraction of GOMAXPROCS.
 const gcGoalUtilization = 0.25
@@ -782,10 +881,23 @@
 const gcOverAssistWork = 64 << 10
 
 var work struct {
-	full  uint64                   // lock-free list of full blocks workbuf
-	empty uint64                   // lock-free list of empty blocks workbuf
+	full  lfstack                  // lock-free list of full blocks workbuf
+	empty lfstack                  // lock-free list of empty blocks workbuf
 	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
 
+	wbufSpans struct {
+		lock mutex
+		// free is a list of spans dedicated to workbufs, but
+		// that don't currently contain any workbufs.
+		free mSpanList
+		// busy is a list of all spans containing workbufs on
+		// one of the workbuf lists.
+		busy mSpanList
+	}
+
+	// Restore 64-bit alignment on 32-bit.
+	_ uint32
+
 	// bytesMarked is the number of bytes marked this cycle. This
 	// includes bytes blackened in scanned objects, noscan objects
 	// that go straight to black, and permagrey objects scanned by
@@ -815,15 +927,13 @@
 	// should pass gcDrainBlock to gcDrain to block in the
 	// getfull() barrier. Otherwise, they should pass gcDrainNoBlock.
 	//
-	// TODO: This is a temporary fallback to support
-	// debug.gcrescanstacks > 0 and to work around some known
-	// races. Remove this when we remove the debug option and fix
-	// the races.
+	// TODO: This is a temporary fallback to work around races
+	// that cause early mark termination.
 	helperDrainBlock bool
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
-	nFlushCacheRoots                                             int
-	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
+	nFlushCacheRoots                               int
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots int
 
 	// markrootDone indicates that roots have been marked at least
 	// once during the current GC cycle. This is checked by root
@@ -859,6 +969,10 @@
 	// mode is the concurrency mode of the current GC cycle.
 	mode gcMode
 
+	// userForced indicates the current GC cycle was forced by an
+	// explicit user call.
+	userForced bool
+
 	// totaltime is the CPU nanoseconds spent in GC since the
 	// program started if debug.gctrace > 0.
 	totaltime int64
@@ -875,14 +989,19 @@
 		head, tail guintptr
 	}
 
-	// rescan is a list of G's that need to be rescanned during
-	// mark termination. A G adds itself to this list when it
-	// first invalidates its stack scan.
-	rescan struct {
+	// sweepWaiters is a list of blocked goroutines to wake when
+	// we transition from mark termination to sweep.
+	sweepWaiters struct {
 		lock mutex
-		list []guintptr
+		head guintptr
 	}
 
+	// cycles is the number of completed GC cycles, where a GC
+	// cycle is sweep termination, mark, mark termination, and
+	// sweep. This differs from memstats.numgc, which is
+	// incremented at mark termination.
+	cycles uint32
+
 	// Timing/utilization stats for this cycle.
 	stwprocs, maxprocs                 int32
 	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
@@ -898,7 +1017,94 @@
 // garbage collection is complete. It may also block the entire
 // program.
 func GC() {
-	gcStart(gcForceBlockMode, false)
+	// We consider a cycle to be: sweep termination, mark, mark
+	// termination, and sweep. This function shouldn't return
+	// until a full cycle has been completed, from beginning to
+	// end. Hence, we always want to finish up the current cycle
+	// and start a new one. That means:
+	//
+	// 1. In sweep termination, mark, or mark termination of cycle
+	// N, wait until mark termination N completes and transitions
+	// to sweep N.
+	//
+	// 2. In sweep N, help with sweep N.
+	//
+	// At this point we can begin a full cycle N+1.
+	//
+	// 3. Trigger cycle N+1 by starting sweep termination N+1.
+	//
+	// 4. Wait for mark termination N+1 to complete.
+	//
+	// 5. Help with sweep N+1 until it's done.
+	//
+	// This all has to be written to deal with the fact that the
+	// GC may move ahead on its own. For example, when we block
+	// until mark termination N, we may wake up in cycle N+2.
+
+	gp := getg()
+
+	// Prevent the GC phase or cycle count from changing.
+	lock(&work.sweepWaiters.lock)
+	n := atomic.Load(&work.cycles)
+	if gcphase == _GCmark {
+		// Wait until sweep termination, mark, and mark
+		// termination of cycle N complete.
+		gp.schedlink = work.sweepWaiters.head
+		work.sweepWaiters.head.set(gp)
+		goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+	} else {
+		// We're in sweep N already.
+		unlock(&work.sweepWaiters.lock)
+	}
+
+	// We're now in sweep N or later. Trigger GC cycle N+1, which
+	// will first finish sweep N if necessary and then enter sweep
+	// termination N+1.
+	gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerCycle, n: n + 1})
+
+	// Wait for mark termination N+1 to complete.
+	lock(&work.sweepWaiters.lock)
+	if gcphase == _GCmark && atomic.Load(&work.cycles) == n+1 {
+		gp.schedlink = work.sweepWaiters.head
+		work.sweepWaiters.head.set(gp)
+		goparkunlock(&work.sweepWaiters.lock, "wait for GC cycle", traceEvGoBlock, 1)
+	} else {
+		unlock(&work.sweepWaiters.lock)
+	}
+
+	// Finish sweep N+1 before returning. We do this both to
+	// complete the cycle and because runtime.GC() is often used
+	// as part of tests and benchmarks to get the system into a
+	// relatively stable and isolated state.
+	for atomic.Load(&work.cycles) == n+1 && gosweepone() != ^uintptr(0) {
+		sweep.nbgsweep++
+		Gosched()
+	}
+
+	// Callers may assume that the heap profile reflects the
+	// just-completed cycle when this returns (historically this
+	// happened because this was a STW GC), but right now the
+	// profile still reflects mark termination N, not N+1.
+	//
+	// As soon as all of the sweep frees from cycle N+1 are done,
+	// we can go ahead and publish the heap profile.
+	//
+	// First, wait for sweeping to finish. (We know there are no
+	// more spans on the sweep queue, but we may be concurrently
+	// sweeping spans, so we have to wait.)
+	for atomic.Load(&work.cycles) == n+1 && atomic.Load(&mheap_.sweepers) != 0 {
+		Gosched()
+	}
+
+	// Now we're really done with sweeping, so we can publish the
+	// stable heap profile. Only do this if we haven't already hit
+	// another mark termination.
+	mp := acquirem()
+	cycle := atomic.Load(&work.cycles)
+	if cycle == n+1 || (gcphase == _GCmark && cycle == n+2) {
+		mProf_PostSweep()
+	}
+	releasem(mp)
 }
 
 // gcMode indicates how concurrent a GC cycle should be.
@@ -910,24 +1116,75 @@
 	gcForceBlockMode               // stop-the-world GC now and STW sweep (forced by user)
 )
 
-// gcShouldStart returns true if the exit condition for the _GCoff
-// phase has been met. The exit condition should be tested when
-// allocating.
-//
-// If forceTrigger is true, it ignores the current heap size, but
-// checks all other conditions. In general this should be false.
-func gcShouldStart(forceTrigger bool) bool {
-	return gcphase == _GCoff && (forceTrigger || memstats.heap_live >= memstats.gc_trigger) && memstats.enablegc && panicking == 0 && gcpercent >= 0
+// A gcTrigger is a predicate for starting a GC cycle. Specifically,
+// it is an exit condition for the _GCoff phase.
+type gcTrigger struct {
+	kind gcTriggerKind
+	now  int64  // gcTriggerTime: current time
+	n    uint32 // gcTriggerCycle: cycle number to start
 }
 
-// gcStart transitions the GC from _GCoff to _GCmark (if mode ==
-// gcBackgroundMode) or _GCmarktermination (if mode !=
-// gcBackgroundMode) by performing sweep termination and GC
-// initialization.
+type gcTriggerKind int
+
+const (
+	// gcTriggerAlways indicates that a cycle should be started
+	// unconditionally, even if GOGC is off or we're in a cycle
+	// right now. This cannot be consolidated with other cycles.
+	gcTriggerAlways gcTriggerKind = iota
+
+	// gcTriggerHeap indicates that a cycle should be started when
+	// the heap size reaches the trigger heap size computed by the
+	// controller.
+	gcTriggerHeap
+
+	// gcTriggerTime indicates that a cycle should be started when
+	// it's been more than forcegcperiod nanoseconds since the
+	// previous GC cycle.
+	gcTriggerTime
+
+	// gcTriggerCycle indicates that a cycle should be started if
+	// we have not yet started cycle number gcTrigger.n (relative
+	// to work.cycles).
+	gcTriggerCycle
+)
+
+// test returns true if the trigger condition is satisfied, meaning
+// that the exit condition for the _GCoff phase has been met. The exit
+// condition should be tested when allocating.
+func (t gcTrigger) test() bool {
+	if !memstats.enablegc || panicking != 0 {
+		return false
+	}
+	if t.kind == gcTriggerAlways {
+		return true
+	}
+	if gcphase != _GCoff || gcpercent < 0 {
+		return false
+	}
+	switch t.kind {
+	case gcTriggerHeap:
+		// Non-atomic access to heap_live for performance. If
+		// we are going to trigger on this, this thread just
+		// atomically wrote heap_live anyway and we'll see our
+		// own write.
+		return memstats.heap_live >= memstats.gc_trigger
+	case gcTriggerTime:
+		lastgc := int64(atomic.Load64(&memstats.last_gc_nanotime))
+		return lastgc != 0 && t.now-lastgc > forcegcperiod
+	case gcTriggerCycle:
+		// t.n > work.cycles, but accounting for wraparound.
+		return int32(t.n-work.cycles) > 0
+	}
+	return true
+}
+
+// gcStart transitions the GC from _GCoff to _GCmark (if
+// !mode.stwMark) or _GCmarktermination (if mode.stwMark) by
+// performing sweep termination and GC initialization.
 //
 // This may return without performing this transition in some cases,
 // such as when called on a system stack or with locks held.
-func gcStart(mode gcMode, forceTrigger bool) {
+func gcStart(mode gcMode, trigger gcTrigger) {
 	// Since this is called from malloc and malloc is called in
 	// the guts of a number of libraries that might be holding
 	// locks, don't attempt to start GC in non-preemptible or
@@ -950,29 +1207,21 @@
 	//
 	// We check the transition condition continuously here in case
 	// this G gets delayed in to the next GC cycle.
-	for (mode != gcBackgroundMode || gcShouldStart(forceTrigger)) && gosweepone() != ^uintptr(0) {
+	for trigger.test() && gosweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 	}
 
 	// Perform GC initialization and the sweep termination
 	// transition.
-	//
-	// If this is a forced GC, don't acquire the transition lock
-	// or re-check the transition condition because we
-	// specifically *don't* want to share the transition with
-	// another thread.
-	useStartSema := mode == gcBackgroundMode
-	if useStartSema {
-		semacquire(&work.startSema, 0)
-		// Re-check transition condition under transition lock.
-		if !gcShouldStart(forceTrigger) {
-			semrelease(&work.startSema)
-			return
-		}
+	semacquire(&work.startSema)
+	// Re-check transition condition under transition lock.
+	if !trigger.test() {
+		semrelease(&work.startSema)
+		return
 	}
 
 	// For stats, check if this GC was forced by the user.
-	forced := mode != gcBackgroundMode
+	work.userForced = trigger.kind == gcTriggerAlways || trigger.kind == gcTriggerCycle
 
 	// In gcstoptheworld debug mode, upgrade the mode accordingly.
 	// We do this after re-checking the transition condition so
@@ -987,7 +1236,7 @@
 	}
 
 	// Ok, we're doing it!  Stop everybody else
-	semacquire(&worldsema, 0)
+	semacquire(&worldsema)
 
 	if trace.enabled {
 		traceGCStart()
@@ -999,13 +1248,13 @@
 
 	gcResetMarkState()
 
-	now := nanotime()
 	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
-	work.tSweepTerm = now
-	work.heap0 = memstats.heap_live
+	work.heap0 = atomic.Load64(&memstats.heap_live)
 	work.pauseNS = 0
 	work.mode = mode
 
+	now := nanotime()
+	work.tSweepTerm = now
 	work.pauseStart = now
 	systemstack(stopTheWorldWithSema)
 	// Finish sweep before we start concurrent scan.
@@ -1016,6 +1265,7 @@
 	// reclaimed until the next GC cycle.
 	clearpools()
 
+	work.cycles++
 	if mode == gcBackgroundMode { // Do as much work concurrently as possible
 		gcController.startCycle()
 		work.heapGoal = memstats.next_gc
@@ -1028,18 +1278,7 @@
 		// the time we start the world and begin
 		// scanning.
 		//
-		// It's necessary to enable write barriers
-		// during the scan phase for several reasons:
-		//
-		// They must be enabled for writes to higher
-		// stack frames before we scan stacks and
-		// install stack barriers because this is how
-		// we track writes to inactive stack frames.
-		// (Alternatively, we could not install stack
-		// barriers over frame boundaries with
-		// up-pointers).
-		//
-		// They must be enabled before assists are
+		// Write barriers must be enabled before assists are
 		// enabled because they must be enabled before
 		// any non-leaf heap objects are marked. Since
 		// allocations are blocked until assists can
@@ -1078,17 +1317,11 @@
 		work.tMark, work.tMarkTerm = t, t
 		work.heapGoal = work.heap0
 
-		if forced {
-			memstats.numforcedgc++
-		}
-
 		// Perform mark termination. This will restart the world.
-		gcMarkTermination()
+		gcMarkTermination(memstats.triggerRatio)
 	}
 
-	if useStartSema {
-		semrelease(&work.startSema)
-	}
+	semrelease(&work.startSema)
 }
 
 // gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
@@ -1108,7 +1341,7 @@
 // by mark termination.
 func gcMarkDone() {
 top:
-	semacquire(&work.markDoneSema, 0)
+	semacquire(&work.markDoneSema)
 
 	// Re-check transition condition under transition lock.
 	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
@@ -1203,14 +1436,14 @@
 
 		// endCycle depends on all gcWork cache stats being
 		// flushed. This is ensured by mark 2.
-		gcController.endCycle()
+		nextTriggerRatio := gcController.endCycle()
 
 		// Perform mark termination. This will restart the world.
-		gcMarkTermination()
+		gcMarkTermination(nextTriggerRatio)
 	}
 }
 
-func gcMarkTermination() {
+func gcMarkTermination(nextTriggerRatio float64) {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
 	atomic.Store(&gcBlackenEnabled, 0)
@@ -1292,11 +1525,17 @@
 		throw("gc done but gcphase != _GCoff")
 	}
 
+	// Update GC trigger and pacing for the next cycle.
+	gcSetTriggerRatio(nextTriggerRatio)
+
 	// Update timing memstats
-	now, unixNow := nanotime(), unixnanotime()
+	now := nanotime()
+	sec, nsec, _ := time_now()
+	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
-	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
+	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
 	memstats.pause_total_ns += uint64(work.pauseNS)
@@ -1314,36 +1553,40 @@
 	totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
 	memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
 
-	memstats.numgc++
-
 	// Reset sweep state.
 	sweep.nbgsweep = 0
 	sweep.npausesweep = 0
 
+	if work.userForced {
+		memstats.numforcedgc++
+	}
+
+	// Bump GC cycle count and wake goroutines waiting on sweep.
+	lock(&work.sweepWaiters.lock)
+	memstats.numgc++
+	injectglist(work.sweepWaiters.head.ptr())
+	work.sweepWaiters.head = 0
+	unlock(&work.sweepWaiters.lock)
+
+	// Finish the current heap profiling cycle and start a new
+	// heap profiling cycle. We do this before starting the world
+	// so events don't leak into the wrong cycle.
+	mProf_NextCycle()
+
 	systemstack(startTheWorldWithSema)
 
-	// Update heap profile stats if gcSweep didn't do it. This is
-	// relatively expensive, so we don't want to do it while the
-	// world is stopped, but it needs to happen ASAP after
-	// starting the world to prevent too many allocations from the
-	// next cycle leaking in. It must happen before releasing
-	// worldsema since there are applications that do a
-	// runtime.GC() to update the heap profile and then
-	// immediately collect the profile.
-	if _ConcurrentSweep && work.mode != gcForceBlockMode {
-		mProf_GC()
-	}
+	// Flush the heap profile so we can start a new cycle next GC.
+	// This is relatively expensive, so we don't do it with the
+	// world stopped.
+	mProf_Flush()
+
+	// Prepare workbufs for freeing by the sweeper. We do this
+	// asynchronously because it can take non-trivial time.
+	prepareFreeWorkbufs()
 
 	// Free stack spans. This must be done between GC cycles.
 	systemstack(freeStackSpans)
 
-	// Best-effort remove stack barriers so they don't get in the
-	// way of things like GDB and perf.
-	lock(&allglock)
-	myallgs := allgs
-	unlock(&allglock)
-	gcTryRemoveAllStackBarriers(myallgs)
-
 	// Print gctrace before dropping worldsema. As soon as we drop
 	// worldsema another cycle could start and smash the stats
 	// we're trying to print.
@@ -1377,7 +1620,7 @@
 			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
 			work.heapGoal>>20, " MB goal, ",
 			work.maxprocs, " P")
-		if work.mode != gcBackgroundMode {
+		if work.userForced {
 			print(" (forced)")
 		}
 		print("\n")
@@ -1527,6 +1770,25 @@
 			default:
 				throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
 			case gcMarkWorkerDedicatedMode:
+				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+				if gp.preempt {
+					// We were preempted. This is
+					// a useful signal to kick
+					// everything out of the run
+					// queue so it can run
+					// somewhere else.
+					lock(&sched.lock)
+					for {
+						gp, _ := runqget(_p_)
+						if gp == nil {
+							break
+						}
+						globrunqput(gp)
+					}
+					unlock(&sched.lock)
+				}
+				// Go back to draining, this time
+				// without preemption.
 				gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 			case gcMarkWorkerFractionalMode:
 				gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
@@ -1599,7 +1861,7 @@
 	if p != nil && !p.gcw.empty() {
 		return true
 	}
-	if atomic.Load64(&work.full) != 0 {
+	if !work.full.empty() {
 		return true // global work available
 	}
 	if work.markrootNext < work.markrootJobs {
@@ -1629,24 +1891,22 @@
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
 
-	if debug.gcrescanstacks == 0 && work.full == 0 && work.nDataRoots+work.nBSSRoots+work.nSpanRoots+work.nStackRoots+work.nRescanRoots == 0 {
+	if work.full == 0 && work.nDataRoots+work.nBSSRoots+work.nSpanRoots+work.nStackRoots == 0 {
 		// There's no work on the work queue and no root jobs
 		// that can produce work, so don't bother entering the
 		// getfull() barrier.
 		//
-		// With the hybrid barrier enabled, this will be the
-		// situation the vast majority of the time after
-		// concurrent mark. However, we still need a fallback
-		// for STW GC and because there are some known races
-		// that occasionally leave work around for mark
-		// termination.
+		// This will be the situation the vast majority of the
+		// time after concurrent mark. However, we still need
+		// a fallback for STW GC and because there are some
+		// known races that occasionally leave work around for
+		// mark termination.
 		//
 		// We're still hedging our bets here: if we do
 		// accidentally produce some work, we'll still process
 		// it, just not necessarily in parallel.
 		//
-		// TODO(austin): When we eliminate
-		// debug.gcrescanstacks: fix the races, and remove
+		// TODO(austin): Fix the races and and remove
 		// work draining from mark termination so we don't
 		// need the fallback path.
 		work.helperDrainBlock = false
@@ -1710,52 +1970,14 @@
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
-	// Trigger the next GC cycle when the allocated heap has grown
-	// by triggerRatio over the marked heap size. Assume that
-	// we're in steady state, so the marked heap size is the
-	// same now as it was at the beginning of the GC cycle.
-	memstats.gc_trigger = uint64(float64(memstats.heap_marked) * (1 + gcController.triggerRatio))
-	if memstats.gc_trigger < heapminimum {
-		memstats.gc_trigger = heapminimum
-	}
-	if int64(memstats.gc_trigger) < 0 {
-		print("next_gc=", memstats.next_gc, " bytesMarked=", work.bytesMarked, " heap_live=", memstats.heap_live, " initialHeapLive=", work.initialHeapLive, "\n")
-		throw("gc_trigger underflow")
-	}
-
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
 	memstats.heap_live = work.bytesMarked
 	memstats.heap_scan = uint64(gcController.scanWork)
 
-	minTrigger := memstats.heap_live + sweepMinHeapDistance*uint64(gcpercent)/100
-	if memstats.gc_trigger < minTrigger {
-		// The allocated heap is already past the trigger.
-		// This can happen if the triggerRatio is very low and
-		// the marked heap is less than the live heap size.
-		//
-		// Concurrent sweep happens in the heap growth from
-		// heap_live to gc_trigger, so bump gc_trigger up to ensure
-		// that concurrent sweep has some heap growth in which
-		// to perform sweeping before we start the next GC
-		// cycle.
-		memstats.gc_trigger = minTrigger
-	}
-
-	// The next GC cycle should finish before the allocated heap
-	// has grown by GOGC/100.
-	memstats.next_gc = memstats.heap_marked + memstats.heap_marked*uint64(gcpercent)/100
-	if gcpercent < 0 {
-		memstats.next_gc = ^uint64(0)
-	}
-	if memstats.next_gc < memstats.gc_trigger {
-		memstats.next_gc = memstats.gc_trigger
-	}
-
 	if trace.enabled {
 		traceHeapAlloc()
-		traceNextGC()
 	}
 }
 
@@ -1773,6 +1995,7 @@
 		// with an empty swept list.
 		throw("non-empty swept list")
 	}
+	mheap_.pagesSwept = 0
 	unlock(&mheap_.lock)
 
 	if !_ConcurrentSweep || mode == gcForceBlockMode {
@@ -1780,35 +2003,23 @@
 		// Record that no proportional sweeping has to happen.
 		lock(&mheap_.lock)
 		mheap_.sweepPagesPerByte = 0
-		mheap_.pagesSwept = 0
 		unlock(&mheap_.lock)
 		// Sweep all spans eagerly.
 		for sweepone() != ^uintptr(0) {
 			sweep.npausesweep++
 		}
-		// Do an additional mProf_GC, because all 'free' events are now real as well.
-		mProf_GC()
-		mProf_GC()
+		// Free workbufs eagerly.
+		prepareFreeWorkbufs()
+		for freeSomeWbufs(false) {
+		}
+		// All "free" events for this mark/sweep cycle have
+		// now happened, so we can make this profile cycle
+		// available immediately.
+		mProf_NextCycle()
+		mProf_Flush()
 		return
 	}
 
-	// Concurrent sweep needs to sweep all of the in-use pages by
-	// the time the allocated heap reaches the GC trigger. Compute
-	// the ratio of in-use pages to sweep per byte allocated.
-	heapDistance := int64(memstats.gc_trigger) - int64(memstats.heap_live)
-	// Add a little margin so rounding errors and concurrent
-	// sweep are less likely to leave pages unswept when GC starts.
-	heapDistance -= 1024 * 1024
-	if heapDistance < _PageSize {
-		// Avoid setting the sweep ratio extremely high
-		heapDistance = _PageSize
-	}
-	lock(&mheap_.lock)
-	mheap_.sweepPagesPerByte = float64(mheap_.pagesInUse) / float64(heapDistance)
-	mheap_.pagesSwept = 0
-	mheap_.spanBytesAlloc = 0
-	unlock(&mheap_.lock)
-
 	// Background sweep.
 	lock(&sweep.lock)
 	if sweep.parked {
@@ -1826,24 +2037,16 @@
 func gcResetMarkState() {
 	// This may be called during a concurrent phase, so make sure
 	// allgs doesn't change.
-	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
-		// Accessing gcRescan is unsafe.
-		throw("bad GC phase")
-	}
 	lock(&allglock)
 	for _, gp := range allgs {
 		gp.gcscandone = false  // set to true in gcphasework
 		gp.gcscanvalid = false // stack has not been scanned
-		gp.gcRescan = -1
 		gp.gcAssistBytes = 0
 	}
 	unlock(&allglock)
 
-	// Clear rescan list.
-	work.rescan.list = work.rescan.list[:0]
-
 	work.bytesMarked = 0
-	work.initialHeapLive = memstats.heap_live
+	work.initialHeapLive = atomic.Load64(&memstats.heap_live)
 	work.markrootDone = false
 }
 
@@ -1918,7 +2121,7 @@
 		traceGCScanDone()
 	}
 
-	nproc := work.nproc // work.nproc can change right after we increment work.ndone
+	nproc := atomic.Load(&work.nproc) // work.nproc can change right after we increment work.ndone
 	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
 	}

diff --git a/src/runtime/mgclarge.go b/src/runtime/mgclarge.go
new file mode 100644
index 0000000..757e88d
--- /dev/null
+++ b/src/runtime/mgclarge.go

@@ -0,0 +1,326 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Page heap.
+//
+// See malloc.go for the general overview.
+//
+// Large spans are the subject of this file. Spans consisting of less than
+// _MaxMHeapLists are held in lists of like sized spans. Larger spans
+// are held in a treap. See https://en.wikipedia.org/wiki/Treap or
+// http://faculty.washington.edu/aragon/pubs/rst89.pdf for an overview.
+// sema.go also holds an implementation of a treap.
+//
+// Each treapNode holds a single span. The treap is sorted by page size
+// and for spans of the same size a secondary sort based on start address
+// is done.
+// Spans are returned based on a best fit algorithm and for spans of the same
+// size the one at the lowest address is selected.
+//
+// The primary routines are
+// insert: adds a span to the treap
+// remove: removes the span from that treap that best fits the required size
+// removeSpan: which removes a specific span from the treap
+//
+// _mheap.lock must be held when manipulating this data structure.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+//go:notinheap
+type mTreap struct {
+	treap *treapNode
+}
+
+//go:notinheap
+type treapNode struct {
+	right     *treapNode // all treapNodes > this treap node
+	left      *treapNode // all treapNodes < this treap node
+	parent    *treapNode // direct parent of this node, nil if root
+	npagesKey uintptr    // number of pages in spanKey, used as primary sort key
+	spanKey   *mspan     // span of size npagesKey, used as secondary sort key
+	priority  uint32     // random number used by treap algorithm keep tree probablistically balanced
+}
+
+func (t *treapNode) init() {
+	t.right = nil
+	t.left = nil
+	t.parent = nil
+	t.spanKey = nil
+	t.npagesKey = 0
+	t.priority = 0
+}
+
+// isSpanInTreap is handy for debugging. One should hold the heap lock, usually
+// mheap_.lock().
+func (t *treapNode) isSpanInTreap(s *mspan) bool {
+	if t == nil {
+		return false
+	}
+	return t.spanKey == s || t.left.isSpanInTreap(s) || t.right.isSpanInTreap(s)
+}
+
+// walkTreap is handy for debugging.
+// Starting at some treapnode t, for example the root, do a depth first preorder walk of
+// the tree executing fn at each treap node. One should hold the heap lock, usually
+// mheap_.lock().
+func (t *treapNode) walkTreap(fn func(tn *treapNode)) {
+	if t == nil {
+		return
+	}
+	fn(t)
+	t.left.walkTreap(fn)
+	t.right.walkTreap(fn)
+}
+
+// checkTreapNode when used in conjunction with walkTreap can usually detect a
+// poorly formed treap.
+func checkTreapNode(t *treapNode) {
+	// lessThan is used to order the treap.
+	// npagesKey and npages are the primary keys.
+	// spanKey and span are the secondary keys.
+	// span == nil (0) will always be lessThan all
+	// spans of the same size.
+	lessThan := func(npages uintptr, s *mspan) bool {
+		if t.npagesKey != npages {
+			return t.npagesKey < npages
+		}
+		// t.npagesKey == npages
+		return uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(s))
+	}
+
+	if t == nil {
+		return
+	}
+	if t.spanKey.npages != t.npagesKey || t.spanKey.next != nil {
+		println("runtime: checkTreapNode treapNode t=", t, "     t.npagesKey=", t.npagesKey,
+			"t.spanKey.npages=", t.spanKey.npages)
+		throw("why does span.npages and treap.ngagesKey do not match?")
+	}
+	if t.left != nil && lessThan(t.left.npagesKey, t.left.spanKey) {
+		throw("t.lessThan(t.left.npagesKey, t.left.spanKey) is not false")
+	}
+	if t.right != nil && !lessThan(t.right.npagesKey, t.right.spanKey) {
+		throw("!t.lessThan(t.left.npagesKey, t.left.spanKey) is not false")
+	}
+}
+
+// insert adds span to the large span treap.
+func (root *mTreap) insert(span *mspan) {
+	npages := span.npages
+	var last *treapNode
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
+		last = t
+		if t.npagesKey < npages {
+			pt = &t.right
+		} else if t.npagesKey > npages {
+			pt = &t.left
+		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+			// t.npagesKey == npages, so sort on span addresses.
+			pt = &t.right
+		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+			pt = &t.left
+		} else {
+			throw("inserting span already in treap")
+		}
+	}
+
+	// Add t as new leaf in tree of span size and unique addrs.
+	// The balanced tree is a treap using priority as the random heap priority.
+	// That is, it is a binary tree ordered according to the npagesKey,
+	// but then among the space of possible binary trees respecting those
+	// npagesKeys, it is kept balanced on average by maintaining a heap ordering
+	// on the priority: s.priority <= both s.right.priority and s.right.priority.
+	// https://en.wikipedia.org/wiki/Treap
+	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
+
+	t := (*treapNode)(mheap_.treapalloc.alloc())
+	t.init()
+	t.npagesKey = span.npages
+	t.priority = fastrand()
+	t.spanKey = span
+	t.parent = last
+	*pt = t // t now at a leaf.
+	// Rotate up into tree according to priority.
+	for t.parent != nil && t.parent.priority > t.priority {
+		if t != nil && t.spanKey.npages != t.npagesKey {
+			println("runtime: insert t=", t, "t.npagesKey=", t.npagesKey)
+			println("runtime:      t.spanKey=", t.spanKey, "t.spanKey.npages=", t.spanKey.npages)
+			throw("span and treap sizes do not match?")
+		}
+		if t.parent.left == t {
+			root.rotateRight(t.parent)
+		} else {
+			if t.parent.right != t {
+				throw("treap insert finds a broken treap")
+			}
+			root.rotateLeft(t.parent)
+		}
+	}
+}
+
+func (root *mTreap) removeNode(t *treapNode) *mspan {
+	if t.spanKey.npages != t.npagesKey {
+		throw("span and treap node npages do not match")
+	}
+	result := t.spanKey
+
+	// Rotate t down to be leaf of tree for removal, respecting priorities.
+	for t.right != nil || t.left != nil {
+		if t.right == nil || t.left != nil && t.left.priority < t.right.priority {
+			root.rotateRight(t)
+		} else {
+			root.rotateLeft(t)
+		}
+	}
+	// Remove t, now a leaf.
+	if t.parent != nil {
+		if t.parent.left == t {
+			t.parent.left = nil
+		} else {
+			t.parent.right = nil
+		}
+	} else {
+		root.treap = nil
+	}
+	// Return the found treapNode's span after freeing the treapNode.
+	t.spanKey = nil
+	t.npagesKey = 0
+	mheap_.treapalloc.free(unsafe.Pointer(t))
+	return result
+}
+
+// remove searches for, finds, removes from the treap, and returns the smallest
+// span that can hold npages. If no span has at least npages return nil.
+// This is slightly more complicated than a simple binary tree search
+// since if an exact match is not found the next larger node is
+// returned.
+// If the last node inspected > npagesKey not holding
+// a left node (a smaller npages) is the "best fit" node.
+func (root *mTreap) remove(npages uintptr) *mspan {
+	t := root.treap
+	for t != nil {
+		if t.spanKey == nil {
+			throw("treap node with nil spanKey found")
+		}
+		if t.npagesKey < npages {
+			t = t.right
+		} else if t.left != nil && t.left.npagesKey >= npages {
+			t = t.left
+		} else {
+			result := t.spanKey
+			root.removeNode(t)
+			return result
+		}
+	}
+	return nil
+}
+
+// removeSpan searches for, finds, deletes span along with
+// the associated treap node. If the span is not in the treap
+// then t will eventually be set to nil and the t.spanKey
+// will throw.
+func (root *mTreap) removeSpan(span *mspan) {
+	npages := span.npages
+	t := root.treap
+	for t.spanKey != span {
+		if t.npagesKey < npages {
+			t = t.right
+		} else if t.npagesKey > npages {
+			t = t.left
+		} else if uintptr(unsafe.Pointer(t.spanKey)) < uintptr(unsafe.Pointer(span)) {
+			t = t.right
+		} else if uintptr(unsafe.Pointer(t.spanKey)) > uintptr(unsafe.Pointer(span)) {
+			t = t.left
+		}
+	}
+	root.removeNode(t)
+}
+
+// scavengetreap visits each node in the treap and scavenges the
+// treapNode's span.
+func scavengetreap(treap *treapNode, now, limit uint64) uintptr {
+	if treap == nil {
+		return 0
+	}
+	return scavengeTreapNode(treap, now, limit) +
+		scavengetreap(treap.left, now, limit) +
+		scavengetreap(treap.right, now, limit)
+}
+
+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *mTreap) rotateLeft(x *treapNode) {
+	// p -> (x a (y b c))
+	p := x.parent
+	a, y := x.left, x.right
+	b, c := y.left, y.right
+
+	y.left = x
+	x.parent = y
+	y.right = c
+	if c != nil {
+		c.parent = y
+	}
+	x.left = a
+	if a != nil {
+		a.parent = x
+	}
+	x.right = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.left == x {
+		p.left = y
+	} else {
+		if p.right != x {
+			throw("large span treap rotateLeft")
+		}
+		p.right = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *mTreap) rotateRight(y *treapNode) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x, c := y.left, y.right
+	a, b := x.left, x.right
+
+	x.left = a
+	if a != nil {
+		a.parent = x
+	}
+	x.right = y
+	y.parent = x
+	y.left = b
+	if b != nil {
+		b.parent = y
+	}
+	y.right = c
+	if c != nil {
+		c.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.left == y {
+		p.left = x
+	} else {
+		if p.right != y {
+			throw("large span treap rotateRight")
+		}
+		p.right = x
+	}
+}

diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 85130bf..9029d19 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go

@@ -107,21 +107,24 @@
 		// termination, allglen isn't changing, so we'll scan
 		// all Gs.
 		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
-		work.nRescanRoots = 0
 	} else {
 		// We've already scanned span roots and kept the scan
 		// up-to-date during concurrent mark.
 		work.nSpanRoots = 0
 
-		// On the second pass of markroot, we're just scanning
-		// dirty stacks. It's safe to access rescan since the
-		// world is stopped.
+		// The hybrid barrier ensures that stacks can't
+		// contain pointers to unmarked objects, so on the
+		// second markroot, there's no need to scan stacks.
 		work.nStackRoots = 0
-		work.nRescanRoots = len(work.rescan.list)
+
+		if debug.gcrescanstacks > 0 {
+			// Scan stacks anyway for debugging.
+			work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		}
 	}
 
 	work.markrootNext = 0
-	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+	work.markrootJobs = uint32(fixedRootCount + work.nFlushCacheRoots + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots)
 }
 
 // gcMarkRootCheck checks that all roots have been scanned. It is
@@ -180,8 +183,7 @@
 	baseBSS := baseData + uint32(work.nDataRoots)
 	baseSpans := baseBSS + uint32(work.nBSSRoots)
 	baseStacks := baseSpans + uint32(work.nSpanRoots)
-	baseRescan := baseStacks + uint32(work.nStackRoots)
-	end := baseRescan + uint32(work.nRescanRoots)
+	end := baseStacks + uint32(work.nStackRoots)
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
 	switch {
@@ -199,6 +201,11 @@
 		}
 
 	case i == fixedRootFinalizers:
+		// Only do this once per GC cycle since we don't call
+		// queuefinalizer during marking.
+		if work.markrootDone {
+			break
+		}
 		for fb := allfin; fb != nil; fb = fb.alllink {
 			cnt := uintptr(atomic.Load(&fb.cnt))
 			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), cnt*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
@@ -220,15 +227,8 @@
 	default:
 		// the rest is scanning goroutine stacks
 		var gp *g
-		if baseStacks <= i && i < baseRescan {
+		if baseStacks <= i && i < end {
 			gp = allgs[i-baseStacks]
-		} else if baseRescan <= i && i < end {
-			gp = work.rescan.list[i-baseRescan].ptr()
-			if gp.gcRescan != int32(i-baseRescan) {
-				// Looking for issue #17099.
-				println("runtime: gp", gp, "found at rescan index", i-baseRescan, "but should be at", gp.gcRescan)
-				throw("bad g rescan index")
-			}
 		} else {
 			throw("markroot: bad index")
 		}
@@ -415,6 +415,7 @@
 		return
 	}
 
+	traced := false
 retry:
 	// Compute the amount of scan work we need to do to make the
 	// balance positive. When the required amount of work is low,
@@ -450,10 +451,18 @@
 		if scanWork == 0 {
 			// We were able to steal all of the credit we
 			// needed.
+			if traced {
+				traceGCMarkAssistDone()
+			}
 			return
 		}
 	}
 
+	if trace.enabled && !traced {
+		traced = true
+		traceGCMarkAssistStart()
+	}
+
 	// Perform assist work
 	systemstack(func() {
 		gcAssistAlloc1(gp, scanWork)
@@ -496,6 +505,9 @@
 		// At this point either background GC has satisfied
 		// this G's assist debt, or the GC cycle is over.
 	}
+	if traced {
+		traceGCMarkAssistDone()
+	}
 }
 
 // gcAssistAlloc1 is the part of gcAssistAlloc that runs on the system
@@ -716,10 +728,6 @@
 
 // scanstack scans gp's stack, greying all pointers found on the stack.
 //
-// During mark phase, it also installs stack barriers while traversing
-// gp's stack. During mark termination, it stops scanning when it
-// reaches an unhit stack barrier.
-//
 // scanstack is marked go:systemstack because it must not be preempted
 // while using a workbuf.
 //
@@ -762,94 +770,14 @@
 		shrinkstack(gp)
 	}
 
-	// Prepare for stack barrier insertion/removal.
-	var sp, barrierOffset, nextBarrier uintptr
-	if gp.syscallsp != 0 {
-		sp = gp.syscallsp
-	} else {
-		sp = gp.sched.sp
-	}
-	gcLockStackBarriers(gp) // Not necessary during mark term, but harmless.
-	switch gcphase {
-	case _GCmark:
-		// Install stack barriers during stack scan.
-		barrierOffset = uintptr(firstStackBarrierOffset)
-		nextBarrier = sp + barrierOffset
-
-		if debug.gcstackbarrieroff > 0 {
-			nextBarrier = ^uintptr(0)
-		}
-
-		// Remove any existing stack barriers before we
-		// install new ones.
-		gcRemoveStackBarriers(gp)
-
-	case _GCmarktermination:
-		if !work.markrootDone {
-			// This is a STW GC. There may be stale stack
-			// barriers from an earlier cycle since we
-			// never passed through mark phase.
-			gcRemoveStackBarriers(gp)
-		}
-
-		if int(gp.stkbarPos) == len(gp.stkbar) {
-			// gp hit all of the stack barriers (or there
-			// were none). Re-scan the whole stack.
-			nextBarrier = ^uintptr(0)
-		} else {
-			// Only re-scan up to the lowest un-hit
-			// barrier. Any frames above this have not
-			// executed since the concurrent scan of gp and
-			// any writes through up-pointers to above
-			// this barrier had write barriers.
-			nextBarrier = gp.stkbar[gp.stkbarPos].savedLRPtr
-			if debugStackBarrier {
-				print("rescan below ", hex(nextBarrier), " in [", hex(sp), ",", hex(gp.stack.hi), ") goid=", gp.goid, "\n")
-			}
-		}
-
-	default:
-		throw("scanstack in wrong phase")
-	}
-
 	// Scan the stack.
 	var cache pcvalueCache
-	n := 0
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
 		scanframeworker(frame, &cache, gcw)
-
-		if frame.fp > nextBarrier {
-			// We skip installing a barrier on bottom-most
-			// frame because on LR machines this LR is not
-			// on the stack.
-			if gcphase == _GCmark && n != 0 {
-				if gcInstallStackBarrier(gp, frame) {
-					barrierOffset *= 2
-					nextBarrier = sp + barrierOffset
-				}
-			} else if gcphase == _GCmarktermination {
-				// We just scanned a frame containing
-				// a return to a stack barrier. Since
-				// this frame never returned, we can
-				// stop scanning.
-				return false
-			}
-		}
-		n++
-
 		return true
 	}
 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
 	tracebackdefers(gp, scanframe, nil)
-	gcUnlockStackBarriers(gp)
-	if gcphase == _GCmark {
-		// gp may have added itself to the rescan list between
-		// when GC started and now. It's clean now, so remove
-		// it. This isn't safe during mark termination because
-		// mark termination is consuming this list, but it's
-		// also not necessary.
-		dequeueRescan(gp)
-	}
 	gp.gcscanvalid = true
 }
 
@@ -926,73 +854,6 @@
 	}
 }
 
-// queueRescan adds gp to the stack rescan list and clears
-// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
-// already on the rescan list.
-func queueRescan(gp *g) {
-	if debug.gcrescanstacks == 0 {
-		// Clear gcscanvalid to keep assertions happy.
-		//
-		// TODO: Remove gcscanvalid entirely when we remove
-		// stack rescanning.
-		gp.gcscanvalid = false
-		return
-	}
-
-	if gcphase == _GCoff {
-		gp.gcscanvalid = false
-		return
-	}
-	if gp.gcRescan != -1 {
-		throw("g already on rescan list")
-	}
-
-	lock(&work.rescan.lock)
-	gp.gcscanvalid = false
-
-	// Recheck gcphase under the lock in case there was a phase change.
-	if gcphase == _GCoff {
-		unlock(&work.rescan.lock)
-		return
-	}
-	if len(work.rescan.list) == cap(work.rescan.list) {
-		throw("rescan list overflow")
-	}
-	n := len(work.rescan.list)
-	gp.gcRescan = int32(n)
-	work.rescan.list = work.rescan.list[:n+1]
-	work.rescan.list[n].set(gp)
-	unlock(&work.rescan.lock)
-}
-
-// dequeueRescan removes gp from the stack rescan list, if gp is on
-// the rescan list. The caller must own gp.
-func dequeueRescan(gp *g) {
-	if debug.gcrescanstacks == 0 {
-		return
-	}
-
-	if gp.gcRescan == -1 {
-		return
-	}
-	if gcphase == _GCoff {
-		gp.gcRescan = -1
-		return
-	}
-
-	lock(&work.rescan.lock)
-	if work.rescan.list[gp.gcRescan].ptr() != gp {
-		throw("bad dequeueRescan")
-	}
-	// Careful: gp may itself be the last G on the list.
-	last := work.rescan.list[len(work.rescan.list)-1]
-	work.rescan.list[gp.gcRescan] = last
-	last.ptr().gcRescan = gp.gcRescan
-	gp.gcRescan = -1
-	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
-	unlock(&work.rescan.lock)
-}
-
 type gcDrainFlags int
 
 const (
@@ -1268,7 +1129,7 @@
 			// paths), in which case we must *not* enqueue
 			// oblets since their bitmaps will be
 			// uninitialized.
-			if !hbits.hasPointers(n) {
+			if s.spanclass.noscan() {
 				// Bypass the whole scan.
 				gcw.bytesMarked += uint64(n)
 				return
@@ -1371,6 +1232,7 @@
 			// Dump the object
 			gcDumpObject("obj", obj, ^uintptr(0))
 
+			getg().m.traceback = 2
 			throw("checkmark found unmarked object")
 		}
 		if hbits.isCheckmarked(span.elemsize) {
@@ -1385,6 +1247,7 @@
 			print("runtime: marking free object ", hex(obj), " found at *(", hex(base), "+", hex(off), ")\n")
 			gcDumpObject("base", base, off)
 			gcDumpObject("obj", obj, ^uintptr(0))
+			getg().m.traceback = 2
 			throw("marking free object")
 		}
 
@@ -1396,7 +1259,7 @@
 		atomic.Or8(mbits.bytep, mbits.mask)
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
-		if !hbits.hasPointers(span.elemsize) {
+		if span.spanclass.noscan() {
 			gcw.bytesMarked += uint64(span.elemsize)
 			return
 		}
@@ -1429,7 +1292,7 @@
 		print(" s=nil\n")
 		return
 	}
-	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, " s.state=")
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.spanclass=", s.spanclass, " s.elemsize=", s.elemsize, " s.state=")
 	if 0 <= s.state && int(s.state) < len(mSpanStateNames) {
 		print(mSpanStateNames[s.state], "\n")
 	} else {
@@ -1438,7 +1301,7 @@
 
 	skipped := false
 	size := s.elemsize
-	if s.state == _MSpanStack && size == 0 {
+	if s.state == _MSpanManual && size == 0 {
 		// We're printing something from a stack frame. We
 		// don't know how big it is, so just show up to an
 		// including off.

diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index fb5c488..1bb19ec 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go

@@ -22,10 +22,6 @@
 
 	nbgsweep    uint32
 	npausesweep uint32
-
-	// pacertracegen is the sweepgen at which the last pacer trace
-	// "sweep finished" message was printed.
-	pacertracegen uint32
 }
 
 // finishsweep_m ensures that all spans are swept.
@@ -60,6 +56,9 @@
 			sweep.nbgsweep++
 			Gosched()
 		}
+		for freeSomeWbufs(true) {
+			Gosched()
+		}
 		lock(&sweep.lock)
 		if !gosweepdone() {
 			// This can happen if a GC runs between
@@ -78,20 +77,24 @@
 //go:nowritebarrier
 func sweepone() uintptr {
 	_g_ := getg()
+	sweepRatio := mheap_.sweepPagesPerByte // For debugging
 
 	// increment locks to ensure that the goroutine is not preempted
 	// in the middle of sweep thus leaving the span in an inconsistent state for next GC
 	_g_.m.locks++
+	if atomic.Load(&mheap_.sweepdone) != 0 {
+		_g_.m.locks--
+		return ^uintptr(0)
+	}
+	atomic.Xadd(&mheap_.sweepers, +1)
+
+	npages := ^uintptr(0)
 	sg := mheap_.sweepgen
 	for {
 		s := mheap_.sweepSpans[1-sg/2%2].pop()
 		if s == nil {
-			mheap_.sweepdone = 1
-			_g_.m.locks--
-			if debug.gcpacertrace > 0 && atomic.Cas(&sweep.pacertracegen, sg-2, sg) {
-				print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
-			}
-			return ^uintptr(0)
+			atomic.Store(&mheap_.sweepdone, 1)
+			break
 		}
 		if s.state != mSpanInUse {
 			// This can happen if direct sweeping already
@@ -106,16 +109,25 @@
 		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			continue
 		}
-		npages := s.npages
+		npages = s.npages
 		if !s.sweep(false) {
 			// Span is still in-use, so this returned no
 			// pages to the heap and the span needs to
 			// move to the swept in-use list.
 			npages = 0
 		}
-		_g_.m.locks--
-		return npages
+		break
 	}
+
+	// Decrement the number of active sweepers and if this is the
+	// last one print trace information.
+	if atomic.Xadd(&mheap_.sweepers, -1) == 0 && atomic.Load(&mheap_.sweepdone) != 0 {
+		if debug.gcpacertrace > 0 {
+			print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", (memstats.heap_live-mheap_.sweepHeapLiveBasis)>>20, "MB during sweep; swept ", mheap_.pagesSwept, " pages at ", sweepRatio, " pages/byte\n")
+		}
+	}
+	_g_.m.locks--
+	return npages
 }
 
 //go:nowritebarrier
@@ -178,15 +190,14 @@
 	}
 
 	if trace.enabled {
-		traceGCSweepStart()
+		traceGCSweepSpan(s.npages * _PageSize)
 	}
 
 	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
 
-	cl := s.sizeclass
+	spc := s.spanclass
 	size := s.elemsize
 	res := false
-	nfree := 0
 
 	c := _g_.m.mcache
 	freeToHeap := false
@@ -276,21 +287,23 @@
 	}
 
 	// Count the number of free objects in this span.
-	nfree = s.countFree()
-	if cl == 0 && nfree != 0 {
+	nalloc := uint16(s.countAlloc())
+	if spc.sizeclass() == 0 && nalloc == 0 {
 		s.needzero = 1
 		freeToHeap = true
 	}
-	nalloc := uint16(s.nelems) - uint16(nfree)
 	nfreed := s.allocCount - nalloc
 	if nalloc > s.allocCount {
-		print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		print("runtime: nelems=", s.nelems, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
 		throw("sweep increased allocation count")
 	}
 
 	s.allocCount = nalloc
 	wasempty := s.nextFreeIndex() == s.nelems
 	s.freeindex = 0 // reset allocation index to start of span.
+	if trace.enabled {
+		getg().m.p.ptr().traceReclaimed += uintptr(nfreed) * s.elemsize
+	}
 
 	// gcmarkBits becomes the allocBits.
 	// get a fresh cleared gcmarkBits in preparation for next GC
@@ -318,9 +331,9 @@
 		atomic.Store(&s.sweepgen, sweepgen)
 	}
 
-	if nfreed > 0 && cl != 0 {
-		c.local_nsmallfree[cl] += uintptr(nfreed)
-		res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty)
+	if nfreed > 0 && spc.sizeclass() != 0 {
+		c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+		res = mheap_.central[spc].mcentral.freeSpan(s, preserve, wasempty)
 		// MCentral_FreeSpan updates sweepgen
 	} else if freeToHeap {
 		// Free large span to heap
@@ -354,9 +367,6 @@
 		// it on the swept in-use list.
 		mheap_.sweepSpans[sweepgen/2%2].push(s)
 	}
-	if trace.enabled {
-		traceGCSweepDone()
-	}
 	return res
 }
 
@@ -369,8 +379,7 @@
 //
 // deductSweepCredit makes a worst-case assumption that all spanBytes
 // bytes of the ultimately allocated span will be available for object
-// allocation. The caller should call reimburseSweepCredit if that
-// turns out not to be the case once the span is allocated.
+// allocation.
 //
 // deductSweepCredit is the core of the "proportional sweep" system.
 // It uses statistics gathered by the garbage collector to perform
@@ -384,31 +393,28 @@
 		return
 	}
 
-	// Account for this span allocation.
-	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+	if trace.enabled {
+		traceGCSweepStart()
+	}
+
+retry:
+	sweptBasis := atomic.Load64(&mheap_.pagesSweptBasis)
 
 	// Fix debt if necessary.
-	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
-	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+	newHeapLive := uintptr(atomic.Load64(&memstats.heap_live)-mheap_.sweepHeapLiveBasis) + spanBytes
+	pagesTarget := int64(mheap_.sweepPagesPerByte*float64(newHeapLive)) - int64(callerSweepPages)
+	for pagesTarget > int64(atomic.Load64(&mheap_.pagesSwept)-sweptBasis) {
 		if gosweepone() == ^uintptr(0) {
 			mheap_.sweepPagesPerByte = 0
 			break
 		}
+		if atomic.Load64(&mheap_.pagesSweptBasis) != sweptBasis {
+			// Sweep pacing changed. Recompute debt.
+			goto retry
+		}
 	}
-}
 
-// reimburseSweepCredit records that unusableBytes bytes of a
-// just-allocated span are not available for object allocation. This
-// offsets the worst-case charge performed by deductSweepCredit.
-func reimburseSweepCredit(unusableBytes uintptr) {
-	if mheap_.sweepPagesPerByte == 0 {
-		// Nobody cares about the credit. Avoid the atomic.
-		return
-	}
-	nval := atomic.Xadd64(&mheap_.spanBytesAlloc, -int64(unusableBytes))
-	if int64(nval) < 0 {
-		// Debugging for #18043.
-		print("runtime: bad spanBytesAlloc=", nval, " (was ", nval+uint64(unusableBytes), ") unusableBytes=", unusableBytes, " sweepPagesPerByte=", mheap_.sweepPagesPerByte, "\n")
-		throw("spanBytesAlloc underflow")
+	if trace.enabled {
+		traceGCSweepDone()
 	}
 }

diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 5eb05a7..461679b 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go

@@ -12,8 +12,22 @@
 
 const (
 	_WorkbufSize = 2048 // in bytes; larger values result in less contention
+
+	// workbufAlloc is the number of bytes to allocate at a time
+	// for new workbufs. This must be a multiple of pageSize and
+	// should be a multiple of _WorkbufSize.
+	//
+	// Larger values reduce workbuf allocation overhead. Smaller
+	// values reduce heap fragmentation.
+	workbufAlloc = 32 << 10
 )
 
+func init() {
+	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
+		throw("bad workbufAlloc")
+	}
+}
+
 // Garbage collector work pool abstraction.
 //
 // This implements a producer/consumer model for pointers to grey
@@ -25,21 +39,6 @@
 // grey objects, thus blackening them, and then scans them,
 // potentially producing new pointers to grey objects.
 
-// A wbufptr holds a workbuf*, but protects it from write barriers.
-// workbufs never live on the heap, so write barriers are unnecessary.
-// Write barriers on workbuf pointers may also be dangerous in the GC.
-//
-// TODO: Since workbuf is now go:notinheap, this isn't necessary.
-type wbufptr uintptr
-
-func wbufptrOf(w *workbuf) wbufptr {
-	return wbufptr(unsafe.Pointer(w))
-}
-
-func (wp wbufptr) ptr() *workbuf {
-	return (*workbuf)(unsafe.Pointer(wp))
-}
-
 // A gcWork provides the interface to produce and consume work for the
 // garbage collector.
 //
@@ -75,7 +74,7 @@
 	// next.
 	//
 	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
-	wbuf1, wbuf2 wbufptr
+	wbuf1, wbuf2 *workbuf
 
 	// Bytes marked (blackened) on this gcWork. This is aggregated
 	// into work.bytesMarked by dispose.
@@ -87,12 +86,12 @@
 }
 
 func (w *gcWork) init() {
-	w.wbuf1 = wbufptrOf(getempty())
+	w.wbuf1 = getempty()
 	wbuf2 := trygetfull()
 	if wbuf2 == nil {
 		wbuf2 = getempty()
 	}
-	w.wbuf2 = wbufptrOf(wbuf2)
+	w.wbuf2 = wbuf2
 }
 
 // put enqueues a pointer for the garbage collector to trace.
@@ -100,18 +99,18 @@
 //go:nowritebarrier
 func (w *gcWork) put(obj uintptr) {
 	flushed := false
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	} else if wbuf.nobj == len(wbuf.obj) {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == len(wbuf.obj) {
 			putfull(wbuf)
 			wbuf = getempty()
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 			flushed = true
 		}
 	}
@@ -132,7 +131,7 @@
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrier
 func (w *gcWork) putFast(obj uintptr) bool {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
 	} else if wbuf.nobj == len(wbuf.obj) {
@@ -151,15 +150,15 @@
 // other gcWork instances or other caches.
 //go:nowritebarrier
 func (w *gcWork) tryGet() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	}
 	if wbuf.nobj == 0 {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == 0 {
 			owbuf := wbuf
 			wbuf = trygetfull()
@@ -167,7 +166,7 @@
 				return 0
 			}
 			putempty(owbuf)
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 		}
 	}
 
@@ -180,7 +179,7 @@
 // the caller is expected to call tryGet().
 //go:nowritebarrier
 func (w *gcWork) tryGetFast() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		return 0
 	}
@@ -197,15 +196,15 @@
 // been retrieved.  get returns 0 if there are no pointers remaining.
 //go:nowritebarrier
 func (w *gcWork) get() uintptr {
-	wbuf := w.wbuf1.ptr()
+	wbuf := w.wbuf1
 	if wbuf == nil {
 		w.init()
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		// wbuf is empty at this point.
 	}
 	if wbuf.nobj == 0 {
 		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
-		wbuf = w.wbuf1.ptr()
+		wbuf = w.wbuf1
 		if wbuf.nobj == 0 {
 			owbuf := wbuf
 			wbuf = getfull()
@@ -213,7 +212,7 @@
 				return 0
 			}
 			putempty(owbuf)
-			w.wbuf1 = wbufptrOf(wbuf)
+			w.wbuf1 = wbuf
 		}
 	}
 
@@ -231,21 +230,21 @@
 //
 //go:nowritebarrier
 func (w *gcWork) dispose() {
-	if wbuf := w.wbuf1.ptr(); wbuf != nil {
+	if wbuf := w.wbuf1; wbuf != nil {
 		if wbuf.nobj == 0 {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
 		}
-		w.wbuf1 = 0
+		w.wbuf1 = nil
 
-		wbuf = w.wbuf2.ptr()
+		wbuf = w.wbuf2
 		if wbuf.nobj == 0 {
 			putempty(wbuf)
 		} else {
 			putfull(wbuf)
 		}
-		w.wbuf2 = 0
+		w.wbuf2 = nil
 	}
 	if w.bytesMarked != 0 {
 		// dispose happens relatively infrequently. If this
@@ -265,14 +264,14 @@
 // global queue.
 //go:nowritebarrier
 func (w *gcWork) balance() {
-	if w.wbuf1 == 0 {
+	if w.wbuf1 == nil {
 		return
 	}
-	if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
+	if wbuf := w.wbuf2; wbuf.nobj != 0 {
 		putfull(wbuf)
-		w.wbuf2 = wbufptrOf(getempty())
-	} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
-		w.wbuf1 = wbufptrOf(handoff(wbuf))
+		w.wbuf2 = getempty()
+	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
+		w.wbuf1 = handoff(wbuf)
 	} else {
 		return
 	}
@@ -285,7 +284,7 @@
 // empty returns true if w has no mark work available.
 //go:nowritebarrier
 func (w *gcWork) empty() bool {
-	return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
+	return w.wbuf1 == nil || (w.wbuf1.nobj == 0 && w.wbuf2.nobj == 0)
 }
 
 // Internally, the GC work pool is kept in arrays in work buffers.
@@ -327,23 +326,56 @@
 func getempty() *workbuf {
 	var b *workbuf
 	if work.empty != 0 {
-		b = (*workbuf)(lfstackpop(&work.empty))
+		b = (*workbuf)(work.empty.pop())
 		if b != nil {
 			b.checkempty()
 		}
 	}
 	if b == nil {
-		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), sys.CacheLineSize, &memstats.gc_sys))
+		// Allocate more workbufs.
+		var s *mspan
+		if work.wbufSpans.free.first != nil {
+			lock(&work.wbufSpans.lock)
+			s = work.wbufSpans.free.first
+			if s != nil {
+				work.wbufSpans.free.remove(s)
+				work.wbufSpans.busy.insert(s)
+			}
+			unlock(&work.wbufSpans.lock)
+		}
+		if s == nil {
+			systemstack(func() {
+				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+			})
+			if s == nil {
+				throw("out of memory")
+			}
+			// Record the new span in the busy list.
+			lock(&work.wbufSpans.lock)
+			work.wbufSpans.busy.insert(s)
+			unlock(&work.wbufSpans.lock)
+		}
+		// Slice up the span into new workbufs. Return one and
+		// put the rest on the empty list.
+		for i := uintptr(0); i+_WorkbufSize <= workbufAlloc; i += _WorkbufSize {
+			newb := (*workbuf)(unsafe.Pointer(s.base() + i))
+			newb.nobj = 0
+			if i == 0 {
+				b = newb
+			} else {
+				putempty(newb)
+			}
+		}
 	}
 	return b
 }
 
 // putempty puts a workbuf onto the work.empty list.
-// Upon entry this go routine owns b. The lfstackpush relinquishes ownership.
+// Upon entry this go routine owns b. The lfstack.push relinquishes ownership.
 //go:nowritebarrier
 func putempty(b *workbuf) {
 	b.checkempty()
-	lfstackpush(&work.empty, &b.node)
+	work.empty.push(&b.node)
 }
 
 // putfull puts the workbuf on the work.full list for the GC.
@@ -352,14 +384,14 @@
 //go:nowritebarrier
 func putfull(b *workbuf) {
 	b.checknonempty()
-	lfstackpush(&work.full, &b.node)
+	work.full.push(&b.node)
 }
 
 // trygetfull tries to get a full or partially empty workbuffer.
 // If one is not immediately available return nil
 //go:nowritebarrier
 func trygetfull() *workbuf {
-	b := (*workbuf)(lfstackpop(&work.full))
+	b := (*workbuf)(work.full.pop())
 	if b != nil {
 		b.checknonempty()
 		return b
@@ -380,7 +412,7 @@
 // phase.
 //go:nowritebarrier
 func getfull() *workbuf {
-	b := (*workbuf)(lfstackpop(&work.full))
+	b := (*workbuf)(work.full.pop())
 	if b != nil {
 		b.checknonempty()
 		return b
@@ -398,7 +430,7 @@
 				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
 			}
-			b = (*workbuf)(lfstackpop(&work.full))
+			b = (*workbuf)(work.full.pop())
 			if b != nil {
 				b.checknonempty()
 				return b
@@ -412,15 +444,11 @@
 		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
 			return nil
 		}
-		_g_ := getg()
 		if i < 10 {
-			_g_.m.gcstats.nprocyield++
 			procyield(20)
 		} else if i < 20 {
-			_g_.m.gcstats.nosyield++
 			osyield()
 		} else {
-			_g_.m.gcstats.nsleep++
 			usleep(100)
 		}
 	}
@@ -434,11 +462,49 @@
 	b.nobj -= n
 	b1.nobj = n
 	memmove(unsafe.Pointer(&b1.obj[0]), unsafe.Pointer(&b.obj[b.nobj]), uintptr(n)*unsafe.Sizeof(b1.obj[0]))
-	_g_ := getg()
-	_g_.m.gcstats.nhandoff++
-	_g_.m.gcstats.nhandoffcnt += uint64(n)
 
 	// Put b on full list - let first half of b get stolen.
 	putfull(b)
 	return b1
 }
+
+// prepareFreeWorkbufs moves busy workbuf spans to free list so they
+// can be freed to the heap. This must only be called when all
+// workbufs are on the empty list.
+func prepareFreeWorkbufs() {
+	lock(&work.wbufSpans.lock)
+	if work.full != 0 {
+		throw("cannot free workbufs when work.full != 0")
+	}
+	// Since all workbufs are on the empty list, we don't care
+	// which ones are in which spans. We can wipe the entire empty
+	// list and move all workbuf spans to the free list.
+	work.empty = 0
+	work.wbufSpans.free.takeAll(&work.wbufSpans.busy)
+	unlock(&work.wbufSpans.lock)
+}
+
+// freeSomeWbufs frees some workbufs back to the heap and returns
+// true if it should be called again to free more.
+func freeSomeWbufs(preemptible bool) bool {
+	const batchSize = 64 // ~1–2 µs per span.
+	lock(&work.wbufSpans.lock)
+	if gcphase != _GCoff || work.wbufSpans.free.isEmpty() {
+		unlock(&work.wbufSpans.lock)
+		return false
+	}
+	systemstack(func() {
+		gp := getg().m.curg
+		for i := 0; i < batchSize && !(preemptible && gp.preempt); i++ {
+			span := work.wbufSpans.free.first
+			if span == nil {
+				break
+			}
+			work.wbufSpans.free.remove(span)
+			mheap_.freeManual(span, &memstats.gc_sys)
+		}
+	})
+	more := !work.wbufSpans.free.isEmpty()
+	unlock(&work.wbufSpans.lock)
+	return more
+}

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index ef62eff..bf682ec 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go

@@ -29,12 +29,13 @@
 //go:notinheap
 type mheap struct {
 	lock      mutex
-	free      [_MaxMHeapList]mSpanList // free lists of given length
-	freelarge mSpanList                // free lists length >= _MaxMHeapList
-	busy      [_MaxMHeapList]mSpanList // busy lists of large objects of given length
-	busylarge mSpanList                // busy lists of large objects length >= _MaxMHeapList
+	free      [_MaxMHeapList]mSpanList // free lists of given length up to _MaxMHeapList
+	freelarge mTreap                   // free treap of length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mSpanList // busy lists of large spans of given length
+	busylarge mSpanList                // busy lists of large spans length >= _MaxMHeapList
 	sweepgen  uint32                   // sweep generation, see comment in mspan
 	sweepdone uint32                   // all spans are swept
+	sweepers  uint32                   // number of active sweepone calls
 
 	// allspans is a slice of all mspans ever created. Each mspan
 	// appears exactly once.
@@ -74,37 +75,82 @@
 	_ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
-	pagesInUse        uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
-	spanBytesAlloc    uint64  // bytes of spans allocated this cycle; updated atomically
-	pagesSwept        uint64  // pages swept this cycle; updated atomically
-	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+	//
+	// These parameters represent a linear function from heap_live
+	// to page sweep count. The proportional sweep system works to
+	// stay in the black by keeping the current page sweep count
+	// above this line at the current heap_live.
+	//
+	// The line has slope sweepPagesPerByte and passes through a
+	// basis point at (sweepHeapLiveBasis, pagesSweptBasis). At
+	// any given time, the system is at (memstats.heap_live,
+	// pagesSwept) in this space.
+	//
+	// It's important that the line pass through a point we
+	// control rather than simply starting at a (0,0) origin
+	// because that lets us adjust sweep pacing at any time while
+	// accounting for current progress. If we could only adjust
+	// the slope, it would create a discontinuity in debt if any
+	// progress has already been made.
+	pagesInUse         uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
+	pagesSwept         uint64  // pages swept this cycle; updated atomically
+	pagesSweptBasis    uint64  // pagesSwept to use as the origin of the sweep ratio; updated atomically
+	sweepHeapLiveBasis uint64  // value of heap_live to use as the origin of sweep ratio; written with lock, read without
+	sweepPagesPerByte  float64 // proportional sweep ratio; written with lock, read without
 	// TODO(austin): pagesInUse should be a uintptr, but the 386
 	// compiler can't 8-byte align fields.
 
 	// Malloc stats.
-	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
+	largealloc  uint64                  // bytes allocated for large objects
+	nlargealloc uint64                  // number of large object allocations
+	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
+	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
+	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
 
 	// range of addresses we might see in the heap
-	bitmap         uintptr // Points to one byte past the end of the bitmap
-	bitmap_mapped  uintptr
-	arena_start    uintptr
-	arena_used     uintptr // always mHeap_Map{Bits,Spans} before updating
-	arena_end      uintptr
+	bitmap        uintptr // Points to one byte past the end of the bitmap
+	bitmap_mapped uintptr
+
+	// The arena_* fields indicate the addresses of the Go heap.
+	//
+	// The maximum range of the Go heap is
+	// [arena_start, arena_start+_MaxMem+1).
+	//
+	// The range of the current Go heap is
+	// [arena_start, arena_used). Parts of this range may not be
+	// mapped, but the metadata structures are always mapped for
+	// the full range.
+	arena_start uintptr
+	arena_used  uintptr // Set with setArenaUsed.
+
+	// The heap is grown using a linear allocator that allocates
+	// from the block [arena_alloc, arena_end). arena_alloc is
+	// often, but *not always* equal to arena_used.
+	arena_alloc uintptr
+	arena_end   uintptr
+
+	// arena_reserved indicates that the memory [arena_alloc,
+	// arena_end) is reserved (e.g., mapped PROT_NONE). If this is
+	// false, we have to be careful not to clobber existing
+	// mappings here. If this is true, then we own the mapping
+	// here and *must* clobber it to use it.
 	arena_reserved bool
 
+	_ uint32 // ensure 64-bit alignment
+
 	// central free lists for small size classes.
 	// the padding makes sure that the MCentrals are
 	// spaced CacheLineSize bytes apart, so that each MCentral.lock
 	// gets its own cache line.
-	central [_NumSizeClasses]struct {
+	// central is indexed by spanClass.
+	central [numSpanClasses]struct {
 		mcentral mcentral
-		pad      [sys.CacheLineSize]byte
+		pad      [sys.CacheLineSize - unsafe.Sizeof(mcentral{})%sys.CacheLineSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
 	cachealloc            fixalloc // allocator for mcache*
+	treapalloc            fixalloc // allocator for treapNodes* used by large objects
 	specialfinalizeralloc fixalloc // allocator for specialfinalizer*
 	specialprofilealloc   fixalloc // allocator for specialprofile*
 	speciallock           mutex    // lock for special record allocators.
@@ -117,7 +163,7 @@
 // When a MSpan is in the heap free list, state == MSpanFree
 // and heapmap(s->start) == span, heapmap(s->start+s->npages-1) == span.
 //
-// When a MSpan is allocated, state == MSpanInUse or MSpanStack
+// When a MSpan is allocated, state == MSpanInUse or MSpanManual
 // and heapmap(i) == span for all s->start <= i < s->start+s->npages.
 
 // Every MSpan is in one doubly-linked list,
@@ -125,25 +171,25 @@
 // MCentral's span lists.
 
 // An MSpan representing actual memory has state _MSpanInUse,
-// _MSpanStack, or _MSpanFree. Transitions between these states are
+// _MSpanManual, or _MSpanFree. Transitions between these states are
 // constrained as follows:
 //
-// * A span may transition from free to in-use or stack during any GC
+// * A span may transition from free to in-use or manual during any GC
 //   phase.
 //
 // * During sweeping (gcphase == _GCoff), a span may transition from
-//   in-use to free (as a result of sweeping) or stack to free (as a
+//   in-use to free (as a result of sweeping) or manual to free (as a
 //   result of stacks being freed).
 //
 // * During GC (gcphase != _GCoff), a span *must not* transition from
-//   stack or in-use to free. Because concurrent GC may read a pointer
+//   manual or in-use to free. Because concurrent GC may read a pointer
 //   and then look up its span, the span state must be monotonic.
 type mSpanState uint8
 
 const (
-	_MSpanDead  mSpanState = iota
-	_MSpanInUse            // allocated for garbage collected heap
-	_MSpanStack            // allocated for use by stack allocator
+	_MSpanDead   mSpanState = iota
+	_MSpanInUse             // allocated for garbage collected heap
+	_MSpanManual            // allocated for manual management (e.g., stack allocator)
 	_MSpanFree
 )
 
@@ -152,7 +198,7 @@
 var mSpanStateNames = []string{
 	"_MSpanDead",
 	"_MSpanInUse",
-	"_MSpanStack",
+	"_MSpanManual",
 	"_MSpanFree",
 }
 
@@ -170,15 +216,16 @@
 	prev *mspan     // previous span in list, or nil if none
 	list *mSpanList // For debugging. TODO: Remove.
 
-	startAddr     uintptr   // address of first byte of span aka s.base()
-	npages        uintptr   // number of pages in span
-	stackfreelist gclinkptr // list of free stacks, avoids overloading freelist
+	startAddr uintptr // address of first byte of span aka s.base()
+	npages    uintptr // number of pages in span
+
+	manualFreeList gclinkptr // list of free objects in _MSpanManual spans
 
 	// freeindex is the slot index between 0 and nelems at which to begin scanning
 	// for the next free object in this span.
 	// Each allocation scans allocBits starting at freeindex until it encounters a 0
 	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
-	// just past the the newly discovered free object.
+	// just past the newly discovered free object.
 	//
 	// If freeindex == nelem, this span has no free objects.
 	//
@@ -224,8 +271,8 @@
 	// The sweep will free the old allocBits and set allocBits to the
 	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
 	// out memory.
-	allocBits  *uint8
-	gcmarkBits *uint8
+	allocBits  *gcBits
+	gcmarkBits *gcBits
 
 	// sweep generation:
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
@@ -236,8 +283,8 @@
 	sweepgen    uint32
 	divMul      uint16     // for divide by elemsize - divMagic.mul
 	baseMask    uint16     // if non-0, elemsize is a power of 2, & this will get object allocation base
-	allocCount  uint16     // capacity - number of objects in freelist
-	sizeclass   uint8      // size class
+	allocCount  uint16     // number of allocated objects
+	spanclass   spanClass  // size class and noscan (uint8)
 	incache     bool       // being used by an mcache
 	state       mSpanState // mspaninuse etc
 	needzero    uint8      // needs to be zeroed before allocation
@@ -292,8 +339,33 @@
 	h.allspans = append(h.allspans, s)
 }
 
+// A spanClass represents the size class and noscan-ness of a span.
+//
+// Each size class has a noscan spanClass and a scan spanClass. The
+// noscan spanClass contains only noscan objects, which do not contain
+// pointers and thus do not need to be scanned by the garbage
+// collector.
+type spanClass uint8
+
+const (
+	numSpanClasses = _NumSizeClasses << 1
+	tinySpanClass  = spanClass(tinySizeClass<<1 | 1)
+)
+
+func makeSpanClass(sizeclass uint8, noscan bool) spanClass {
+	return spanClass(sizeclass<<1) | spanClass(bool2int(noscan))
+}
+
+func (sc spanClass) sizeclass() int8 {
+	return int8(sc >> 1)
+}
+
+func (sc spanClass) noscan() bool {
+	return sc&1 != 0
+}
+
 // inheap reports whether b is a pointer into a (potentially dead) heap object.
-// It returns false for pointers into stack spans.
+// It returns false for pointers into _MSpanManual spans.
 // Non-preemptible because it is used by write barriers.
 //go:nowritebarrier
 //go:nosplit
@@ -309,7 +381,9 @@
 	return true
 }
 
-// inHeapOrStack is a variant of inheap that returns true for pointers into stack spans.
+// inHeapOrStack is a variant of inheap that returns true for pointers
+// into any allocated heap span.
+//
 //go:nowritebarrier
 //go:nosplit
 func inHeapOrStack(b uintptr) bool {
@@ -322,10 +396,8 @@
 		return false
 	}
 	switch s.state {
-	case mSpanInUse:
+	case mSpanInUse, _MSpanManual:
 		return b < s.limit
-	case _MSpanStack:
-		return b < s.base()+s.npages<<_PageShift
 	default:
 		return false
 	}
@@ -376,7 +448,7 @@
 	}
 
 	p := s.base()
-	if s.sizeclass == 0 {
+	if s.spanclass.sizeclass() == 0 {
 		// Large object.
 		if base != nil {
 			*base = p
@@ -401,6 +473,7 @@
 
 // Initialize the heap.
 func (h *mheap) init(spansStart, spansBytes uintptr) {
+	h.treapalloc.init(unsafe.Sizeof(treapNode{}), nil, nil, &memstats.other_sys)
 	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
 	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
 	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
@@ -421,10 +494,9 @@
 		h.busy[i].init()
 	}
 
-	h.freelarge.init()
 	h.busylarge.init()
 	for i := range h.central {
-		h.central[i].mcentral.init(int32(i))
+		h.central[i].mcentral.init(spanClass(i))
 	}
 
 	sp := (*slice)(unsafe.Pointer(&h.spans))
@@ -433,14 +505,35 @@
 	sp.cap = int(spansBytes / sys.PtrSize)
 }
 
-// mHeap_MapSpans makes sure that the spans are mapped
+// setArenaUsed extends the usable arena to address arena_used and
+// maps auxiliary VM regions for any newly usable arena space.
+//
+// racemap indicates that this memory should be managed by the race
+// detector. racemap should be true unless this is covering a VM hole.
+func (h *mheap) setArenaUsed(arena_used uintptr, racemap bool) {
+	// Map auxiliary structures *before* h.arena_used is updated.
+	// Waiting to update arena_used until after the memory has been mapped
+	// avoids faults when other threads try access these regions immediately
+	// after observing the change to arena_used.
+
+	// Map the bitmap.
+	h.mapBits(arena_used)
+
+	// Map spans array.
+	h.mapSpans(arena_used)
+
+	// Tell the race detector about the new heap memory.
+	if racemap && raceenabled {
+		racemapshadow(unsafe.Pointer(h.arena_used), arena_used-h.arena_used)
+	}
+
+	h.arena_used = arena_used
+}
+
+// mapSpans makes sure that the spans are mapped
 // up to the new value of arena_used.
 //
-// It must be called with the expected new value of arena_used,
-// *before* h.arena_used has been updated.
-// Waiting to update arena_used until after the memory has been mapped
-// avoids faults when other threads try access the bitmap immediately
-// after observing the change to arena_used.
+// Don't call this directly. Call mheap.setArenaUsed.
 func (h *mheap) mapSpans(arena_used uintptr) {
 	// Map spans array, PageSize at a time.
 	n := arena_used
@@ -466,7 +559,7 @@
 		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			list.remove(s)
 			// swept spans are at the end of the list
-			list.insertBack(s)
+			list.insertBack(s) // Puts it back on a busy list. s is not in the treap at this point.
 			unlock(&h.lock)
 			snpages := s.npages
 			if s.sweep(false) {
@@ -533,7 +626,7 @@
 
 // Allocate a new span of npage pages from the heap for GC'd memory
 // and record its size class in the HeapMap and HeapMapCache.
-func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
+func (h *mheap) alloc_m(npage uintptr, spanclass spanClass, large bool) *mspan {
 	_g_ := getg()
 	if _g_ != _g_.m.g0 {
 		throw("_mheap_alloc not on g0 stack")
@@ -550,7 +643,13 @@
 		// If GC kept a bit for whether there were any marks
 		// in a span, we could release these free spans
 		// at the end of GC and eliminate this entirely.
+		if trace.enabled {
+			traceGCSweepStart()
+		}
 		h.reclaim(npage)
+		if trace.enabled {
+			traceGCSweepDone()
+		}
 	}
 
 	// transfer stats from cache to global
@@ -559,7 +658,7 @@
 	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
 	_g_.m.mcache.local_tinyallocs = 0
 
-	s := h.allocSpanLocked(npage)
+	s := h.allocSpanLocked(npage, &memstats.heap_inuse)
 	if s != nil {
 		// Record span info, because gc needs to be
 		// able to map interior pointer to containing span.
@@ -567,8 +666,8 @@
 		h.sweepSpans[h.sweepgen/2%2].push(s) // Add to swept in-use list.
 		s.state = _MSpanInUse
 		s.allocCount = 0
-		s.sizeclass = uint8(sizeclass)
-		if sizeclass == 0 {
+		s.spanclass = spanclass
+		if sizeclass := spanclass.sizeclass(); sizeclass == 0 {
 			s.elemsize = s.npages << _PageShift
 			s.divShift = 0
 			s.divMul = 0
@@ -587,9 +686,11 @@
 		h.pagesInUse += uint64(npage)
 		if large {
 			memstats.heap_objects++
+			mheap_.largealloc += uint64(s.elemsize)
+			mheap_.nlargealloc++
 			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
 			// Swept spans are at the end of lists.
-			if s.npages < uintptr(len(h.free)) {
+			if s.npages < uintptr(len(h.busy)) {
 				h.busy[s.npages].insertBack(s)
 			} else {
 				h.busylarge.insertBack(s)
@@ -618,13 +719,13 @@
 	return s
 }
 
-func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+func (h *mheap) alloc(npage uintptr, spanclass spanClass, large bool, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
 	var s *mspan
 	systemstack(func() {
-		s = h.alloc_m(npage, sizeclass, large)
+		s = h.alloc_m(npage, spanclass, large)
 	})
 
 	if s != nil {
@@ -636,29 +737,46 @@
 	return s
 }
 
-func (h *mheap) allocStack(npage uintptr) *mspan {
-	_g_ := getg()
-	if _g_ != _g_.m.g0 {
-		throw("mheap_allocstack not on g0 stack")
-	}
+// allocManual allocates a manually-managed span of npage pages.
+// allocManual returns nil if allocation fails.
+//
+// allocManual adds the bytes used to *stat, which should be a
+// memstats in-use field. Unlike allocations in the GC'd heap, the
+// allocation does *not* count toward heap_inuse or heap_sys.
+//
+// The memory backing the returned span may not be zeroed if
+// span.needzero is set.
+//
+// allocManual must be called on the system stack to prevent stack
+// growth. Since this is used by the stack allocator, stack growth
+// during allocManual would self-deadlock.
+//
+//go:systemstack
+func (h *mheap) allocManual(npage uintptr, stat *uint64) *mspan {
 	lock(&h.lock)
-	s := h.allocSpanLocked(npage)
+	s := h.allocSpanLocked(npage, stat)
 	if s != nil {
-		s.state = _MSpanStack
-		s.stackfreelist = 0
+		s.state = _MSpanManual
+		s.manualFreeList = 0
 		s.allocCount = 0
-		memstats.stacks_inuse += uint64(s.npages << _PageShift)
+		s.spanclass = 0
+		s.nelems = 0
+		s.elemsize = 0
+		s.limit = s.base() + s.npages<<_PageShift
+		// Manually manged memory doesn't count toward heap_sys.
+		memstats.heap_sys -= uint64(s.npages << _PageShift)
 	}
 
-	// This unlock acts as a release barrier. See mHeap_Alloc_m.
+	// This unlock acts as a release barrier. See mheap.alloc_m.
 	unlock(&h.lock)
+
 	return s
 }
 
 // Allocates a span of the given size.  h must be locked.
 // The returned span has been removed from the
 // free list, but its state is still MSpanFree.
-func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
+func (h *mheap) allocSpanLocked(npage uintptr, stat *uint64) *mspan {
 	var list *mSpanList
 	var s *mspan
 
@@ -667,13 +785,12 @@
 		list = &h.free[i]
 		if !list.isEmpty() {
 			s = list.first
+			list.remove(s)
 			goto HaveSpan
 		}
 	}
-
 	// Best fit in list of large spans.
-	list = &h.freelarge
-	s = h.allocLarge(npage)
+	s = h.allocLarge(npage) // allocLarge removed s from h.freelarge for us
 	if s == nil {
 		if !h.grow(npage) {
 			return nil
@@ -692,10 +809,6 @@
 	if s.npages < npage {
 		throw("MHeap_AllocLocked - bad npages")
 	}
-	list.remove(s)
-	if s.inList() {
-		throw("still in list")
-	}
 	if s.npreleased > 0 {
 		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
 		memstats.heap_released -= uint64(s.npreleased << _PageShift)
@@ -714,8 +827,8 @@
 		h.spans[p] = t
 		h.spans[p+t.npages-1] = t
 		t.needzero = s.needzero
-		s.state = _MSpanStack // prevent coalescing with s
-		t.state = _MSpanStack
+		s.state = _MSpanManual // prevent coalescing with s
+		t.state = _MSpanManual
 		h.freeSpanLocked(t, false, false, s.unusedsince)
 		s.state = _MSpanFree
 	}
@@ -726,7 +839,7 @@
 		h.spans[p+n] = s
 	}
 
-	memstats.heap_inuse += uint64(npage << _PageShift)
+	*stat += uint64(npage << _PageShift)
 	memstats.heap_idle -= uint64(npage << _PageShift)
 
 	//println("spanalloc", hex(s.start<<_PageShift))
@@ -736,24 +849,19 @@
 	return s
 }
 
-// Allocate a span of exactly npage pages from the list of large spans.
-func (h *mheap) allocLarge(npage uintptr) *mspan {
-	return bestFit(&h.freelarge, npage, nil)
+// Large spans have a minimum size of 1MByte. The maximum number of large spans to support
+// 1TBytes is 1 million, experimentation using random sizes indicates that the depth of
+// the tree is less that 2x that of a perfectly balanced tree. For 1TByte can be referenced
+// by a perfectly balanced tree with a a depth of 20. Twice that is an acceptable 40.
+func (h *mheap) isLargeSpan(npages uintptr) bool {
+	return npages >= uintptr(len(h.free))
 }
 
-// Search list for smallest span with >= npage pages.
-// If there are multiple smallest spans, take the one
-// with the earliest starting address.
-func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan {
-	for s := list.first; s != nil; s = s.next {
-		if s.npages < npage {
-			continue
-		}
-		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) {
-			best = s
-		}
-	}
-	return best
+// allocLarge allocates a span of at least npage pages from the treap of large spans.
+// Returns nil if no such span currently exists.
+func (h *mheap) allocLarge(npage uintptr) *mspan {
+	// Search treap for smallest span with >= npage pages.
+	return h.freelarge.remove(npage)
 }
 
 // Try to add at least npage pages of memory to the heap,
@@ -852,22 +960,30 @@
 	})
 }
 
-func (h *mheap) freeStack(s *mspan) {
-	_g_ := getg()
-	if _g_ != _g_.m.g0 {
-		throw("mheap_freestack not on g0 stack")
-	}
+// freeManual frees a manually-managed span returned by allocManual.
+// stat must be the same as the stat passed to the allocManual that
+// allocated s.
+//
+// This must only be called when gcphase == _GCoff. See mSpanState for
+// an explanation.
+//
+// freeManual must be called on the system stack to prevent stack
+// growth, just like allocManual.
+//
+//go:systemstack
+func (h *mheap) freeManual(s *mspan, stat *uint64) {
 	s.needzero = 1
 	lock(&h.lock)
-	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
-	h.freeSpanLocked(s, true, true, 0)
+	*stat -= uint64(s.npages << _PageShift)
+	memstats.heap_sys += uint64(s.npages << _PageShift)
+	h.freeSpanLocked(s, false, true, 0)
 	unlock(&h.lock)
 }
 
 // s must be on a busy list (h.busy or h.busylarge) or unlinked.
 func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
 	switch s.state {
-	case _MSpanStack:
+	case _MSpanManual:
 		if s.allocCount != 0 {
 			throw("MHeap_FreeSpanLocked - invalid stack free")
 		}
@@ -903,50 +1019,98 @@
 	// Coalesce with earlier, later spans.
 	p := (s.base() - h.arena_start) >> _PageShift
 	if p > 0 {
-		t := h.spans[p-1]
-		if t != nil && t.state == _MSpanFree {
-			s.startAddr = t.startAddr
-			s.npages += t.npages
-			s.npreleased = t.npreleased // absorb released pages
-			s.needzero |= t.needzero
-			p -= t.npages
+		before := h.spans[p-1]
+		if before != nil && before.state == _MSpanFree {
+			// Now adjust s.
+			s.startAddr = before.startAddr
+			s.npages += before.npages
+			s.npreleased = before.npreleased // absorb released pages
+			s.needzero |= before.needzero
+			p -= before.npages
 			h.spans[p] = s
-			h.freeList(t.npages).remove(t)
-			t.state = _MSpanDead
-			h.spanalloc.free(unsafe.Pointer(t))
-		}
-	}
-	if (p + s.npages) < uintptr(len(h.spans)) {
-		t := h.spans[p+s.npages]
-		if t != nil && t.state == _MSpanFree {
-			s.npages += t.npages
-			s.npreleased += t.npreleased
-			s.needzero |= t.needzero
-			h.spans[p+s.npages-1] = s
-			h.freeList(t.npages).remove(t)
-			t.state = _MSpanDead
-			h.spanalloc.free(unsafe.Pointer(t))
+			// The size is potentially changing so the treap needs to delete adjacent nodes and
+			// insert back as a combined node.
+			if h.isLargeSpan(before.npages) {
+				// We have a t, it is large so it has to be in the treap so we can remove it.
+				h.freelarge.removeSpan(before)
+			} else {
+				h.freeList(before.npages).remove(before)
+			}
+			before.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(before))
 		}
 	}
 
-	// Insert s into appropriate list.
-	h.freeList(s.npages).insert(s)
+	// Now check to see if next (greater addresses) span is free and can be coalesced.
+	if (p + s.npages) < uintptr(len(h.spans)) {
+		after := h.spans[p+s.npages]
+		if after != nil && after.state == _MSpanFree {
+			s.npages += after.npages
+			s.npreleased += after.npreleased
+			s.needzero |= after.needzero
+			h.spans[p+s.npages-1] = s
+			if h.isLargeSpan(after.npages) {
+				h.freelarge.removeSpan(after)
+			} else {
+				h.freeList(after.npages).remove(after)
+			}
+			after.state = _MSpanDead
+			h.spanalloc.free(unsafe.Pointer(after))
+		}
+	}
+
+	// Insert s into appropriate list or treap.
+	if h.isLargeSpan(s.npages) {
+		h.freelarge.insert(s)
+	} else {
+		h.freeList(s.npages).insert(s)
+	}
 }
 
 func (h *mheap) freeList(npages uintptr) *mSpanList {
-	if npages < uintptr(len(h.free)) {
-		return &h.free[npages]
-	}
-	return &h.freelarge
+	return &h.free[npages]
 }
 
 func (h *mheap) busyList(npages uintptr) *mSpanList {
-	if npages < uintptr(len(h.free)) {
+	if npages < uintptr(len(h.busy)) {
 		return &h.busy[npages]
 	}
 	return &h.busylarge
 }
 
+func scavengeTreapNode(t *treapNode, now, limit uint64) uintptr {
+	s := t.spanKey
+	var sumreleased uintptr
+	if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
+		start := s.base()
+		end := start + s.npages<<_PageShift
+		if physPageSize > _PageSize {
+			// We can only release pages in
+			// physPageSize blocks, so round start
+			// and end in. (Otherwise, madvise
+			// will round them *out* and release
+			// more memory than we want.)
+			start = (start + physPageSize - 1) &^ (physPageSize - 1)
+			end &^= physPageSize - 1
+			if end <= start {
+				// start and end don't span a
+				// whole physical page.
+				return sumreleased
+			}
+		}
+		len := end - start
+		released := len - (s.npreleased << _PageShift)
+		if physPageSize > _PageSize && released == 0 {
+			return sumreleased
+		}
+		memstats.heap_released += uint64(released)
+		sumreleased += released
+		s.npreleased = len >> _PageShift
+		sysUnused(unsafe.Pointer(start), len)
+	}
+	return sumreleased
+}
+
 func scavengelist(list *mSpanList, now, limit uint64) uintptr {
 	if list.isEmpty() {
 		return 0
@@ -987,27 +1151,31 @@
 }
 
 func (h *mheap) scavenge(k int32, now, limit uint64) {
+	// Disallow malloc or panic while holding the heap lock. We do
+	// this here because this is an non-mallocgc entry-point to
+	// the mheap API.
+	gp := getg()
+	gp.m.mallocing++
 	lock(&h.lock)
 	var sumreleased uintptr
 	for i := 0; i < len(h.free); i++ {
 		sumreleased += scavengelist(&h.free[i], now, limit)
 	}
-	sumreleased += scavengelist(&h.freelarge, now, limit)
+	sumreleased += scavengetreap(h.freelarge.treap, now, limit)
 	unlock(&h.lock)
+	gp.m.mallocing--
 
 	if debug.gctrace > 0 {
 		if sumreleased > 0 {
 			print("scvg", k, ": ", sumreleased>>20, " MB released\n")
 		}
-		// TODO(dvyukov): these stats are incorrect as we don't subtract stack usage from heap.
-		// But we can't call ReadMemStats on g0 holding locks.
 		print("scvg", k, ": inuse: ", memstats.heap_inuse>>20, ", idle: ", memstats.heap_idle>>20, ", sys: ", memstats.heap_sys>>20, ", released: ", memstats.heap_released>>20, ", consumed: ", (memstats.heap_sys-memstats.heap_released)>>20, " (MB)\n")
 	}
 }
 
 //go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
 func runtime_debug_freeOSMemory() {
-	gcStart(gcForceBlockMode, false)
+	GC()
 	systemstack(func() { mheap_.scavenge(-1, ^uint64(0), 0) })
 }
 
@@ -1020,7 +1188,7 @@
 	span.startAddr = base
 	span.npages = npages
 	span.allocCount = 0
-	span.sizeclass = 0
+	span.spanclass = 0
 	span.incache = false
 	span.elemsize = 0
 	span.state = _MSpanDead
@@ -1046,7 +1214,8 @@
 
 func (list *mSpanList) remove(span *mspan) {
 	if span.list != list {
-		println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
+		print("runtime: failed MSpanList_Remove span.npages=", span.npages,
+			" span=", span, " prev=", span.prev, " span.list=", span.list, " list=", list, "\n")
 		throw("MSpanList_Remove")
 	}
 	if list.first == span {
@@ -1088,7 +1257,7 @@
 
 func (list *mSpanList) insertBack(span *mspan) {
 	if span.next != nil || span.prev != nil || span.list != nil {
-		println("failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
+		println("runtime: failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
 		throw("MSpanList_InsertBack")
 	}
 	span.prev = list.last
@@ -1103,6 +1272,31 @@
 	span.list = list
 }
 
+// takeAll removes all spans from other and inserts them at the front
+// of list.
+func (list *mSpanList) takeAll(other *mSpanList) {
+	if other.isEmpty() {
+		return
+	}
+
+	// Reparent everything in other to list.
+	for s := other.first; s != nil; s = s.next {
+		s.list = list
+	}
+
+	// Concatenate the lists.
+	if list.isEmpty() {
+		*list = *other
+	} else {
+		// Neither list is empty. Put other before list.
+		other.last.next = list.first
+		list.first.prev = other.last
+		list.first = other.first
+	}
+
+	other.first, other.last = nil, nil
+}
+
 const (
 	_KindSpecialFinalizer = 1
 	_KindSpecialProfile   = 2
@@ -1316,6 +1510,22 @@
 	}
 }
 
+// gcBits is an alloc/mark bitmap. This is always used as *gcBits.
+//
+//go:notinheap
+type gcBits uint8
+
+// bytep returns a pointer to the n'th byte of b.
+func (b *gcBits) bytep(n uintptr) *uint8 {
+	return addb((*uint8)(b), n)
+}
+
+// bitp returns a pointer to the byte containing bit n and a mask for
+// selecting that bit from *bytep.
+func (b *gcBits) bitp(n uintptr) (bytep *uint8, mask uint8) {
+	return b.bytep(n / 8), 1 << (n % 8)
+}
+
 const gcBitsChunkBytes = uintptr(64 << 10)
 const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
 
@@ -1325,42 +1535,87 @@
 }
 
 //go:notinheap
-type gcBits struct {
+type gcBitsArena struct {
 	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
-	free uintptr // free is the index into bits of the next free byte.
-	next *gcBits
-	bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8
+	free uintptr // free is the index into bits of the next free byte; read/write atomically
+	next *gcBitsArena
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]gcBits
 }
 
 var gcBitsArenas struct {
 	lock     mutex
-	free     *gcBits
-	next     *gcBits
-	current  *gcBits
-	previous *gcBits
+	free     *gcBitsArena
+	next     *gcBitsArena // Read atomically. Write atomically under lock.
+	current  *gcBitsArena
+	previous *gcBitsArena
+}
+
+// tryAlloc allocates from b or returns nil if b does not have enough room.
+// This is safe to call concurrently.
+func (b *gcBitsArena) tryAlloc(bytes uintptr) *gcBits {
+	if b == nil || atomic.Loaduintptr(&b.free)+bytes > uintptr(len(b.bits)) {
+		return nil
+	}
+	// Try to allocate from this block.
+	end := atomic.Xadduintptr(&b.free, bytes)
+	if end > uintptr(len(b.bits)) {
+		return nil
+	}
+	// There was enough room.
+	start := end - bytes
+	return &b.bits[start]
 }
 
 // newMarkBits returns a pointer to 8 byte aligned bytes
 // to be used for a span's mark bits.
-func newMarkBits(nelems uintptr) *uint8 {
-	lock(&gcBitsArenas.lock)
+func newMarkBits(nelems uintptr) *gcBits {
 	blocksNeeded := uintptr((nelems + 63) / 64)
 	bytesNeeded := blocksNeeded * 8
-	if gcBitsArenas.next == nil ||
-		gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) {
-		// Allocate a new arena.
-		fresh := newArena()
-		fresh.next = gcBitsArenas.next
-		gcBitsArenas.next = fresh
+
+	// Try directly allocating from the current head arena.
+	head := (*gcBitsArena)(atomic.Loadp(unsafe.Pointer(&gcBitsArenas.next)))
+	if p := head.tryAlloc(bytesNeeded); p != nil {
+		return p
 	}
-	if gcBitsArenas.next.free >= gcBitsChunkBytes {
-		println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes)
+
+	// There's not enough room in the head arena. We may need to
+	// allocate a new arena.
+	lock(&gcBitsArenas.lock)
+	// Try the head arena again, since it may have changed. Now
+	// that we hold the lock, the list head can't change, but its
+	// free position still can.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate a new arena. This may temporarily drop the lock.
+	fresh := newArenaMayUnlock()
+	// If newArenaMayUnlock dropped the lock, another thread may
+	// have put a fresh arena on the "next" list. Try allocating
+	// from next again.
+	if p := gcBitsArenas.next.tryAlloc(bytesNeeded); p != nil {
+		// Put fresh back on the free list.
+		// TODO: Mark it "already zeroed"
+		fresh.next = gcBitsArenas.free
+		gcBitsArenas.free = fresh
+		unlock(&gcBitsArenas.lock)
+		return p
+	}
+
+	// Allocate from the fresh arena. We haven't linked it in yet, so
+	// this cannot race and is guaranteed to succeed.
+	p := fresh.tryAlloc(bytesNeeded)
+	if p == nil {
 		throw("markBits overflow")
 	}
-	result := &gcBitsArenas.next.bits[gcBitsArenas.next.free]
-	gcBitsArenas.next.free += bytesNeeded
+
+	// Add the fresh arena to the "next" list.
+	fresh.next = gcBitsArenas.next
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), unsafe.Pointer(fresh))
+
 	unlock(&gcBitsArenas.lock)
-	return result
+	return p
 }
 
 // newAllocBits returns a pointer to 8 byte aligned bytes
@@ -1369,7 +1624,7 @@
 // allocation bits. For spans not being initialized the
 // the mark bits are repurposed as allocation bits when
 // the span is swept.
-func newAllocBits(nelems uintptr) *uint8 {
+func newAllocBits(nelems uintptr) *gcBits {
 	return newMarkBits(nelems)
 }
 
@@ -1403,18 +1658,21 @@
 	}
 	gcBitsArenas.previous = gcBitsArenas.current
 	gcBitsArenas.current = gcBitsArenas.next
-	gcBitsArenas.next = nil // newMarkBits calls newArena when needed
+	atomic.StorepNoWB(unsafe.Pointer(&gcBitsArenas.next), nil) // newMarkBits calls newArena when needed
 	unlock(&gcBitsArenas.lock)
 }
 
-// newArena allocates and zeroes a gcBits arena.
-func newArena() *gcBits {
-	var result *gcBits
+// newArenaMayUnlock allocates and zeroes a gcBits arena.
+// The caller must hold gcBitsArena.lock. This may temporarily release it.
+func newArenaMayUnlock() *gcBitsArena {
+	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
-		result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		unlock(&gcBitsArenas.lock)
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
+		lock(&gcBitsArenas.lock)
 	} else {
 		result = gcBitsArenas.free
 		gcBitsArenas.free = gcBitsArenas.free.next
@@ -1423,7 +1681,7 @@
 	result.next = nil
 	// If result.bits is not 8 byte aligned adjust index so
 	// that &result.bits[result.free] is 8 byte aligned.
-	if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 {
+	if uintptr(unsafe.Offsetof(gcBitsArena{}.bits))&7 == 0 {
 		result.free = 0
 	} else {
 		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)

diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index cf6b37f..d15f1f7 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go

@@ -43,7 +43,7 @@
 func gen(arch string, tags, zero, copy func(io.Writer)) {
 	var buf bytes.Buffer
 
-	fmt.Fprintln(&buf, "// AUTO-GENERATED by mkduff.go")
+	fmt.Fprintln(&buf, "// Code generated by mkduff.go; DO NOT EDIT.")
 	fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
 	fmt.Fprintln(&buf, "// See mkduff.go for comments.")
 	tags(&buf)

diff --git a/src/runtime/mknacl.sh b/src/runtime/mknacl.sh
index 0a74db1..3454b62 100644
--- a/src/runtime/mknacl.sh
+++ b/src/runtime/mknacl.sh

@@ -6,7 +6,7 @@
 cat /Users/rsc/pub/native_client/src/trusted/service_runtime/include/bits/nacl_syscalls.h |
 	awk '
 	BEGIN {
-		printf("// generated by mknacl.sh - do not edit\n")
+		printf("// Code generated by mknacl.sh; DO NOT EDIT.\n")
 	}
 	NF==3 && $1=="#define" && $2~/^NACL_sys_/ {
 		name=$2

diff --git a/src/runtime/mksizeclasses.go b/src/runtime/mksizeclasses.go
index 0f897ba..0cb2b33 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/runtime/mksizeclasses.go

@@ -48,7 +48,7 @@
 	flag.Parse()
 
 	var b bytes.Buffer
-	fmt.Fprintln(&b, "// AUTO-GENERATED by mksizeclasses.go; DO NOT EDIT")
+	fmt.Fprintln(&b, "// Code generated by mksizeclasses.go; DO NOT EDIT.")
 	fmt.Fprintln(&b, "//go:generate go run mksizeclasses.go")
 	fmt.Fprintln(&b)
 	fmt.Fprintln(&b, "package runtime")

diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
index 53617e4..62f3780 100644
--- a/src/runtime/mmap.go
+++ b/src/runtime/mmap.go

@@ -17,3 +17,6 @@
 // assembly routine; the higher bits (if required), should be provided
 // by the assembly routine as 0.
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+
+// munmap calls the munmap system call. It is implemented in assembly.
+func munmap(addr unsafe.Pointer, n uintptr)

diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index fc06d8d..2bd09b6 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go

@@ -64,27 +64,70 @@
 	// come only after a GC during concurrent sweeping. So if we would
 	// naively count them, we would get a skew toward mallocs.
 	//
-	// Mallocs are accounted in recent stats.
-	// Explicit frees are accounted in recent stats.
-	// GC frees are accounted in prev stats.
-	// After GC prev stats are added to final stats and
-	// recent stats are moved into prev stats.
-	allocs      uintptr
-	frees       uintptr
-	alloc_bytes uintptr
-	free_bytes  uintptr
+	// Hence, we delay information to get consistent snapshots as
+	// of mark termination. Allocations count toward the next mark
+	// termination's snapshot, while sweep frees count toward the
+	// previous mark termination's snapshot:
+	//
+	//              MT          MT          MT          MT
+	//             .·|         .·|         .·|         .·|
+	//          .·˙  |      .·˙  |      .·˙  |      .·˙  |
+	//       .·˙     |   .·˙     |   .·˙     |   .·˙     |
+	//    .·˙        |.·˙        |.·˙        |.·˙        |
+	//
+	//       alloc → ▲ ← free
+	//               ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//       C+2     →    C+1    →  C
+	//
+	//                   alloc → ▲ ← free
+	//                           ┠┅┅┅┅┅┅┅┅┅┅┅P
+	//                   C+2     →    C+1    →  C
+	//
+	// Since we can't publish a consistent snapshot until all of
+	// the sweep frees are accounted for, we wait until the next
+	// mark termination ("MT" above) to publish the previous mark
+	// termination's snapshot ("P" above). To do this, allocation
+	// and free events are accounted to *future* heap profile
+	// cycles ("C+n" above) and we only publish a cycle once all
+	// of the events from that cycle must be done. Specifically:
+	//
+	// Mallocs are accounted to cycle C+2.
+	// Explicit frees are accounted to cycle C+2.
+	// GC frees (done during sweeping) are accounted to cycle C+1.
+	//
+	// After mark termination, we increment the global heap
+	// profile cycle counter and accumulate the stats from cycle C
+	// into the active profile.
 
-	// changes between next-to-last GC and last GC
-	prev_allocs      uintptr
-	prev_frees       uintptr
-	prev_alloc_bytes uintptr
-	prev_free_bytes  uintptr
+	// active is the currently published profile. A profiling
+	// cycle can be accumulated into active once its complete.
+	active memRecordCycle
 
-	// changes since last GC
-	recent_allocs      uintptr
-	recent_frees       uintptr
-	recent_alloc_bytes uintptr
-	recent_free_bytes  uintptr
+	// future records the profile events we're counting for cycles
+	// that have not yet been published. This is ring buffer
+	// indexed by the global heap profile cycle C and stores
+	// cycles C, C+1, and C+2. Unlike active, these counts are
+	// only for a single cycle; they are not cumulative across
+	// cycles.
+	//
+	// We store cycle C here because there's a window between when
+	// C becomes the active cycle and when we've flushed it to
+	// active.
+	future [3]memRecordCycle
+}
+
+// memRecordCycle
+type memRecordCycle struct {
+	allocs, frees           uintptr
+	alloc_bytes, free_bytes uintptr
+}
+
+// add accumulates b into a. It does not zero b.
+func (a *memRecordCycle) add(b *memRecordCycle) {
+	a.allocs += b.allocs
+	a.frees += b.frees
+	a.alloc_bytes += b.alloc_bytes
+	a.free_bytes += b.free_bytes
 }
 
 // A blockRecord is the bucket data for a bucket of type blockProfile,
@@ -100,8 +143,21 @@
 	xbuckets  *bucket // mutex profile buckets
 	buckhash  *[179999]*bucket
 	bucketmem uintptr
+
+	mProf struct {
+		// All fields in mProf are protected by proflock.
+
+		// cycle is the global heap profile cycle. This wraps
+		// at mProfCycleWrap.
+		cycle uint32
+		// flushed indicates that future[cycle] in all buckets
+		// has been flushed to the active profile.
+		flushed bool
+	}
 )
 
+const mProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)
+
 // newBucket allocates a bucket with the given type and number of stack entries.
 func newBucket(typ bucketType, nstk int) *bucket {
 	size := unsafe.Sizeof(bucket{}) + uintptr(nstk)*unsafe.Sizeof(uintptr(0))
@@ -212,30 +268,71 @@
 	return true
 }
 
-func mprof_GC() {
+// mProf_NextCycle publishes the next heap profile cycle and creates a
+// fresh heap profile cycle. This operation is fast and can be done
+// during STW. The caller must call mProf_Flush before calling
+// mProf_NextCycle again.
+//
+// This is called by mark termination during STW so allocations and
+// frees after the world is started again count towards a new heap
+// profiling cycle.
+func mProf_NextCycle() {
+	lock(&proflock)
+	// We explicitly wrap mProf.cycle rather than depending on
+	// uint wraparound because the memRecord.future ring does not
+	// itself wrap at a power of two.
+	mProf.cycle = (mProf.cycle + 1) % mProfCycleWrap
+	mProf.flushed = false
+	unlock(&proflock)
+}
+
+// mProf_Flush flushes the events from the current heap profiling
+// cycle into the active profile. After this it is safe to start a new
+// heap profiling cycle with mProf_NextCycle.
+//
+// This is called by GC after mark termination starts the world. In
+// contrast with mProf_NextCycle, this is somewhat expensive, but safe
+// to do concurrently.
+func mProf_Flush() {
+	lock(&proflock)
+	if !mProf.flushed {
+		mProf_FlushLocked()
+		mProf.flushed = true
+	}
+	unlock(&proflock)
+}
+
+func mProf_FlushLocked() {
+	c := mProf.cycle
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		mp.allocs += mp.prev_allocs
-		mp.frees += mp.prev_frees
-		mp.alloc_bytes += mp.prev_alloc_bytes
-		mp.free_bytes += mp.prev_free_bytes
 
-		mp.prev_allocs = mp.recent_allocs
-		mp.prev_frees = mp.recent_frees
-		mp.prev_alloc_bytes = mp.recent_alloc_bytes
-		mp.prev_free_bytes = mp.recent_free_bytes
-
-		mp.recent_allocs = 0
-		mp.recent_frees = 0
-		mp.recent_alloc_bytes = 0
-		mp.recent_free_bytes = 0
+		// Flush cycle C into the published profile and clear
+		// it for reuse.
+		mpc := &mp.future[c%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
 	}
 }
 
-// Record that a gc just happened: all the 'recent' statistics are now real.
-func mProf_GC() {
+// mProf_PostSweep records that all sweep frees for this GC cycle have
+// completed. This has the effect of publishing the heap profile
+// snapshot as of the last mark termination without advancing the heap
+// profile cycle.
+func mProf_PostSweep() {
 	lock(&proflock)
-	mprof_GC()
+	// Flush cycle C+1 to the active profile so everything as of
+	// the last mark termination becomes visible. *Don't* advance
+	// the cycle, since we're still accumulating allocs in cycle
+	// C+2, which have to become C+1 in the next mark termination
+	// and so on.
+	c := mProf.cycle
+	for b := mbuckets; b != nil; b = b.allnext {
+		mp := b.mp()
+		mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+		mp.active.add(mpc)
+		*mpc = memRecordCycle{}
+	}
 	unlock(&proflock)
 }
 
@@ -245,9 +342,11 @@
 	nstk := callers(4, stk[:])
 	lock(&proflock)
 	b := stkbucket(memProfile, size, stk[:nstk], true)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.recent_allocs++
-	mp.recent_alloc_bytes += size
+	mpc := &mp.future[(c+2)%uint32(len(mp.future))]
+	mpc.allocs++
+	mpc.alloc_bytes += size
 	unlock(&proflock)
 
 	// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock.
@@ -262,9 +361,11 @@
 // Called when freeing a profiled block.
 func mProf_Free(b *bucket, size uintptr) {
 	lock(&proflock)
+	c := mProf.cycle
 	mp := b.mp()
-	mp.prev_frees++
-	mp.prev_free_bytes += size
+	mpc := &mp.future[(c+1)%uint32(len(mp.future))]
+	mpc.frees++
+	mpc.free_bytes += size
 	unlock(&proflock)
 }
 
@@ -298,7 +399,7 @@
 		cycles = 1
 	}
 	if blocksampled(cycles) {
-		saveblockevent(cycles, skip+1, blockProfile, &blockprofilerate)
+		saveblockevent(cycles, skip+1, blockProfile)
 	}
 }
 
@@ -310,7 +411,7 @@
 	return true
 }
 
-func saveblockevent(cycles int64, skip int, which bucketType, ratep *uint64) {
+func saveblockevent(cycles int64, skip int, which bucketType) {
 	gp := getg()
 	var nstk int
 	var stk [maxStack]uintptr
@@ -353,7 +454,7 @@
 	// TODO(pjw): measure impact of always calling fastrand vs using something
 	// like malloc.go:nextSample()
 	if rate > 0 && int64(fastrand())%rate == 0 {
-		saveblockevent(cycles, skip+1, mutexProfile, &mutexprofilerate)
+		saveblockevent(cycles, skip+1, mutexProfile)
 	}
 }
 
@@ -441,13 +542,17 @@
 // of calling MemProfile directly.
 func MemProfile(p []MemProfileRecord, inuseZero bool) (n int, ok bool) {
 	lock(&proflock)
+	// If we're between mProf_NextCycle and mProf_Flush, take care
+	// of flushing to the active profile so we only have to look
+	// at the active profile below.
+	mProf_FlushLocked()
 	clear := true
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		if inuseZero || mp.alloc_bytes != mp.free_bytes {
+		if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 			n++
 		}
-		if mp.allocs != 0 || mp.frees != 0 {
+		if mp.active.allocs != 0 || mp.active.frees != 0 {
 			clear = false
 		}
 	}
@@ -455,13 +560,15 @@
 		// Absolutely no data, suggesting that a garbage collection
 		// has not yet happened. In order to allow profiling when
 		// garbage collection is disabled from the beginning of execution,
-		// accumulate stats as if a GC just happened, and recount buckets.
-		mprof_GC()
-		mprof_GC()
+		// accumulate all of the cycles, and recount buckets.
 		n = 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
-			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+			for c := range mp.future {
+				mp.active.add(&mp.future[c])
+				mp.future[c] = memRecordCycle{}
+			}
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				n++
 			}
 		}
@@ -471,7 +578,7 @@
 		idx := 0
 		for b := mbuckets; b != nil; b = b.allnext {
 			mp := b.mp()
-			if inuseZero || mp.alloc_bytes != mp.free_bytes {
+			if inuseZero || mp.active.alloc_bytes != mp.active.free_bytes {
 				record(&p[idx], b)
 				idx++
 			}
@@ -484,10 +591,10 @@
 // Write b's data to r.
 func record(r *MemProfileRecord, b *bucket) {
 	mp := b.mp()
-	r.AllocBytes = int64(mp.alloc_bytes)
-	r.FreeBytes = int64(mp.free_bytes)
-	r.AllocObjects = int64(mp.allocs)
-	r.FreeObjects = int64(mp.frees)
+	r.AllocBytes = int64(mp.active.alloc_bytes)
+	r.FreeBytes = int64(mp.active.free_bytes)
+	r.AllocObjects = int64(mp.active.allocs)
+	r.FreeObjects = int64(mp.active.frees)
 	if raceenabled {
 		racewriterangepc(unsafe.Pointer(&r.Stack0[0]), unsafe.Sizeof(r.Stack0), getcallerpc(unsafe.Pointer(&r)), funcPC(MemProfile))
 	}
@@ -504,7 +611,7 @@
 	lock(&proflock)
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		fn(b, b.nstk, &b.stk()[0], b.size, mp.allocs, mp.frees)
+		fn(b, b.nstk, &b.stk()[0], b.size, mp.active.allocs, mp.active.frees)
 	}
 	unlock(&proflock)
 }

diff --git a/src/runtime/msize.go b/src/runtime/msize.go
index 438c987..0accb83 100644
--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go

@@ -9,28 +9,6 @@
 
 package runtime
 
-// sizeToClass(0 <= n <= MaxSmallSize) returns the size class,
-//	1 <= sizeclass < NumSizeClasses, for n.
-//	Size class 0 is reserved to mean "not small".
-//
-// The sizeToClass lookup is implemented using two arrays,
-// one mapping sizes <= 1024 to their class and one mapping
-// sizes >= 1024 and <= MaxSmallSize to their class.
-// All objects are 8-aligned, so the first array is indexed by
-// the size divided by 8 (rounded up).  Objects >= 1024 bytes
-// are 128-aligned, so the second array is indexed by the
-// size divided by 128 (rounded up).  The arrays are constants
-// in sizeclass.go generated by mksizeclass.go.
-func sizeToClass(size uint32) uint32 {
-	if size > _MaxSmallSize {
-		throw("invalid size")
-	}
-	if size > smallSizeMax-8 {
-		return uint32(size_to_class128[(size-smallSizeMax+largeSizeDiv-1)/largeSizeDiv])
-	}
-	return uint32(size_to_class8[(size+smallSizeDiv-1)/smallSizeDiv])
-}
-
 // Returns size of the memory block that mallocgc will allocate if you ask for the size.
 func roundupsize(size uintptr) uintptr {
 	if size < _MaxSmallSize {

diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 41b9005..1cb44a1 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go

@@ -33,13 +33,12 @@
 	// Statistics about malloc heap.
 	// Protected by mheap.lock
 	//
-	// In mstats, heap_sys and heap_inuse includes stack memory,
-	// while in MemStats stack memory is separated out from the
-	// heap stats.
+	// Like MemStats, heap_sys and heap_inuse do not count memory
+	// in manually-managed spans.
 	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system
+	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
 	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in non-idle spans
+	heap_inuse    uint64 // bytes in _MSpanInUse spans
 	heap_released uint64 // bytes released to the os
 	heap_objects  uint64 // total number of allocated objects
 
@@ -59,7 +58,7 @@
 
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // this number is included in heap_inuse above; differs from MemStats.StackInuse
+	stacks_inuse uint64 // bytes in manually-managed stack spans
 	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
 	mspan_sys    uint64
@@ -72,7 +71,7 @@
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
 	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
-	last_gc         uint64 // last gc (in absolute time)
+	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
 	pause_end       [256]uint64 // circular buffer of recent gc end times (nanoseconds since 1970)
@@ -92,13 +91,26 @@
 
 	// Statistics below here are not exported to MemStats directly.
 
-	tinyallocs uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+	last_gc_nanotime uint64 // last gc (monotonic time)
+	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
+
+	// triggerRatio is the heap growth ratio that triggers marking.
+	//
+	// E.g., if this is 0.6, then GC should start when the live
+	// heap has reached 1.6 times the heap size marked by the
+	// previous cycle. This should be ≤ GOGC/100 so the trigger
+	// heap size is less than the goal heap size. This is set
+	// during mark termination for the next cycle's trigger.
+	triggerRatio float64
 
 	// gc_trigger is the heap size that triggers marking.
 	//
 	// When heap_live ≥ gc_trigger, the mark phase will start.
 	// This is also the heap size by which proportional sweeping
 	// must be complete.
+	//
+	// This is computed from triggerRatio during mark termination
+	// for the next cycle's trigger.
 	gc_trigger uint64
 
 	// heap_live is the number of bytes considered live by the GC.
@@ -121,6 +133,8 @@
 	// leads to a conservative GC rate rather than a GC rate that
 	// is potentially too low.
 	//
+	// Reads should likewise be atomic (or during STW).
+	//
 	// Whenever this is updated, call traceHeapAlloc() and
 	// gcController.revise().
 	heap_live uint64
@@ -451,17 +465,16 @@
 }
 
 func readmemstats_m(stats *MemStats) {
-	updatememstats(nil)
+	updatememstats()
 
 	// The size of the trailing by_size array differs between
 	// mstats and MemStats. NumSizeClasses was changed, but we
 	// cannot change MemStats because of backward compatibility.
 	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
 
-	// Stack numbers are part of the heap numbers, separate those out for user consumption
+	// memstats.stacks_sys is only memory mapped directly for OS stacks.
+	// Add in heap-allocated stack memory for user consumption.
 	stats.StackSys += stats.StackInuse
-	stats.HeapInuse -= stats.StackInuse
-	stats.HeapSys -= stats.StackInuse
 }
 
 //go:linkname readGCStats runtime/debug.readGCStats
@@ -497,7 +510,7 @@
 		p[n+i] = memstats.pause_end[j]
 	}
 
-	p[n+n] = memstats.last_gc
+	p[n+n] = memstats.last_gc_unix
 	p[n+n+1] = uint64(memstats.numgc)
 	p[n+n+2] = memstats.pause_total_ns
 	unlock(&mheap_.lock)
@@ -505,26 +518,15 @@
 }
 
 //go:nowritebarrier
-func updatememstats(stats *gcstats) {
-	if stats != nil {
-		*stats = gcstats{}
-	}
-	for mp := allm; mp != nil; mp = mp.alllink {
-		if stats != nil {
-			src := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(&mp.gcstats))
-			dst := (*[unsafe.Sizeof(gcstats{}) / 8]uint64)(unsafe.Pointer(stats))
-			for i, v := range src {
-				dst[i] += v
-			}
-			mp.gcstats = gcstats{}
-		}
-	}
-
+func updatememstats() {
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
 	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
 		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
 
+	// We also count stacks_inuse as sys memory.
+	memstats.sys += memstats.stacks_inuse
+
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
 	// Current number of alive object in the heap and amount of alive heap memory
@@ -547,45 +549,49 @@
 	// Aggregate local stats.
 	cachestats()
 
-	// Scan all spans and count number of alive objects.
-	lock(&mheap_.lock)
-	for _, s := range mheap_.allspans {
-		if s.state != mSpanInUse {
+	// Collect allocation stats. This is safe and consistent
+	// because the world is stopped.
+	var smallFree, totalAlloc, totalFree uint64
+	// Collect per-spanclass stats.
+	for spc := range mheap_.central {
+		// The mcaches are now empty, so mcentral stats are
+		// up-to-date.
+		c := &mheap_.central[spc].mcentral
+		memstats.nmalloc += c.nmalloc
+		i := spanClass(spc).sizeclass()
+		memstats.by_size[i].nmalloc += c.nmalloc
+		totalAlloc += c.nmalloc * uint64(class_to_size[i])
+	}
+	// Collect per-sizeclass stats.
+	for i := 0; i < _NumSizeClasses; i++ {
+		if i == 0 {
+			memstats.nmalloc += mheap_.nlargealloc
+			totalAlloc += mheap_.largealloc
+			totalFree += mheap_.largefree
+			memstats.nfree += mheap_.nlargefree
 			continue
 		}
-		if s.sizeclass == 0 {
-			memstats.nmalloc++
-			memstats.alloc += uint64(s.elemsize)
-		} else {
-			memstats.nmalloc += uint64(s.allocCount)
-			memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount)
-			memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize)
-		}
-	}
-	unlock(&mheap_.lock)
 
-	// Aggregate by size class.
-	smallfree := uint64(0)
-	memstats.nfree = mheap_.nlargefree
-	for i := 0; i < len(memstats.by_size); i++ {
+		// The mcache stats have been flushed to mheap_.
 		memstats.nfree += mheap_.nsmallfree[i]
 		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
-		smallfree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
 	}
+	totalFree += smallFree
+
 	memstats.nfree += memstats.tinyallocs
-	memstats.nmalloc += memstats.nfree
+	memstats.nmalloc += memstats.tinyallocs
 
 	// Calculate derived stats.
-	memstats.total_alloc = memstats.alloc + mheap_.largefree + smallfree
+	memstats.total_alloc = totalAlloc
+	memstats.alloc = totalAlloc - totalFree
 	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
 
 //go:nowritebarrier
 func cachestats() {
-	for i := 0; ; i++ {
-		p := allp[i]
+	for _, p := range &allp {
 		if p == nil {
 			break
 		}

diff --git a/src/runtime/mstkbar.go b/src/runtime/mstkbar.go
deleted file mode 100644
index 4415559..0000000
--- a/src/runtime/mstkbar.go
+++ /dev/null

@@ -1,393 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Garbage collector: stack barriers
-//
-// Stack barriers enable the garbage collector to determine how much
-// of a gorountine stack has changed between when a stack is scanned
-// during the concurrent scan phase and when it is re-scanned during
-// the stop-the-world mark termination phase. Mark termination only
-// needs to re-scan the changed part, so for deep stacks this can
-// significantly reduce GC pause time compared to the alternative of
-// re-scanning whole stacks. The deeper the stacks, the more stack
-// barriers help.
-//
-// When stacks are scanned during the concurrent scan phase, the stack
-// scan installs stack barriers by selecting stack frames and
-// overwriting the saved return PCs (or link registers) of these
-// frames with the PC of a "stack barrier trampoline". Later, when a
-// selected frame returns, it "returns" to this trampoline instead of
-// returning to its actual caller. The trampoline records that the
-// stack has unwound past this frame and jumps to the original return
-// PC recorded when the stack barrier was installed. Mark termination
-// re-scans only as far as the first frame that hasn't hit a stack
-// barrier and then removes and un-hit stack barriers.
-//
-// This scheme is very lightweight. No special code is required in the
-// mutator to record stack unwinding and the trampoline is only a few
-// assembly instructions.
-//
-// Book-keeping
-// ------------
-//
-// The primary cost of stack barriers is book-keeping: the runtime has
-// to record the locations of all stack barriers and the original
-// return PCs in order to return to the correct caller when a stack
-// barrier is hit and so it can remove un-hit stack barriers. In order
-// to minimize this cost, the Go runtime places stack barriers in
-// exponentially-spaced frames, starting 1K past the current frame.
-// The book-keeping structure hence grows logarithmically with the
-// size of the stack and mark termination re-scans at most twice as
-// much stack as necessary.
-//
-// The runtime reserves space for this book-keeping structure at the
-// top of the stack allocation itself (just above the outermost
-// frame). This is necessary because the regular memory allocator can
-// itself grow the stack, and hence can't be used when allocating
-// stack-related structures.
-//
-// For debugging, the runtime also supports installing stack barriers
-// at every frame. However, this requires significantly more
-// book-keeping space.
-//
-// Correctness
-// -----------
-//
-// The runtime and the compiler cooperate to ensure that all objects
-// reachable from the stack as of mark termination are marked.
-// Anything unchanged since the concurrent scan phase will be marked
-// because it is marked by the concurrent scan. After the concurrent
-// scan, there are three possible classes of stack modifications that
-// must be tracked:
-//
-// 1) Mutator writes below the lowest un-hit stack barrier. This
-// includes all writes performed by an executing function to its own
-// stack frame. This part of the stack will be re-scanned by mark
-// termination, which will mark any objects made reachable from
-// modifications to this part of the stack.
-//
-// 2) Mutator writes above the lowest un-hit stack barrier. It's
-// possible for a mutator to modify the stack above the lowest un-hit
-// stack barrier if a higher frame has passed down a pointer to a
-// stack variable in its frame. This is called an "up-pointer". The
-// compiler ensures that writes through up-pointers have an
-// accompanying write barrier (it simply doesn't distinguish between
-// writes through up-pointers and writes through heap pointers). This
-// write barrier marks any object made reachable from modifications to
-// this part of the stack.
-//
-// 3) Runtime writes to the stack. Various runtime operations such as
-// sends to unbuffered channels can write to arbitrary parts of the
-// stack, including above the lowest un-hit stack barrier. We solve
-// this in two ways. In many cases, the runtime can perform an
-// explicit write barrier operation like in case 2. However, in the
-// case of bulk memory move (typedmemmove), the runtime doesn't
-// necessary have ready access to a pointer bitmap for the memory
-// being copied, so it simply unwinds any stack barriers below the
-// destination.
-//
-// Gotchas
-// -------
-//
-// Anything that inspects or manipulates the stack potentially needs
-// to understand stack barriers. The most obvious case is that
-// gentraceback needs to use the original return PC when it encounters
-// the stack barrier trampoline. Anything that unwinds the stack such
-// as panic/recover must unwind stack barriers in tandem with
-// unwinding the stack.
-//
-// Stack barriers require that any goroutine whose stack has been
-// scanned must execute write barriers. Go solves this by simply
-// enabling write barriers globally during the concurrent scan phase.
-// However, traditionally, write barriers are not enabled during this
-// phase.
-//
-// Synchronization
-// ---------------
-//
-// For the most part, accessing and modifying stack barriers is
-// synchronized around GC safe points. Installing stack barriers
-// forces the G to a safe point, while all other operations that
-// modify stack barriers run on the G and prevent it from reaching a
-// safe point.
-//
-// Subtlety arises when a G may be tracebacked when *not* at a safe
-// point. This happens during sigprof. For this, each G has a "stack
-// barrier lock" (see gcLockStackBarriers, gcUnlockStackBarriers).
-// Operations that manipulate stack barriers acquire this lock, while
-// sigprof tries to acquire it and simply skips the traceback if it
-// can't acquire it. There is one exception for performance and
-// complexity reasons: hitting a stack barrier manipulates the stack
-// barrier list without acquiring the stack barrier lock. For this,
-// gentraceback performs a special fix up if the traceback starts in
-// the stack barrier function.
-
-package runtime
-
-import (
-	"runtime/internal/atomic"
-	"runtime/internal/sys"
-	"unsafe"
-)
-
-const debugStackBarrier = false
-
-// firstStackBarrierOffset is the approximate byte offset at
-// which to place the first stack barrier from the current SP.
-// This is a lower bound on how much stack will have to be
-// re-scanned during mark termination. Subsequent barriers are
-// placed at firstStackBarrierOffset * 2^n offsets.
-//
-// For debugging, this can be set to 0, which will install a
-// stack barrier at every frame. If you do this, you may also
-// have to raise _StackMin, since the stack barrier
-// bookkeeping will use a large amount of each stack.
-var firstStackBarrierOffset = 1024
-
-// gcMaxStackBarriers returns the maximum number of stack barriers
-// that can be installed in a stack of stackSize bytes.
-func gcMaxStackBarriers(stackSize int) (n int) {
-	if debug.gcstackbarrieroff > 0 {
-		return 0
-	}
-
-	if firstStackBarrierOffset == 0 {
-		// Special debugging case for inserting stack barriers
-		// at every frame. Steal half of the stack for the
-		// []stkbar. Technically, if the stack were to consist
-		// solely of return PCs we would need two thirds of
-		// the stack, but stealing that much breaks things and
-		// this doesn't happen in practice.
-		return stackSize / 2 / int(unsafe.Sizeof(stkbar{}))
-	}
-
-	offset := firstStackBarrierOffset
-	for offset < stackSize {
-		n++
-		offset *= 2
-	}
-	return n + 1
-}
-
-// gcInstallStackBarrier installs a stack barrier over the return PC of frame.
-//go:nowritebarrier
-func gcInstallStackBarrier(gp *g, frame *stkframe) bool {
-	if frame.lr == 0 {
-		if debugStackBarrier {
-			print("not installing stack barrier with no LR, goid=", gp.goid, "\n")
-		}
-		return false
-	}
-
-	if frame.fn.entry == cgocallback_gofuncPC {
-		// cgocallback_gofunc doesn't return to its LR;
-		// instead, its return path puts LR in g.sched.pc and
-		// switches back to the system stack on which
-		// cgocallback_gofunc was originally called. We can't
-		// have a stack barrier in g.sched.pc, so don't
-		// install one in this frame.
-		if debugStackBarrier {
-			print("not installing stack barrier over LR of cgocallback_gofunc, goid=", gp.goid, "\n")
-		}
-		return false
-	}
-
-	// Save the return PC and overwrite it with stackBarrier.
-	var lrUintptr uintptr
-	if usesLR {
-		lrUintptr = frame.sp
-	} else {
-		lrUintptr = frame.fp - sys.RegSize
-	}
-	lrPtr := (*sys.Uintreg)(unsafe.Pointer(lrUintptr))
-	if debugStackBarrier {
-		print("install stack barrier at ", hex(lrUintptr), " over ", hex(*lrPtr), ", goid=", gp.goid, "\n")
-		if uintptr(*lrPtr) != frame.lr {
-			print("frame.lr=", hex(frame.lr))
-			throw("frame.lr differs from stack LR")
-		}
-	}
-
-	gp.stkbar = gp.stkbar[:len(gp.stkbar)+1]
-	stkbar := &gp.stkbar[len(gp.stkbar)-1]
-	stkbar.savedLRPtr = lrUintptr
-	stkbar.savedLRVal = uintptr(*lrPtr)
-	*lrPtr = sys.Uintreg(stackBarrierPC)
-	return true
-}
-
-// gcRemoveStackBarriers removes all stack barriers installed in gp's stack.
-//
-// gp's stack barriers must be locked.
-//
-//go:nowritebarrier
-func gcRemoveStackBarriers(gp *g) {
-	if debugStackBarrier && gp.stkbarPos != 0 {
-		print("hit ", gp.stkbarPos, " stack barriers, goid=", gp.goid, "\n")
-	}
-
-	// Remove stack barriers that we didn't hit.
-	for _, stkbar := range gp.stkbar[gp.stkbarPos:] {
-		gcRemoveStackBarrier(gp, stkbar)
-	}
-
-	// Clear recorded stack barriers so copystack doesn't try to
-	// adjust them.
-	gp.stkbarPos = 0
-	gp.stkbar = gp.stkbar[:0]
-}
-
-// gcRemoveStackBarrier removes a single stack barrier. It is the
-// inverse operation of gcInstallStackBarrier.
-//
-// This is nosplit to ensure gp's stack does not move.
-//
-//go:nowritebarrier
-//go:nosplit
-func gcRemoveStackBarrier(gp *g, stkbar stkbar) {
-	if debugStackBarrier {
-		print("remove stack barrier at ", hex(stkbar.savedLRPtr), " with ", hex(stkbar.savedLRVal), ", goid=", gp.goid, "\n")
-	}
-	lrPtr := (*sys.Uintreg)(unsafe.Pointer(stkbar.savedLRPtr))
-	if val := *lrPtr; val != sys.Uintreg(stackBarrierPC) {
-		printlock()
-		print("at *", hex(stkbar.savedLRPtr), " expected stack barrier PC ", hex(stackBarrierPC), ", found ", hex(val), ", goid=", gp.goid, "\n")
-		print("gp.stkbar=")
-		gcPrintStkbars(gp, -1)
-		print(", gp.stack=[", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
-		throw("stack barrier lost")
-	}
-	*lrPtr = sys.Uintreg(stkbar.savedLRVal)
-}
-
-// gcTryRemoveAllStackBarriers tries to remove stack barriers from all
-// Gs in gps. It is best-effort and efficient. If it can't remove
-// barriers from a G immediately, it will simply skip it.
-func gcTryRemoveAllStackBarriers(gps []*g) {
-	for _, gp := range gps {
-	retry:
-		for {
-			switch s := readgstatus(gp); s {
-			default:
-				break retry
-
-			case _Grunnable, _Gsyscall, _Gwaiting:
-				if !castogscanstatus(gp, s, s|_Gscan) {
-					continue
-				}
-				gcLockStackBarriers(gp)
-				gcRemoveStackBarriers(gp)
-				gcUnlockStackBarriers(gp)
-				restartg(gp)
-				break retry
-			}
-		}
-	}
-}
-
-// gcPrintStkbars prints the stack barriers of gp for debugging. It
-// places a "@@@" marker at gp.stkbarPos. If marker >= 0, it will also
-// place a "==>" marker before the marker'th entry.
-func gcPrintStkbars(gp *g, marker int) {
-	print("[")
-	for i, s := range gp.stkbar {
-		if i > 0 {
-			print(" ")
-		}
-		if i == int(gp.stkbarPos) {
-			print("@@@ ")
-		}
-		if i == marker {
-			print("==> ")
-		}
-		print("*", hex(s.savedLRPtr), "=", hex(s.savedLRVal))
-	}
-	if int(gp.stkbarPos) == len(gp.stkbar) {
-		print(" @@@")
-	}
-	if marker == len(gp.stkbar) {
-		print(" ==>")
-	}
-	print("]")
-}
-
-// gcUnwindBarriers marks all stack barriers up the frame containing
-// sp as hit and removes them. This is used during stack unwinding for
-// panic/recover and by heapBitsBulkBarrier to force stack re-scanning
-// when its destination is on the stack.
-//
-// This is nosplit to ensure gp's stack does not move.
-//
-//go:nosplit
-func gcUnwindBarriers(gp *g, sp uintptr) {
-	gcLockStackBarriers(gp)
-	// On LR machines, if there is a stack barrier on the return
-	// from the frame containing sp, this will mark it as hit even
-	// though it isn't, but it's okay to be conservative.
-	before := gp.stkbarPos
-	for int(gp.stkbarPos) < len(gp.stkbar) && gp.stkbar[gp.stkbarPos].savedLRPtr < sp {
-		gcRemoveStackBarrier(gp, gp.stkbar[gp.stkbarPos])
-		gp.stkbarPos++
-	}
-	gcUnlockStackBarriers(gp)
-	if debugStackBarrier && gp.stkbarPos != before {
-		print("skip barriers below ", hex(sp), " in goid=", gp.goid, ": ")
-		// We skipped barriers between the "==>" marker
-		// (before) and the "@@@" marker (gp.stkbarPos).
-		gcPrintStkbars(gp, int(before))
-		print("\n")
-	}
-}
-
-// nextBarrierPC returns the original return PC of the next stack barrier.
-// Used by getcallerpc, so it must be nosplit.
-//go:nosplit
-func nextBarrierPC() uintptr {
-	gp := getg()
-	return gp.stkbar[gp.stkbarPos].savedLRVal
-}
-
-// setNextBarrierPC sets the return PC of the next stack barrier.
-// Used by setcallerpc, so it must be nosplit.
-//go:nosplit
-func setNextBarrierPC(pc uintptr) {
-	gp := getg()
-	gcLockStackBarriers(gp)
-	gp.stkbar[gp.stkbarPos].savedLRVal = pc
-	gcUnlockStackBarriers(gp)
-}
-
-// gcLockStackBarriers synchronizes with tracebacks of gp's stack
-// during sigprof for installation or removal of stack barriers. It
-// blocks until any current sigprof is done tracebacking gp's stack
-// and then disallows profiling tracebacks of gp's stack.
-//
-// This is necessary because a sigprof during barrier installation or
-// removal could observe inconsistencies between the stkbar array and
-// the stack itself and crash.
-//
-//go:nosplit
-func gcLockStackBarriers(gp *g) {
-	// Disable preemption so scanstack cannot run while the caller
-	// is manipulating the stack barriers.
-	acquirem()
-	for !atomic.Cas(&gp.stackLock, 0, 1) {
-		osyield()
-	}
-}
-
-//go:nosplit
-func gcTryLockStackBarriers(gp *g) bool {
-	mp := acquirem()
-	result := atomic.Cas(&gp.stackLock, 0, 1)
-	if !result {
-		releasem(mp)
-	}
-	return result
-}
-
-func gcUnlockStackBarriers(gp *g) {
-	atomic.Store(&gp.stackLock, 0)
-	releasem(getg().m)
-}

diff --git a/src/runtime/net_plan9.go b/src/runtime/net_plan9.go
index 10fd089..b1ac7c7 100644
--- a/src/runtime/net_plan9.go
+++ b/src/runtime/net_plan9.go

@@ -8,12 +8,12 @@
 	_ "unsafe"
 )
 
-//go:linkname runtime_ignoreHangup net.runtime_ignoreHangup
+//go:linkname runtime_ignoreHangup internal/poll.runtime_ignoreHangup
 func runtime_ignoreHangup() {
 	getg().m.ignoreHangup = true
 }
 
-//go:linkname runtime_unignoreHangup net.runtime_unignoreHangup
+//go:linkname runtime_unignoreHangup internal/poll.runtime_unignoreHangup
 func runtime_unignoreHangup(sig string) {
 	getg().m.ignoreHangup = false
 }

diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 10a3c88..8dd4fb6 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go

@@ -77,12 +77,13 @@
 }
 
 var (
-	netpollInited uint32
-	pollcache     pollCache
+	netpollInited  uint32
+	pollcache      pollCache
+	netpollWaiters uint32
 )
 
-//go:linkname net_runtime_pollServerInit net.runtime_pollServerInit
-func net_runtime_pollServerInit() {
+//go:linkname poll_runtime_pollServerInit internal/poll.runtime_pollServerInit
+func poll_runtime_pollServerInit() {
 	netpollinit()
 	atomic.Store(&netpollInited, 1)
 }
@@ -91,15 +92,23 @@
 	return atomic.Load(&netpollInited) != 0
 }
 
-//go:linkname net_runtime_pollOpen net.runtime_pollOpen
-func net_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
+//go:linkname poll_runtime_pollServerDescriptor internal/poll.runtime_pollServerDescriptor
+
+// poll_runtime_pollServerDescriptor returns the descriptor being used,
+// or ^uintptr(0) if the system does not use a poll descriptor.
+func poll_runtime_pollServerDescriptor() uintptr {
+	return netpolldescriptor()
+}
+
+//go:linkname poll_runtime_pollOpen internal/poll.runtime_pollOpen
+func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd := pollcache.alloc()
 	lock(&pd.lock)
 	if pd.wg != 0 && pd.wg != pdReady {
-		throw("netpollOpen: blocked write on free descriptor")
+		throw("runtime: blocked write on free polldesc")
 	}
 	if pd.rg != 0 && pd.rg != pdReady {
-		throw("netpollOpen: blocked read on free descriptor")
+		throw("runtime: blocked read on free polldesc")
 	}
 	pd.fd = fd
 	pd.closing = false
@@ -115,16 +124,16 @@
 	return pd, int(errno)
 }
 
-//go:linkname net_runtime_pollClose net.runtime_pollClose
-func net_runtime_pollClose(pd *pollDesc) {
+//go:linkname poll_runtime_pollClose internal/poll.runtime_pollClose
+func poll_runtime_pollClose(pd *pollDesc) {
 	if !pd.closing {
-		throw("netpollClose: close w/o unblock")
+		throw("runtime: close polldesc w/o unblock")
 	}
 	if pd.wg != 0 && pd.wg != pdReady {
-		throw("netpollClose: blocked write on closing descriptor")
+		throw("runtime: blocked write on closing polldesc")
 	}
 	if pd.rg != 0 && pd.rg != pdReady {
-		throw("netpollClose: blocked read on closing descriptor")
+		throw("runtime: blocked read on closing polldesc")
 	}
 	netpollclose(pd.fd)
 	pollcache.free(pd)
@@ -137,8 +146,8 @@
 	unlock(&c.lock)
 }
 
-//go:linkname net_runtime_pollReset net.runtime_pollReset
-func net_runtime_pollReset(pd *pollDesc, mode int) int {
+//go:linkname poll_runtime_pollReset internal/poll.runtime_pollReset
+func poll_runtime_pollReset(pd *pollDesc, mode int) int {
 	err := netpollcheckerr(pd, int32(mode))
 	if err != 0 {
 		return err
@@ -151,8 +160,8 @@
 	return 0
 }
 
-//go:linkname net_runtime_pollWait net.runtime_pollWait
-func net_runtime_pollWait(pd *pollDesc, mode int) int {
+//go:linkname poll_runtime_pollWait internal/poll.runtime_pollWait
+func poll_runtime_pollWait(pd *pollDesc, mode int) int {
 	err := netpollcheckerr(pd, int32(mode))
 	if err != 0 {
 		return err
@@ -173,16 +182,16 @@
 	return 0
 }
 
-//go:linkname net_runtime_pollWaitCanceled net.runtime_pollWaitCanceled
-func net_runtime_pollWaitCanceled(pd *pollDesc, mode int) {
+//go:linkname poll_runtime_pollWaitCanceled internal/poll.runtime_pollWaitCanceled
+func poll_runtime_pollWaitCanceled(pd *pollDesc, mode int) {
 	// This function is used only on windows after a failed attempt to cancel
 	// a pending async IO operation. Wait for ioready, ignore closing or timeouts.
 	for !netpollblock(pd, int32(mode), true) {
 	}
 }
 
-//go:linkname net_runtime_pollSetDeadline net.runtime_pollSetDeadline
-func net_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
+//go:linkname poll_runtime_pollSetDeadline internal/poll.runtime_pollSetDeadline
+func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	lock(&pd.lock)
 	if pd.closing {
 		unlock(&pd.lock)
@@ -244,18 +253,18 @@
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 3)
+		netpollgoready(rg, 3)
 	}
 	if wg != nil {
-		goready(wg, 3)
+		netpollgoready(wg, 3)
 	}
 }
 
-//go:linkname net_runtime_pollUnblock net.runtime_pollUnblock
-func net_runtime_pollUnblock(pd *pollDesc) {
+//go:linkname poll_runtime_pollUnblock internal/poll.runtime_pollUnblock
+func poll_runtime_pollUnblock(pd *pollDesc) {
 	lock(&pd.lock)
 	if pd.closing {
-		throw("netpollUnblock: already closing")
+		throw("runtime: unblock on closing polldesc")
 	}
 	pd.closing = true
 	pd.seq++
@@ -273,10 +282,10 @@
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 3)
+		netpollgoready(rg, 3)
 	}
 	if wg != nil {
-		goready(wg, 3)
+		netpollgoready(wg, 3)
 	}
 }
 
@@ -312,7 +321,19 @@
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
-	return atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	r := atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	if r {
+		// Bump the count of goroutines waiting for the poller.
+		// The scheduler uses this to decide whether to block
+		// waiting for the poller if there is nothing else to do.
+		atomic.Xadd(&netpollWaiters, 1)
+	}
+	return r
+}
+
+func netpollgoready(gp *g, traceskip int) {
+	atomic.Xadd(&netpollWaiters, -1)
+	goready(gp, traceskip+1)
 }
 
 // returns true if IO is ready, or false if timedout or closed
@@ -331,7 +352,7 @@
 			return true
 		}
 		if old != 0 {
-			throw("netpollblock: double wait")
+			throw("runtime: double wait")
 		}
 		if atomic.Casuintptr(gpp, 0, pdWait) {
 			break
@@ -347,7 +368,7 @@
 	// be careful to not lose concurrent READY notification
 	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
-		throw("netpollblock: corrupted state")
+		throw("runtime: corrupted polldesc")
 	}
 	return old == pdReady
 }
@@ -393,7 +414,7 @@
 	var rg *g
 	if read {
 		if pd.rd <= 0 || pd.rt.f == nil {
-			throw("netpolldeadlineimpl: inconsistent read deadline")
+			throw("runtime: inconsistent read deadline")
 		}
 		pd.rd = -1
 		atomicstorep(unsafe.Pointer(&pd.rt.f), nil) // full memory barrier between store to rd and load of rg in netpollunblock
@@ -402,7 +423,7 @@
 	var wg *g
 	if write {
 		if pd.wd <= 0 || pd.wt.f == nil && !read {
-			throw("netpolldeadlineimpl: inconsistent write deadline")
+			throw("runtime: inconsistent write deadline")
 		}
 		pd.wd = -1
 		atomicstorep(unsafe.Pointer(&pd.wt.f), nil) // full memory barrier between store to wd and load of wg in netpollunblock
@@ -410,10 +431,10 @@
 	}
 	unlock(&pd.lock)
 	if rg != nil {
-		goready(rg, 0)
+		netpollgoready(rg, 0)
 	}
 	if wg != nil {
-		goready(wg, 0)
+		netpollgoready(wg, 0)
 	}
 }
 

diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index e06eff8..1908220 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go

@@ -32,8 +32,12 @@
 		closeonexec(epfd)
 		return
 	}
-	println("netpollinit: failed to create epoll descriptor", -epfd)
-	throw("netpollinit: failed to create descriptor")
+	println("runtime: epollcreate failed with", -epfd)
+	throw("runtime: netpollinit failed")
+}
+
+func netpolldescriptor() uintptr {
+	return uintptr(epfd)
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -49,7 +53,7 @@
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // polls for ready network connections
@@ -68,7 +72,7 @@
 	if n < 0 {
 		if n != -_EINTR {
 			println("runtime: epollwait on fd", epfd, "failed with", -n)
-			throw("epollwait failed")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 337377a..71de98b 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go

@@ -23,12 +23,16 @@
 func netpollinit() {
 	kq = kqueue()
 	if kq < 0 {
-		println("netpollinit: kqueue failed with", -kq)
-		throw("netpollinit: kqueue failed")
+		println("runtime: kqueue failed with", -kq)
+		throw("runtime: netpollinit failed")
 	}
 	closeonexec(kq)
 }
 
+func netpolldescriptor() uintptr {
+	return uintptr(kq)
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
 	// for the whole fd lifetime. The notifications are automatically unregistered
@@ -56,7 +60,7 @@
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // Polls for ready network connections.
@@ -76,7 +80,7 @@
 	if n < 0 {
 		if n != -_EINTR {
 			println("runtime: kevent on fd", kq, "failed with", -n)
-			throw("kevent failed")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_nacl.go b/src/runtime/netpoll_nacl.go
index 5cbc300..dc5a55e 100644
--- a/src/runtime/netpoll_nacl.go
+++ b/src/runtime/netpoll_nacl.go

@@ -10,6 +10,10 @@
 func netpollinit() {
 }
 
+func netpolldescriptor() uintptr {
+	return ^uintptr(0)
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	return 0
 }

diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index 53b2aac..853e5f6 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go

@@ -117,8 +117,12 @@
 		return
 	}
 
-	print("netpollinit: failed to create port (", errno(), ")\n")
-	throw("netpollinit: failed to create port")
+	print("runtime: port_create failed (errno=", errno(), ")\n")
+	throw("runtime: netpollinit failed")
+}
+
+func netpolldescriptor() uintptr {
+	return uintptr(portfd)
 }
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
@@ -154,8 +158,8 @@
 	}
 
 	if events != 0 && port_associate(portfd, _PORT_SOURCE_FD, pd.fd, events, uintptr(unsafe.Pointer(pd))) != 0 {
-		print("netpollupdate: failed to associate (", errno(), ")\n")
-		throw("netpollupdate: failed to associate")
+		print("runtime: port_associate failed (errno=", errno(), ")\n")
+		throw("runtime: netpollupdate failed")
 	}
 	pd.user = events
 }
@@ -169,7 +173,7 @@
 	case 'w':
 		netpollupdate(pd, _POLLOUT, 0)
 	default:
-		throw("netpollarm: bad mode")
+		throw("runtime: bad mode")
 	}
 	unlock(&pd.lock)
 }
@@ -192,8 +196,8 @@
 	var n uint32 = 1
 	if port_getn(portfd, &events[0], uint32(len(events)), &n, wait) < 0 {
 		if e := errno(); e != _EINTR {
-			print("runtime: port_getn on fd ", portfd, " failed with ", e, "\n")
-			throw("port_getn failed")
+			print("runtime: port_getn on fd ", portfd, " failed (errno=", e, ")\n")
+			throw("runtime: netpoll failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_stub.go b/src/runtime/netpoll_stub.go
index 09f64ad..a4d6b46 100644
--- a/src/runtime/netpoll_stub.go
+++ b/src/runtime/netpoll_stub.go

@@ -6,6 +6,8 @@
 
 package runtime
 
+var netpollWaiters uint32
+
 // Polls for ready network connections.
 // Returns list of goroutines that become runnable.
 func netpoll(block bool) (gp *g) {

diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 7ad1158..79dafb0 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go

@@ -12,7 +12,8 @@
 
 const _INVALID_HANDLE_VALUE = ^uintptr(0)
 
-// net_op must be the same as beginning of net.operation. Keep these in sync.
+// net_op must be the same as beginning of internal/poll.operation.
+// Keep these in sync.
 type net_op struct {
 	// used by windows
 	o overlapped
@@ -35,11 +36,15 @@
 func netpollinit() {
 	iocphandle = stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX)
 	if iocphandle == 0 {
-		println("netpoll: failed to create iocp handle (errno=", getlasterror(), ")")
-		throw("netpoll: failed to create iocp handle")
+		println("runtime: CreateIoCompletionPort failed (errno=", getlasterror(), ")")
+		throw("runtime: netpollinit failed")
 	}
 }
 
+func netpolldescriptor() uintptr {
+	return iocphandle
+}
+
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	if stdcall4(_CreateIoCompletionPort, fd, iocphandle, 0, 0) == 0 {
 		return -int32(getlasterror())
@@ -53,7 +58,7 @@
 }
 
 func netpollarm(pd *pollDesc, mode int) {
-	throw("unused")
+	throw("runtime: unused")
 }
 
 // Polls for completed network IO.
@@ -89,8 +94,8 @@
 			if !block && errno == _WAIT_TIMEOUT {
 				return nil
 			}
-			println("netpoll: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
-			throw("netpoll: GetQueuedCompletionStatusEx failed")
+			println("runtime: GetQueuedCompletionStatusEx failed (errno=", errno, ")")
+			throw("runtime: netpoll failed")
 		}
 		mp.blocked = false
 		for i = 0; i < n; i++ {
@@ -116,8 +121,8 @@
 				return nil
 			}
 			if op == nil {
-				println("netpoll: GetQueuedCompletionStatus failed (errno=", errno, ")")
-				throw("netpoll: GetQueuedCompletionStatus failed")
+				println("runtime: GetQueuedCompletionStatus failed (errno=", errno, ")")
+				throw("runtime: netpoll failed")
 			}
 			// dequeued failed IO packet, so report that
 		}
@@ -132,12 +137,13 @@
 
 func handlecompletion(gpp *guintptr, op *net_op, errno int32, qty uint32) {
 	if op == nil {
-		throw("netpoll: GetQueuedCompletionStatus returned op == nil")
+		println("runtime: GetQueuedCompletionStatus returned op == nil")
+		throw("runtime: netpoll failed")
 	}
 	mode := op.mode
 	if mode != 'r' && mode != 'w' {
-		println("netpoll: GetQueuedCompletionStatus returned invalid mode=", mode)
-		throw("netpoll: GetQueuedCompletionStatus returned invalid mode")
+		println("runtime: GetQueuedCompletionStatus returned invalid mode=", mode)
+		throw("runtime: netpoll failed")
 	}
 	op.errno = errno
 	op.qty = qty

diff --git a/src/runtime/numcpu_freebsd_test.go b/src/runtime/numcpu_freebsd_test.go
new file mode 100644
index 0000000..e78890a
--- /dev/null
+++ b/src/runtime/numcpu_freebsd_test.go

@@ -0,0 +1,15 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import "testing"
+
+func TestFreeBSDNumCPU(t *testing.T) {
+	got := runTestProg(t, "testprog", "FreeBSDNumCPU")
+	want := "OK\n"
+	if got != want {
+		t.Fatalf("expected %q, but got:\n%s", want, got)
+	}
+}

diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index 26b4acd..5d4b5a6 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go

@@ -62,7 +62,7 @@
 		// but we do recognize the top pointer on the stack as code,
 		// then assume this was a call to non-code and treat like
 		// pc == 0, to make unwinding show the context.
-		if pc != 0 && findfunc(pc) == nil && findfunc(*(*uintptr)(unsafe.Pointer(sp))) != nil {
+		if pc != 0 && !findfunc(pc).valid() && findfunc(*(*uintptr)(unsafe.Pointer(sp))).valid() {
 			pc = 0
 		}
 
@@ -146,7 +146,10 @@
 func sigignore(sig uint32) {
 }
 
-func resetcpuprofiler(hz int32) {
+func setProcessCPUProfiler(hz int32) {
+}
+
+func setThreadCPUProfiler(hz int32) {
 	// TODO: Enable profiling interrupts.
 	getg().m.profilehz = hz
 }

diff --git a/src/runtime/os_darwin_arm.go b/src/runtime/os_darwin_arm.go
index ee1bd17..8eb5655 100644
--- a/src/runtime/os_darwin_arm.go
+++ b/src/runtime/os_darwin_arm.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 

diff --git a/src/runtime/os_darwin_arm64.go b/src/runtime/os_darwin_arm64.go
index 8de132d..01285af 100644
--- a/src/runtime/os_darwin_arm64.go
+++ b/src/runtime/os_darwin_arm64.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var supportCRC32 = false
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().

diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index 35ed026..7c989de 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go

@@ -42,21 +42,87 @@
 // From FreeBSD's <sys/sysctl.h>
 const (
 	_CTL_HW      = 6
-	_HW_NCPU     = 3
 	_HW_PAGESIZE = 7
 )
 
 var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
 
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
+// Undocumented numbers from FreeBSD's lib/libc/gen/sysctlnametomib.c.
+const (
+	_CTL_QUERY     = 0
+	_CTL_QUERY_MIB = 3
+)
+
+// sysctlnametomib fill mib with dynamically assigned sysctl entries of name,
+// return count of effected mib slots, return 0 on error.
+func sysctlnametomib(name []byte, mib *[_CTL_MAXNAME]uint32) uint32 {
+	oid := [2]uint32{_CTL_QUERY, _CTL_QUERY_MIB}
+	miblen := uintptr(_CTL_MAXNAME)
+	if sysctl(&oid[0], 2, (*byte)(unsafe.Pointer(mib)), &miblen, (*byte)(unsafe.Pointer(&name[0])), (uintptr)(len(name))) < 0 {
+		return 0
 	}
-	return 1
+	miblen /= unsafe.Sizeof(uint32(0))
+	if miblen <= 0 {
+		return 0
+	}
+	return uint32(miblen)
+}
+
+const (
+	_CPU_SETSIZE_MAX = 32 // Limited by _MaxGomaxprocs(256) in runtime2.go.
+	_CPU_CURRENT_PID = -1 // Current process ID.
+)
+
+//go:noescape
+func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+
+func getncpu() int32 {
+	var mask [_CPU_SETSIZE_MAX]byte
+	var mib [_CTL_MAXNAME]uint32
+
+	// According to FreeBSD's /usr/src/sys/kern/kern_cpuset.c,
+	// cpuset_getaffinity return ERANGE when provided buffer size exceed the limits in kernel.
+	// Querying kern.smp.maxcpus to calculate maximum buffer size.
+	// See https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200802
+
+	// Variable kern.smp.maxcpus introduced at Dec 23 2003, revision 123766,
+	// with dynamically assigned sysctl entries.
+	miblen := sysctlnametomib([]byte("kern.smp.maxcpus"), &mib)
+	if miblen == 0 {
+		return 1
+	}
+
+	// Query kern.smp.maxcpus.
+	dstsize := uintptr(4)
+	maxcpus := uint32(0)
+	if sysctl(&mib[0], miblen, (*byte)(unsafe.Pointer(&maxcpus)), &dstsize, nil, 0) != 0 {
+		return 1
+	}
+
+	size := maxcpus / _NBBY
+	ptrsize := uint32(unsafe.Sizeof(uintptr(0)))
+	if size < ptrsize {
+		size = ptrsize
+	}
+	if size > _CPU_SETSIZE_MAX {
+		return 1
+	}
+
+	if cpuset_getaffinity(_CPU_LEVEL_WHICH, _CPU_WHICH_PID, _CPU_CURRENT_PID,
+		int(size), (*byte)(unsafe.Pointer(&mask[0]))) != 0 {
+		return 1
+	}
+	n := int32(0)
+	for _, v := range mask[:size] {
+		for v != 0 {
+			n += int32(v & 1)
+			v >>= 1
+		}
+	}
+	if n == 0 {
+		return 1
+	}
+	return n
 }
 
 func getPageSize() uintptr {
@@ -174,6 +240,20 @@
 		_g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
 	}
 
+	// On FreeBSD before about April 2017 there was a bug such
+	// that calling execve from a thread other than the main
+	// thread did not reset the signal stack. That would confuse
+	// minitSignals, which calls minitSignalStack, which checks
+	// whether there is currently a signal stack and uses it if
+	// present. To avoid this confusion, explicitly disable the
+	// signal stack on the main thread when not running in a
+	// library. This can be removed when we are confident that all
+	// FreeBSD users are running a patched kernel. See issue #15658.
+	if gp := getg(); !isarchive && !islibrary && gp.m == &m0 && gp == gp.m.g0 {
+		st := stackt{ss_flags: _SS_DISABLE}
+		sigaltstack(&st, nil)
+	}
+
 	minitSignals()
 }
 

diff --git a/src/runtime/os_freebsd_arm.go b/src/runtime/os_freebsd_arm.go
index 0399499..6e2bc97 100644
--- a/src/runtime/os_freebsd_arm.go
+++ b/src/runtime/os_freebsd_arm.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 

diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index a6efc0e..7889973 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go

@@ -132,6 +132,7 @@
 		_CLONE_FS | /* share cwd, etc */
 		_CLONE_FILES | /* share fd table */
 		_CLONE_SIGHAND | /* share sig handler table */
+		_CLONE_SYSVSEM | /* share SysV semaphore undo lists (see issue #20763) */
 		_CLONE_THREAD /* revisit - okay for now */
 )
 

diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index 896ec15..7c925d7 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go

@@ -11,11 +11,13 @@
 
 	_HWCAP_VFP   = 1 << 6  // introduced in at least 2.6.11
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
+	_HWCAP_IDIVA = 1 << 17
 )
 
 var randomNumber uint32
 var armArch uint8 = 6 // we default to ARMv6
 var hwcap uint32      // set by setup_auxv
+var hardDiv bool      // set if a hardware divider is available
 
 func checkgoarm() {
 	// On Android, /proc/self/auxv might be unreadable and hwcap won't
@@ -53,6 +55,7 @@
 
 	case _AT_HWCAP: // CPU capability bit flags
 		hwcap = uint32(val)
+		hardDiv = (hwcap & _HWCAP_IDIVA) != 0
 	}
 }
 

diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index bdc341d..986a341 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go

@@ -4,7 +4,12 @@
 
 package runtime
 
+const (
+	_ARM64_FEATURE_HAS_CRC32 = 0x80
+)
+
 var randomNumber uint32
+var supportCRC32 bool
 
 func archauxv(tag, val uintptr) {
 	switch tag {
@@ -14,6 +19,8 @@
 		// it as a byte array.
 		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
 			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+	case _AT_HWCAP:
+		supportCRC32 = val&_ARM64_FEATURE_HAS_CRC32 != 0
 	}
 }
 

diff --git a/src/runtime/os_nacl.go b/src/runtime/os_nacl.go
index 7015316..18e6ce6 100644
--- a/src/runtime/os_nacl.go
+++ b/src/runtime/os_nacl.go

@@ -88,6 +88,11 @@
 }
 
 //go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+}
+
+//go:nosplit
 func sigblock() {
 }
 
@@ -273,7 +278,8 @@
 
 func madvise(addr unsafe.Pointer, n uintptr, flags int32) {}
 func munmap(addr unsafe.Pointer, n uintptr)               {}
-func resetcpuprofiler(hz int32)                           {}
+func setProcessCPUProfiler(hz int32)                      {}
+func setThreadCPUProfiler(hz int32)                       {}
 func sigdisable(uint32)                                   {}
 func sigenable(uint32)                                    {}
 func sigignore(uint32)                                    {}

diff --git a/src/runtime/os_nacl_arm.go b/src/runtime/os_nacl_arm.go
index 8669ee7..c64ebf3 100644
--- a/src/runtime/os_nacl_arm.go
+++ b/src/runtime/os_nacl_arm.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 

diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index c79b50b..c26c3c9 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go

@@ -167,13 +167,23 @@
 	var uc ucontextt
 	getcontext(unsafe.Pointer(&uc))
 
+	// _UC_SIGMASK does not seem to work here.
+	// It would be nice if _UC_SIGMASK and _UC_STACK
+	// worked so that we could do all the work setting
+	// the sigmask and the stack here, instead of setting
+	// the mask here and the stack in netbsdMstart.
+	// For now do the blocking manually.
 	uc.uc_flags = _UC_SIGMASK | _UC_CPU
 	uc.uc_link = nil
 	uc.uc_sigmask = sigset_all
 
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+
 	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(netbsdMstart))
 
 	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret < 0 {
 		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
 		if ret == -_EAGAIN {

diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go
index 95603da..b02e36a 100644
--- a/src/runtime/os_netbsd_arm.go
+++ b/src/runtime/os_netbsd_arm.go

@@ -6,6 +6,8 @@
 
 import "unsafe"
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) {
 	// Machine dependent mcontext initialisation for LWP.
 	mc.__gregs[_REG_R15] = uint32(funcPC(lwp_tramp))

diff --git a/src/runtime/os_openbsd_arm.go b/src/runtime/os_openbsd_arm.go
index be2e1e9..c318578 100644
--- a/src/runtime/os_openbsd_arm.go
+++ b/src/runtime/os_openbsd_arm.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func checkgoarm() {
 	// TODO(minux): FP checks like in os_linux_arm.go.
 

diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index ba2d5c5..45e881a 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go

@@ -173,6 +173,11 @@
 func msigrestore(sigmask sigset) {
 }
 
+//go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+}
+
 func sigblock() {
 }
 

diff --git a/src/runtime/os_plan9_arm.go b/src/runtime/os_plan9_arm.go
index fdce1e7..1ce0141 100644
--- a/src/runtime/os_plan9_arm.go
+++ b/src/runtime/os_plan9_arm.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+var hardDiv bool // TODO: set if a hardware divider is available
+
 func checkgoarm() {
 	return // TODO(minux)
 }

diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 75b8acd..233cc16 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go

@@ -50,6 +50,7 @@
 //go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
 //go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
 //go:cgo_import_dynamic runtime._timeBeginPeriod timeBeginPeriod%1 "winmm.dll"
+//go:cgo_import_dynamic runtime._timeEndPeriod timeEndPeriod%1 "winmm.dll"
 
 type stdFunction unsafe.Pointer
 
@@ -73,9 +74,12 @@
 	_GetQueuedCompletionStatus,
 	_GetStdHandle,
 	_GetSystemInfo,
+	_GetSystemTimeAsFileTime,
 	_GetThreadContext,
 	_LoadLibraryW,
 	_LoadLibraryA,
+	_QueryPerformanceCounter,
+	_QueryPerformanceFrequency,
 	_ResumeThread,
 	_SetConsoleCtrlHandler,
 	_SetErrorMode,
@@ -93,6 +97,7 @@
 	_WriteConsoleW,
 	_WriteFile,
 	_timeBeginPeriod,
+	_timeEndPeriod,
 	_ stdFunction
 
 	// Following syscalls are only available on some Windows PCs.
@@ -188,6 +193,11 @@
 		throw("ntdll.dll not found")
 	}
 	_NtWaitForSingleObject = windowsFindfunc(n32, []byte("NtWaitForSingleObject\000"))
+
+	if windowsFindfunc(n32, []byte("wine_get_version\000")) != nil {
+		// running on Wine
+		initWine(k32)
+	}
 }
 
 //go:nosplit
@@ -260,6 +270,27 @@
 
 var timeBeginPeriodRetValue uint32
 
+// osRelaxMinNS indicates that sysmon shouldn't osRelax if the next
+// timer is less than 60 ms from now. Since osRelaxing may reduce
+// timer resolution to 15.6 ms, this keeps timer error under roughly 1
+// part in 4.
+const osRelaxMinNS = 60 * 1e6
+
+// osRelax is called by the scheduler when transitioning to and from
+// all Ps being idle.
+//
+// On Windows, it adjusts the system-wide timer resolution. Go needs a
+// high resolution timer while running and there's little extra cost
+// if we're already using the CPU, but if all Ps are idle there's no
+// need to consume extra power to drive the high-res timer.
+func osRelax(relax bool) uint32 {
+	if relax {
+		return uint32(stdcall1(_timeEndPeriod, 1))
+	} else {
+		return uint32(stdcall1(_timeBeginPeriod, 1))
+	}
+}
+
 func osinit() {
 	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
 	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
@@ -279,7 +310,7 @@
 
 	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
 
-	timeBeginPeriodRetValue = uint32(stdcall1(_timeBeginPeriod, 1))
+	timeBeginPeriodRetValue = osRelax(false)
 
 	ncpu = getproccount()
 
@@ -292,6 +323,79 @@
 	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
 }
 
+func nanotime() int64
+
+// useQPCTime controls whether time.now and nanotime use QueryPerformanceCounter.
+// This is only set to 1 when running under Wine.
+var useQPCTime uint8
+
+var qpcStartCounter int64
+var qpcMultiplier int64
+
+//go:nosplit
+func nanotimeQPC() int64 {
+	var counter int64 = 0
+	stdcall1(_QueryPerformanceCounter, uintptr(unsafe.Pointer(&counter)))
+
+	// returns number of nanoseconds
+	return (counter - qpcStartCounter) * qpcMultiplier
+}
+
+//go:nosplit
+func nowQPC() (sec int64, nsec int32, mono int64) {
+	var ft int64
+	stdcall1(_GetSystemTimeAsFileTime, uintptr(unsafe.Pointer(&ft)))
+
+	t := (ft - 116444736000000000) * 100
+
+	sec = t / 1000000000
+	nsec = int32(t - sec*1000000000)
+
+	mono = nanotimeQPC()
+	return
+}
+
+func initWine(k32 uintptr) {
+	_GetSystemTimeAsFileTime = windowsFindfunc(k32, []byte("GetSystemTimeAsFileTime\000"))
+	if _GetSystemTimeAsFileTime == nil {
+		throw("could not find GetSystemTimeAsFileTime() syscall")
+	}
+
+	_QueryPerformanceCounter = windowsFindfunc(k32, []byte("QueryPerformanceCounter\000"))
+	_QueryPerformanceFrequency = windowsFindfunc(k32, []byte("QueryPerformanceFrequency\000"))
+	if _QueryPerformanceCounter == nil || _QueryPerformanceFrequency == nil {
+		throw("could not find QPC syscalls")
+	}
+
+	// We can not simply fallback to GetSystemTimeAsFileTime() syscall, since its time is not monotonic,
+	// instead we use QueryPerformanceCounter family of syscalls to implement monotonic timer
+	// https://msdn.microsoft.com/en-us/library/windows/desktop/dn553408(v=vs.85).aspx
+
+	var tmp int64
+	stdcall1(_QueryPerformanceFrequency, uintptr(unsafe.Pointer(&tmp)))
+	if tmp == 0 {
+		throw("QueryPerformanceFrequency syscall returned zero, running on unsupported hardware")
+	}
+
+	// This should not overflow, it is a number of ticks of the performance counter per second,
+	// its resolution is at most 10 per usecond (on Wine, even smaller on real hardware), so it will be at most 10 millions here,
+	// panic if overflows.
+	if tmp > (1<<31 - 1) {
+		throw("QueryPerformanceFrequency overflow 32 bit divider, check nosplit discussion to proceed")
+	}
+	qpcFrequency := int32(tmp)
+	stdcall1(_QueryPerformanceCounter, uintptr(unsafe.Pointer(&qpcStartCounter)))
+
+	// Since we are supposed to run this time calls only on Wine, it does not lose precision,
+	// since Wine's timer is kind of emulated at 10 Mhz, so it will be a nice round multiplier of 100
+	// but for general purpose system (like 3.3 Mhz timer on i7) it will not be very precise.
+	// We have to do it this way (or similar), since multiplying QPC counter by 100 millions overflows
+	// int64 and resulted time will always be invalid.
+	qpcMultiplier = int64(timediv(1000000000, qpcFrequency, nil))
+
+	useQPCTime = 1
+}
+
 //go:nosplit
 func getRandomData(r []byte) {
 	n := 0
@@ -518,7 +622,9 @@
 //go:nosplit
 func newosproc(mp *m, stk unsafe.Pointer) {
 	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
-	thandle := stdcall6(_CreateThread, 0, 0x20000,
+	// stackSize must match SizeOfStackReserve in cmd/link/internal/ld/pe.go.
+	const stackSize = 0x00200000*_64bit + 0x00020000*(1-_64bit)
+	thandle := stdcall6(_CreateThread, 0, stackSize,
 		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
 		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
 
@@ -559,6 +665,11 @@
 }
 
 //go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+}
+
+//go:nosplit
 func sigblock() {
 }
 
@@ -578,51 +689,6 @@
 	*tp = 0
 }
 
-// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-type _KSYSTEM_TIME struct {
-	LowPart   uint32
-	High1Time int32
-	High2Time int32
-}
-
-const (
-	_INTERRUPT_TIME = 0x7ffe0008
-	_SYSTEM_TIME    = 0x7ffe0014
-)
-
-//go:nosplit
-func systime(addr uintptr) int64 {
-	timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
-
-	var t _KSYSTEM_TIME
-	for i := 1; i < 10000; i++ {
-		// these fields must be read in that order (see URL above)
-		t.High1Time = timeaddr.High1Time
-		t.LowPart = timeaddr.LowPart
-		t.High2Time = timeaddr.High2Time
-		if t.High1Time == t.High2Time {
-			return int64(t.High1Time)<<32 | int64(t.LowPart)
-		}
-		if (i % 100) == 0 {
-			osyield()
-		}
-	}
-	systemstack(func() {
-		throw("interrupt/system time is changing too fast")
-	})
-	return 0
-}
-
-//go:nosplit
-func unixnano() int64 {
-	return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
-}
-
-//go:nosplit
-func nanotime() int64 {
-	return systime(_INTERRUPT_TIME) * 100
-}
-
 // Calling stdcall on os stack.
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrier
@@ -787,10 +853,7 @@
 	}
 }
 
-var cpuprofilerlock mutex
-
-func resetcpuprofiler(hz int32) {
-	lock(&cpuprofilerlock)
+func setProcessCPUProfiler(hz int32) {
 	if profiletimer == 0 {
 		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
 		atomic.Storeuintptr(&profiletimer, timer)
@@ -798,8 +861,9 @@
 		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
 		stdcall1(_CloseHandle, thread)
 	}
-	unlock(&cpuprofilerlock)
+}
 
+func setThreadCPUProfiler(hz int32) {
 	ms := int32(0)
 	due := ^int64(^uint64(1 << 63))
 	if hz > 0 {

diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 876bca7..43bfdd7 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go

@@ -456,6 +456,8 @@
 	p.link = gp._panic
 	gp._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 
+	atomic.Xadd(&runningPanicDefers, 1)
+
 	for {
 		d := gp._defer
 		if d == nil {
@@ -504,6 +506,8 @@
 		sp := unsafe.Pointer(d.sp) // must be pointer so it gets adjusted during stack copy
 		freedefer(d)
 		if p.recovered {
+			atomic.Xadd(&runningPanicDefers, -1)
+
 			gp._panic = p.link
 			// Aborted panics are marked but remain on the g.panic list.
 			// Remove them from the list.
@@ -527,6 +531,11 @@
 	// and String methods to prepare the panic strings before startpanic.
 	preprintpanics(gp._panic)
 	startpanic()
+
+	// startpanic set panicking, which will block main from exiting,
+	// so now OK to decrement runningPanicDefers.
+	atomic.Xadd(&runningPanicDefers, -1)
+
 	printpanics(gp._panic)
 	dopanic(0)       // should not return
 	*(*int)(nil) = 0 // not reached
@@ -597,7 +606,17 @@
 	*(*int)(nil) = 0 // not reached
 }
 
-//uint32 runtime·panicking;
+// runningPanicDefers is non-zero while running deferred functions for panic.
+// runningPanicDefers is incremented and decremented atomically.
+// This is used to try hard to get a panic stack trace out when exiting.
+var runningPanicDefers uint32
+
+// panicking is non-zero when crashing the program for an unrecovered panic.
+// panicking is incremented and decremented atomically.
+var panicking uint32
+
+// paniclk is held while printing the panic information and stack trace,
+// so that two concurrent panics don't overlap their output.
 var paniclk mutex
 
 // Unwind the stack after a deferred function calls recover
@@ -617,7 +636,6 @@
 	// Make the deferproc for this d return again,
 	// this time returning 1.  The calling function will
 	// jump to the standard return epilogue.
-	gcUnwindBarriers(gp, sp)
 	gp.sched.sp = sp
 	gp.sched.pc = pc
 	gp.sched.lr = 0

diff --git a/src/runtime/plugin.go b/src/runtime/plugin.go
index 8edb29c..682caac 100644
--- a/src/runtime/plugin.go
+++ b/src/runtime/plugin.go

@@ -56,7 +56,7 @@
 
 	lock(&ifaceLock)
 	for _, i := range md.itablinks {
-		if i.inhash == 0 {
+		if !i.inhash {
 			additab(i, true, false)
 		}
 	}
@@ -95,7 +95,7 @@
 			continue
 		}
 
-		f := (*_func)(unsafe.Pointer(&md.pclntable[md.ftab[i].funcoff]))
+		f := funcInfo{(*_func)(unsafe.Pointer(&md.pclntable[md.ftab[i].funcoff])), md}
 		name := funcname(f)
 
 		// A common bug is f.entry has a relocation to a duplicate
@@ -104,7 +104,7 @@
 		name2 := "none"
 		entry2 := uintptr(0)
 		f2 := findfunc(entry)
-		if f2 != nil {
+		if f2.valid() {
 			name2 = funcname(f2)
 			entry2 = f2.entry
 		}

diff --git a/src/runtime/pprof/elf.go b/src/runtime/pprof/elf.go
new file mode 100644
index 0000000..a8b5ea6
--- /dev/null
+++ b/src/runtime/pprof/elf.go

@@ -0,0 +1,109 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"encoding/binary"
+	"errors"
+	"fmt"
+	"os"
+)
+
+var (
+	errBadELF    = errors.New("malformed ELF binary")
+	errNoBuildID = errors.New("no NT_GNU_BUILD_ID found in ELF binary")
+)
+
+// elfBuildID returns the GNU build ID of the named ELF binary,
+// without introducing a dependency on debug/elf and its dependencies.
+func elfBuildID(file string) (string, error) {
+	buf := make([]byte, 256)
+	f, err := os.Open(file)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+
+	if _, err := f.ReadAt(buf[:64], 0); err != nil {
+		return "", err
+	}
+
+	// ELF file begins with \x7F E L F.
+	if buf[0] != 0x7F || buf[1] != 'E' || buf[2] != 'L' || buf[3] != 'F' {
+		return "", errBadELF
+	}
+
+	var byteOrder binary.ByteOrder
+	switch buf[5] {
+	default:
+		return "", errBadELF
+	case 1: // little-endian
+		byteOrder = binary.LittleEndian
+	case 2: // big-endian
+		byteOrder = binary.BigEndian
+	}
+
+	var shnum int
+	var shoff, shentsize int64
+	switch buf[4] {
+	default:
+		return "", errBadELF
+	case 1: // 32-bit file header
+		shoff = int64(byteOrder.Uint32(buf[32:]))
+		shentsize = int64(byteOrder.Uint16(buf[46:]))
+		if shentsize != 40 {
+			return "", errBadELF
+		}
+		shnum = int(byteOrder.Uint16(buf[48:]))
+	case 2: // 64-bit file header
+		shoff = int64(byteOrder.Uint64(buf[40:]))
+		shentsize = int64(byteOrder.Uint16(buf[58:]))
+		if shentsize != 64 {
+			return "", errBadELF
+		}
+		shnum = int(byteOrder.Uint16(buf[60:]))
+	}
+
+	for i := 0; i < shnum; i++ {
+		if _, err := f.ReadAt(buf[:shentsize], shoff+int64(i)*shentsize); err != nil {
+			return "", err
+		}
+		if typ := byteOrder.Uint32(buf[4:]); typ != 7 { // SHT_NOTE
+			continue
+		}
+		var off, size int64
+		if shentsize == 40 {
+			// 32-bit section header
+			off = int64(byteOrder.Uint32(buf[16:]))
+			size = int64(byteOrder.Uint32(buf[20:]))
+		} else {
+			// 64-bit section header
+			off = int64(byteOrder.Uint64(buf[24:]))
+			size = int64(byteOrder.Uint64(buf[32:]))
+		}
+		size += off
+		for off < size {
+			if _, err := f.ReadAt(buf[:16], off); err != nil { // room for header + name GNU\x00
+				return "", err
+			}
+			nameSize := int(byteOrder.Uint32(buf[0:]))
+			descSize := int(byteOrder.Uint32(buf[4:]))
+			noteType := int(byteOrder.Uint32(buf[8:]))
+			descOff := off + int64(12+(nameSize+3)&^3)
+			off = descOff + int64((descSize+3)&^3)
+			if nameSize != 4 || noteType != 3 || buf[12] != 'G' || buf[13] != 'N' || buf[14] != 'U' || buf[15] != '\x00' { // want name GNU\x00 type 3 (NT_GNU_BUILD_ID)
+				continue
+			}
+			if descSize > len(buf) {
+				return "", errBadELF
+			}
+			if _, err := f.ReadAt(buf[:descSize], descOff); err != nil {
+				return "", err
+			}
+			return fmt.Sprintf("%x", buf[:descSize]), nil
+		}
+	}
+	return "", errNoBuildID
+}

diff --git a/src/runtime/pprof/internal/profile/encode.go b/src/runtime/pprof/internal/profile/encode.go
new file mode 100644
index 0000000..6b879a8
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/encode.go

@@ -0,0 +1,470 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package profile
+
+import (
+	"errors"
+	"fmt"
+	"sort"
+)
+
+func (p *Profile) decoder() []decoder {
+	return profileDecoder
+}
+
+// preEncode populates the unexported fields to be used by encode
+// (with suffix X) from the corresponding exported fields. The
+// exported fields are cleared up to facilitate testing.
+func (p *Profile) preEncode() {
+	strings := make(map[string]int)
+	addString(strings, "")
+
+	for _, st := range p.SampleType {
+		st.typeX = addString(strings, st.Type)
+		st.unitX = addString(strings, st.Unit)
+	}
+
+	for _, s := range p.Sample {
+		s.labelX = nil
+		var keys []string
+		for k := range s.Label {
+			keys = append(keys, k)
+		}
+		sort.Strings(keys)
+		for _, k := range keys {
+			vs := s.Label[k]
+			for _, v := range vs {
+				s.labelX = append(s.labelX,
+					Label{
+						keyX: addString(strings, k),
+						strX: addString(strings, v),
+					},
+				)
+			}
+		}
+		var numKeys []string
+		for k := range s.NumLabel {
+			numKeys = append(numKeys, k)
+		}
+		sort.Strings(numKeys)
+		for _, k := range numKeys {
+			vs := s.NumLabel[k]
+			for _, v := range vs {
+				s.labelX = append(s.labelX,
+					Label{
+						keyX: addString(strings, k),
+						numX: v,
+					},
+				)
+			}
+		}
+		s.locationIDX = nil
+		for _, l := range s.Location {
+			s.locationIDX = append(s.locationIDX, l.ID)
+		}
+	}
+
+	for _, m := range p.Mapping {
+		m.fileX = addString(strings, m.File)
+		m.buildIDX = addString(strings, m.BuildID)
+	}
+
+	for _, l := range p.Location {
+		for i, ln := range l.Line {
+			if ln.Function != nil {
+				l.Line[i].functionIDX = ln.Function.ID
+			} else {
+				l.Line[i].functionIDX = 0
+			}
+		}
+		if l.Mapping != nil {
+			l.mappingIDX = l.Mapping.ID
+		} else {
+			l.mappingIDX = 0
+		}
+	}
+	for _, f := range p.Function {
+		f.nameX = addString(strings, f.Name)
+		f.systemNameX = addString(strings, f.SystemName)
+		f.filenameX = addString(strings, f.Filename)
+	}
+
+	p.dropFramesX = addString(strings, p.DropFrames)
+	p.keepFramesX = addString(strings, p.KeepFrames)
+
+	if pt := p.PeriodType; pt != nil {
+		pt.typeX = addString(strings, pt.Type)
+		pt.unitX = addString(strings, pt.Unit)
+	}
+
+	p.stringTable = make([]string, len(strings))
+	for s, i := range strings {
+		p.stringTable[i] = s
+	}
+}
+
+func (p *Profile) encode(b *buffer) {
+	for _, x := range p.SampleType {
+		encodeMessage(b, 1, x)
+	}
+	for _, x := range p.Sample {
+		encodeMessage(b, 2, x)
+	}
+	for _, x := range p.Mapping {
+		encodeMessage(b, 3, x)
+	}
+	for _, x := range p.Location {
+		encodeMessage(b, 4, x)
+	}
+	for _, x := range p.Function {
+		encodeMessage(b, 5, x)
+	}
+	encodeStrings(b, 6, p.stringTable)
+	encodeInt64Opt(b, 7, p.dropFramesX)
+	encodeInt64Opt(b, 8, p.keepFramesX)
+	encodeInt64Opt(b, 9, p.TimeNanos)
+	encodeInt64Opt(b, 10, p.DurationNanos)
+	if pt := p.PeriodType; pt != nil && (pt.typeX != 0 || pt.unitX != 0) {
+		encodeMessage(b, 11, p.PeriodType)
+	}
+	encodeInt64Opt(b, 12, p.Period)
+}
+
+var profileDecoder = []decoder{
+	nil, // 0
+	// repeated ValueType sample_type = 1
+	func(b *buffer, m message) error {
+		x := new(ValueType)
+		pp := m.(*Profile)
+		pp.SampleType = append(pp.SampleType, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Sample sample = 2
+	func(b *buffer, m message) error {
+		x := new(Sample)
+		pp := m.(*Profile)
+		pp.Sample = append(pp.Sample, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Mapping mapping = 3
+	func(b *buffer, m message) error {
+		x := new(Mapping)
+		pp := m.(*Profile)
+		pp.Mapping = append(pp.Mapping, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Location location = 4
+	func(b *buffer, m message) error {
+		x := new(Location)
+		pp := m.(*Profile)
+		pp.Location = append(pp.Location, x)
+		return decodeMessage(b, x)
+	},
+	// repeated Function function = 5
+	func(b *buffer, m message) error {
+		x := new(Function)
+		pp := m.(*Profile)
+		pp.Function = append(pp.Function, x)
+		return decodeMessage(b, x)
+	},
+	// repeated string string_table = 6
+	func(b *buffer, m message) error {
+		err := decodeStrings(b, &m.(*Profile).stringTable)
+		if err != nil {
+			return err
+		}
+		if *&m.(*Profile).stringTable[0] != "" {
+			return errors.New("string_table[0] must be ''")
+		}
+		return nil
+	},
+	// repeated int64 drop_frames = 7
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).dropFramesX) },
+	// repeated int64 keep_frames = 8
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).keepFramesX) },
+	// repeated int64 time_nanos = 9
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).TimeNanos) },
+	// repeated int64 duration_nanos = 10
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).DurationNanos) },
+	// optional string period_type = 11
+	func(b *buffer, m message) error {
+		x := new(ValueType)
+		pp := m.(*Profile)
+		pp.PeriodType = x
+		return decodeMessage(b, x)
+	},
+	// repeated int64 period = 12
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Profile).Period) },
+}
+
+// postDecode takes the unexported fields populated by decode (with
+// suffix X) and populates the corresponding exported fields.
+// The unexported fields are cleared up to facilitate testing.
+func (p *Profile) postDecode() error {
+	var err error
+
+	mappings := make(map[uint64]*Mapping)
+	for _, m := range p.Mapping {
+		m.File, err = getString(p.stringTable, &m.fileX, err)
+		m.BuildID, err = getString(p.stringTable, &m.buildIDX, err)
+		mappings[m.ID] = m
+	}
+
+	functions := make(map[uint64]*Function)
+	for _, f := range p.Function {
+		f.Name, err = getString(p.stringTable, &f.nameX, err)
+		f.SystemName, err = getString(p.stringTable, &f.systemNameX, err)
+		f.Filename, err = getString(p.stringTable, &f.filenameX, err)
+		functions[f.ID] = f
+	}
+
+	locations := make(map[uint64]*Location)
+	for _, l := range p.Location {
+		l.Mapping = mappings[l.mappingIDX]
+		l.mappingIDX = 0
+		for i, ln := range l.Line {
+			if id := ln.functionIDX; id != 0 {
+				l.Line[i].Function = functions[id]
+				if l.Line[i].Function == nil {
+					return fmt.Errorf("Function ID %d not found", id)
+				}
+				l.Line[i].functionIDX = 0
+			}
+		}
+		locations[l.ID] = l
+	}
+
+	for _, st := range p.SampleType {
+		st.Type, err = getString(p.stringTable, &st.typeX, err)
+		st.Unit, err = getString(p.stringTable, &st.unitX, err)
+	}
+
+	for _, s := range p.Sample {
+		labels := make(map[string][]string)
+		numLabels := make(map[string][]int64)
+		for _, l := range s.labelX {
+			var key, value string
+			key, err = getString(p.stringTable, &l.keyX, err)
+			if l.strX != 0 {
+				value, err = getString(p.stringTable, &l.strX, err)
+				labels[key] = append(labels[key], value)
+			} else {
+				numLabels[key] = append(numLabels[key], l.numX)
+			}
+		}
+		if len(labels) > 0 {
+			s.Label = labels
+		}
+		if len(numLabels) > 0 {
+			s.NumLabel = numLabels
+		}
+		s.Location = nil
+		for _, lid := range s.locationIDX {
+			s.Location = append(s.Location, locations[lid])
+		}
+		s.locationIDX = nil
+	}
+
+	p.DropFrames, err = getString(p.stringTable, &p.dropFramesX, err)
+	p.KeepFrames, err = getString(p.stringTable, &p.keepFramesX, err)
+
+	if pt := p.PeriodType; pt == nil {
+		p.PeriodType = &ValueType{}
+	}
+
+	if pt := p.PeriodType; pt != nil {
+		pt.Type, err = getString(p.stringTable, &pt.typeX, err)
+		pt.Unit, err = getString(p.stringTable, &pt.unitX, err)
+	}
+	p.stringTable = nil
+	return nil
+}
+
+func (p *ValueType) decoder() []decoder {
+	return valueTypeDecoder
+}
+
+func (p *ValueType) encode(b *buffer) {
+	encodeInt64Opt(b, 1, p.typeX)
+	encodeInt64Opt(b, 2, p.unitX)
+}
+
+var valueTypeDecoder = []decoder{
+	nil, // 0
+	// optional int64 type = 1
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).typeX) },
+	// optional int64 unit = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*ValueType).unitX) },
+}
+
+func (p *Sample) decoder() []decoder {
+	return sampleDecoder
+}
+
+func (p *Sample) encode(b *buffer) {
+	encodeUint64s(b, 1, p.locationIDX)
+	for _, x := range p.Value {
+		encodeInt64(b, 2, x)
+	}
+	for _, x := range p.labelX {
+		encodeMessage(b, 3, x)
+	}
+}
+
+var sampleDecoder = []decoder{
+	nil, // 0
+	// repeated uint64 location = 1
+	func(b *buffer, m message) error { return decodeUint64s(b, &m.(*Sample).locationIDX) },
+	// repeated int64 value = 2
+	func(b *buffer, m message) error { return decodeInt64s(b, &m.(*Sample).Value) },
+	// repeated Label label = 3
+	func(b *buffer, m message) error {
+		s := m.(*Sample)
+		n := len(s.labelX)
+		s.labelX = append(s.labelX, Label{})
+		return decodeMessage(b, &s.labelX[n])
+	},
+}
+
+func (p Label) decoder() []decoder {
+	return labelDecoder
+}
+
+func (p Label) encode(b *buffer) {
+	encodeInt64Opt(b, 1, p.keyX)
+	encodeInt64Opt(b, 2, p.strX)
+	encodeInt64Opt(b, 3, p.numX)
+}
+
+var labelDecoder = []decoder{
+	nil, // 0
+	// optional int64 key = 1
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).keyX) },
+	// optional int64 str = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).strX) },
+	// optional int64 num = 3
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Label).numX) },
+}
+
+func (p *Mapping) decoder() []decoder {
+	return mappingDecoder
+}
+
+func (p *Mapping) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeUint64Opt(b, 2, p.Start)
+	encodeUint64Opt(b, 3, p.Limit)
+	encodeUint64Opt(b, 4, p.Offset)
+	encodeInt64Opt(b, 5, p.fileX)
+	encodeInt64Opt(b, 6, p.buildIDX)
+	encodeBoolOpt(b, 7, p.HasFunctions)
+	encodeBoolOpt(b, 8, p.HasFilenames)
+	encodeBoolOpt(b, 9, p.HasLineNumbers)
+	encodeBoolOpt(b, 10, p.HasInlineFrames)
+}
+
+var mappingDecoder = []decoder{
+	nil, // 0
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).ID) },            // optional uint64 id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Start) },         // optional uint64 memory_offset = 2
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Limit) },         // optional uint64 memory_limit = 3
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Mapping).Offset) },        // optional uint64 file_offset = 4
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).fileX) },          // optional int64 filename = 5
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Mapping).buildIDX) },       // optional int64 build_id = 6
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFunctions) },    // optional bool has_functions = 7
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasFilenames) },    // optional bool has_filenames = 8
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasLineNumbers) },  // optional bool has_line_numbers = 9
+	func(b *buffer, m message) error { return decodeBool(b, &m.(*Mapping).HasInlineFrames) }, // optional bool has_inline_frames = 10
+}
+
+func (p *Location) decoder() []decoder {
+	return locationDecoder
+}
+
+func (p *Location) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeUint64Opt(b, 2, p.mappingIDX)
+	encodeUint64Opt(b, 3, p.Address)
+	for i := range p.Line {
+		encodeMessage(b, 4, &p.Line[i])
+	}
+}
+
+var locationDecoder = []decoder{
+	nil, // 0
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).ID) },         // optional uint64 id = 1;
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).mappingIDX) }, // optional uint64 mapping_id = 2;
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Location).Address) },    // optional uint64 address = 3;
+	func(b *buffer, m message) error { // repeated Line line = 4
+		pp := m.(*Location)
+		n := len(pp.Line)
+		pp.Line = append(pp.Line, Line{})
+		return decodeMessage(b, &pp.Line[n])
+	},
+}
+
+func (p *Line) decoder() []decoder {
+	return lineDecoder
+}
+
+func (p *Line) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.functionIDX)
+	encodeInt64Opt(b, 2, p.Line)
+}
+
+var lineDecoder = []decoder{
+	nil, // 0
+	// optional uint64 function_id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Line).functionIDX) },
+	// optional int64 line = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Line).Line) },
+}
+
+func (p *Function) decoder() []decoder {
+	return functionDecoder
+}
+
+func (p *Function) encode(b *buffer) {
+	encodeUint64Opt(b, 1, p.ID)
+	encodeInt64Opt(b, 2, p.nameX)
+	encodeInt64Opt(b, 3, p.systemNameX)
+	encodeInt64Opt(b, 4, p.filenameX)
+	encodeInt64Opt(b, 5, p.StartLine)
+}
+
+var functionDecoder = []decoder{
+	nil, // 0
+	// optional uint64 id = 1
+	func(b *buffer, m message) error { return decodeUint64(b, &m.(*Function).ID) },
+	// optional int64 function_name = 2
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).nameX) },
+	// optional int64 function_system_name = 3
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).systemNameX) },
+	// repeated int64 filename = 4
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).filenameX) },
+	// optional int64 start_line = 5
+	func(b *buffer, m message) error { return decodeInt64(b, &m.(*Function).StartLine) },
+}
+
+func addString(strings map[string]int, s string) int64 {
+	i, ok := strings[s]
+	if !ok {
+		i = len(strings)
+		strings[s] = i
+	}
+	return int64(i)
+}
+
+func getString(strings []string, strng *int64, err error) (string, error) {
+	if err != nil {
+		return "", err
+	}
+	s := int(*strng)
+	if s < 0 || s >= len(strings) {
+		return "", errMalformed
+	}
+	*strng = 0
+	return strings[s], nil
+}

diff --git a/src/runtime/pprof/internal/profile/filter.go b/src/runtime/pprof/internal/profile/filter.go
new file mode 100644
index 0000000..1baa096
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/filter.go

@@ -0,0 +1,158 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implements methods to filter samples from profiles.
+
+package profile
+
+import "regexp"
+
+// FilterSamplesByName filters the samples in a profile and only keeps
+// samples where at least one frame matches focus but none match ignore.
+// Returns true is the corresponding regexp matched at least one sample.
+func (p *Profile) FilterSamplesByName(focus, ignore, hide *regexp.Regexp) (fm, im, hm bool) {
+	focusOrIgnore := make(map[uint64]bool)
+	hidden := make(map[uint64]bool)
+	for _, l := range p.Location {
+		if ignore != nil && l.matchesName(ignore) {
+			im = true
+			focusOrIgnore[l.ID] = false
+		} else if focus == nil || l.matchesName(focus) {
+			fm = true
+			focusOrIgnore[l.ID] = true
+		}
+		if hide != nil && l.matchesName(hide) {
+			hm = true
+			l.Line = l.unmatchedLines(hide)
+			if len(l.Line) == 0 {
+				hidden[l.ID] = true
+			}
+		}
+	}
+
+	s := make([]*Sample, 0, len(p.Sample))
+	for _, sample := range p.Sample {
+		if focusedAndNotIgnored(sample.Location, focusOrIgnore) {
+			if len(hidden) > 0 {
+				var locs []*Location
+				for _, loc := range sample.Location {
+					if !hidden[loc.ID] {
+						locs = append(locs, loc)
+					}
+				}
+				if len(locs) == 0 {
+					// Remove sample with no locations (by not adding it to s).
+					continue
+				}
+				sample.Location = locs
+			}
+			s = append(s, sample)
+		}
+	}
+	p.Sample = s
+
+	return
+}
+
+// matchesName returns whether the function name or file in the
+// location matches the regular expression.
+func (loc *Location) matchesName(re *regexp.Regexp) bool {
+	for _, ln := range loc.Line {
+		if fn := ln.Function; fn != nil {
+			if re.MatchString(fn.Name) {
+				return true
+			}
+			if re.MatchString(fn.Filename) {
+				return true
+			}
+		}
+	}
+	return false
+}
+
+// unmatchedLines returns the lines in the location that do not match
+// the regular expression.
+func (loc *Location) unmatchedLines(re *regexp.Regexp) []Line {
+	var lines []Line
+	for _, ln := range loc.Line {
+		if fn := ln.Function; fn != nil {
+			if re.MatchString(fn.Name) {
+				continue
+			}
+			if re.MatchString(fn.Filename) {
+				continue
+			}
+		}
+		lines = append(lines, ln)
+	}
+	return lines
+}
+
+// focusedAndNotIgnored looks up a slice of ids against a map of
+// focused/ignored locations. The map only contains locations that are
+// explicitly focused or ignored. Returns whether there is at least
+// one focused location but no ignored locations.
+func focusedAndNotIgnored(locs []*Location, m map[uint64]bool) bool {
+	var f bool
+	for _, loc := range locs {
+		if focus, focusOrIgnore := m[loc.ID]; focusOrIgnore {
+			if focus {
+				// Found focused location. Must keep searching in case there
+				// is an ignored one as well.
+				f = true
+			} else {
+				// Found ignored location. Can return false right away.
+				return false
+			}
+		}
+	}
+	return f
+}
+
+// TagMatch selects tags for filtering
+type TagMatch func(key, val string, nval int64) bool
+
+// FilterSamplesByTag removes all samples from the profile, except
+// those that match focus and do not match the ignore regular
+// expression.
+func (p *Profile) FilterSamplesByTag(focus, ignore TagMatch) (fm, im bool) {
+	samples := make([]*Sample, 0, len(p.Sample))
+	for _, s := range p.Sample {
+		focused, ignored := focusedSample(s, focus, ignore)
+		fm = fm || focused
+		im = im || ignored
+		if focused && !ignored {
+			samples = append(samples, s)
+		}
+	}
+	p.Sample = samples
+	return
+}
+
+// focusedTag checks a sample against focus and ignore regexps.
+// Returns whether the focus/ignore regexps match any tags
+func focusedSample(s *Sample, focus, ignore TagMatch) (fm, im bool) {
+	fm = focus == nil
+	for key, vals := range s.Label {
+		for _, val := range vals {
+			if ignore != nil && ignore(key, val, 0) {
+				im = true
+			}
+			if !fm && focus(key, val, 0) {
+				fm = true
+			}
+		}
+	}
+	for key, vals := range s.NumLabel {
+		for _, val := range vals {
+			if ignore != nil && ignore(key, "", val) {
+				im = true
+			}
+			if !fm && focus(key, "", val) {
+				fm = true
+			}
+		}
+	}
+	return fm, im
+}

diff --git a/src/runtime/pprof/internal/profile/legacy_profile.go b/src/runtime/pprof/internal/profile/legacy_profile.go
new file mode 100644
index 0000000..d69f8de
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/legacy_profile.go

@@ -0,0 +1,1266 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file implements parsers to convert legacy profiles into the
+// profile.proto format.
+
+package profile
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"regexp"
+	"strconv"
+	"strings"
+)
+
+var (
+	countStartRE = regexp.MustCompile(`\A(\w+) profile: total \d+\n\z`)
+	countRE      = regexp.MustCompile(`\A(\d+) @(( 0x[0-9a-f]+)+)\n\z`)
+
+	heapHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] *@ *(heap[_a-z0-9]*)/?(\d*)`)
+	heapSampleRE = regexp.MustCompile(`(-?\d+): *(-?\d+) *\[ *(\d+): *(\d+) *] @([ x0-9a-f]*)`)
+
+	contentionSampleRE = regexp.MustCompile(`(\d+) *(\d+) @([ x0-9a-f]*)`)
+
+	hexNumberRE = regexp.MustCompile(`0x[0-9a-f]+`)
+
+	growthHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ growthz`)
+
+	fragmentationHeaderRE = regexp.MustCompile(`heap profile: *(\d+): *(\d+) *\[ *(\d+): *(\d+) *\] @ fragmentationz`)
+
+	threadzStartRE = regexp.MustCompile(`--- threadz \d+ ---`)
+	threadStartRE  = regexp.MustCompile(`--- Thread ([[:xdigit:]]+) \(name: (.*)/(\d+)\) stack: ---`)
+
+	procMapsRE = regexp.MustCompile(`([[:xdigit:]]+)-([[:xdigit:]]+)\s+([-rwxp]+)\s+([[:xdigit:]]+)\s+([[:xdigit:]]+):([[:xdigit:]]+)\s+([[:digit:]]+)\s*(\S+)?`)
+
+	briefMapsRE = regexp.MustCompile(`\s*([[:xdigit:]]+)-([[:xdigit:]]+):\s*(\S+)(\s.*@)?([[:xdigit:]]+)?`)
+
+	// LegacyHeapAllocated instructs the heapz parsers to use the
+	// allocated memory stats instead of the default in-use memory. Note
+	// that tcmalloc doesn't provide all allocated memory, only in-use
+	// stats.
+	LegacyHeapAllocated bool
+)
+
+func isSpaceOrComment(line string) bool {
+	trimmed := strings.TrimSpace(line)
+	return len(trimmed) == 0 || trimmed[0] == '#'
+}
+
+// parseGoCount parses a Go count profile (e.g., threadcreate or
+// goroutine) and returns a new Profile.
+func parseGoCount(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	var line string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		line, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(line) {
+			break
+		}
+	}
+
+	m := countStartRE.FindStringSubmatch(line)
+	if m == nil {
+		return nil, errUnrecognized
+	}
+	profileType := m[1]
+	p := &Profile{
+		PeriodType: &ValueType{Type: profileType, Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{{Type: profileType, Unit: "count"}},
+	}
+	locations := make(map[uint64]*Location)
+	for {
+		line, err = r.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return nil, err
+		}
+		if isSpaceOrComment(line) {
+			continue
+		}
+		if strings.HasPrefix(line, "---") {
+			break
+		}
+		m := countRE.FindStringSubmatch(line)
+		if m == nil {
+			return nil, errMalformed
+		}
+		n, err := strconv.ParseInt(m[1], 0, 64)
+		if err != nil {
+			return nil, errMalformed
+		}
+		fields := strings.Fields(m[2])
+		locs := make([]*Location, 0, len(fields))
+		for _, stk := range fields {
+			addr, err := strconv.ParseUint(stk, 0, 64)
+			if err != nil {
+				return nil, errMalformed
+			}
+			// Adjust all frames by -1 to land on the call instruction.
+			addr--
+			loc := locations[addr]
+			if loc == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				locations[addr] = loc
+				p.Location = append(p.Location, loc)
+			}
+			locs = append(locs, loc)
+		}
+		p.Sample = append(p.Sample, &Sample{
+			Location: locs,
+			Value:    []int64{n},
+		})
+	}
+
+	if err = parseAdditionalSections(strings.TrimSpace(line), r, p); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// remapLocationIDs ensures there is a location for each address
+// referenced by a sample, and remaps the samples to point to the new
+// location ids.
+func (p *Profile) remapLocationIDs() {
+	seen := make(map[*Location]bool, len(p.Location))
+	var locs []*Location
+
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			if seen[l] {
+				continue
+			}
+			l.ID = uint64(len(locs) + 1)
+			locs = append(locs, l)
+			seen[l] = true
+		}
+	}
+	p.Location = locs
+}
+
+func (p *Profile) remapFunctionIDs() {
+	seen := make(map[*Function]bool, len(p.Function))
+	var fns []*Function
+
+	for _, l := range p.Location {
+		for _, ln := range l.Line {
+			fn := ln.Function
+			if fn == nil || seen[fn] {
+				continue
+			}
+			fn.ID = uint64(len(fns) + 1)
+			fns = append(fns, fn)
+			seen[fn] = true
+		}
+	}
+	p.Function = fns
+}
+
+// remapMappingIDs matches location addresses with existing mappings
+// and updates them appropriately. This is O(N*M), if this ever shows
+// up as a bottleneck, evaluate sorting the mappings and doing a
+// binary search, which would make it O(N*log(M)).
+func (p *Profile) remapMappingIDs() {
+	if len(p.Mapping) == 0 {
+		return
+	}
+
+	// Some profile handlers will incorrectly set regions for the main
+	// executable if its section is remapped. Fix them through heuristics.
+
+	// Remove the initial mapping if named '/anon_hugepage' and has a
+	// consecutive adjacent mapping.
+	if m := p.Mapping[0]; strings.HasPrefix(m.File, "/anon_hugepage") {
+		if len(p.Mapping) > 1 && m.Limit == p.Mapping[1].Start {
+			p.Mapping = p.Mapping[1:]
+		}
+	}
+
+	// Subtract the offset from the start of the main mapping if it
+	// ends up at a recognizable start address.
+	const expectedStart = 0x400000
+	if m := p.Mapping[0]; m.Start-m.Offset == expectedStart {
+		m.Start = expectedStart
+		m.Offset = 0
+	}
+
+	for _, l := range p.Location {
+		if a := l.Address; a != 0 {
+			for _, m := range p.Mapping {
+				if m.Start <= a && a < m.Limit {
+					l.Mapping = m
+					break
+				}
+			}
+		}
+	}
+
+	// Reset all mapping IDs.
+	for i, m := range p.Mapping {
+		m.ID = uint64(i + 1)
+	}
+}
+
+var cpuInts = []func([]byte) (uint64, []byte){
+	get32l,
+	get32b,
+	get64l,
+	get64b,
+}
+
+func get32l(b []byte) (uint64, []byte) {
+	if len(b) < 4 {
+		return 0, nil
+	}
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24, b[4:]
+}
+
+func get32b(b []byte) (uint64, []byte) {
+	if len(b) < 4 {
+		return 0, nil
+	}
+	return uint64(b[3]) | uint64(b[2])<<8 | uint64(b[1])<<16 | uint64(b[0])<<24, b[4:]
+}
+
+func get64l(b []byte) (uint64, []byte) {
+	if len(b) < 8 {
+		return 0, nil
+	}
+	return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56, b[8:]
+}
+
+func get64b(b []byte) (uint64, []byte) {
+	if len(b) < 8 {
+		return 0, nil
+	}
+	return uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 | uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56, b[8:]
+}
+
+// ParseTracebacks parses a set of tracebacks and returns a newly
+// populated profile. It will accept any text file and generate a
+// Profile out of it with any hex addresses it can identify, including
+// a process map if it can recognize one. Each sample will include a
+// tag "source" with the addresses recognized in string format.
+func ParseTracebacks(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	p := &Profile{
+		PeriodType: &ValueType{Type: "trace", Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{
+			{Type: "trace", Unit: "count"},
+		},
+	}
+
+	var sources []string
+	var sloc []*Location
+
+	locs := make(map[uint64]*Location)
+	for {
+		l, err := r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if sectionTrigger(l) == memoryMapSection {
+			break
+		}
+		if s, addrs := extractHexAddresses(l); len(s) > 0 {
+			for _, addr := range addrs {
+				// Addresses from stack traces point to the next instruction after
+				// each call. Adjust by -1 to land somewhere on the actual call.
+				addr--
+				loc := locs[addr]
+				if locs[addr] == nil {
+					loc = &Location{
+						Address: addr,
+					}
+					p.Location = append(p.Location, loc)
+					locs[addr] = loc
+				}
+				sloc = append(sloc, loc)
+			}
+
+			sources = append(sources, s...)
+		} else {
+			if len(sources) > 0 || len(sloc) > 0 {
+				addTracebackSample(sloc, sources, p)
+				sloc, sources = nil, nil
+			}
+		}
+	}
+
+	// Add final sample to save any leftover data.
+	if len(sources) > 0 || len(sloc) > 0 {
+		addTracebackSample(sloc, sources, p)
+	}
+
+	if err := p.ParseMemoryMap(r); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+func addTracebackSample(l []*Location, s []string, p *Profile) {
+	p.Sample = append(p.Sample,
+		&Sample{
+			Value:    []int64{1},
+			Location: l,
+			Label:    map[string][]string{"source": s},
+		})
+}
+
+// parseCPU parses a profilez legacy profile and returns a newly
+// populated Profile.
+//
+// The general format for profilez samples is a sequence of words in
+// binary format. The first words are a header with the following data:
+//   1st word -- 0
+//   2nd word -- 3
+//   3rd word -- 0 if a c++ application, 1 if a java application.
+//   4th word -- Sampling period (in microseconds).
+//   5th word -- Padding.
+func parseCPU(b []byte) (*Profile, error) {
+	var parse func([]byte) (uint64, []byte)
+	var n1, n2, n3, n4, n5 uint64
+	for _, parse = range cpuInts {
+		var tmp []byte
+		n1, tmp = parse(b)
+		n2, tmp = parse(tmp)
+		n3, tmp = parse(tmp)
+		n4, tmp = parse(tmp)
+		n5, tmp = parse(tmp)
+
+		if tmp != nil && n1 == 0 && n2 == 3 && n3 == 0 && n4 > 0 && n5 == 0 {
+			b = tmp
+			return cpuProfile(b, int64(n4), parse)
+		}
+	}
+	return nil, errUnrecognized
+}
+
+// cpuProfile returns a new Profile from C++ profilez data.
+// b is the profile bytes after the header, period is the profiling
+// period, and parse is a function to parse 8-byte chunks from the
+// profile in its native endianness.
+func cpuProfile(b []byte, period int64, parse func(b []byte) (uint64, []byte)) (*Profile, error) {
+	p := &Profile{
+		Period:     period * 1000,
+		PeriodType: &ValueType{Type: "cpu", Unit: "nanoseconds"},
+		SampleType: []*ValueType{
+			{Type: "samples", Unit: "count"},
+			{Type: "cpu", Unit: "nanoseconds"},
+		},
+	}
+	var err error
+	if b, _, err = parseCPUSamples(b, parse, true, p); err != nil {
+		return nil, err
+	}
+
+	// If all samples have the same second-to-the-bottom frame, it
+	// strongly suggests that it is an uninteresting artifact of
+	// measurement -- a stack frame pushed by the signal handler. The
+	// bottom frame is always correct as it is picked up from the signal
+	// structure, not the stack. Check if this is the case and if so,
+	// remove.
+	if len(p.Sample) > 1 && len(p.Sample[0].Location) > 1 {
+		allSame := true
+		id1 := p.Sample[0].Location[1].Address
+		for _, s := range p.Sample {
+			if len(s.Location) < 2 || id1 != s.Location[1].Address {
+				allSame = false
+				break
+			}
+		}
+		if allSame {
+			for _, s := range p.Sample {
+				s.Location = append(s.Location[:1], s.Location[2:]...)
+			}
+		}
+	}
+
+	if err := p.ParseMemoryMap(bytes.NewBuffer(b)); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// parseCPUSamples parses a collection of profilez samples from a
+// profile.
+//
+// profilez samples are a repeated sequence of stack frames of the
+// form:
+//    1st word -- The number of times this stack was encountered.
+//    2nd word -- The size of the stack (StackSize).
+//    3rd word -- The first address on the stack.
+//    ...
+//    StackSize + 2 -- The last address on the stack
+// The last stack trace is of the form:
+//   1st word -- 0
+//   2nd word -- 1
+//   3rd word -- 0
+//
+// Addresses from stack traces may point to the next instruction after
+// each call. Optionally adjust by -1 to land somewhere on the actual
+// call (except for the leaf, which is not a call).
+func parseCPUSamples(b []byte, parse func(b []byte) (uint64, []byte), adjust bool, p *Profile) ([]byte, map[uint64]*Location, error) {
+	locs := make(map[uint64]*Location)
+	for len(b) > 0 {
+		var count, nstk uint64
+		count, b = parse(b)
+		nstk, b = parse(b)
+		if b == nil || nstk > uint64(len(b)/4) {
+			return nil, nil, errUnrecognized
+		}
+		var sloc []*Location
+		addrs := make([]uint64, nstk)
+		for i := 0; i < int(nstk); i++ {
+			addrs[i], b = parse(b)
+		}
+
+		if count == 0 && nstk == 1 && addrs[0] == 0 {
+			// End of data marker
+			break
+		}
+		for i, addr := range addrs {
+			if adjust && i > 0 {
+				addr--
+			}
+			loc := locs[addr]
+			if loc == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				locs[addr] = loc
+				p.Location = append(p.Location, loc)
+			}
+			sloc = append(sloc, loc)
+		}
+		p.Sample = append(p.Sample,
+			&Sample{
+				Value:    []int64{int64(count), int64(count) * p.Period},
+				Location: sloc,
+			})
+	}
+	// Reached the end without finding the EOD marker.
+	return b, locs, nil
+}
+
+// parseHeap parses a heapz legacy or a growthz profile and
+// returns a newly populated Profile.
+func parseHeap(b []byte) (p *Profile, err error) {
+	r := bytes.NewBuffer(b)
+	l, err := r.ReadString('\n')
+	if err != nil {
+		return nil, errUnrecognized
+	}
+
+	sampling := ""
+
+	if header := heapHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "objects", Unit: "bytes"},
+		}
+
+		var period int64
+		if len(header[6]) > 0 {
+			if period, err = strconv.ParseInt(header[6], 10, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+
+		switch header[5] {
+		case "heapz_v2", "heap_v2":
+			sampling, p.Period = "v2", period
+		case "heapprofile":
+			sampling, p.Period = "", 1
+		case "heap":
+			sampling, p.Period = "v2", period/2
+		default:
+			return nil, errUnrecognized
+		}
+	} else if header = growthHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "heapgrowth", Unit: "count"},
+			Period:     1,
+		}
+	} else if header = fragmentationHeaderRE.FindStringSubmatch(l); header != nil {
+		p = &Profile{
+			SampleType: []*ValueType{
+				{Type: "objects", Unit: "count"},
+				{Type: "space", Unit: "bytes"},
+			},
+			PeriodType: &ValueType{Type: "allocations", Unit: "count"},
+			Period:     1,
+		}
+	} else {
+		return nil, errUnrecognized
+	}
+
+	if LegacyHeapAllocated {
+		for _, st := range p.SampleType {
+			st.Type = "alloc_" + st.Type
+		}
+	} else {
+		for _, st := range p.SampleType {
+			st.Type = "inuse_" + st.Type
+		}
+	}
+
+	locs := make(map[uint64]*Location)
+	for {
+		l, err = r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+
+			if l == "" {
+				break
+			}
+		}
+
+		if isSpaceOrComment(l) {
+			continue
+		}
+		l = strings.TrimSpace(l)
+
+		if sectionTrigger(l) != unrecognizedSection {
+			break
+		}
+
+		value, blocksize, addrs, err := parseHeapSample(l, p.Period, sampling)
+		if err != nil {
+			return nil, err
+		}
+		var sloc []*Location
+		for _, addr := range addrs {
+			// Addresses from stack traces point to the next instruction after
+			// each call. Adjust by -1 to land somewhere on the actual call.
+			addr--
+			loc := locs[addr]
+			if locs[addr] == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				p.Location = append(p.Location, loc)
+				locs[addr] = loc
+			}
+			sloc = append(sloc, loc)
+		}
+
+		p.Sample = append(p.Sample, &Sample{
+			Value:    value,
+			Location: sloc,
+			NumLabel: map[string][]int64{"bytes": {blocksize}},
+		})
+	}
+
+	if err = parseAdditionalSections(l, r, p); err != nil {
+		return nil, err
+	}
+	return p, nil
+}
+
+// parseHeapSample parses a single row from a heap profile into a new Sample.
+func parseHeapSample(line string, rate int64, sampling string) (value []int64, blocksize int64, addrs []uint64, err error) {
+	sampleData := heapSampleRE.FindStringSubmatch(line)
+	if len(sampleData) != 6 {
+		return value, blocksize, addrs, fmt.Errorf("unexpected number of sample values: got %d, want 6", len(sampleData))
+	}
+
+	// Use first two values by default; tcmalloc sampling generates the
+	// same value for both, only the older heap-profile collect separate
+	// stats for in-use and allocated objects.
+	valueIndex := 1
+	if LegacyHeapAllocated {
+		valueIndex = 3
+	}
+
+	var v1, v2 int64
+	if v1, err = strconv.ParseInt(sampleData[valueIndex], 10, 64); err != nil {
+		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+	if v2, err = strconv.ParseInt(sampleData[valueIndex+1], 10, 64); err != nil {
+		return value, blocksize, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+
+	if v1 == 0 {
+		if v2 != 0 {
+			return value, blocksize, addrs, fmt.Errorf("allocation count was 0 but allocation bytes was %d", v2)
+		}
+	} else {
+		blocksize = v2 / v1
+		if sampling == "v2" {
+			v1, v2 = scaleHeapSample(v1, v2, rate)
+		}
+	}
+
+	value = []int64{v1, v2}
+	addrs = parseHexAddresses(sampleData[5])
+
+	return value, blocksize, addrs, nil
+}
+
+// extractHexAddresses extracts hex numbers from a string and returns
+// them, together with their numeric value, in a slice.
+func extractHexAddresses(s string) ([]string, []uint64) {
+	hexStrings := hexNumberRE.FindAllString(s, -1)
+	var ids []uint64
+	for _, s := range hexStrings {
+		if id, err := strconv.ParseUint(s, 0, 64); err == nil {
+			ids = append(ids, id)
+		} else {
+			// Do not expect any parsing failures due to the regexp matching.
+			panic("failed to parse hex value:" + s)
+		}
+	}
+	return hexStrings, ids
+}
+
+// parseHexAddresses parses hex numbers from a string and returns them
+// in a slice.
+func parseHexAddresses(s string) []uint64 {
+	_, ids := extractHexAddresses(s)
+	return ids
+}
+
+// scaleHeapSample adjusts the data from a heapz Sample to
+// account for its probability of appearing in the collected
+// data. heapz profiles are a sampling of the memory allocations
+// requests in a program. We estimate the unsampled value by dividing
+// each collected sample by its probability of appearing in the
+// profile. heapz v2 profiles rely on a poisson process to determine
+// which samples to collect, based on the desired average collection
+// rate R. The probability of a sample of size S to appear in that
+// profile is 1-exp(-S/R).
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+	if count == 0 || size == 0 {
+		return 0, 0
+	}
+
+	if rate <= 1 {
+		// if rate==1 all samples were collected so no adjustment is needed.
+		// if rate<1 treat as unknown and skip scaling.
+		return count, size
+	}
+
+	avgSize := float64(size) / float64(count)
+	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+	return int64(float64(count) * scale), int64(float64(size) * scale)
+}
+
+// parseContention parses a mutex or contention profile. There are 2 cases:
+// "--- contentionz " for legacy C++ profiles (and backwards compatibility)
+// "--- mutex:" or "--- contention:" for profiles generated by the Go runtime.
+// This code converts the text output from runtime into a *Profile. (In the future
+// the runtime might write a serialized Profile directly making this unnecessary.)
+func parseContention(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+	var l string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		l, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(l) {
+			break
+		}
+	}
+
+	if strings.HasPrefix(l, "--- contentionz ") {
+		return parseCppContention(r)
+	} else if strings.HasPrefix(l, "--- mutex:") {
+		return parseCppContention(r)
+	} else if strings.HasPrefix(l, "--- contention:") {
+		return parseCppContention(r)
+	}
+	return nil, errUnrecognized
+}
+
+// parseCppContention parses the output from synchronization_profiling.cc
+// for backward compatibility, and the compatible (non-debug) block profile
+// output from the Go runtime.
+func parseCppContention(r *bytes.Buffer) (*Profile, error) {
+	p := &Profile{
+		PeriodType: &ValueType{Type: "contentions", Unit: "count"},
+		Period:     1,
+		SampleType: []*ValueType{
+			{Type: "contentions", Unit: "count"},
+			{Type: "delay", Unit: "nanoseconds"},
+		},
+	}
+
+	var cpuHz int64
+	var l string
+	var err error
+	// Parse text of the form "attribute = value" before the samples.
+	const delimiter = "="
+	for {
+		l, err = r.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+
+			if l == "" {
+				break
+			}
+		}
+		if isSpaceOrComment(l) {
+			continue
+		}
+
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if strings.HasPrefix(l, "---") {
+			break
+		}
+
+		attr := strings.SplitN(l, delimiter, 2)
+		if len(attr) != 2 {
+			break
+		}
+		key, val := strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1])
+		var err error
+		switch key {
+		case "cycles/second":
+			if cpuHz, err = strconv.ParseInt(val, 0, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		case "sampling period":
+			if p.Period, err = strconv.ParseInt(val, 0, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		case "ms since reset":
+			ms, err := strconv.ParseInt(val, 0, 64)
+			if err != nil {
+				return nil, errUnrecognized
+			}
+			p.DurationNanos = ms * 1000 * 1000
+		case "format":
+			// CPP contentionz profiles don't have format.
+			return nil, errUnrecognized
+		case "resolution":
+			// CPP contentionz profiles don't have resolution.
+			return nil, errUnrecognized
+		case "discarded samples":
+		default:
+			return nil, errUnrecognized
+		}
+	}
+
+	locs := make(map[uint64]*Location)
+	for {
+		if !isSpaceOrComment(l) {
+			if l = strings.TrimSpace(l); strings.HasPrefix(l, "---") {
+				break
+			}
+			value, addrs, err := parseContentionSample(l, p.Period, cpuHz)
+			if err != nil {
+				return nil, err
+			}
+			var sloc []*Location
+			for _, addr := range addrs {
+				// Addresses from stack traces point to the next instruction after
+				// each call. Adjust by -1 to land somewhere on the actual call.
+				addr--
+				loc := locs[addr]
+				if locs[addr] == nil {
+					loc = &Location{
+						Address: addr,
+					}
+					p.Location = append(p.Location, loc)
+					locs[addr] = loc
+				}
+				sloc = append(sloc, loc)
+			}
+			p.Sample = append(p.Sample, &Sample{
+				Value:    value,
+				Location: sloc,
+			})
+		}
+
+		if l, err = r.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+	}
+
+	if err = parseAdditionalSections(l, r, p); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+// parseContentionSample parses a single row from a contention profile
+// into a new Sample.
+func parseContentionSample(line string, period, cpuHz int64) (value []int64, addrs []uint64, err error) {
+	sampleData := contentionSampleRE.FindStringSubmatch(line)
+	if sampleData == nil {
+		return value, addrs, errUnrecognized
+	}
+
+	v1, err := strconv.ParseInt(sampleData[1], 10, 64)
+	if err != nil {
+		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+	v2, err := strconv.ParseInt(sampleData[2], 10, 64)
+	if err != nil {
+		return value, addrs, fmt.Errorf("malformed sample: %s: %v", line, err)
+	}
+
+	// Unsample values if period and cpuHz are available.
+	// - Delays are scaled to cycles and then to nanoseconds.
+	// - Contentions are scaled to cycles.
+	if period > 0 {
+		if cpuHz > 0 {
+			cpuGHz := float64(cpuHz) / 1e9
+			v1 = int64(float64(v1) * float64(period) / cpuGHz)
+		}
+		v2 = v2 * period
+	}
+
+	value = []int64{v2, v1}
+	addrs = parseHexAddresses(sampleData[3])
+
+	return value, addrs, nil
+}
+
+// parseThread parses a Threadz profile and returns a new Profile.
+func parseThread(b []byte) (*Profile, error) {
+	r := bytes.NewBuffer(b)
+
+	var line string
+	var err error
+	for {
+		// Skip past comments and empty lines seeking a real header.
+		line, err = r.ReadString('\n')
+		if err != nil {
+			return nil, err
+		}
+		if !isSpaceOrComment(line) {
+			break
+		}
+	}
+
+	if m := threadzStartRE.FindStringSubmatch(line); m != nil {
+		// Advance over initial comments until first stack trace.
+		for {
+			line, err = r.ReadString('\n')
+			if err != nil {
+				if err != io.EOF {
+					return nil, err
+				}
+
+				if line == "" {
+					break
+				}
+			}
+			if sectionTrigger(line) != unrecognizedSection || line[0] == '-' {
+				break
+			}
+		}
+	} else if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
+		return nil, errUnrecognized
+	}
+
+	p := &Profile{
+		SampleType: []*ValueType{{Type: "thread", Unit: "count"}},
+		PeriodType: &ValueType{Type: "thread", Unit: "count"},
+		Period:     1,
+	}
+
+	locs := make(map[uint64]*Location)
+	// Recognize each thread and populate profile samples.
+	for sectionTrigger(line) == unrecognizedSection {
+		if strings.HasPrefix(line, "---- no stack trace for") {
+			line = ""
+			break
+		}
+		if t := threadStartRE.FindStringSubmatch(line); len(t) != 4 {
+			return nil, errUnrecognized
+		}
+
+		var addrs []uint64
+		line, addrs, err = parseThreadSample(r)
+		if err != nil {
+			return nil, errUnrecognized
+		}
+		if len(addrs) == 0 {
+			// We got a --same as previous threads--. Bump counters.
+			if len(p.Sample) > 0 {
+				s := p.Sample[len(p.Sample)-1]
+				s.Value[0]++
+			}
+			continue
+		}
+
+		var sloc []*Location
+		for _, addr := range addrs {
+			// Addresses from stack traces point to the next instruction after
+			// each call. Adjust by -1 to land somewhere on the actual call.
+			addr--
+			loc := locs[addr]
+			if locs[addr] == nil {
+				loc = &Location{
+					Address: addr,
+				}
+				p.Location = append(p.Location, loc)
+				locs[addr] = loc
+			}
+			sloc = append(sloc, loc)
+		}
+
+		p.Sample = append(p.Sample, &Sample{
+			Value:    []int64{1},
+			Location: sloc,
+		})
+	}
+
+	if err = parseAdditionalSections(line, r, p); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+// parseThreadSample parses a symbolized or unsymbolized stack trace.
+// Returns the first line after the traceback, the sample (or nil if
+// it hits a 'same-as-previous' marker) and an error.
+func parseThreadSample(b *bytes.Buffer) (nextl string, addrs []uint64, err error) {
+	var l string
+	sameAsPrevious := false
+	for {
+		if l, err = b.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return "", nil, err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if strings.HasPrefix(l, "---") {
+			break
+		}
+		if strings.Contains(l, "same as previous thread") {
+			sameAsPrevious = true
+			continue
+		}
+
+		addrs = append(addrs, parseHexAddresses(l)...)
+	}
+
+	if sameAsPrevious {
+		return l, nil, nil
+	}
+	return l, addrs, nil
+}
+
+// parseAdditionalSections parses any additional sections in the
+// profile, ignoring any unrecognized sections.
+func parseAdditionalSections(l string, b *bytes.Buffer, p *Profile) (err error) {
+	for {
+		if sectionTrigger(l) == memoryMapSection {
+			break
+		}
+		// Ignore any unrecognized sections.
+		if l, err := b.ReadString('\n'); err != nil {
+			if err != io.EOF {
+				return err
+			}
+			if l == "" {
+				break
+			}
+		}
+	}
+	return p.ParseMemoryMap(b)
+}
+
+// ParseMemoryMap parses a memory map in the format of
+// /proc/self/maps, and overrides the mappings in the current profile.
+// It renumbers the samples and locations in the profile correspondingly.
+func (p *Profile) ParseMemoryMap(rd io.Reader) error {
+	b := bufio.NewReader(rd)
+
+	var attrs []string
+	var r *strings.Replacer
+	const delimiter = "="
+	for {
+		l, err := b.ReadString('\n')
+		if err != nil {
+			if err != io.EOF {
+				return err
+			}
+			if l == "" {
+				break
+			}
+		}
+		if l = strings.TrimSpace(l); l == "" {
+			continue
+		}
+
+		if r != nil {
+			l = r.Replace(l)
+		}
+		m, err := parseMappingEntry(l)
+		if err != nil {
+			if err == errUnrecognized {
+				// Recognize assignments of the form: attr=value, and replace
+				// $attr with value on subsequent mappings.
+				if attr := strings.SplitN(l, delimiter, 2); len(attr) == 2 {
+					attrs = append(attrs, "$"+strings.TrimSpace(attr[0]), strings.TrimSpace(attr[1]))
+					r = strings.NewReplacer(attrs...)
+				}
+				// Ignore any unrecognized entries
+				continue
+			}
+			return err
+		}
+		if m == nil || (m.File == "" && len(p.Mapping) != 0) {
+			// In some cases the first entry may include the address range
+			// but not the name of the file. It should be followed by
+			// another entry with the name.
+			continue
+		}
+		if len(p.Mapping) == 1 && p.Mapping[0].File == "" {
+			// Update the name if this is the entry following that empty one.
+			p.Mapping[0].File = m.File
+			continue
+		}
+		p.Mapping = append(p.Mapping, m)
+	}
+	p.remapLocationIDs()
+	p.remapFunctionIDs()
+	p.remapMappingIDs()
+	return nil
+}
+
+func parseMappingEntry(l string) (*Mapping, error) {
+	mapping := &Mapping{}
+	var err error
+	if me := procMapsRE.FindStringSubmatch(l); len(me) == 9 {
+		if !strings.Contains(me[3], "x") {
+			// Skip non-executable entries.
+			return nil, nil
+		}
+		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if me[4] != "" {
+			if mapping.Offset, err = strconv.ParseUint(me[4], 16, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+		mapping.File = me[8]
+		return mapping, nil
+	}
+
+	if me := briefMapsRE.FindStringSubmatch(l); len(me) == 6 {
+		if mapping.Start, err = strconv.ParseUint(me[1], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		if mapping.Limit, err = strconv.ParseUint(me[2], 16, 64); err != nil {
+			return nil, errUnrecognized
+		}
+		mapping.File = me[3]
+		if me[5] != "" {
+			if mapping.Offset, err = strconv.ParseUint(me[5], 16, 64); err != nil {
+				return nil, errUnrecognized
+			}
+		}
+		return mapping, nil
+	}
+
+	return nil, errUnrecognized
+}
+
+type sectionType int
+
+const (
+	unrecognizedSection sectionType = iota
+	memoryMapSection
+)
+
+var memoryMapTriggers = []string{
+	"--- Memory map: ---",
+	"MAPPED_LIBRARIES:",
+}
+
+func sectionTrigger(line string) sectionType {
+	for _, trigger := range memoryMapTriggers {
+		if strings.Contains(line, trigger) {
+			return memoryMapSection
+		}
+	}
+	return unrecognizedSection
+}
+
+func (p *Profile) addLegacyFrameInfo() {
+	switch {
+	case isProfileType(p, heapzSampleTypes) ||
+		isProfileType(p, heapzInUseSampleTypes) ||
+		isProfileType(p, heapzAllocSampleTypes):
+		p.DropFrames, p.KeepFrames = allocRxStr, allocSkipRxStr
+	case isProfileType(p, contentionzSampleTypes):
+		p.DropFrames, p.KeepFrames = lockRxStr, ""
+	default:
+		p.DropFrames, p.KeepFrames = cpuProfilerRxStr, ""
+	}
+}
+
+var heapzSampleTypes = []string{"allocations", "size"} // early Go pprof profiles
+var heapzInUseSampleTypes = []string{"inuse_objects", "inuse_space"}
+var heapzAllocSampleTypes = []string{"alloc_objects", "alloc_space"}
+var contentionzSampleTypes = []string{"contentions", "delay"}
+
+func isProfileType(p *Profile, t []string) bool {
+	st := p.SampleType
+	if len(st) != len(t) {
+		return false
+	}
+
+	for i := range st {
+		if st[i].Type != t[i] {
+			return false
+		}
+	}
+	return true
+}
+
+var allocRxStr = strings.Join([]string{
+	// POSIX entry points.
+	`calloc`,
+	`cfree`,
+	`malloc`,
+	`free`,
+	`memalign`,
+	`do_memalign`,
+	`(__)?posix_memalign`,
+	`pvalloc`,
+	`valloc`,
+	`realloc`,
+
+	// TC malloc.
+	`tcmalloc::.*`,
+	`tc_calloc`,
+	`tc_cfree`,
+	`tc_malloc`,
+	`tc_free`,
+	`tc_memalign`,
+	`tc_posix_memalign`,
+	`tc_pvalloc`,
+	`tc_valloc`,
+	`tc_realloc`,
+	`tc_new`,
+	`tc_delete`,
+	`tc_newarray`,
+	`tc_deletearray`,
+	`tc_new_nothrow`,
+	`tc_newarray_nothrow`,
+
+	// Memory-allocation routines on OS X.
+	`malloc_zone_malloc`,
+	`malloc_zone_calloc`,
+	`malloc_zone_valloc`,
+	`malloc_zone_realloc`,
+	`malloc_zone_memalign`,
+	`malloc_zone_free`,
+
+	// Go runtime
+	`runtime\..*`,
+
+	// Other misc. memory allocation routines
+	`BaseArena::.*`,
+	`(::)?do_malloc_no_errno`,
+	`(::)?do_malloc_pages`,
+	`(::)?do_malloc`,
+	`DoSampledAllocation`,
+	`MallocedMemBlock::MallocedMemBlock`,
+	`_M_allocate`,
+	`__builtin_(vec_)?delete`,
+	`__builtin_(vec_)?new`,
+	`__gnu_cxx::new_allocator::allocate`,
+	`__libc_malloc`,
+	`__malloc_alloc_template::allocate`,
+	`allocate`,
+	`cpp_alloc`,
+	`operator new(\[\])?`,
+	`simple_alloc::allocate`,
+}, `|`)
+
+var allocSkipRxStr = strings.Join([]string{
+	// Preserve Go runtime frames that appear in the middle/bottom of
+	// the stack.
+	`runtime\.panic`,
+	`runtime\.reflectcall`,
+	`runtime\.call[0-9]*`,
+}, `|`)
+
+var cpuProfilerRxStr = strings.Join([]string{
+	`ProfileData::Add`,
+	`ProfileData::prof_handler`,
+	`CpuProfiler::prof_handler`,
+	`__pthread_sighandler`,
+	`__restore`,
+}, `|`)
+
+var lockRxStr = strings.Join([]string{
+	`RecordLockProfileData`,
+	`(base::)?RecordLockProfileData.*`,
+	`(base::)?SubmitMutexProfileData.*`,
+	`(base::)?SubmitSpinLockProfileData.*`,
+	`(Mutex::)?AwaitCommon.*`,
+	`(Mutex::)?Unlock.*`,
+	`(Mutex::)?UnlockSlow.*`,
+	`(Mutex::)?ReaderUnlock.*`,
+	`(MutexLock::)?~MutexLock.*`,
+	`(SpinLock::)?Unlock.*`,
+	`(SpinLock::)?SlowUnlock.*`,
+	`(SpinLockHolder::)?~SpinLockHolder.*`,
+}, `|`)

diff --git a/src/runtime/pprof/internal/profile/profile.go b/src/runtime/pprof/internal/profile/profile.go
new file mode 100644
index 0000000..9b6a6f9
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/profile.go

@@ -0,0 +1,575 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package profile provides a representation of profile.proto and
+// methods to encode/decode profiles in this format.
+//
+// This package is only for testing runtime/pprof.
+// It is not used by production Go programs.
+package profile
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"regexp"
+	"strings"
+	"time"
+)
+
+// Profile is an in-memory representation of profile.proto.
+type Profile struct {
+	SampleType []*ValueType
+	Sample     []*Sample
+	Mapping    []*Mapping
+	Location   []*Location
+	Function   []*Function
+
+	DropFrames string
+	KeepFrames string
+
+	TimeNanos     int64
+	DurationNanos int64
+	PeriodType    *ValueType
+	Period        int64
+
+	dropFramesX int64
+	keepFramesX int64
+	stringTable []string
+}
+
+// ValueType corresponds to Profile.ValueType
+type ValueType struct {
+	Type string // cpu, wall, inuse_space, etc
+	Unit string // seconds, nanoseconds, bytes, etc
+
+	typeX int64
+	unitX int64
+}
+
+// Sample corresponds to Profile.Sample
+type Sample struct {
+	Location []*Location
+	Value    []int64
+	Label    map[string][]string
+	NumLabel map[string][]int64
+
+	locationIDX []uint64
+	labelX      []Label
+}
+
+// Label corresponds to Profile.Label
+type Label struct {
+	keyX int64
+	// Exactly one of the two following values must be set
+	strX int64
+	numX int64 // Integer value for this label
+}
+
+// Mapping corresponds to Profile.Mapping
+type Mapping struct {
+	ID              uint64
+	Start           uint64
+	Limit           uint64
+	Offset          uint64
+	File            string
+	BuildID         string
+	HasFunctions    bool
+	HasFilenames    bool
+	HasLineNumbers  bool
+	HasInlineFrames bool
+
+	fileX    int64
+	buildIDX int64
+}
+
+// Location corresponds to Profile.Location
+type Location struct {
+	ID      uint64
+	Mapping *Mapping
+	Address uint64
+	Line    []Line
+
+	mappingIDX uint64
+}
+
+// Line corresponds to Profile.Line
+type Line struct {
+	Function *Function
+	Line     int64
+
+	functionIDX uint64
+}
+
+// Function corresponds to Profile.Function
+type Function struct {
+	ID         uint64
+	Name       string
+	SystemName string
+	Filename   string
+	StartLine  int64
+
+	nameX       int64
+	systemNameX int64
+	filenameX   int64
+}
+
+// Parse parses a profile and checks for its validity. The input
+// may be a gzip-compressed encoded protobuf or one of many legacy
+// profile formats which may be unsupported in the future.
+func Parse(r io.Reader) (*Profile, error) {
+	orig, err := ioutil.ReadAll(r)
+	if err != nil {
+		return nil, err
+	}
+
+	var p *Profile
+	if len(orig) >= 2 && orig[0] == 0x1f && orig[1] == 0x8b {
+		gz, err := gzip.NewReader(bytes.NewBuffer(orig))
+		if err != nil {
+			return nil, fmt.Errorf("decompressing profile: %v", err)
+		}
+		data, err := ioutil.ReadAll(gz)
+		if err != nil {
+			return nil, fmt.Errorf("decompressing profile: %v", err)
+		}
+		orig = data
+	}
+	if p, err = parseUncompressed(orig); err != nil {
+		if p, err = parseLegacy(orig); err != nil {
+			return nil, fmt.Errorf("parsing profile: %v", err)
+		}
+	}
+
+	if err := p.CheckValid(); err != nil {
+		return nil, fmt.Errorf("malformed profile: %v", err)
+	}
+	return p, nil
+}
+
+var errUnrecognized = fmt.Errorf("unrecognized profile format")
+var errMalformed = fmt.Errorf("malformed profile format")
+
+func parseLegacy(data []byte) (*Profile, error) {
+	parsers := []func([]byte) (*Profile, error){
+		parseCPU,
+		parseHeap,
+		parseGoCount, // goroutine, threadcreate
+		parseThread,
+		parseContention,
+	}
+
+	for _, parser := range parsers {
+		p, err := parser(data)
+		if err == nil {
+			p.setMain()
+			p.addLegacyFrameInfo()
+			return p, nil
+		}
+		if err != errUnrecognized {
+			return nil, err
+		}
+	}
+	return nil, errUnrecognized
+}
+
+func parseUncompressed(data []byte) (*Profile, error) {
+	p := &Profile{}
+	if err := unmarshal(data, p); err != nil {
+		return nil, err
+	}
+
+	if err := p.postDecode(); err != nil {
+		return nil, err
+	}
+
+	return p, nil
+}
+
+var libRx = regexp.MustCompile(`([.]so$|[.]so[._][0-9]+)`)
+
+// setMain scans Mapping entries and guesses which entry is main
+// because legacy profiles don't obey the convention of putting main
+// first.
+func (p *Profile) setMain() {
+	for i := 0; i < len(p.Mapping); i++ {
+		file := strings.TrimSpace(strings.Replace(p.Mapping[i].File, "(deleted)", "", -1))
+		if len(file) == 0 {
+			continue
+		}
+		if len(libRx.FindStringSubmatch(file)) > 0 {
+			continue
+		}
+		if strings.HasPrefix(file, "[") {
+			continue
+		}
+		// Swap what we guess is main to position 0.
+		tmp := p.Mapping[i]
+		p.Mapping[i] = p.Mapping[0]
+		p.Mapping[0] = tmp
+		break
+	}
+}
+
+// Write writes the profile as a gzip-compressed marshaled protobuf.
+func (p *Profile) Write(w io.Writer) error {
+	p.preEncode()
+	b := marshal(p)
+	zw := gzip.NewWriter(w)
+	defer zw.Close()
+	_, err := zw.Write(b)
+	return err
+}
+
+// CheckValid tests whether the profile is valid. Checks include, but are
+// not limited to:
+//   - len(Profile.Sample[n].value) == len(Profile.value_unit)
+//   - Sample.id has a corresponding Profile.Location
+func (p *Profile) CheckValid() error {
+	// Check that sample values are consistent
+	sampleLen := len(p.SampleType)
+	if sampleLen == 0 && len(p.Sample) != 0 {
+		return fmt.Errorf("missing sample type information")
+	}
+	for _, s := range p.Sample {
+		if len(s.Value) != sampleLen {
+			return fmt.Errorf("mismatch: sample has: %d values vs. %d types", len(s.Value), len(p.SampleType))
+		}
+	}
+
+	// Check that all mappings/locations/functions are in the tables
+	// Check that there are no duplicate ids
+	mappings := make(map[uint64]*Mapping, len(p.Mapping))
+	for _, m := range p.Mapping {
+		if m.ID == 0 {
+			return fmt.Errorf("found mapping with reserved ID=0")
+		}
+		if mappings[m.ID] != nil {
+			return fmt.Errorf("multiple mappings with same id: %d", m.ID)
+		}
+		mappings[m.ID] = m
+	}
+	functions := make(map[uint64]*Function, len(p.Function))
+	for _, f := range p.Function {
+		if f.ID == 0 {
+			return fmt.Errorf("found function with reserved ID=0")
+		}
+		if functions[f.ID] != nil {
+			return fmt.Errorf("multiple functions with same id: %d", f.ID)
+		}
+		functions[f.ID] = f
+	}
+	locations := make(map[uint64]*Location, len(p.Location))
+	for _, l := range p.Location {
+		if l.ID == 0 {
+			return fmt.Errorf("found location with reserved id=0")
+		}
+		if locations[l.ID] != nil {
+			return fmt.Errorf("multiple locations with same id: %d", l.ID)
+		}
+		locations[l.ID] = l
+		if m := l.Mapping; m != nil {
+			if m.ID == 0 || mappings[m.ID] != m {
+				return fmt.Errorf("inconsistent mapping %p: %d", m, m.ID)
+			}
+		}
+		for _, ln := range l.Line {
+			if f := ln.Function; f != nil {
+				if f.ID == 0 || functions[f.ID] != f {
+					return fmt.Errorf("inconsistent function %p: %d", f, f.ID)
+				}
+			}
+		}
+	}
+	return nil
+}
+
+// Aggregate merges the locations in the profile into equivalence
+// classes preserving the request attributes. It also updates the
+// samples to point to the merged locations.
+func (p *Profile) Aggregate(inlineFrame, function, filename, linenumber, address bool) error {
+	for _, m := range p.Mapping {
+		m.HasInlineFrames = m.HasInlineFrames && inlineFrame
+		m.HasFunctions = m.HasFunctions && function
+		m.HasFilenames = m.HasFilenames && filename
+		m.HasLineNumbers = m.HasLineNumbers && linenumber
+	}
+
+	// Aggregate functions
+	if !function || !filename {
+		for _, f := range p.Function {
+			if !function {
+				f.Name = ""
+				f.SystemName = ""
+			}
+			if !filename {
+				f.Filename = ""
+			}
+		}
+	}
+
+	// Aggregate locations
+	if !inlineFrame || !address || !linenumber {
+		for _, l := range p.Location {
+			if !inlineFrame && len(l.Line) > 1 {
+				l.Line = l.Line[len(l.Line)-1:]
+			}
+			if !linenumber {
+				for i := range l.Line {
+					l.Line[i].Line = 0
+				}
+			}
+			if !address {
+				l.Address = 0
+			}
+		}
+	}
+
+	return p.CheckValid()
+}
+
+// Print dumps a text representation of a profile. Intended mainly
+// for debugging purposes.
+func (p *Profile) String() string {
+
+	ss := make([]string, 0, len(p.Sample)+len(p.Mapping)+len(p.Location))
+	if pt := p.PeriodType; pt != nil {
+		ss = append(ss, fmt.Sprintf("PeriodType: %s %s", pt.Type, pt.Unit))
+	}
+	ss = append(ss, fmt.Sprintf("Period: %d", p.Period))
+	if p.TimeNanos != 0 {
+		ss = append(ss, fmt.Sprintf("Time: %v", time.Unix(0, p.TimeNanos)))
+	}
+	if p.DurationNanos != 0 {
+		ss = append(ss, fmt.Sprintf("Duration: %v", time.Duration(p.DurationNanos)))
+	}
+
+	ss = append(ss, "Samples:")
+	var sh1 string
+	for _, s := range p.SampleType {
+		sh1 = sh1 + fmt.Sprintf("%s/%s ", s.Type, s.Unit)
+	}
+	ss = append(ss, strings.TrimSpace(sh1))
+	for _, s := range p.Sample {
+		var sv string
+		for _, v := range s.Value {
+			sv = fmt.Sprintf("%s %10d", sv, v)
+		}
+		sv = sv + ": "
+		for _, l := range s.Location {
+			sv = sv + fmt.Sprintf("%d ", l.ID)
+		}
+		ss = append(ss, sv)
+		const labelHeader = "                "
+		if len(s.Label) > 0 {
+			ls := labelHeader
+			for k, v := range s.Label {
+				ls = ls + fmt.Sprintf("%s:%v ", k, v)
+			}
+			ss = append(ss, ls)
+		}
+		if len(s.NumLabel) > 0 {
+			ls := labelHeader
+			for k, v := range s.NumLabel {
+				ls = ls + fmt.Sprintf("%s:%v ", k, v)
+			}
+			ss = append(ss, ls)
+		}
+	}
+
+	ss = append(ss, "Locations")
+	for _, l := range p.Location {
+		locStr := fmt.Sprintf("%6d: %#x ", l.ID, l.Address)
+		if m := l.Mapping; m != nil {
+			locStr = locStr + fmt.Sprintf("M=%d ", m.ID)
+		}
+		if len(l.Line) == 0 {
+			ss = append(ss, locStr)
+		}
+		for li := range l.Line {
+			lnStr := "??"
+			if fn := l.Line[li].Function; fn != nil {
+				lnStr = fmt.Sprintf("%s %s:%d s=%d",
+					fn.Name,
+					fn.Filename,
+					l.Line[li].Line,
+					fn.StartLine)
+				if fn.Name != fn.SystemName {
+					lnStr = lnStr + "(" + fn.SystemName + ")"
+				}
+			}
+			ss = append(ss, locStr+lnStr)
+			// Do not print location details past the first line
+			locStr = "             "
+		}
+	}
+
+	ss = append(ss, "Mappings")
+	for _, m := range p.Mapping {
+		bits := ""
+		if m.HasFunctions {
+			bits = bits + "[FN]"
+		}
+		if m.HasFilenames {
+			bits = bits + "[FL]"
+		}
+		if m.HasLineNumbers {
+			bits = bits + "[LN]"
+		}
+		if m.HasInlineFrames {
+			bits = bits + "[IN]"
+		}
+		ss = append(ss, fmt.Sprintf("%d: %#x/%#x/%#x %s %s %s",
+			m.ID,
+			m.Start, m.Limit, m.Offset,
+			m.File,
+			m.BuildID,
+			bits))
+	}
+
+	return strings.Join(ss, "\n") + "\n"
+}
+
+// Merge adds profile p adjusted by ratio r into profile p. Profiles
+// must be compatible (same Type and SampleType).
+// TODO(rsilvera): consider normalizing the profiles based on the
+// total samples collected.
+func (p *Profile) Merge(pb *Profile, r float64) error {
+	if err := p.Compatible(pb); err != nil {
+		return err
+	}
+
+	pb = pb.Copy()
+
+	// Keep the largest of the two periods.
+	if pb.Period > p.Period {
+		p.Period = pb.Period
+	}
+
+	p.DurationNanos += pb.DurationNanos
+
+	p.Mapping = append(p.Mapping, pb.Mapping...)
+	for i, m := range p.Mapping {
+		m.ID = uint64(i + 1)
+	}
+	p.Location = append(p.Location, pb.Location...)
+	for i, l := range p.Location {
+		l.ID = uint64(i + 1)
+	}
+	p.Function = append(p.Function, pb.Function...)
+	for i, f := range p.Function {
+		f.ID = uint64(i + 1)
+	}
+
+	if r != 1.0 {
+		for _, s := range pb.Sample {
+			for i, v := range s.Value {
+				s.Value[i] = int64((float64(v) * r))
+			}
+		}
+	}
+	p.Sample = append(p.Sample, pb.Sample...)
+	return p.CheckValid()
+}
+
+// Compatible determines if two profiles can be compared/merged.
+// returns nil if the profiles are compatible; otherwise an error with
+// details on the incompatibility.
+func (p *Profile) Compatible(pb *Profile) error {
+	if !compatibleValueTypes(p.PeriodType, pb.PeriodType) {
+		return fmt.Errorf("incompatible period types %v and %v", p.PeriodType, pb.PeriodType)
+	}
+
+	if len(p.SampleType) != len(pb.SampleType) {
+		return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
+	}
+
+	for i := range p.SampleType {
+		if !compatibleValueTypes(p.SampleType[i], pb.SampleType[i]) {
+			return fmt.Errorf("incompatible sample types %v and %v", p.SampleType, pb.SampleType)
+		}
+	}
+
+	return nil
+}
+
+// HasFunctions determines if all locations in this profile have
+// symbolized function information.
+func (p *Profile) HasFunctions() bool {
+	for _, l := range p.Location {
+		if l.Mapping == nil || !l.Mapping.HasFunctions {
+			return false
+		}
+	}
+	return true
+}
+
+// HasFileLines determines if all locations in this profile have
+// symbolized file and line number information.
+func (p *Profile) HasFileLines() bool {
+	for _, l := range p.Location {
+		if l.Mapping == nil || (!l.Mapping.HasFilenames || !l.Mapping.HasLineNumbers) {
+			return false
+		}
+	}
+	return true
+}
+
+func compatibleValueTypes(v1, v2 *ValueType) bool {
+	if v1 == nil || v2 == nil {
+		return true // No grounds to disqualify.
+	}
+	return v1.Type == v2.Type && v1.Unit == v2.Unit
+}
+
+// Copy makes a fully independent copy of a profile.
+func (p *Profile) Copy() *Profile {
+	p.preEncode()
+	b := marshal(p)
+
+	pp := &Profile{}
+	if err := unmarshal(b, pp); err != nil {
+		panic(err)
+	}
+	if err := pp.postDecode(); err != nil {
+		panic(err)
+	}
+
+	return pp
+}
+
+// Demangler maps symbol names to a human-readable form. This may
+// include C++ demangling and additional simplification. Names that
+// are not demangled may be missing from the resulting map.
+type Demangler func(name []string) (map[string]string, error)
+
+// Demangle attempts to demangle and optionally simplify any function
+// names referenced in the profile. It works on a best-effort basis:
+// it will silently preserve the original names in case of any errors.
+func (p *Profile) Demangle(d Demangler) error {
+	// Collect names to demangle.
+	var names []string
+	for _, fn := range p.Function {
+		names = append(names, fn.SystemName)
+	}
+
+	// Update profile with demangled names.
+	demangled, err := d(names)
+	if err != nil {
+		return err
+	}
+	for _, fn := range p.Function {
+		if dd, ok := demangled[fn.SystemName]; ok {
+			fn.Name = dd
+		}
+	}
+	return nil
+}
+
+// Empty returns true if the profile contains no samples.
+func (p *Profile) Empty() bool {
+	return len(p.Sample) == 0
+}

diff --git a/src/runtime/pprof/internal/profile/profile_test.go b/src/runtime/pprof/internal/profile/profile_test.go
new file mode 100644
index 0000000..e1963f3
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/profile_test.go

@@ -0,0 +1,79 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package profile
+
+import (
+	"bytes"
+	"testing"
+)
+
+func TestEmptyProfile(t *testing.T) {
+	var buf bytes.Buffer
+	p, err := Parse(&buf)
+	if err != nil {
+		t.Error("Want no error, got", err)
+	}
+	if p == nil {
+		t.Fatal("Want a valid profile, got <nil>")
+	}
+	if !p.Empty() {
+		t.Errorf("Profile should be empty, got %#v", p)
+	}
+}
+
+func TestParseContention(t *testing.T) {
+	tests := []struct {
+		name    string
+		in      string
+		wantErr bool
+	}{
+		{
+			name: "valid",
+			in: `--- mutex:
+cycles/second=3491920901
+sampling period=1
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
+34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
+`,
+		},
+		{
+			name: "valid with comment",
+			in: `--- mutex:
+cycles/second=3491920901
+sampling period=1
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31
+#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
+#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
+#	0x4a2be0	main.main.func3+0x70		/go/src/internal/pprof/profile/a_binary.go:58
+
+34035731690 15760 @ 0x45e851 0x45f764 0x4a2b17 0x44ea31
+#	0x45e850	sync.(*Mutex).Unlock+0x80	/go/src/sync/mutex.go:126
+#	0x45f763	sync.(*RWMutex).Unlock+0x83	/go/src/sync/rwmutex.go:125
+#	0x4a2b16	main.main.func2+0xd6		/go/src/internal/pprof/profile/a_binary.go:48
+`,
+		},
+		{
+			name:    "empty",
+			in:      `--- mutex:`,
+			wantErr: true,
+		},
+		{
+			name: "invalid header",
+			in: `--- channel:
+43227965305 1659640 @ 0x45e851 0x45f764 0x4a2be1 0x44ea31`,
+			wantErr: true,
+		},
+	}
+	for _, tc := range tests {
+		_, err := parseContention([]byte(tc.in))
+		if tc.wantErr && err == nil {
+			t.Errorf("parseContention(%q) succeeded unexpectedly", tc.name)
+		}
+		if !tc.wantErr && err != nil {
+			t.Errorf("parseContention(%q) failed unexpectedly: %v", tc.name, err)
+		}
+	}
+
+}

diff --git a/src/runtime/pprof/internal/profile/proto.go b/src/runtime/pprof/internal/profile/proto.go
new file mode 100644
index 0000000..11d7f9f
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/proto.go

@@ -0,0 +1,360 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// This file is a simple protocol buffer encoder and decoder.
+//
+// A protocol message must implement the message interface:
+//   decoder() []decoder
+//   encode(*buffer)
+//
+// The decode method returns a slice indexed by field number that gives the
+// function to decode that field.
+// The encode method encodes its receiver into the given buffer.
+//
+// The two methods are simple enough to be implemented by hand rather than
+// by using a protocol compiler.
+//
+// See profile.go for examples of messages implementing this interface.
+//
+// There is no support for groups, message sets, or "has" bits.
+
+package profile
+
+import "errors"
+
+type buffer struct {
+	field int
+	typ   int
+	u64   uint64
+	data  []byte
+	tmp   [16]byte
+}
+
+type decoder func(*buffer, message) error
+
+type message interface {
+	decoder() []decoder
+	encode(*buffer)
+}
+
+func marshal(m message) []byte {
+	var b buffer
+	m.encode(&b)
+	return b.data
+}
+
+func encodeVarint(b *buffer, x uint64) {
+	for x >= 128 {
+		b.data = append(b.data, byte(x)|0x80)
+		x >>= 7
+	}
+	b.data = append(b.data, byte(x))
+}
+
+func encodeLength(b *buffer, tag int, len int) {
+	encodeVarint(b, uint64(tag)<<3|2)
+	encodeVarint(b, uint64(len))
+}
+
+func encodeUint64(b *buffer, tag int, x uint64) {
+	// append varint to b.data
+	encodeVarint(b, uint64(tag)<<3|0)
+	encodeVarint(b, x)
+}
+
+func encodeUint64s(b *buffer, tag int, x []uint64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			encodeVarint(b, u)
+		}
+		n2 := len(b.data)
+		encodeLength(b, tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		encodeUint64(b, tag, u)
+	}
+}
+
+func encodeUint64Opt(b *buffer, tag int, x uint64) {
+	if x == 0 {
+		return
+	}
+	encodeUint64(b, tag, x)
+}
+
+func encodeInt64(b *buffer, tag int, x int64) {
+	u := uint64(x)
+	encodeUint64(b, tag, u)
+}
+
+func encodeInt64Opt(b *buffer, tag int, x int64) {
+	if x == 0 {
+		return
+	}
+	encodeInt64(b, tag, x)
+}
+
+func encodeInt64s(b *buffer, tag int, x []int64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			encodeVarint(b, uint64(u))
+		}
+		n2 := len(b.data)
+		encodeLength(b, tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		encodeInt64(b, tag, u)
+	}
+}
+
+func encodeString(b *buffer, tag int, x string) {
+	encodeLength(b, tag, len(x))
+	b.data = append(b.data, x...)
+}
+
+func encodeStrings(b *buffer, tag int, x []string) {
+	for _, s := range x {
+		encodeString(b, tag, s)
+	}
+}
+
+func encodeStringOpt(b *buffer, tag int, x string) {
+	if x == "" {
+		return
+	}
+	encodeString(b, tag, x)
+}
+
+func encodeBool(b *buffer, tag int, x bool) {
+	if x {
+		encodeUint64(b, tag, 1)
+	} else {
+		encodeUint64(b, tag, 0)
+	}
+}
+
+func encodeBoolOpt(b *buffer, tag int, x bool) {
+	if x == false {
+		return
+	}
+	encodeBool(b, tag, x)
+}
+
+func encodeMessage(b *buffer, tag int, m message) {
+	n1 := len(b.data)
+	m.encode(b)
+	n2 := len(b.data)
+	encodeLength(b, tag, n2-n1)
+	n3 := len(b.data)
+	copy(b.tmp[:], b.data[n2:n3])
+	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+	copy(b.data[n1:], b.tmp[:n3-n2])
+}
+
+func unmarshal(data []byte, m message) (err error) {
+	b := buffer{data: data, typ: 2}
+	return decodeMessage(&b, m)
+}
+
+func le64(p []byte) uint64 {
+	return uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56
+}
+
+func le32(p []byte) uint32 {
+	return uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
+}
+
+func decodeVarint(data []byte) (uint64, []byte, error) {
+	var i int
+	var u uint64
+	for i = 0; ; i++ {
+		if i >= 10 || i >= len(data) {
+			return 0, nil, errors.New("bad varint")
+		}
+		u |= uint64(data[i]&0x7F) << uint(7*i)
+		if data[i]&0x80 == 0 {
+			return u, data[i+1:], nil
+		}
+	}
+}
+
+func decodeField(b *buffer, data []byte) ([]byte, error) {
+	x, data, err := decodeVarint(data)
+	if err != nil {
+		return nil, err
+	}
+	b.field = int(x >> 3)
+	b.typ = int(x & 7)
+	b.data = nil
+	b.u64 = 0
+	switch b.typ {
+	case 0:
+		b.u64, data, err = decodeVarint(data)
+		if err != nil {
+			return nil, err
+		}
+	case 1:
+		if len(data) < 8 {
+			return nil, errors.New("not enough data")
+		}
+		b.u64 = le64(data[:8])
+		data = data[8:]
+	case 2:
+		var n uint64
+		n, data, err = decodeVarint(data)
+		if err != nil {
+			return nil, err
+		}
+		if n > uint64(len(data)) {
+			return nil, errors.New("too much data")
+		}
+		b.data = data[:n]
+		data = data[n:]
+	case 5:
+		if len(data) < 4 {
+			return nil, errors.New("not enough data")
+		}
+		b.u64 = uint64(le32(data[:4]))
+		data = data[4:]
+	default:
+		return nil, errors.New("unknown type: " + string(b.typ))
+	}
+
+	return data, nil
+}
+
+func checkType(b *buffer, typ int) error {
+	if b.typ != typ {
+		return errors.New("type mismatch")
+	}
+	return nil
+}
+
+func decodeMessage(b *buffer, m message) error {
+	if err := checkType(b, 2); err != nil {
+		return err
+	}
+	dec := m.decoder()
+	data := b.data
+	for len(data) > 0 {
+		// pull varint field# + type
+		var err error
+		data, err = decodeField(b, data)
+		if err != nil {
+			return err
+		}
+		if b.field >= len(dec) || dec[b.field] == nil {
+			continue
+		}
+		if err := dec[b.field](b, m); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func decodeInt64(b *buffer, x *int64) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	*x = int64(b.u64)
+	return nil
+}
+
+func decodeInt64s(b *buffer, x *[]int64) error {
+	if b.typ == 2 {
+		// Packed encoding
+		data := b.data
+		for len(data) > 0 {
+			var u uint64
+			var err error
+
+			if u, data, err = decodeVarint(data); err != nil {
+				return err
+			}
+			*x = append(*x, int64(u))
+		}
+		return nil
+	}
+	var i int64
+	if err := decodeInt64(b, &i); err != nil {
+		return err
+	}
+	*x = append(*x, i)
+	return nil
+}
+
+func decodeUint64(b *buffer, x *uint64) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	*x = b.u64
+	return nil
+}
+
+func decodeUint64s(b *buffer, x *[]uint64) error {
+	if b.typ == 2 {
+		data := b.data
+		// Packed encoding
+		for len(data) > 0 {
+			var u uint64
+			var err error
+
+			if u, data, err = decodeVarint(data); err != nil {
+				return err
+			}
+			*x = append(*x, u)
+		}
+		return nil
+	}
+	var u uint64
+	if err := decodeUint64(b, &u); err != nil {
+		return err
+	}
+	*x = append(*x, u)
+	return nil
+}
+
+func decodeString(b *buffer, x *string) error {
+	if err := checkType(b, 2); err != nil {
+		return err
+	}
+	*x = string(b.data)
+	return nil
+}
+
+func decodeStrings(b *buffer, x *[]string) error {
+	var s string
+	if err := decodeString(b, &s); err != nil {
+		return err
+	}
+	*x = append(*x, s)
+	return nil
+}
+
+func decodeBool(b *buffer, x *bool) error {
+	if err := checkType(b, 0); err != nil {
+		return err
+	}
+	if int64(b.u64) == 0 {
+		*x = false
+	} else {
+		*x = true
+	}
+	return nil
+}

diff --git a/src/runtime/pprof/internal/profile/proto_test.go b/src/runtime/pprof/internal/profile/proto_test.go
new file mode 100644
index 0000000..c2613fc
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/proto_test.go

@@ -0,0 +1,67 @@
+package profile
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestPackedEncoding(t *testing.T) {
+
+	type testcase struct {
+		uint64s []uint64
+		int64s  []int64
+		encoded []byte
+	}
+	for i, tc := range []testcase{
+		{
+			[]uint64{0, 1, 10, 100, 1000, 10000},
+			[]int64{1000, 0, 1000},
+			[]byte{10, 8, 0, 1, 10, 100, 232, 7, 144, 78, 18, 5, 232, 7, 0, 232, 7},
+		},
+		{
+			[]uint64{10000},
+			nil,
+			[]byte{8, 144, 78},
+		},
+		{
+			nil,
+			[]int64{-10000},
+			[]byte{16, 240, 177, 255, 255, 255, 255, 255, 255, 255, 1},
+		},
+	} {
+		source := &packedInts{tc.uint64s, tc.int64s}
+		if got, want := marshal(source), tc.encoded; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed encode %d, got %v, want %v", i, got, want)
+		}
+
+		dest := new(packedInts)
+		if err := unmarshal(tc.encoded, dest); err != nil {
+			t.Errorf("failed decode %d: %v", i, err)
+			continue
+		}
+		if got, want := dest.uint64s, tc.uint64s; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed decode uint64s %d, got %v, want %v", i, got, want)
+		}
+		if got, want := dest.int64s, tc.int64s; !reflect.DeepEqual(got, want) {
+			t.Errorf("failed decode int64s %d, got %v, want %v", i, got, want)
+		}
+	}
+}
+
+type packedInts struct {
+	uint64s []uint64
+	int64s  []int64
+}
+
+func (u *packedInts) decoder() []decoder {
+	return []decoder{
+		nil,
+		func(b *buffer, m message) error { return decodeUint64s(b, &m.(*packedInts).uint64s) },
+		func(b *buffer, m message) error { return decodeInt64s(b, &m.(*packedInts).int64s) },
+	}
+}
+
+func (u *packedInts) encode(b *buffer) {
+	encodeUint64s(b, 1, u.uint64s)
+	encodeInt64s(b, 2, u.int64s)
+}

diff --git a/src/runtime/pprof/internal/profile/prune.go b/src/runtime/pprof/internal/profile/prune.go
new file mode 100644
index 0000000..1924fad
--- /dev/null
+++ b/src/runtime/pprof/internal/profile/prune.go

@@ -0,0 +1,97 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Implements methods to remove frames from profiles.
+
+package profile
+
+import (
+	"fmt"
+	"regexp"
+)
+
+// Prune removes all nodes beneath a node matching dropRx, and not
+// matching keepRx. If the root node of a Sample matches, the sample
+// will have an empty stack.
+func (p *Profile) Prune(dropRx, keepRx *regexp.Regexp) {
+	prune := make(map[uint64]bool)
+	pruneBeneath := make(map[uint64]bool)
+
+	for _, loc := range p.Location {
+		var i int
+		for i = len(loc.Line) - 1; i >= 0; i-- {
+			if fn := loc.Line[i].Function; fn != nil && fn.Name != "" {
+				funcName := fn.Name
+				// Account for leading '.' on the PPC ELF v1 ABI.
+				if funcName[0] == '.' {
+					funcName = funcName[1:]
+				}
+				if dropRx.MatchString(funcName) {
+					if keepRx == nil || !keepRx.MatchString(funcName) {
+						break
+					}
+				}
+			}
+		}
+
+		if i >= 0 {
+			// Found matching entry to prune.
+			pruneBeneath[loc.ID] = true
+
+			// Remove the matching location.
+			if i == len(loc.Line)-1 {
+				// Matched the top entry: prune the whole location.
+				prune[loc.ID] = true
+			} else {
+				loc.Line = loc.Line[i+1:]
+			}
+		}
+	}
+
+	// Prune locs from each Sample
+	for _, sample := range p.Sample {
+		// Scan from the root to the leaves to find the prune location.
+		// Do not prune frames before the first user frame, to avoid
+		// pruning everything.
+		foundUser := false
+		for i := len(sample.Location) - 1; i >= 0; i-- {
+			id := sample.Location[i].ID
+			if !prune[id] && !pruneBeneath[id] {
+				foundUser = true
+				continue
+			}
+			if !foundUser {
+				continue
+			}
+			if prune[id] {
+				sample.Location = sample.Location[i+1:]
+				break
+			}
+			if pruneBeneath[id] {
+				sample.Location = sample.Location[i:]
+				break
+			}
+		}
+	}
+}
+
+// RemoveUninteresting prunes and elides profiles using built-in
+// tables of uninteresting function names.
+func (p *Profile) RemoveUninteresting() error {
+	var keep, drop *regexp.Regexp
+	var err error
+
+	if p.DropFrames != "" {
+		if drop, err = regexp.Compile("^(" + p.DropFrames + ")$"); err != nil {
+			return fmt.Errorf("failed to compile regexp %s: %v", p.DropFrames, err)
+		}
+		if p.KeepFrames != "" {
+			if keep, err = regexp.Compile("^(" + p.KeepFrames + ")$"); err != nil {
+				return fmt.Errorf("failed to compile regexp %s: %v", p.KeepFrames, err)
+			}
+		}
+		p.Prune(drop, keep)
+	}
+	return nil
+}

diff --git a/src/runtime/pprof/internal/protopprof/protomemprofile.go b/src/runtime/pprof/internal/protopprof/protomemprofile.go
deleted file mode 100644
index c2ab5b5..0000000
--- a/src/runtime/pprof/internal/protopprof/protomemprofile.go
+++ /dev/null

@@ -1,83 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"internal/pprof/profile"
-	"math"
-	"runtime"
-	"time"
-)
-
-// EncodeMemProfile converts MemProfileRecords to a Profile.
-func EncodeMemProfile(mr []runtime.MemProfileRecord, rate int64, t time.Time) *profile.Profile {
-	p := &profile.Profile{
-		Period:     rate,
-		PeriodType: &profile.ValueType{Type: "space", Unit: "bytes"},
-		SampleType: []*profile.ValueType{
-			{Type: "alloc_objects", Unit: "count"},
-			{Type: "alloc_space", Unit: "bytes"},
-			{Type: "inuse_objects", Unit: "count"},
-			{Type: "inuse_space", Unit: "bytes"},
-		},
-		TimeNanos: int64(t.UnixNano()),
-	}
-
-	locs := make(map[uintptr]*profile.Location)
-	for _, r := range mr {
-		stack := r.Stack()
-		sloc := make([]*profile.Location, len(stack))
-		for i, addr := range stack {
-			loc := locs[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(p.Location) + 1),
-					Address: uint64(addr),
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc[i] = loc
-		}
-
-		ao, ab := scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
-		uo, ub := scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
-
-		p.Sample = append(p.Sample, &profile.Sample{
-			Value:    []int64{ao, ab, uo, ub},
-			Location: sloc,
-		})
-	}
-	if runtime.GOOS == "linux" {
-		addMappings(p)
-	}
-	return p
-}
-
-// scaleHeapSample adjusts the data from a heap Sample to
-// account for its probability of appearing in the collected
-// data. heap profiles are a sampling of the memory allocations
-// requests in a program. We estimate the unsampled value by dividing
-// each collected sample by its probability of appearing in the
-// profile. heap profiles rely on a poisson process to determine
-// which samples to collect, based on the desired average collection
-// rate R. The probability of a sample of size S to appear in that
-// profile is 1-exp(-S/R).
-func scaleHeapSample(count, size, rate int64) (int64, int64) {
-	if count == 0 || size == 0 {
-		return 0, 0
-	}
-
-	if rate <= 1 {
-		// if rate==1 all samples were collected so no adjustment is needed.
-		// if rate<1 treat as unknown and skip scaling.
-		return count, size
-	}
-
-	avgSize := float64(size) / float64(count)
-	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
-
-	return int64(float64(count) * scale), int64(float64(size) * scale)
-}

diff --git a/src/runtime/pprof/internal/protopprof/protomemprofile_test.go b/src/runtime/pprof/internal/protopprof/protomemprofile_test.go
deleted file mode 100644
index a10fe77..0000000
--- a/src/runtime/pprof/internal/protopprof/protomemprofile_test.go
+++ /dev/null

@@ -1,104 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"bytes"
-	"internal/pprof/profile"
-	"io/ioutil"
-	"reflect"
-	"runtime"
-	"testing"
-	"time"
-)
-
-// TestSampledHeapAllocProfile tests encoding of a memory profile from
-// runtime.MemProfileRecord data.
-func TestSampledHeapAllocProfile(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("Test requires a system with /proc/self/maps")
-	}
-
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
-
-	var buf bytes.Buffer
-
-	rec, rate := testMemRecords(address1, address2)
-	p := EncodeMemProfile(rec, rate, time.Now())
-	if err := p.Write(&buf); err != nil {
-		t.Fatalf("Failed to write profile: %v", err)
-	}
-
-	p, err = profile.Parse(&buf)
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-
-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "space", Unit: "bytes"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "alloc_objects", Unit: "count"},
-		{Type: "alloc_space", Unit: "bytes"},
-		{Type: "inuse_objects", Unit: "count"},
-		{Type: "inuse_space", Unit: "bytes"},
-	}
-	// Expected samples, with values unsampled according to the profiling rate.
-	expectedSample := []*profile.Sample{
-		{Value: []int64{2050, 2099200, 1537, 1574400}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[1], Address: address2},
-		}},
-		{Value: []int64{1, 829411, 1, 829411}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2 + 1},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 2},
-		}},
-		{Value: []int64{1, 829411, 0, 0}, Location: []*profile.Location{
-			{ID: 5, Mapping: mprof.Mapping[0], Address: address1 + 1},
-			{ID: 6, Mapping: mprof.Mapping[0], Address: address1 + 2},
-			{ID: 7, Mapping: mprof.Mapping[1], Address: address2 + 3},
-		}},
-	}
-
-	if p.Period != 512*1024 {
-		t.Fatalf("Sampling periods do not match")
-	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
-	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
-	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
-	}
-}
-
-func testMemRecords(a1, a2 uint64) ([]runtime.MemProfileRecord, int64) {
-	addr1, addr2 := uintptr(a1), uintptr(a2)
-	rate := int64(512 * 1024)
-	rec := []runtime.MemProfileRecord{
-		{4096, 1024, 4, 1, [32]uintptr{addr1, addr2}},
-		{512 * 1024, 0, 1, 0, [32]uintptr{addr2 + 1, addr2 + 2}},
-		{512 * 1024, 512 * 1024, 1, 1, [32]uintptr{addr1 + 1, addr1 + 2, addr2 + 3}},
-	}
-	return rec, rate
-}

diff --git a/src/runtime/pprof/internal/protopprof/protopprof.go b/src/runtime/pprof/internal/protopprof/protopprof.go
deleted file mode 100644
index 5d269c4..0000000
--- a/src/runtime/pprof/internal/protopprof/protopprof.go
+++ /dev/null

@@ -1,105 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package protopprof converts the runtime's raw profile logs
-// to Profile structs containing a representation of the pprof
-// protocol buffer profile format.
-package protopprof
-
-import (
-	"fmt"
-	"os"
-	"runtime"
-	"time"
-	"unsafe"
-
-	"internal/pprof/profile"
-)
-
-// TranslateCPUProfile parses binary CPU profiling stack trace data
-// generated by runtime.CPUProfile() into a profile struct.
-func TranslateCPUProfile(b []byte, startTime time.Time) (*profile.Profile, error) {
-	const wordSize = unsafe.Sizeof(uintptr(0))
-	const minRawProfile = 5 * wordSize // Need a minimum of 5 words.
-	if uintptr(len(b)) < minRawProfile {
-		return nil, fmt.Errorf("truncated profile")
-	}
-	n := int(uintptr(len(b)) / wordSize)
-	data := ((*[1 << 28]uintptr)(unsafe.Pointer(&b[0])))[:n:n]
-	period := data[3]
-	data = data[5:] // skip header
-
-	// profile initialization taken from pprof tool
-	p := &profile.Profile{
-		Period:     int64(period) * 1000,
-		PeriodType: &profile.ValueType{Type: "cpu", Unit: "nanoseconds"},
-		SampleType: []*profile.ValueType{
-			{Type: "samples", Unit: "count"},
-			{Type: "cpu", Unit: "nanoseconds"},
-		},
-		TimeNanos:     int64(startTime.UnixNano()),
-		DurationNanos: time.Since(startTime).Nanoseconds(),
-	}
-	// Parse CPU samples from the profile.
-	locs := make(map[uint64]*profile.Location)
-	for len(b) > 0 {
-		if len(data) < 2 || uintptr(len(data)) < 2+data[1] {
-			return nil, fmt.Errorf("truncated profile")
-		}
-		count := data[0]
-		nstk := data[1]
-		if uintptr(len(data)) < 2+nstk {
-			return nil, fmt.Errorf("truncated profile")
-		}
-		stk := data[2 : 2+nstk]
-		data = data[2+nstk:]
-
-		if count == 0 && nstk == 1 && stk[0] == 0 {
-			// end of data marker
-			break
-		}
-
-		sloc := make([]*profile.Location, len(stk))
-		for i, addr := range stk {
-			addr := uint64(addr)
-			// Addresses from stack traces point to the next instruction after
-			// each call.  Adjust by -1 to land somewhere on the actual call
-			// (except for the leaf, which is not a call).
-			if i > 0 {
-				addr--
-			}
-			loc := locs[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(p.Location) + 1),
-					Address: addr,
-				}
-				locs[addr] = loc
-				p.Location = append(p.Location, loc)
-			}
-			sloc[i] = loc
-		}
-		p.Sample = append(p.Sample, &profile.Sample{
-			Value:    []int64{int64(count), int64(count) * int64(p.Period)},
-			Location: sloc,
-		})
-	}
-
-	if runtime.GOOS == "linux" {
-		if err := addMappings(p); err != nil {
-			return nil, err
-		}
-	}
-	return p, nil
-}
-
-func addMappings(p *profile.Profile) error {
-	// Parse memory map from /proc/self/maps
-	f, err := os.Open("/proc/self/maps")
-	if err != nil {
-		return err
-	}
-	defer f.Close()
-	return p.ParseMemoryMap(f)
-}

diff --git a/src/runtime/pprof/internal/protopprof/protopprof_test.go b/src/runtime/pprof/internal/protopprof/protopprof_test.go
deleted file mode 100644
index f1937b5..0000000
--- a/src/runtime/pprof/internal/protopprof/protopprof_test.go
+++ /dev/null

@@ -1,171 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package protopprof
-
-import (
-	"bytes"
-	"fmt"
-	"internal/pprof/profile"
-	"io/ioutil"
-	"reflect"
-	"runtime"
-	"testing"
-	"time"
-	"unsafe"
-)
-
-// Helper function to initialize empty cpu profile with sampling period provided.
-func createEmptyProfileWithPeriod(t *testing.T, periodMs uint64) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	// Profile header is as follows:
-	// The first, third and fifth words are 0. The second word is 3.
-	// The fourth word is the period.
-	// EOD marker:
-	// The sixth word -- count is initialized to 0 above.
-	// The code below sets the seventh word -- nstk to 1
-	// The eighth word -- addr is initialized to 0 above.
-	words := []int{0, 3, 0, int(periodMs), 0, 0, 1, 0}
-	n := int(unsafe.Sizeof(0)) * len(words)
-	data := ((*[1 << 29]byte)(unsafe.Pointer(&words[0])))[:n:n]
-	if _, err := buf.Write(data); err != nil {
-		t.Fatalf("createEmptyProfileWithPeriod failed: %v", err)
-	}
-	return *buf
-}
-
-// Helper function to initialize cpu profile with two sample values.
-func createProfileWithTwoSamples(t *testing.T, periodMs uintptr, count1 uintptr, count2 uintptr,
-	address1 uintptr, address2 uintptr) bytes.Buffer {
-	// Mock the sample header produced by cpu profiler. Write a sample
-	// period of 2000 microseconds, followed by no samples.
-	buf := new(bytes.Buffer)
-	words := []uintptr{0, 3, 0, uintptr(periodMs), 0, uintptr(count1), 2,
-		uintptr(address1), uintptr(address1 + 2),
-		uintptr(count2), 2, uintptr(address2), uintptr(address2 + 2),
-		0, 1, 0}
-	for _, n := range words {
-		var err error
-		switch unsafe.Sizeof(int(0)) {
-		case 8:
-			_, err = buf.Write((*[8]byte)(unsafe.Pointer(&n))[:8:8])
-		case 4:
-			_, err = buf.Write((*[4]byte)(unsafe.Pointer(&n))[:4:4])
-		}
-		if err != nil {
-			t.Fatalf("createProfileWithTwoSamples failed: %v", err)
-		}
-	}
-	return *buf
-}
-
-// Tests TranslateCPUProfile parses correct sampling period in an otherwise empty cpu profile.
-func TestTranlateCPUProfileSamplingPeriod(t *testing.T) {
-	// A test server with mock cpu profile data.
-	var buf bytes.Buffer
-
-	startTime := time.Now()
-	b := createEmptyProfileWithPeriod(t, 2000)
-	p, err := TranslateCPUProfile(b.Bytes(), startTime)
-	if err != nil {
-		t.Fatalf("translate failed: %v", err)
-	}
-	if err := p.Write(&buf); err != nil {
-		t.Fatalf("write failed: %v", err)
-	}
-
-	p, err = profile.Parse(&buf)
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-
-	// Expected PeriodType and SampleType.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "samples", Unit: "count"},
-		{Type: "cpu", Unit: "nanoseconds"},
-	}
-	if p.Period != 2000*1000 || !reflect.DeepEqual(p.PeriodType, expectedPeriodType) ||
-		!reflect.DeepEqual(p.SampleType, expectedSampleType) || p.Sample != nil {
-		t.Fatalf("Unexpected Profile fields")
-	}
-}
-
-func getSampleAsString(sample []*profile.Sample) string {
-	var str string
-	for _, x := range sample {
-		for _, y := range x.Location {
-			if y.Mapping != nil {
-				str += fmt.Sprintf("Mapping:%v\n", *y.Mapping)
-			}
-			str += fmt.Sprintf("Location:%v\n", y)
-		}
-		str += fmt.Sprintf("Sample:%v\n", *x)
-	}
-	return str
-}
-
-// Tests TranslateCPUProfile parses a cpu profile with sample values present.
-func TestTranslateCPUProfileWithSamples(t *testing.T) {
-	if runtime.GOOS != "linux" {
-		t.Skip("test requires a system with /proc/self/maps")
-	}
-	// Figure out two addresses from /proc/self/maps.
-	mmap, err := ioutil.ReadFile("/proc/self/maps")
-	if err != nil {
-		t.Fatal("Cannot read /proc/self/maps")
-	}
-	rd := bytes.NewReader(mmap)
-	mprof := &profile.Profile{}
-	if err = mprof.ParseMemoryMap(rd); err != nil {
-		t.Fatalf("Cannot parse /proc/self/maps")
-	}
-	if len(mprof.Mapping) < 2 {
-		// It is possible for a binary to only have 1 executable
-		// region of memory.
-		t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
-	}
-	address1 := mprof.Mapping[0].Start
-	address2 := mprof.Mapping[1].Start
-	// A test server with mock cpu profile data.
-
-	startTime := time.Now()
-	b := createProfileWithTwoSamples(t, 2000, 20, 40, uintptr(address1), uintptr(address2))
-	p, err := TranslateCPUProfile(b.Bytes(), startTime)
-
-	if err != nil {
-		t.Fatalf("Could not parse Profile profile: %v", err)
-	}
-	// Expected PeriodType, SampleType and Sample.
-	expectedPeriodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
-	expectedSampleType := []*profile.ValueType{
-		{Type: "samples", Unit: "count"},
-		{Type: "cpu", Unit: "nanoseconds"},
-	}
-	expectedSample := []*profile.Sample{
-		{Value: []int64{20, 20 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 1, Mapping: mprof.Mapping[0], Address: address1},
-			{ID: 2, Mapping: mprof.Mapping[0], Address: address1 + 1},
-		}},
-		{Value: []int64{40, 40 * 2000 * 1000}, Location: []*profile.Location{
-			{ID: 3, Mapping: mprof.Mapping[1], Address: address2},
-			{ID: 4, Mapping: mprof.Mapping[1], Address: address2 + 1},
-		}},
-	}
-	if p.Period != 2000*1000 {
-		t.Fatalf("Sampling periods do not match")
-	}
-	if !reflect.DeepEqual(p.PeriodType, expectedPeriodType) {
-		t.Fatalf("Period types do not match")
-	}
-	if !reflect.DeepEqual(p.SampleType, expectedSampleType) {
-		t.Fatalf("Sample types do not match")
-	}
-	if !reflect.DeepEqual(p.Sample, expectedSample) {
-		t.Fatalf("Samples do not match: Expected: %v, Got:%v", getSampleAsString(expectedSample),
-			getSampleAsString(p.Sample))
-	}
-}

diff --git a/src/runtime/pprof/label.go b/src/runtime/pprof/label.go
new file mode 100644
index 0000000..35647ee
--- /dev/null
+++ b/src/runtime/pprof/label.go

@@ -0,0 +1,85 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+)
+
+type label struct {
+	key   string
+	value string
+}
+
+// LabelSet is a set of labels.
+type LabelSet struct {
+	list []label
+}
+
+// labelContextKey is the type of contextKeys used for profiler labels.
+type labelContextKey struct{}
+
+func labelValue(ctx context.Context) labelMap {
+	labels, _ := ctx.Value(labelContextKey{}).(*labelMap)
+	if labels == nil {
+		return labelMap(nil)
+	}
+	return *labels
+}
+
+// labelMap is the representation of the label set held in the context type.
+// This is an initial implementation, but it will be replaced with something
+// that admits incremental immutable modification more efficiently.
+type labelMap map[string]string
+
+// WithLabels returns a new context.Context with the given labels added.
+// A label overwrites a prior label with the same key.
+func WithLabels(ctx context.Context, labels LabelSet) context.Context {
+	childLabels := make(labelMap)
+	parentLabels := labelValue(ctx)
+	// TODO(matloob): replace the map implementation with something
+	// more efficient so creating a child context WithLabels doesn't need
+	// to clone the map.
+	for k, v := range parentLabels {
+		childLabels[k] = v
+	}
+	for _, label := range labels.list {
+		childLabels[label.key] = label.value
+	}
+	return context.WithValue(ctx, labelContextKey{}, &childLabels)
+}
+
+// Labels takes an even number of strings representing key-value pairs
+// and makes a LabelSet containing them.
+// A label overwrites a prior label with the same key.
+func Labels(args ...string) LabelSet {
+	if len(args)%2 != 0 {
+		panic("uneven number of arguments to pprof.Labels")
+	}
+	labels := LabelSet{}
+	for i := 0; i+1 < len(args); i += 2 {
+		labels.list = append(labels.list, label{key: args[i], value: args[i+1]})
+	}
+	return labels
+}
+
+// Label returns the value of the label with the given key on ctx, and a boolean indicating
+// whether that label exists.
+func Label(ctx context.Context, key string) (string, bool) {
+	ctxLabels := labelValue(ctx)
+	v, ok := ctxLabels[key]
+	return v, ok
+}
+
+// ForLabels invokes f with each label set on the context.
+// The function f should return true to continue iteration or false to stop iteration early.
+func ForLabels(ctx context.Context, f func(key, value string) bool) {
+	ctxLabels := labelValue(ctx)
+	for k, v := range ctxLabels {
+		if !f(k, v) {
+			break
+		}
+	}
+}

diff --git a/src/runtime/pprof/label_test.go b/src/runtime/pprof/label_test.go
new file mode 100644
index 0000000..240445f
--- /dev/null
+++ b/src/runtime/pprof/label_test.go

@@ -0,0 +1,82 @@
+package pprof
+
+import (
+	"context"
+	"reflect"
+	"sort"
+	"testing"
+)
+
+func labelsSorted(ctx context.Context) []label {
+	ls := []label{}
+	ForLabels(ctx, func(key, value string) bool {
+		ls = append(ls, label{key, value})
+		return true
+	})
+	sort.Sort(labelSorter(ls))
+	return ls
+}
+
+type labelSorter []label
+
+func (s labelSorter) Len() int           { return len(s) }
+func (s labelSorter) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
+func (s labelSorter) Less(i, j int) bool { return s[i].key < s[j].key }
+
+func TestContextLabels(t *testing.T) {
+	// Background context starts with no lablels.
+	ctx := context.Background()
+	labels := labelsSorted(ctx)
+	if len(labels) != 0 {
+		t.Errorf("labels on background context: want [], got %v ", labels)
+	}
+
+	// Add a single label.
+	ctx = WithLabels(ctx, Labels("key", "value"))
+	// Retrieve it with Label.
+	v, ok := Label(ctx, "key")
+	if !ok || v != "value" {
+		t.Errorf(`Label(ctx, "key"): got %v, %v; want "value", ok`, v, ok)
+	}
+	gotLabels := labelsSorted(ctx)
+	wantLabels := []label{{"key", "value"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Add a label with a different key.
+	ctx = WithLabels(ctx, Labels("key2", "value2"))
+	v, ok = Label(ctx, "key2")
+	if !ok || v != "value2" {
+		t.Errorf(`Label(ctx, "key2"): got %v, %v; want "value2", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value"}, {"key2", "value2"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Add label with first key to test label replacement.
+	ctx = WithLabels(ctx, Labels("key", "value3"))
+	v, ok = Label(ctx, "key")
+	if !ok || v != "value3" {
+		t.Errorf(`Label(ctx, "key3"): got %v, %v; want "value3", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value3"}, {"key2", "value2"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+
+	// Labels called with two labels with the same key should pick the second.
+	ctx = WithLabels(ctx, Labels("key4", "value4a", "key4", "value4b"))
+	v, ok = Label(ctx, "key4")
+	if !ok || v != "value4b" {
+		t.Errorf(`Label(ctx, "key4"): got %v, %v; want "value4b", ok`, v, ok)
+	}
+	gotLabels = labelsSorted(ctx)
+	wantLabels = []label{{"key", "value3"}, {"key2", "value2"}, {"key4", "value4b"}}
+	if !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("(sorted) labels on context: got %v, want %v", gotLabels, wantLabels)
+	}
+}

diff --git a/src/runtime/pprof/map.go b/src/runtime/pprof/map.go
new file mode 100644
index 0000000..a271ad0
--- /dev/null
+++ b/src/runtime/pprof/map.go

@@ -0,0 +1,89 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import "unsafe"
+
+// A profMap is a map from (stack, tag) to mapEntry.
+// It grows without bound, but that's assumed to be OK.
+type profMap struct {
+	hash    map[uintptr]*profMapEntry
+	all     *profMapEntry
+	last    *profMapEntry
+	free    []profMapEntry
+	freeStk []uintptr
+}
+
+// A profMapEntry is a single entry in the profMap.
+type profMapEntry struct {
+	nextHash *profMapEntry // next in hash list
+	nextAll  *profMapEntry // next in list of all entries
+	stk      []uintptr
+	tag      unsafe.Pointer
+	count    int64
+}
+
+func (m *profMap) lookup(stk []uint64, tag unsafe.Pointer) *profMapEntry {
+	// Compute hash of (stk, tag).
+	h := uintptr(0)
+	for _, x := range stk {
+		h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
+		h += uintptr(x) * 41
+	}
+	h = h<<8 | (h >> (8 * (unsafe.Sizeof(h) - 1)))
+	h += uintptr(tag) * 41
+
+	// Find entry if present.
+	var last *profMapEntry
+Search:
+	for e := m.hash[h]; e != nil; last, e = e, e.nextHash {
+		if len(e.stk) != len(stk) || e.tag != tag {
+			continue
+		}
+		for j := range stk {
+			if e.stk[j] != uintptr(stk[j]) {
+				continue Search
+			}
+		}
+		// Move to front.
+		if last != nil {
+			last.nextHash = e.nextHash
+			e.nextHash = m.hash[h]
+			m.hash[h] = e
+		}
+		return e
+	}
+
+	// Add new entry.
+	if len(m.free) < 1 {
+		m.free = make([]profMapEntry, 128)
+	}
+	e := &m.free[0]
+	m.free = m.free[1:]
+	e.nextHash = m.hash[h]
+	e.tag = tag
+
+	if len(m.freeStk) < len(stk) {
+		m.freeStk = make([]uintptr, 1024)
+	}
+	e.stk = m.freeStk[:len(stk)]
+	m.freeStk = m.freeStk[len(stk):]
+
+	for j := range stk {
+		e.stk[j] = uintptr(stk[j])
+	}
+	if m.hash == nil {
+		m.hash = make(map[uintptr]*profMapEntry)
+	}
+	m.hash[h] = e
+	if m.all == nil {
+		m.all = e
+		m.last = e
+	} else {
+		m.last.nextAll = e
+		m.last = e
+	}
+	return e
+}

diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index df4f6f8..4c14527 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package pprof_test
+package pprof
 
 import (
 	"bytes"
@@ -10,7 +10,6 @@
 	"reflect"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
 	"testing"
 	"unsafe"
 )
@@ -86,22 +85,22 @@
 
 	tests := []string{
 		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:41
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:75
+#	0x[0-9,a-f]+	runtime/pprof\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:40
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:74
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:22
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:73
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:21
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:72
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:28
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:74
+#	0x[0-9,a-f]+	runtime/pprof\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:27
+#	0x[0-9,a-f]+	runtime/pprof\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:73
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 
 		fmt.Sprintf(`0: 0 \[%v: %v\] @( 0x[0-9,a-f]+)+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:49
+#	0x[0-9,a-f]+	runtime/pprof\.allocateReflectTransient\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:48
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}
 

diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index 871fba0..21ea25c 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go

@@ -33,7 +33,9 @@
 //            }
 //            defer pprof.StopCPUProfile()
 //        }
-//        ...
+//
+//        // ... rest of the program ...
+//
 //        if *memprofile != "" {
 //            f, err := os.Create(*memprofile)
 //            if err != nil {
@@ -73,15 +75,14 @@
 	"bufio"
 	"bytes"
 	"fmt"
-	"internal/pprof/profile"
 	"io"
 	"runtime"
-	"runtime/pprof/internal/protopprof"
 	"sort"
 	"strings"
 	"sync"
 	"text/tabwriter"
 	"time"
+	"unsafe"
 )
 
 // BUG(rsc): Profiles are only as good as the kernel support used to generate them.
@@ -183,6 +184,8 @@
 // If a profile with that name already exists, NewProfile panics.
 // The convention is to use a 'import/path.' prefix to create
 // separate name spaces for each package.
+// For compatibility with various tools that read pprof data,
+// profile names should not contain spaces.
 func NewProfile(name string) *Profile {
 	lockProfiles()
 	defer unlockProfiles()
@@ -264,13 +267,18 @@
 
 	stk := make([]uintptr, 32)
 	n := runtime.Callers(skip+1, stk[:])
+	stk = stk[:n]
+	if len(stk) == 0 {
+		// The value for skip is too large, and there's no stack trace to record.
+		stk = []uintptr{funcPC(lostProfileEvent)}
+	}
 
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if p.m[value] != nil {
 		panic("pprof: Profile.Add of duplicate value")
 	}
-	p.m[value] = stk[:n]
+	p.m[value] = stk
 }
 
 // Remove removes the execution stack associated with value from the profile.
@@ -303,8 +311,8 @@
 	}
 
 	// Obtain consistent snapshot under lock; then process without lock.
-	all := make([][]uintptr, 0, len(p.m))
 	p.mu.Lock()
+	all := make([][]uintptr, 0, len(p.m))
 	for _, stk := range p.m {
 		all = append(all, stk)
 	}
@@ -380,35 +388,29 @@
 	}
 
 	// Output profile in protobuf form.
-	prof := &profile.Profile{
-		PeriodType: &profile.ValueType{Type: name, Unit: "count"},
-		Period:     1,
-		Sample:     make([]*profile.Sample, 0, len(keys)),
-		SampleType: []*profile.ValueType{{Type: name, Unit: "count"}},
-	}
-	locMap := make(map[uintptr]*profile.Location)
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, name, "count")
+	b.pb.int64Opt(tagProfile_Period, 1)
+	b.pbValueType(tagProfile_SampleType, name, "count")
+
+	values := []int64{0}
+	var locs []uint64
 	for _, k := range keys {
-		stk := p.Stack(index[k])
-		c := count[k]
-		locs := make([]*profile.Location, len(stk))
-		for i, addr := range stk {
-			loc := locMap[addr]
-			if loc == nil {
-				loc = &profile.Location{
-					ID:      uint64(len(locMap) + 1),
-					Address: uint64(addr - 1),
-				}
-				prof.Location = append(prof.Location, loc)
-				locMap[addr] = loc
+		values[0] = int64(count[k])
+		locs = locs[:0]
+		for _, addr := range p.Stack(index[k]) {
+			// For count profiles, all stack addresses are
+			// return PCs, which is what locForPC expects.
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
 			}
-			locs[i] = loc
+			locs = append(locs, l)
 		}
-		prof.Sample = append(prof.Sample, &profile.Sample{
-			Location: locs,
-			Value:    []int64{int64(c)},
-		})
+		b.pbSample(values, locs, nil)
 	}
-	return prof.Write(w)
+	b.build()
+	return nil
 }
 
 // keysByCount sorts keys with higher counts first, breaking ties by key string order.
@@ -496,8 +498,7 @@
 	}
 
 	if debug == 0 {
-		pp := protopprof.EncodeMemProfile(p, int64(runtime.MemProfileRate), time.Now())
-		return pp.Write(w)
+		return writeHeapProto(w, p, int64(runtime.MemProfileRate))
 	}
 
 	sort.Slice(p, func(i, j int) bool { return p[i].InUseBytes() > p[j].InUseBytes() })
@@ -562,8 +563,12 @@
 	fmt.Fprintf(w, "# OtherSys = %d\n", s.OtherSys)
 
 	fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
+	fmt.Fprintf(w, "# LastGC = %d\n", s.LastGC)
 	fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
+	fmt.Fprintf(w, "# PauseEnd = %d\n", s.PauseEnd)
 	fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
+	fmt.Fprintf(w, "# NumForcedGC = %d\n", s.NumForcedGC)
+	fmt.Fprintf(w, "# GCCPUFraction = %v\n", s.GCCPUFraction)
 	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
 	tw.Flush()
@@ -689,30 +694,32 @@
 	return nil
 }
 
+// readProfile, provided by the runtime, returns the next chunk of
+// binary CPU profiling stack trace data, blocking until data is available.
+// If profiling is turned off and all the profile data accumulated while it was
+// on has been returned, readProfile returns eof=true.
+// The caller must save the returned data and tags before calling readProfile again.
+func readProfile() (data []uint64, tags []unsafe.Pointer, eof bool)
+
 func profileWriter(w io.Writer) {
-	startTime := time.Now()
-	// This will buffer the entire profile into buf and then
-	// translate it into a profile.Profile structure. This will
-	// create two copies of all the data in the profile in memory.
-	// TODO(matloob): Convert each chunk of the proto output and
-	// stream it out instead of converting the entire profile.
-	var buf bytes.Buffer
+	b := newProfileBuilder(w)
+	var err error
 	for {
-		data := runtime.CPUProfile()
-		if data == nil {
+		time.Sleep(100 * time.Millisecond)
+		data, tags, eof := readProfile()
+		if e := b.addCPUData(data, tags); e != nil && err == nil {
+			err = e
+		}
+		if eof {
 			break
 		}
-		buf.Write(data)
 	}
-
-	profile, err := protopprof.TranslateCPUProfile(buf.Bytes(), startTime)
 	if err != nil {
 		// The runtime should never produce an invalid or truncated profile.
 		// It drops records that can't fit into its log buffers.
-		panic(fmt.Errorf("could not translate binary profile to proto format: %v", err))
+		panic("runtime/pprof: converting profile: " + err.Error())
 	}
-
-	profile.Write(w)
+	b.build()
 	cpu.done <- true
 }
 

diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 8372283..22fea0a 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go

@@ -4,22 +4,20 @@
 
 // +build !nacl
 
-package pprof_test
+package pprof
 
 import (
 	"bytes"
-	"compress/gzip"
+	"context"
 	"fmt"
-	"internal/pprof/profile"
 	"internal/testenv"
 	"io"
-	"io/ioutil"
 	"math/big"
 	"os"
 	"os/exec"
 	"regexp"
 	"runtime"
-	. "runtime/pprof"
+	"runtime/pprof/internal/profile"
 	"strings"
 	"sync"
 	"testing"
@@ -71,14 +69,14 @@
 }
 
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"runtime/pprof.cpuHog1"}, func(dur time.Duration) {
 		cpuHogger(cpuHog1, dur)
 	})
 }
 
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
+	testCPUProfile(t, []string{"runtime/pprof.cpuHog1", "runtime/pprof.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
 			cpuHogger(cpuHog1, dur)
@@ -89,18 +87,41 @@
 	})
 }
 
-func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []uintptr)) {
+func TestCPUProfileInlining(t *testing.T) {
+	testCPUProfile(t, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, func(dur time.Duration) {
+		cpuHogger(inlinedCaller, dur)
+	})
+}
+
+func inlinedCaller() {
+	inlinedCallee()
+}
+
+func inlinedCallee() {
+	// We could just use cpuHog1, but for loops prevent inlining
+	// right now. :(
+	foo := salt1
+	i := 0
+loop:
+	if foo > 0 {
+		foo *= foo
+	} else {
+		foo *= foo + 1
+	}
+	if i++; i < 1e5 {
+		goto loop
+	}
+	salt1 = foo
+}
+
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
 	p, err := profile.Parse(bytes.NewReader(valBytes))
 	if err != nil {
 		t.Fatal(err)
 	}
 	for _, sample := range p.Sample {
 		count := uintptr(sample.Value[0])
-		stk := make([]uintptr, len(sample.Location))
-		for i := range sample.Location {
-			stk[i] = uintptr(sample.Location[i].Address)
-		}
-		f(count, stk)
+		f(count, sample.Location, sample.Label)
 	}
 }
 
@@ -124,8 +145,7 @@
 
 	const maxDuration = 5 * time.Second
 	// If we're running a long test, start with a long duration
-	// because some of the tests (e.g., TestStackBarrierProfiling)
-	// are trying to make sure something *doesn't* happen.
+	// for tests that try to make sure something *doesn't* happen.
 	duration := 5 * time.Second
 	if testing.Short() {
 		duration = 200 * time.Millisecond
@@ -169,31 +189,45 @@
 	t.FailNow()
 }
 
+func contains(slice []string, s string) bool {
+	for i := range slice {
+		if slice[i] == s {
+			return true
+		}
+	}
+	return false
+}
+
 func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
 	ok = true
 
 	// Check that profile is well formed and contains need.
 	have := make([]uintptr, len(need))
 	var samples uintptr
-	parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+	var buf bytes.Buffer
+	parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
+		fmt.Fprintf(&buf, "%d:", count)
+		fprintStack(&buf, stk)
 		samples += count
-		for _, pc := range stk {
-			f := runtime.FuncForPC(pc)
-			if f == nil {
-				continue
+		for i, name := range need {
+			if semi := strings.Index(name, ";"); semi > -1 {
+				kv := strings.SplitN(name[semi+1:], "=", 2)
+				if len(kv) != 2 || !contains(labels[kv[0]], kv[1]) {
+					continue
+				}
+				name = name[:semi]
 			}
-			for i, name := range need {
-				if strings.Contains(f.Name(), name) {
-					have[i] += count
+			for _, loc := range stk {
+				for _, line := range loc.Line {
+					if strings.Contains(line.Function.Name, name) {
+						have[i] += count
+					}
 				}
 			}
-			if strings.Contains(f.Name(), "stackBarrier") {
-				// The runtime should have unwound this.
-				t.Fatalf("profile includes stackBarrier")
-			}
 		}
+		fmt.Fprintf(&buf, "\n")
 	})
-	t.Logf("total %d CPU profile samples collected", samples)
+	t.Logf("total %d CPU profile samples collected:\n%s", samples, buf.String())
 
 	if samples < 10 && runtime.GOOS == "windows" {
 		// On some windows machines we end up with
@@ -300,36 +334,43 @@
 
 		// Read profile to look for entries for runtime.gogo with an attempt at a traceback.
 		// The special entry
-		parseProfile(t, prof.Bytes(), func(count uintptr, stk []uintptr) {
+		parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, _ map[string][]string) {
 			// An entry with two frames with 'System' in its top frame
 			// exists to record a PC without a traceback. Those are okay.
 			if len(stk) == 2 {
-				f := runtime.FuncForPC(stk[1])
-				if f != nil && (f.Name() == "runtime._System" || f.Name() == "runtime._ExternalCode" || f.Name() == "runtime._GC") {
+				name := stk[1].Line[0].Function.Name
+				if name == "runtime._System" || name == "runtime._ExternalCode" || name == "runtime._GC" {
 					return
 				}
 			}
 
 			// Otherwise, should not see runtime.gogo.
 			// The place we'd see it would be the inner most frame.
-			f := runtime.FuncForPC(stk[0])
-			if f != nil && f.Name() == "runtime.gogo" {
+			name := stk[0].Line[0].Function.Name
+			if name == "runtime.gogo" {
 				var buf bytes.Buffer
-				for _, pc := range stk {
-					f := runtime.FuncForPC(pc)
-					if f == nil {
-						fmt.Fprintf(&buf, "%#x ?:0\n", pc)
-					} else {
-						file, line := f.FileLine(pc)
-						fmt.Fprintf(&buf, "%#x %s:%d\n", pc, file, line)
-					}
-				}
+				fprintStack(&buf, stk)
 				t.Fatalf("found profile entry for runtime.gogo:\n%s", buf.String())
 			}
 		})
 	}
 }
 
+func fprintStack(w io.Writer, stk []*profile.Location) {
+	for _, loc := range stk {
+		fmt.Fprintf(w, " %#x", loc.Address)
+		fmt.Fprintf(w, " (")
+		for i, line := range loc.Line {
+			if i > 0 {
+				fmt.Fprintf(w, " ")
+			}
+			fmt.Fprintf(w, "%s:%d", line.Function.Name, line.Line)
+		}
+		fmt.Fprintf(w, ")")
+	}
+	fmt.Fprintf(w, "\n")
+}
+
 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
 	testCPUProfile(t, nil, func(duration time.Duration) {
@@ -350,111 +391,6 @@
 	})
 }
 
-func slurpString(r io.Reader) string {
-	slurp, _ := ioutil.ReadAll(r)
-	return string(slurp)
-}
-
-func getLinuxKernelConfig() string {
-	if f, err := os.Open("/proc/config"); err == nil {
-		defer f.Close()
-		return slurpString(f)
-	}
-	if f, err := os.Open("/proc/config.gz"); err == nil {
-		defer f.Close()
-		r, err := gzip.NewReader(f)
-		if err != nil {
-			return ""
-		}
-		return slurpString(r)
-	}
-	if f, err := os.Open("/boot/config"); err == nil {
-		defer f.Close()
-		return slurpString(f)
-	}
-	uname, _ := exec.Command("uname", "-r").Output()
-	if len(uname) > 0 {
-		if f, err := os.Open("/boot/config-" + strings.TrimSpace(string(uname))); err == nil {
-			defer f.Close()
-			return slurpString(f)
-		}
-	}
-	return ""
-}
-
-func haveLinuxHiresTimers() bool {
-	config := getLinuxKernelConfig()
-	return strings.Contains(config, "CONFIG_HIGH_RES_TIMERS=y")
-}
-
-func TestStackBarrierProfiling(t *testing.T) {
-	if (runtime.GOOS == "linux" && runtime.GOARCH == "arm") ||
-		runtime.GOOS == "openbsd" ||
-		runtime.GOOS == "solaris" ||
-		runtime.GOOS == "dragonfly" ||
-		runtime.GOOS == "freebsd" {
-		// This test currently triggers a large number of
-		// usleep(100)s. These kernels/arches have poor
-		// resolution timers, so this gives up a whole
-		// scheduling quantum. On Linux and the BSDs (and
-		// probably Solaris), profiling signals are only
-		// generated when a process completes a whole
-		// scheduling quantum, so this test often gets zero
-		// profiling signals and fails.
-		t.Skipf("low resolution timers inhibit profiling signals (golang.org/issue/13405)")
-		return
-	}
-
-	if runtime.GOOS == "linux" && strings.HasPrefix(runtime.GOARCH, "mips") {
-		if !haveLinuxHiresTimers() {
-			t.Skipf("low resolution timers inhibit profiling signals (golang.org/issue/13405, golang.org/issue/17936)")
-		}
-	}
-
-	if !strings.Contains(os.Getenv("GODEBUG"), "gcstackbarrierall=1") {
-		// Re-execute this test with constant GC and stack
-		// barriers at every frame.
-		testenv.MustHaveExec(t)
-		if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
-			t.Skip("gcstackbarrierall doesn't work on ppc64")
-		}
-		args := []string{"-test.run=TestStackBarrierProfiling"}
-		if testing.Short() {
-			args = append(args, "-test.short")
-		}
-		cmd := exec.Command(os.Args[0], args...)
-		cmd.Env = append([]string{"GODEBUG=gcstackbarrierall=1", "GOGC=1", "GOTRACEBACK=system"}, os.Environ()...)
-		if out, err := cmd.CombinedOutput(); err != nil {
-			t.Fatalf("subprocess failed with %v:\n%s", err, out)
-		}
-		return
-	}
-
-	testCPUProfile(t, nil, func(duration time.Duration) {
-		// In long mode, we're likely to get one or two
-		// samples in stackBarrier.
-		t := time.After(duration)
-		for {
-			deepStack(1000)
-			select {
-			case <-t:
-				return
-			default:
-			}
-		}
-	})
-}
-
-var x []byte
-
-func deepStack(depth int) int {
-	if depth == 0 {
-		return 0
-	}
-	x = make([]byte, 1024)
-	return deepStack(depth-1) + 1
-}
-
 // Operating systems that are expected to fail the tests. See issue 13841.
 var badOS = map[string]bool{
 	"darwin":    true,
@@ -472,46 +408,46 @@
 	}
 	tests := [...]TestCase{
 		{"chan recv", blockChanRecv, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanRecv\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanRecv\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan send", blockChanSend, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chansend1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanSend\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chansend1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanSend\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"chan close", blockChanClose, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.chanrecv1\+0x[0-9,a-f]+	.*/src/runtime/chan.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockChanClose\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.chanrecv1\+0x[0-9a-f]+	.*/src/runtime/chan.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockChanClose\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select recv async", blockSelectRecvAsync, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectRecvAsync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockSelectRecvAsync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"select send sync", blockSelectSendSync, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime\.selectgo\+0x[0-9,a-f]+	.*/src/runtime/select.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockSelectSendSync\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	runtime\.selectgo\+0x[0-9a-f]+	.*/src/runtime/select.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockSelectSendSync\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"mutex", blockMutex, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9,a-f]+	.*/src/sync/mutex\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockMutex\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	sync\.\(\*Mutex\)\.Lock\+0x[0-9a-f]+	.*/src/sync/mutex\.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockMutex\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 		{"cond", blockCond, `
-[0-9]+ [0-9]+ @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9,a-f]+	.*/src/sync/cond\.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.blockCond\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestBlockProfile\+0x[0-9,a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+[0-9]+ [0-9]+ @( 0x[[:xdigit:]]+)+
+#	0x[0-9a-f]+	sync\.\(\*Cond\)\.Wait\+0x[0-9a-f]+	.*/src/sync/cond\.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.blockCond\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
+#	0x[0-9a-f]+	runtime/pprof\.TestBlockProfile\+0x[0-9a-f]+	.*/src/runtime/pprof/pprof_test.go:[0-9]+
 `},
 	}
 
@@ -606,6 +542,10 @@
 		time.Sleep(blockDelay)
 		mu.Unlock()
 	}()
+	// Note: Unlock releases mu before recording the mutex event,
+	// so it's theoretically possible for this to proceed and
+	// capture the profile before the event is recorded. As long
+	// as this is blocked before the unlock happens, it's okay.
 	mu.Lock()
 }
 
@@ -653,7 +593,7 @@
 	if ok, err := regexp.MatchString(r2, lines[3]); err != nil || !ok {
 		t.Errorf("%q didn't match %q", lines[3], r2)
 	}
-	r3 := "^#.*runtime/pprof_test.blockMutex.*$"
+	r3 := "^#.*runtime/pprof.blockMutex.*$"
 	if ok, err := regexp.MatchString(r3, lines[5]); err != nil || !ok {
 		t.Errorf("%q didn't match %q", lines[5], r3)
 	}
@@ -665,22 +605,25 @@
 func func4(c chan int) { <-c }
 
 func TestGoroutineCounts(t *testing.T) {
-	if runtime.GOOS == "openbsd" {
-		testenv.SkipFlaky(t, 15156)
-	}
+	// Setting GOMAXPROCS to 1 ensures we can force all goroutines to the
+	// desired blocking point.
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+
 	c := make(chan int)
 	for i := 0; i < 100; i++ {
-		if i%10 == 0 {
+		switch {
+		case i%10 == 0:
 			go func1(c)
-			continue
-		}
-		if i%2 == 0 {
+		case i%2 == 0:
 			go func2(c)
-			continue
+		default:
+			go func3(c)
 		}
-		go func3(c)
+		// Let goroutines block on channel
+		for j := 0; j < 5; j++ {
+			runtime.Gosched()
+		}
 	}
-	time.Sleep(10 * time.Millisecond) // let goroutines block on channel
 
 	var w bytes.Buffer
 	goroutineProf := Lookup("goroutine")
@@ -743,3 +686,30 @@
 	}
 	return true
 }
+
+// Issue 18836.
+func TestEmptyCallStack(t *testing.T) {
+	t.Parallel()
+	var buf bytes.Buffer
+	p := NewProfile("test18836")
+	p.Add("foo", 47674)
+	p.WriteTo(&buf, 1)
+	p.Remove("foo")
+	got := buf.String()
+	prefix := "test18836 profile: total 1\n"
+	if !strings.HasPrefix(got, prefix) {
+		t.Fatalf("got:\n\t%q\nwant prefix:\n\t%q\n", got, prefix)
+	}
+	lostevent := "lostProfileEvent"
+	if !strings.Contains(got, lostevent) {
+		t.Fatalf("got:\n\t%q\ndoes not contain:\n\t%q\n", got, lostevent)
+	}
+}
+
+func TestCPUProfileLabel(t *testing.T) {
+	testCPUProfile(t, []string{"runtime/pprof.cpuHogger;key=value"}, func(dur time.Duration) {
+		Do(context.Background(), Labels("key", "value"), func(context.Context) {
+			cpuHogger(cpuHog1, dur)
+		})
+	})
+}

diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go
new file mode 100644
index 0000000..9e16e58
--- /dev/null
+++ b/src/runtime/pprof/proto.go

@@ -0,0 +1,510 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"compress/gzip"
+	"fmt"
+	"io"
+	"io/ioutil"
+	"runtime"
+	"sort"
+	"strconv"
+	"time"
+	"unsafe"
+)
+
+// lostProfileEvent is the function to which lost profiling
+// events are attributed.
+// (The name shows up in the pprof graphs.)
+func lostProfileEvent() { lostProfileEvent() }
+
+// funcPC returns the PC for the func value f.
+func funcPC(f interface{}) uintptr {
+	return *(*[2]*uintptr)(unsafe.Pointer(&f))[1]
+}
+
+// A profileBuilder writes a profile incrementally from a
+// stream of profile samples delivered by the runtime.
+type profileBuilder struct {
+	start      time.Time
+	end        time.Time
+	havePeriod bool
+	period     int64
+	m          profMap
+
+	// encoding state
+	w         io.Writer
+	zw        *gzip.Writer
+	pb        protobuf
+	strings   []string
+	stringMap map[string]int
+	locs      map[uintptr]int
+	funcs     map[string]int // Package path-qualified function name to Function.ID
+	mem       []memMap
+}
+
+type memMap struct {
+	start uintptr
+	end   uintptr
+}
+
+const (
+	// message Profile
+	tagProfile_SampleType    = 1  // repeated ValueType
+	tagProfile_Sample        = 2  // repeated Sample
+	tagProfile_Mapping       = 3  // repeated Mapping
+	tagProfile_Location      = 4  // repeated Location
+	tagProfile_Function      = 5  // repeated Function
+	tagProfile_StringTable   = 6  // repeated string
+	tagProfile_DropFrames    = 7  // int64 (string table index)
+	tagProfile_KeepFrames    = 8  // int64 (string table index)
+	tagProfile_TimeNanos     = 9  // int64
+	tagProfile_DurationNanos = 10 // int64
+	tagProfile_PeriodType    = 11 // ValueType (really optional string???)
+	tagProfile_Period        = 12 // int64
+
+	// message ValueType
+	tagValueType_Type = 1 // int64 (string table index)
+	tagValueType_Unit = 2 // int64 (string table index)
+
+	// message Sample
+	tagSample_Location = 1 // repeated uint64
+	tagSample_Value    = 2 // repeated int64
+	tagSample_Label    = 3 // repeated Label
+
+	// message Label
+	tagLabel_Key = 1 // int64 (string table index)
+	tagLabel_Str = 2 // int64 (string table index)
+	tagLabel_Num = 3 // int64
+
+	// message Mapping
+	tagMapping_ID              = 1  // uint64
+	tagMapping_Start           = 2  // uint64
+	tagMapping_Limit           = 3  // uint64
+	tagMapping_Offset          = 4  // uint64
+	tagMapping_Filename        = 5  // int64 (string table index)
+	tagMapping_BuildID         = 6  // int64 (string table index)
+	tagMapping_HasFunctions    = 7  // bool
+	tagMapping_HasFilenames    = 8  // bool
+	tagMapping_HasLineNumbers  = 9  // bool
+	tagMapping_HasInlineFrames = 10 // bool
+
+	// message Location
+	tagLocation_ID        = 1 // uint64
+	tagLocation_MappingID = 2 // uint64
+	tagLocation_Address   = 3 // uint64
+	tagLocation_Line      = 4 // repeated Line
+
+	// message Line
+	tagLine_FunctionID = 1 // uint64
+	tagLine_Line       = 2 // int64
+
+	// message Function
+	tagFunction_ID         = 1 // uint64
+	tagFunction_Name       = 2 // int64 (string table index)
+	tagFunction_SystemName = 3 // int64 (string table index)
+	tagFunction_Filename   = 4 // int64 (string table index)
+	tagFunction_StartLine  = 5 // int64
+)
+
+// stringIndex adds s to the string table if not already present
+// and returns the index of s in the string table.
+func (b *profileBuilder) stringIndex(s string) int64 {
+	id, ok := b.stringMap[s]
+	if !ok {
+		id = len(b.strings)
+		b.strings = append(b.strings, s)
+		b.stringMap[s] = id
+	}
+	return int64(id)
+}
+
+func (b *profileBuilder) flush() {
+	const dataFlush = 4096
+	if b.pb.nest == 0 && len(b.pb.data) > dataFlush {
+		b.zw.Write(b.pb.data)
+		b.pb.data = b.pb.data[:0]
+	}
+}
+
+// pbValueType encodes a ValueType message to b.pb.
+func (b *profileBuilder) pbValueType(tag int, typ, unit string) {
+	start := b.pb.startMessage()
+	b.pb.int64(tagValueType_Type, b.stringIndex(typ))
+	b.pb.int64(tagValueType_Unit, b.stringIndex(unit))
+	b.pb.endMessage(tag, start)
+}
+
+// pbSample encodes a Sample message to b.pb.
+func (b *profileBuilder) pbSample(values []int64, locs []uint64, labels func()) {
+	start := b.pb.startMessage()
+	b.pb.int64s(tagSample_Value, values)
+	b.pb.uint64s(tagSample_Location, locs)
+	if labels != nil {
+		labels()
+	}
+	b.pb.endMessage(tagProfile_Sample, start)
+	b.flush()
+}
+
+// pbLabel encodes a Label message to b.pb.
+func (b *profileBuilder) pbLabel(tag int, key, str string, num int64) {
+	start := b.pb.startMessage()
+	b.pb.int64Opt(tagLabel_Key, b.stringIndex(key))
+	b.pb.int64Opt(tagLabel_Str, b.stringIndex(str))
+	b.pb.int64Opt(tagLabel_Num, num)
+	b.pb.endMessage(tag, start)
+}
+
+// pbLine encodes a Line message to b.pb.
+func (b *profileBuilder) pbLine(tag int, funcID uint64, line int64) {
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLine_FunctionID, funcID)
+	b.pb.int64Opt(tagLine_Line, line)
+	b.pb.endMessage(tag, start)
+}
+
+// pbMapping encodes a Mapping message to b.pb.
+func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file, buildID string) {
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagMapping_ID, id)
+	b.pb.uint64Opt(tagMapping_Start, base)
+	b.pb.uint64Opt(tagMapping_Limit, limit)
+	b.pb.uint64Opt(tagMapping_Offset, offset)
+	b.pb.int64Opt(tagMapping_Filename, b.stringIndex(file))
+	b.pb.int64Opt(tagMapping_BuildID, b.stringIndex(buildID))
+	// TODO: Set any of HasInlineFrames, HasFunctions, HasFilenames, HasLineNumbers?
+	// It seems like they should all be true, but they've never been set.
+	b.pb.endMessage(tag, start)
+}
+
+// locForPC returns the location ID for addr.
+// addr must be a return PC. This returns the location of the call.
+// It may emit to b.pb, so there must be no message encoding in progress.
+func (b *profileBuilder) locForPC(addr uintptr) uint64 {
+	id := uint64(b.locs[addr])
+	if id != 0 {
+		return id
+	}
+
+	// Expand this one address using CallersFrames so we can cache
+	// each expansion. In general, CallersFrames takes a whole
+	// stack, but in this case we know there will be no skips in
+	// the stack and we have return PCs anyway.
+	frames := runtime.CallersFrames([]uintptr{addr})
+	frame, more := frames.Next()
+	if frame.Function == "runtime.goexit" {
+		// Short-circuit if we see runtime.goexit so the loop
+		// below doesn't allocate a useless empty location.
+		return 0
+	}
+
+	if frame.PC == 0 {
+		// If we failed to resolve the frame, at least make up
+		// a reasonable call PC. This mostly happens in tests.
+		frame.PC = addr - 1
+	}
+
+	// We can't write out functions while in the middle of the
+	// Location message, so record new functions we encounter and
+	// write them out after the Location.
+	type newFunc struct {
+		id         uint64
+		name, file string
+	}
+	newFuncs := make([]newFunc, 0, 8)
+
+	id = uint64(len(b.locs)) + 1
+	b.locs[addr] = int(id)
+	start := b.pb.startMessage()
+	b.pb.uint64Opt(tagLocation_ID, id)
+	b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
+	for frame.Function != "runtime.goexit" {
+		// Write out each line in frame expansion.
+		funcID := uint64(b.funcs[frame.Function])
+		if funcID == 0 {
+			funcID = uint64(len(b.funcs)) + 1
+			b.funcs[frame.Function] = int(funcID)
+			newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+		}
+		b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
+		if !more {
+			break
+		}
+		frame, more = frames.Next()
+	}
+	if len(b.mem) > 0 {
+		i := sort.Search(len(b.mem), func(i int) bool {
+			return b.mem[i].end > addr
+		})
+		if i < len(b.mem) && b.mem[i].start <= addr && addr < b.mem[i].end {
+			b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+		}
+	}
+	b.pb.endMessage(tagProfile_Location, start)
+
+	// Write out functions we found during frame expansion.
+	for _, fn := range newFuncs {
+		start := b.pb.startMessage()
+		b.pb.uint64Opt(tagFunction_ID, fn.id)
+		b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
+		b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+		b.pb.endMessage(tagProfile_Function, start)
+	}
+
+	b.flush()
+	return id
+}
+
+// newProfileBuilder returns a new profileBuilder.
+// CPU profiling data obtained from the runtime can be added
+// by calling b.addCPUData, and then the eventual profile
+// can be obtained by calling b.finish.
+func newProfileBuilder(w io.Writer) *profileBuilder {
+	zw, _ := gzip.NewWriterLevel(w, gzip.BestSpeed)
+	b := &profileBuilder{
+		w:         w,
+		zw:        zw,
+		start:     time.Now(),
+		strings:   []string{""},
+		stringMap: map[string]int{"": 0},
+		locs:      map[uintptr]int{},
+		funcs:     map[string]int{},
+	}
+	b.readMapping()
+	return b
+}
+
+// addCPUData adds the CPU profiling data to the profile.
+// The data must be a whole number of records,
+// as delivered by the runtime.
+func (b *profileBuilder) addCPUData(data []uint64, tags []unsafe.Pointer) error {
+	if !b.havePeriod {
+		// first record is period
+		if len(data) < 3 {
+			return fmt.Errorf("truncated profile")
+		}
+		if data[0] != 3 || data[2] == 0 {
+			return fmt.Errorf("malformed profile")
+		}
+		// data[2] is sampling rate in Hz. Convert to sampling
+		// period in nanoseconds.
+		b.period = 1e9 / int64(data[2])
+		b.havePeriod = true
+		data = data[3:]
+	}
+
+	// Parse CPU samples from the profile.
+	// Each sample is 3+n uint64s:
+	//	data[0] = 3+n
+	//	data[1] = time stamp (ignored)
+	//	data[2] = count
+	//	data[3:3+n] = stack
+	// If the count is 0 and the stack has length 1,
+	// that's an overflow record inserted by the runtime
+	// to indicate that stack[0] samples were lost.
+	// Otherwise the count is usually 1,
+	// but in a few special cases like lost non-Go samples
+	// there can be larger counts.
+	// Because many samples with the same stack arrive,
+	// we want to deduplicate immediately, which we do
+	// using the b.m profMap.
+	for len(data) > 0 {
+		if len(data) < 3 || data[0] > uint64(len(data)) {
+			return fmt.Errorf("truncated profile")
+		}
+		if data[0] < 3 || tags != nil && len(tags) < 1 {
+			return fmt.Errorf("malformed profile")
+		}
+		count := data[2]
+		stk := data[3:data[0]]
+		data = data[data[0]:]
+		var tag unsafe.Pointer
+		if tags != nil {
+			tag = tags[0]
+			tags = tags[1:]
+		}
+
+		if count == 0 && len(stk) == 1 {
+			// overflow record
+			count = uint64(stk[0])
+			stk = []uint64{
+				uint64(funcPC(lostProfileEvent)),
+			}
+		}
+		b.m.lookup(stk, tag).count += int64(count)
+	}
+	return nil
+}
+
+// build completes and returns the constructed profile.
+func (b *profileBuilder) build() error {
+	b.end = time.Now()
+
+	b.pb.int64Opt(tagProfile_TimeNanos, b.start.UnixNano())
+	if b.havePeriod { // must be CPU profile
+		b.pbValueType(tagProfile_SampleType, "samples", "count")
+		b.pbValueType(tagProfile_SampleType, "cpu", "nanoseconds")
+		b.pb.int64Opt(tagProfile_DurationNanos, b.end.Sub(b.start).Nanoseconds())
+		b.pbValueType(tagProfile_PeriodType, "cpu", "nanoseconds")
+		b.pb.int64Opt(tagProfile_Period, b.period)
+	}
+
+	values := []int64{0, 0}
+	var locs []uint64
+	for e := b.m.all; e != nil; e = e.nextAll {
+		values[0] = e.count
+		values[1] = e.count * b.period
+
+		var labels func()
+		if e.tag != nil {
+			labels = func() {
+				for k, v := range *(*labelMap)(e.tag) {
+					b.pbLabel(tagSample_Label, k, v, 0)
+				}
+			}
+		}
+
+		locs = locs[:0]
+		for i, addr := range e.stk {
+			// Addresses from stack traces point to the
+			// next instruction after each call, except
+			// for the leaf, which points to where the
+			// signal occurred. locForPC expects return
+			// PCs, so increment the leaf address to look
+			// like a return PC.
+			if i == 0 {
+				addr++
+			}
+			l := b.locForPC(addr)
+			if l == 0 { // runtime.goexit
+				continue
+			}
+			locs = append(locs, l)
+		}
+		b.pbSample(values, locs, labels)
+	}
+
+	// TODO: Anything for tagProfile_DropFrames?
+	// TODO: Anything for tagProfile_KeepFrames?
+
+	b.pb.strings(tagProfile_StringTable, b.strings)
+	b.zw.Write(b.pb.data)
+	b.zw.Close()
+	return nil
+}
+
+// readMapping reads /proc/self/maps and writes mappings to b.pb.
+// It saves the address ranges of the mappings in b.mem for use
+// when emitting locations.
+func (b *profileBuilder) readMapping() {
+	data, _ := ioutil.ReadFile("/proc/self/maps")
+	parseProcSelfMaps(data, b.addMapping)
+}
+
+func parseProcSelfMaps(data []byte, addMapping func(lo, hi, offset uint64, file, buildID string)) {
+	// $ cat /proc/self/maps
+	// 00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat
+	// 0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat
+	// 0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat
+	// 014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+	// 7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+	// 7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+	// 7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+	// 7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+	// 7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+	// 7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+	// 7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+	// 7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+	// 7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+	// ffffffffff600000-ffffffffff601000 r-xp 00000000 00:00 0                  [vsyscall]
+
+	var line []byte
+	// next removes and returns the next field in the line.
+	// It also removes from line any spaces following the field.
+	next := func() []byte {
+		j := bytes.IndexByte(line, ' ')
+		if j < 0 {
+			f := line
+			line = nil
+			return f
+		}
+		f := line[:j]
+		line = line[j+1:]
+		for len(line) > 0 && line[0] == ' ' {
+			line = line[1:]
+		}
+		return f
+	}
+
+	for len(data) > 0 {
+		i := bytes.IndexByte(data, '\n')
+		if i < 0 {
+			line, data = data, nil
+		} else {
+			line, data = data[:i], data[i+1:]
+		}
+		addr := next()
+		i = bytes.IndexByte(addr, '-')
+		if i < 0 {
+			continue
+		}
+		lo, err := strconv.ParseUint(string(addr[:i]), 16, 64)
+		if err != nil {
+			continue
+		}
+		hi, err := strconv.ParseUint(string(addr[i+1:]), 16, 64)
+		if err != nil {
+			continue
+		}
+		perm := next()
+		if len(perm) < 4 || perm[2] != 'x' {
+			// Only interested in executable mappings.
+			continue
+		}
+		offset, err := strconv.ParseUint(string(next()), 16, 64)
+		if err != nil {
+			continue
+		}
+		next()          // dev
+		inode := next() // inode
+		if line == nil {
+			continue
+		}
+		file := string(line)
+		if len(inode) == 1 && inode[0] == '0' && file == "" {
+			// Huge-page text mappings list the initial fragment of
+			// mapped but unpopulated memory as being inode 0.
+			// Don't report that part.
+			// But [vdso] and [vsyscall] are inode 0, so let non-empty file names through.
+			continue
+		}
+
+		// TODO: pprof's remapMappingIDs makes two adjustments:
+		// 1. If there is an /anon_hugepage mapping first and it is
+		// consecutive to a next mapping, drop the /anon_hugepage.
+		// 2. If start-offset = 0x400000, change start to 0x400000 and offset to 0.
+		// There's no indication why either of these is needed.
+		// Let's try not doing these and see what breaks.
+		// If we do need them, they would go here, before we
+		// enter the mappings into b.mem in the first place.
+
+		buildID, _ := elfBuildID(file)
+		addMapping(lo, hi, offset, file, buildID)
+	}
+}
+
+func (b *profileBuilder) addMapping(lo, hi, offset uint64, file, buildID string) {
+	b.mem = append(b.mem, memMap{uintptr(lo), uintptr(hi)})
+	b.pbMapping(tagProfile_Mapping, uint64(len(b.mem)), lo, hi, offset, file, buildID)
+}

diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go
new file mode 100644
index 0000000..dab929c
--- /dev/null
+++ b/src/runtime/pprof/proto_test.go

@@ -0,0 +1,222 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"reflect"
+	"runtime"
+	"runtime/pprof/internal/profile"
+	"strings"
+	"testing"
+)
+
+// translateCPUProfile parses binary CPU profiling stack trace data
+// generated by runtime.CPUProfile() into a profile struct.
+// This is only used for testing. Real conversions stream the
+// data into the profileBuilder as it becomes available.
+func translateCPUProfile(data []uint64) (*profile.Profile, error) {
+	var buf bytes.Buffer
+	b := newProfileBuilder(&buf)
+	if err := b.addCPUData(data, nil); err != nil {
+		return nil, err
+	}
+	b.build()
+	return profile.Parse(&buf)
+}
+
+// fmtJSON returns a pretty-printed JSON form for x.
+// It works reasonbly well for printing protocol-buffer
+// data structures like profile.Profile.
+func fmtJSON(x interface{}) string {
+	js, _ := json.MarshalIndent(x, "", "\t")
+	return string(js)
+}
+
+func TestConvertCPUProfileEmpty(t *testing.T) {
+	// A test server with mock cpu profile data.
+	var buf bytes.Buffer
+
+	b := []uint64{3, 0, 500} // empty profile at 500 Hz (2ms sample period)
+	p, err := translateCPUProfile(b)
+	if err != nil {
+		t.Fatalf("translateCPUProfile: %v", err)
+	}
+	if err := p.Write(&buf); err != nil {
+		t.Fatalf("writing profile: %v", err)
+	}
+
+	p, err = profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("profile.Parse: %v", err)
+	}
+
+	// Expected PeriodType and SampleType.
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	sampleType := []*profile.ValueType{
+		{Type: "samples", Unit: "count"},
+		{Type: "cpu", Unit: "nanoseconds"},
+	}
+
+	checkProfile(t, p, 2000*1000, periodType, sampleType, nil)
+}
+
+func f1() { f1() }
+func f2() { f2() }
+
+// testPCs returns two PCs and two corresponding memory mappings
+// to use in test profiles.
+func testPCs(t *testing.T) (addr1, addr2 uint64, map1, map2 *profile.Mapping) {
+	switch runtime.GOOS {
+	case "linux", "android", "netbsd":
+		// Figure out two addresses from /proc/self/maps.
+		mmap, err := ioutil.ReadFile("/proc/self/maps")
+		if err != nil {
+			t.Fatal(err)
+		}
+		mprof := &profile.Profile{}
+		if err = mprof.ParseMemoryMap(bytes.NewReader(mmap)); err != nil {
+			t.Fatalf("parsing /proc/self/maps: %v", err)
+		}
+		if len(mprof.Mapping) < 2 {
+			// It is possible for a binary to only have 1 executable
+			// region of memory.
+			t.Skipf("need 2 or more mappings, got %v", len(mprof.Mapping))
+		}
+		addr1 = mprof.Mapping[0].Start
+		map1 = mprof.Mapping[0]
+		map1.BuildID, _ = elfBuildID(map1.File)
+		addr2 = mprof.Mapping[1].Start
+		map2 = mprof.Mapping[1]
+		map2.BuildID, _ = elfBuildID(map2.File)
+	default:
+		addr1 = uint64(funcPC(f1))
+		addr2 = uint64(funcPC(f2))
+	}
+	return
+}
+
+func TestConvertCPUProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)
+
+	b := []uint64{
+		3, 0, 500, // hz = 500
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+		5, 0, 40, uint64(addr2), uint64(addr2 + 2), // 40 samples in addr2
+		5, 0, 10, uint64(addr1), uint64(addr1 + 2), // 10 samples in addr1
+	}
+	p, err := translateCPUProfile(b)
+	if err != nil {
+		t.Fatalf("translating profile: %v", err)
+	}
+	period := int64(2000 * 1000)
+	periodType := &profile.ValueType{Type: "cpu", Unit: "nanoseconds"}
+	sampleType := []*profile.ValueType{
+		{Type: "samples", Unit: "count"},
+		{Type: "cpu", Unit: "nanoseconds"},
+	}
+	samples := []*profile.Sample{
+		{Value: []int64{20, 20 * 2000 * 1000}, Location: []*profile.Location{
+			{ID: 1, Mapping: map1, Address: addr1},
+			{ID: 2, Mapping: map1, Address: addr1 + 1},
+		}},
+		{Value: []int64{40, 40 * 2000 * 1000}, Location: []*profile.Location{
+			{ID: 3, Mapping: map2, Address: addr2},
+			{ID: 4, Mapping: map2, Address: addr2 + 1},
+		}},
+	}
+	checkProfile(t, p, period, periodType, sampleType, samples)
+}
+
+func checkProfile(t *testing.T, p *profile.Profile, period int64, periodType *profile.ValueType, sampleType []*profile.ValueType, samples []*profile.Sample) {
+	if p.Period != period {
+		t.Fatalf("p.Period = %d, want %d", p.Period, period)
+	}
+	if !reflect.DeepEqual(p.PeriodType, periodType) {
+		t.Fatalf("p.PeriodType = %v\nwant = %v", fmtJSON(p.PeriodType), fmtJSON(periodType))
+	}
+	if !reflect.DeepEqual(p.SampleType, sampleType) {
+		t.Fatalf("p.SampleType = %v\nwant = %v", fmtJSON(p.SampleType), fmtJSON(sampleType))
+	}
+	// Clear line info since it is not in the expected samples.
+	// If we used f1 and f2 above, then the samples will have line info.
+	for _, s := range p.Sample {
+		for _, l := range s.Location {
+			l.Line = nil
+		}
+	}
+	if fmtJSON(p.Sample) != fmtJSON(samples) { // ignore unexported fields
+		if len(p.Sample) == len(samples) {
+			for i := range p.Sample {
+				if !reflect.DeepEqual(p.Sample[i], samples[i]) {
+					t.Errorf("sample %d = %v\nwant = %v\n", i, fmtJSON(p.Sample[i]), fmtJSON(samples[i]))
+				}
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+		}
+		t.Fatalf("p.Sample = %v\nwant = %v", fmtJSON(p.Sample), fmtJSON(samples))
+	}
+}
+
+var profSelfMapsTests = `
+00400000-0040b000 r-xp 00000000 fc:01 787766                             /bin/cat
+0060a000-0060b000 r--p 0000a000 fc:01 787766                             /bin/cat
+0060b000-0060c000 rw-p 0000b000 fc:01 787766                             /bin/cat
+014ab000-014cc000 rw-p 00000000 00:00 0                                  [heap]
+7f7d76af8000-7f7d7797c000 r--p 00000000 fc:01 1318064                    /usr/lib/locale/locale-archive
+7f7d7797c000-7f7d77b36000 r-xp 00000000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77b36000-7f7d77d36000 ---p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d36000-7f7d77d3a000 r--p 001ba000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3a000-7f7d77d3c000 rw-p 001be000 fc:01 1180226                    /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d3c000-7f7d77d41000 rw-p 00000000 00:00 0
+7f7d77d41000-7f7d77d64000 r-xp 00000000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f3f000-7f7d77f42000 rw-p 00000000 00:00 0
+7f7d77f61000-7f7d77f63000 rw-p 00000000 00:00 0
+7f7d77f63000-7f7d77f64000 r--p 00022000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f64000-7f7d77f65000 rw-p 00023000 fc:01 1180217                    /lib/x86_64-linux-gnu/ld-2.19.so
+7f7d77f65000-7f7d77f66000 rw-p 00000000 00:00 0
+7ffc342a2000-7ffc342c3000 rw-p 00000000 00:00 0                          [stack]
+7ffc34343000-7ffc34345000 r-xp 00000000 00:00 0                          [vdso]
+ffffffffff600000-ffffffffff601000 r-xp 00000090 00:00 0                  [vsyscall]
+->
+00400000 0040b000 00000000 /bin/cat
+7f7d7797c000 7f7d77b36000 00000000 /lib/x86_64-linux-gnu/libc-2.19.so
+7f7d77d41000 7f7d77d64000 00000000 /lib/x86_64-linux-gnu/ld-2.19.so
+7ffc34343000 7ffc34345000 00000000 [vdso]
+ffffffffff600000 ffffffffff601000 00000090 [vsyscall]
+
+00400000-07000000 r-xp 00000000 00:00 0 
+07000000-07093000 r-xp 06c00000 00:2e 536754                             /path/to/gobench_server_main
+07093000-0722d000 rw-p 06c92000 00:2e 536754                             /path/to/gobench_server_main
+0722d000-07b21000 rw-p 00000000 00:00 0 
+c000000000-c000036000 rw-p 00000000 00:00 0 
+->
+07000000 07093000 06c00000 /path/to/gobench_server_main
+`
+
+func TestProcSelfMaps(t *testing.T) {
+	for tx, tt := range strings.Split(profSelfMapsTests, "\n\n") {
+		i := strings.Index(tt, "->\n")
+		if i < 0 {
+			t.Fatal("malformed test case")
+		}
+		in, out := tt[:i], tt[i+len("->\n"):]
+		if len(out) > 0 && out[len(out)-1] != '\n' {
+			out += "\n"
+		}
+		var buf bytes.Buffer
+		parseProcSelfMaps([]byte(in), func(lo, hi, offset uint64, file, buildID string) {
+			fmt.Fprintf(&buf, "%08x %08x %08x %s\n", lo, hi, offset, file)
+		})
+		if buf.String() != out {
+			t.Errorf("#%d: have:\n%s\nwant:\n%s\n%q\n%q", tx, buf.String(), out, buf.String(), out)
+		}
+	}
+}

diff --git a/src/runtime/pprof/protobuf.go b/src/runtime/pprof/protobuf.go
new file mode 100644
index 0000000..7b99095
--- /dev/null
+++ b/src/runtime/pprof/protobuf.go

@@ -0,0 +1,141 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+// A protobuf is a simple protocol buffer encoder.
+type protobuf struct {
+	data []byte
+	tmp  [16]byte
+	nest int
+}
+
+func (b *protobuf) varint(x uint64) {
+	for x >= 128 {
+		b.data = append(b.data, byte(x)|0x80)
+		x >>= 7
+	}
+	b.data = append(b.data, byte(x))
+}
+
+func (b *protobuf) length(tag int, len int) {
+	b.varint(uint64(tag)<<3 | 2)
+	b.varint(uint64(len))
+}
+
+func (b *protobuf) uint64(tag int, x uint64) {
+	// append varint to b.data
+	b.varint(uint64(tag)<<3 | 0)
+	b.varint(x)
+}
+
+func (b *protobuf) uint64s(tag int, x []uint64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(u)
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.uint64(tag, u)
+	}
+}
+
+func (b *protobuf) uint64Opt(tag int, x uint64) {
+	if x == 0 {
+		return
+	}
+	b.uint64(tag, x)
+}
+
+func (b *protobuf) int64(tag int, x int64) {
+	u := uint64(x)
+	b.uint64(tag, u)
+}
+
+func (b *protobuf) int64Opt(tag int, x int64) {
+	if x == 0 {
+		return
+	}
+	b.int64(tag, x)
+}
+
+func (b *protobuf) int64s(tag int, x []int64) {
+	if len(x) > 2 {
+		// Use packed encoding
+		n1 := len(b.data)
+		for _, u := range x {
+			b.varint(uint64(u))
+		}
+		n2 := len(b.data)
+		b.length(tag, n2-n1)
+		n3 := len(b.data)
+		copy(b.tmp[:], b.data[n2:n3])
+		copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+		copy(b.data[n1:], b.tmp[:n3-n2])
+		return
+	}
+	for _, u := range x {
+		b.int64(tag, u)
+	}
+}
+
+func (b *protobuf) string(tag int, x string) {
+	b.length(tag, len(x))
+	b.data = append(b.data, x...)
+}
+
+func (b *protobuf) strings(tag int, x []string) {
+	for _, s := range x {
+		b.string(tag, s)
+	}
+}
+
+func (b *protobuf) stringOpt(tag int, x string) {
+	if x == "" {
+		return
+	}
+	b.string(tag, x)
+}
+
+func (b *protobuf) bool(tag int, x bool) {
+	if x {
+		b.uint64(tag, 1)
+	} else {
+		b.uint64(tag, 0)
+	}
+}
+
+func (b *protobuf) boolOpt(tag int, x bool) {
+	if x == false {
+		return
+	}
+	b.bool(tag, x)
+}
+
+type msgOffset int
+
+func (b *protobuf) startMessage() msgOffset {
+	b.nest++
+	return msgOffset(len(b.data))
+}
+
+func (b *protobuf) endMessage(tag int, start msgOffset) {
+	n1 := int(start)
+	n2 := len(b.data)
+	b.length(tag, n2-n1)
+	n3 := len(b.data)
+	copy(b.tmp[:], b.data[n2:n3])
+	copy(b.data[n1+(n3-n2):], b.data[n1:n2])
+	copy(b.data[n1:], b.tmp[:n3-n2])
+	b.nest--
+}

diff --git a/src/runtime/pprof/protomem.go b/src/runtime/pprof/protomem.go
new file mode 100644
index 0000000..2756cfd
--- /dev/null
+++ b/src/runtime/pprof/protomem.go

@@ -0,0 +1,93 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"io"
+	"math"
+	"runtime"
+	"strings"
+)
+
+// writeHeapProto writes the current heap profile in protobuf format to w.
+func writeHeapProto(w io.Writer, p []runtime.MemProfileRecord, rate int64) error {
+	b := newProfileBuilder(w)
+	b.pbValueType(tagProfile_PeriodType, "space", "bytes")
+	b.pb.int64Opt(tagProfile_Period, rate)
+	b.pbValueType(tagProfile_SampleType, "alloc_objects", "count")
+	b.pbValueType(tagProfile_SampleType, "alloc_space", "bytes")
+	b.pbValueType(tagProfile_SampleType, "inuse_objects", "count")
+	b.pbValueType(tagProfile_SampleType, "inuse_space", "bytes")
+
+	values := []int64{0, 0, 0, 0}
+	var locs []uint64
+	for _, r := range p {
+		locs = locs[:0]
+		hideRuntime := true
+		for tries := 0; tries < 2; tries++ {
+			for _, addr := range r.Stack() {
+				// For heap profiles, all stack
+				// addresses are return PCs, which is
+				// what locForPC expects.
+				if hideRuntime {
+					if f := runtime.FuncForPC(addr); f != nil && strings.HasPrefix(f.Name(), "runtime.") {
+						continue
+					}
+					// Found non-runtime. Show any runtime uses above it.
+					hideRuntime = false
+				}
+				l := b.locForPC(addr)
+				if l == 0 { // runtime.goexit
+					continue
+				}
+				locs = append(locs, l)
+			}
+			if len(locs) > 0 {
+				break
+			}
+			hideRuntime = false // try again, and show all frames
+		}
+
+		values[0], values[1] = scaleHeapSample(r.AllocObjects, r.AllocBytes, rate)
+		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
+		var blockSize int64
+		if values[0] > 0 {
+			blockSize = values[1] / values[0]
+		}
+		b.pbSample(values, locs, func() {
+			if blockSize != 0 {
+				b.pbLabel(tagSample_Label, "bytes", "", blockSize)
+			}
+		})
+	}
+	b.build()
+	return nil
+}
+
+// scaleHeapSample adjusts the data from a heap Sample to
+// account for its probability of appearing in the collected
+// data. heap profiles are a sampling of the memory allocations
+// requests in a program. We estimate the unsampled value by dividing
+// each collected sample by its probability of appearing in the
+// profile. heap profiles rely on a poisson process to determine
+// which samples to collect, based on the desired average collection
+// rate R. The probability of a sample of size S to appear in that
+// profile is 1-exp(-S/R).
+func scaleHeapSample(count, size, rate int64) (int64, int64) {
+	if count == 0 || size == 0 {
+		return 0, 0
+	}
+
+	if rate <= 1 {
+		// if rate==1 all samples were collected so no adjustment is needed.
+		// if rate<1 treat as unknown and skip scaling.
+		return count, size
+	}
+
+	avgSize := float64(size) / float64(count)
+	scale := 1 / (1 - math.Exp(-avgSize/float64(rate)))
+
+	return int64(float64(count) * scale), int64(float64(size) * scale)
+}

diff --git a/src/runtime/pprof/protomem_test.go b/src/runtime/pprof/protomem_test.go
new file mode 100644
index 0000000..1e30ed9
--- /dev/null
+++ b/src/runtime/pprof/protomem_test.go

@@ -0,0 +1,74 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"bytes"
+	"runtime"
+	"runtime/pprof/internal/profile"
+	"testing"
+)
+
+func TestConvertMemProfile(t *testing.T) {
+	addr1, addr2, map1, map2 := testPCs(t)
+
+	var buf bytes.Buffer
+	// MemProfileRecord stacks are return PCs, so add one to the
+	// addresses recorded in the "profile". The proto profile
+	// locations are call PCs, so conversion will subtract one
+	// from these and get back to addr1 and addr2.
+	a1, a2 := uintptr(addr1)+1, uintptr(addr2)+1
+	rate := int64(512 * 1024)
+	rec := []runtime.MemProfileRecord{
+		{AllocBytes: 4096, FreeBytes: 1024, AllocObjects: 4, FreeObjects: 1, Stack0: [32]uintptr{a1, a2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 0, AllocObjects: 1, FreeObjects: 0, Stack0: [32]uintptr{a2 + 1, a2 + 2}},
+		{AllocBytes: 512 * 1024, FreeBytes: 512 * 1024, AllocObjects: 1, FreeObjects: 1, Stack0: [32]uintptr{a1 + 1, a1 + 2, a2 + 3}},
+	}
+
+	if err := writeHeapProto(&buf, rec, rate); err != nil {
+		t.Fatalf("writing profile: %v", err)
+	}
+
+	p, err := profile.Parse(&buf)
+	if err != nil {
+		t.Fatalf("profile.Parse: %v", err)
+	}
+
+	periodType := &profile.ValueType{Type: "space", Unit: "bytes"}
+	sampleType := []*profile.ValueType{
+		{Type: "alloc_objects", Unit: "count"},
+		{Type: "alloc_space", Unit: "bytes"},
+		{Type: "inuse_objects", Unit: "count"},
+		{Type: "inuse_space", Unit: "bytes"},
+	}
+	samples := []*profile.Sample{
+		{
+			Value: []int64{2050, 2099200, 1537, 1574400},
+			Location: []*profile.Location{
+				{ID: 1, Mapping: map1, Address: addr1},
+				{ID: 2, Mapping: map2, Address: addr2},
+			},
+			NumLabel: map[string][]int64{"bytes": {1024}},
+		},
+		{
+			Value: []int64{1, 829411, 1, 829411},
+			Location: []*profile.Location{
+				{ID: 3, Mapping: map2, Address: addr2 + 1},
+				{ID: 4, Mapping: map2, Address: addr2 + 2},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
+		{
+			Value: []int64{1, 829411, 0, 0},
+			Location: []*profile.Location{
+				{ID: 5, Mapping: map1, Address: addr1 + 1},
+				{ID: 6, Mapping: map1, Address: addr1 + 2},
+				{ID: 7, Mapping: map2, Address: addr2 + 3},
+			},
+			NumLabel: map[string][]int64{"bytes": {829411}},
+		},
+	}
+	checkProfile(t, p, rate, periodType, sampleType, samples)
+}

diff --git a/src/runtime/pprof/runtime.go b/src/runtime/pprof/runtime.go
new file mode 100644
index 0000000..e6aace8
--- /dev/null
+++ b/src/runtime/pprof/runtime.go

@@ -0,0 +1,36 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+	"unsafe"
+)
+
+// runtime_setProfLabel is defined in runtime/proflabel.go.
+func runtime_setProfLabel(labels unsafe.Pointer)
+
+// runtime_getProfLabel is defined in runtime/proflabel.go.
+func runtime_getProfLabel() unsafe.Pointer
+
+// SetGoroutineLabels sets the current goroutine's labels to match ctx.
+// This is a lower-level API than Do, which should be used instead when possible.
+func SetGoroutineLabels(ctx context.Context) {
+	ctxLabels, _ := ctx.Value(labelContextKey{}).(*labelMap)
+	runtime_setProfLabel(unsafe.Pointer(ctxLabels))
+}
+
+// Do calls f with a copy of the parent context with the
+// given labels added to the parent's label map.
+// Each key/value pair in labels is inserted into the label map in the
+// order provided, overriding any previous value for the same key.
+// The augmented label map will be set for the duration of the call to f
+// and restored once f returns.
+func Do(ctx context.Context, labels LabelSet, f func(context.Context)) {
+	defer SetGoroutineLabels(ctx)
+	ctx = WithLabels(ctx, labels)
+	SetGoroutineLabels(ctx)
+	f(ctx)
+}

diff --git a/src/runtime/pprof/runtime_test.go b/src/runtime/pprof/runtime_test.go
new file mode 100644
index 0000000..0dd5324
--- /dev/null
+++ b/src/runtime/pprof/runtime_test.go

@@ -0,0 +1,96 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package pprof
+
+import (
+	"context"
+	"fmt"
+	"reflect"
+	"testing"
+)
+
+func TestSetGoroutineLabels(t *testing.T) {
+	sync := make(chan struct{})
+
+	wantLabels := map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty before test, got %v", gotLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("Expected child goroutine's profile labels to be empty before test, got %v", gotLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+
+	wantLabels = map[string]string{"key": "value"}
+	ctx := WithLabels(context.Background(), Labels("key", "value"))
+	SetGoroutineLabels(ctx)
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("parent goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("child goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+
+	wantLabels = map[string]string{}
+	ctx = context.Background()
+	SetGoroutineLabels(ctx)
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty, got %v", gotLabels)
+	}
+	go func() {
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("Expected child goroutine's profile labels to be empty, got %v", gotLabels)
+		}
+		sync <- struct{}{}
+	}()
+	<-sync
+}
+
+func TestDo(t *testing.T) {
+	wantLabels := map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		t.Errorf("Expected parent goroutine's profile labels to be empty before Do, got %v", gotLabels)
+	}
+
+	Do(context.Background(), Labels("key1", "value1", "key2", "value2"), func(ctx context.Context) {
+		wantLabels := map[string]string{"key1": "value1", "key2": "value2"}
+		if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+			t.Errorf("parent goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+		}
+
+		sync := make(chan struct{})
+		go func() {
+			wantLabels := map[string]string{"key1": "value1", "key2": "value2"}
+			if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+				t.Errorf("child goroutine's profile labels: got %v, want %v", gotLabels, wantLabels)
+			}
+			sync <- struct{}{}
+		}()
+		<-sync
+
+	})
+
+	wantLabels = map[string]string{}
+	if gotLabels := getProfLabel(); !reflect.DeepEqual(gotLabels, wantLabels) {
+		fmt.Printf("%#v", gotLabels)
+		fmt.Printf("%#v", wantLabels)
+		t.Errorf("Expected parent goroutine's profile labels to be empty after Do, got %v", gotLabels)
+	}
+}
+
+func getProfLabel() map[string]string {
+	l := (*labelMap)(runtime_getProfLabel())
+	if l == nil {
+		return map[string]string{}
+	}
+	return *l
+}

diff --git a/src/runtime/pprof/testdata/README b/src/runtime/pprof/testdata/README
new file mode 100644
index 0000000..876538e
--- /dev/null
+++ b/src/runtime/pprof/testdata/README

@@ -0,0 +1,9 @@
+These binaries were generated by:
+
+$ cat empty.s
+.global _start
+_start:
+$ as --32 -o empty.o empty.s && ld  --build-id -m elf_i386 -o test32 empty.o
+$ as --64 -o empty.o empty.s && ld --build-id -o test64 empty.o
+$ powerpc-linux-gnu-as -o empty.o empty.s && powerpc-linux-gnu-ld --build-id -o test32be empty.o
+$ powerpc64-linux-gnu-as -o empty.o empty.s && powerpc64-linux-gnu-ld --build-id -o test64be empty.o

diff --git a/src/runtime/pprof/testdata/test32 b/src/runtime/pprof/testdata/test32
new file mode 100755
index 0000000..ce59472
--- /dev/null
+++ b/src/runtime/pprof/testdata/test32
Binary files differ

diff --git a/src/runtime/pprof/testdata/test32be b/src/runtime/pprof/testdata/test32be
new file mode 100755
index 0000000..f13a732
--- /dev/null
+++ b/src/runtime/pprof/testdata/test32be
Binary files differ

diff --git a/src/runtime/pprof/testdata/test64 b/src/runtime/pprof/testdata/test64
new file mode 100755
index 0000000..3fb42fb
--- /dev/null
+++ b/src/runtime/pprof/testdata/test64
Binary files differ

diff --git a/src/runtime/pprof/testdata/test64be b/src/runtime/pprof/testdata/test64be
new file mode 100755
index 0000000..09b4b01
--- /dev/null
+++ b/src/runtime/pprof/testdata/test64be
Binary files differ

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index f41672d..a5ada4f 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go

@@ -190,8 +190,17 @@
 	// Make racy client program work: if panicking on
 	// another goroutine at the same time as main returns,
 	// let the other goroutine finish printing the panic trace.
-	// Once it does, it will exit. See issue 3934.
-	if panicking != 0 {
+	// Once it does, it will exit. See issues 3934 and 20018.
+	if atomic.Load(&runningPanicDefers) != 0 {
+		// Running deferred functions should not take long.
+		for c := 0; c < 1000; c++ {
+			if atomic.Load(&runningPanicDefers) == 0 {
+				break
+			}
+			Gosched()
+		}
+	}
+	if atomic.Load(&panicking) != 0 {
 		gopark(nil, nil, "panicwait", traceEvGoStop, 1)
 	}
 
@@ -228,26 +237,23 @@
 		if debug.gctrace > 0 {
 			println("GC forced")
 		}
-		gcStart(gcBackgroundMode, true)
+		// Time-triggered, fully concurrent.
+		gcStart(gcBackgroundMode, gcTrigger{kind: gcTriggerTime, now: nanotime()})
 	}
 }
 
-//go:nosplit
-
 // Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
+//go:nosplit
 func Gosched() {
 	mcall(gosched_m)
 }
 
-var alwaysFalse bool
-
-// goschedguarded does nothing, but is written in a way that guarantees a preemption check in its prologue.
-// Calls to this function are inserted by the compiler in otherwise uninterruptible loops (see insertLoopReschedChecks).
+// goschedguarded yields the processor like gosched, but also checks
+// for forbidden states and opts out of the yield in those cases.
+//go:nosplit
 func goschedguarded() {
-	if alwaysFalse {
-		goschedguarded()
-	}
+	mcall(goschedguarded_m)
 }
 
 // Puts the current goroutine into a waiting state and calls unlockf.
@@ -432,16 +438,6 @@
 	lock(&allglock)
 	allgs = append(allgs, gp)
 	allglen = uintptr(len(allgs))
-
-	// Grow GC rescan list if necessary.
-	if len(allgs) > cap(work.rescan.list) {
-		lock(&work.rescan.lock)
-		l := work.rescan.list
-		// Let append do the heavy lifting, but keep the
-		// length the same.
-		work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
-		unlock(&work.rescan.lock)
-	}
 	unlock(&allglock)
 }
 
@@ -795,9 +791,8 @@
 			nextYield = nanotime() + yieldDelay/2
 		}
 	}
-	if newval == _Grunning && gp.gcscanvalid {
-		// Run queueRescan on the system stack so it has more space.
-		systemstack(func() { queueRescan(gp) })
+	if newval == _Grunning {
+		gp.gcscanvalid = false
 	}
 }
 
@@ -827,8 +822,6 @@
 	// Nothing is racing with us now, but gcscandone might be set to true left over
 	// from an earlier round of stack scanning (we scan twice per GC).
 	// We use gcscandone to record whether the scan has been done during this round.
-	// It is important that the scan happens exactly once: if called twice,
-	// the installation of stack barriers will detect the double scan and die.
 
 	gp.gcscandone = false
 
@@ -941,7 +934,7 @@
 // in panic or being exited, this may not reliably stop all
 // goroutines.
 func stopTheWorld(reason string) {
-	semacquire(&worldsema, 0)
+	semacquire(&worldsema)
 	getg().m.preemptoff = reason
 	systemstack(stopTheWorldWithSema)
 }
@@ -1416,6 +1409,7 @@
 	// running at all (that is, there's no garbage collection
 	// running right now).
 	mp.needextram = mp.schedlink == 0
+	extraMCount--
 	unlockextra(mp.schedlink.ptr())
 
 	// Save and block signals before installing g.
@@ -1441,6 +1435,10 @@
 	// Initialize this thread to use the m.
 	asminit()
 	minit()
+
+	// mp.curg is now a real goroutine.
+	casgstatus(mp.curg, _Gdead, _Gsyscall)
+	atomic.Xadd(&sched.ngsys, -1)
 }
 
 var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
@@ -1481,12 +1479,13 @@
 	gp.syscallpc = gp.sched.pc
 	gp.syscallsp = gp.sched.sp
 	gp.stktopsp = gp.sched.sp
-	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
+	gp.gcscanvalid = true
 	gp.gcscandone = true
-	gp.gcRescan = -1
-	// malg returns status as Gidle, change to Gsyscall before adding to allg
-	// where GC will see it.
-	casgstatus(gp, _Gidle, _Gsyscall)
+	// malg returns status as _Gidle. Change to _Gdead before
+	// adding to allg where GC can see it. We use _Gdead to hide
+	// this from tracebacks and stack scans since it isn't a
+	// "real" goroutine until needm grabs it.
+	casgstatus(gp, _Gidle, _Gdead)
 	gp.m = mp
 	mp.curg = gp
 	mp.locked = _LockInternal
@@ -1499,9 +1498,16 @@
 	// put on allg for garbage collector
 	allgadd(gp)
 
+	// gp is now on the allg list, but we don't want it to be
+	// counted by gcount. It would be more "proper" to increment
+	// sched.ngfree, but that requires locking. Incrementing ngsys
+	// has the same effect.
+	atomic.Xadd(&sched.ngsys, +1)
+
 	// Add m to the extra list.
 	mnext := lockextra(true)
 	mp.schedlink.set(mnext)
+	extraMCount++
 	unlockextra(mp)
 }
 
@@ -1534,6 +1540,10 @@
 	// with no pointer manipulation.
 	mp := getg().m
 
+	// Return mp.curg to dead state.
+	casgstatus(mp.curg, _Gsyscall, _Gdead)
+	atomic.Xadd(&sched.ngsys, +1)
+
 	// Block signals before unminit.
 	// Unminit unregisters the signal handling stack (but needs g on some systems).
 	// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
@@ -1543,6 +1553,7 @@
 	unminit()
 
 	mnext := lockextra(true)
+	extraMCount++
 	mp.schedlink.set(mnext)
 
 	setg(nil)
@@ -1559,6 +1570,7 @@
 }
 
 var extram uintptr
+var extraMCount uint32 // Protected by lockextra
 var extraMWaiters uint32
 
 // lockextra locks the extra list and returns the list head.
@@ -1603,6 +1615,10 @@
 	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
 }
 
+// execLock serializes exec and clone to avoid bugs or unspecified behaviour
+// around exec'ing while creating/destroying threads.  See issue #19546.
+var execLock rwmutex
+
 // Create a new m. It will start off with a call to fn, or else the scheduler.
 // fn needs to be static and not a heap allocated closure.
 // May run with m.p==nil, so write barriers are not allowed.
@@ -1622,10 +1638,14 @@
 		if msanenabled {
 			msanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts))
 		}
+		execLock.rlock() // Prevent process clone.
 		asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
+		execLock.runlock()
 		return
 	}
+	execLock.rlock() // Prevent process clone.
 	newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
+	execLock.runlock()
 }
 
 // Stops execution of the current m until new work is available.
@@ -1879,7 +1899,7 @@
 	// Check whether the profiler needs to be turned on or off.
 	hz := sched.profilehz
 	if _g_.m.profilehz != hz {
-		resetcpuprofiler(hz)
+		setThreadCPUProfiler(hz)
 	}
 
 	if trace.enabled {
@@ -1917,6 +1937,9 @@
 			ready(gp, 0, true)
 		}
 	}
+	if *cgo_yield != nil {
+		asmcgocall(*cgo_yield, nil)
+	}
 
 	// local runq
 	if gp, inheritTime := runqget(_p_); gp != nil {
@@ -2074,7 +2097,7 @@
 	}
 
 	// poll network
-	if netpollinited() && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
 		if _g_.m.p != 0 {
 			throw("findrunnable: netpoll with p")
 		}
@@ -2115,7 +2138,7 @@
 	if !runqempty(p) {
 		return true
 	}
-	if netpollinited() && sched.lastpoll != 0 {
+	if netpollinited() && atomic.Load(&netpollWaiters) > 0 && sched.lastpoll != 0 {
 		if gp := netpoll(false); gp != nil {
 			injectglist(gp)
 			return true
@@ -2263,7 +2286,7 @@
 	_g_ := getg()
 
 	if trace.enabled {
-		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
+		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip)
 	}
 
 	casgstatus(gp, _Grunning, _Gwaiting)
@@ -2308,6 +2331,19 @@
 	goschedImpl(gp)
 }
 
+// goschedguarded is a forbidden-states-avoided version of gosched_m
+func goschedguarded_m(gp *g) {
+
+	if gp.m.locks != 0 || gp.m.mallocing != 0 || gp.m.preemptoff != "" || gp.m.p.ptr().status != _Prunning {
+		gogo(&gp.sched) // never return
+	}
+
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
 func gopreempt_m(gp *g) {
 	if trace.enabled {
 		traceGoPreempt()
@@ -2343,10 +2379,11 @@
 	gp.writebuf = nil
 	gp.waitreason = ""
 	gp.param = nil
+	gp.labels = nil
+	gp.timer = nil
 
 	// Note that gp's stack scan is now "valid" because it has no
-	// stack. We could dequeueRescan, but that takes a lock and
-	// isn't really necessary.
+	// stack.
 	gp.gcscanvalid = true
 	dropg()
 
@@ -2775,12 +2812,12 @@
 func beforefork() {
 	gp := getg().m.curg
 
-	// Fork can hang if preempted with signals frequently enough (see issue 5517).
-	// Ensure that we stay on the same M where we disable profiling.
+	// Block signals during a fork, so that the child does not run
+	// a signal handler before exec if a signal is sent to the process
+	// group. See issue #18600.
 	gp.m.locks++
-	if gp.m.profilehz != 0 {
-		resetcpuprofiler(0)
-	}
+	msigsave(gp.m)
+	sigblock()
 
 	// This function is called before fork in syscall package.
 	// Code between fork and exec must not allocate memory nor even try to grow stack.
@@ -2799,13 +2836,11 @@
 func afterfork() {
 	gp := getg().m.curg
 
-	// See the comment in beforefork.
+	// See the comments in beforefork.
 	gp.stackguard0 = gp.stack.lo + _StackGuard
 
-	hz := sched.profilehz
-	if hz != 0 {
-		resetcpuprofiler(hz)
-	}
+	msigrestore(gp.m.sigmask)
+
 	gp.m.locks--
 }
 
@@ -2816,17 +2851,60 @@
 	systemstack(afterfork)
 }
 
+// inForkedChild is true while manipulating signals in the child process.
+// This is used to avoid calling libc functions in case we are using vfork.
+var inForkedChild bool
+
+// Called from syscall package after fork in child.
+// It resets non-sigignored signals to the default handler, and
+// restores the signal mask in preparation for the exec.
+//
+// Because this might be called during a vfork, and therefore may be
+// temporarily sharing address space with the parent process, this must
+// not change any global variables or calling into C code that may do so.
+//
+//go:linkname syscall_runtime_AfterForkInChild syscall.runtime_AfterForkInChild
+//go:nosplit
+//go:nowritebarrierrec
+func syscall_runtime_AfterForkInChild() {
+	// It's OK to change the global variable inForkedChild here
+	// because we are going to change it back. There is no race here,
+	// because if we are sharing address space with the parent process,
+	// then the parent process can not be running concurrently.
+	inForkedChild = true
+
+	clearSignalHandlers()
+
+	// When we are the child we are the only thread running,
+	// so we know that nothing else has changed gp.m.sigmask.
+	msigrestore(getg().m.sigmask)
+
+	inForkedChild = false
+}
+
+// Called from syscall package before Exec.
+//go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
+func syscall_runtime_BeforeExec() {
+	// Prevent thread creation during exec.
+	execLock.lock()
+}
+
+// Called from syscall package after Exec.
+//go:linkname syscall_runtime_AfterExec syscall.runtime_AfterExec
+func syscall_runtime_AfterExec() {
+	execLock.unlock()
+}
+
 // Allocate a new g, with a stack big enough for stacksize bytes.
 func malg(stacksize int32) *g {
 	newg := new(g)
 	if stacksize >= 0 {
 		stacksize = round2(_StackSystem + stacksize)
 		systemstack(func() {
-			newg.stack, newg.stkbar = stackalloc(uint32(stacksize))
+			newg.stack = stackalloc(uint32(stacksize))
 		})
 		newg.stackguard0 = newg.stack.lo + _StackGuard
 		newg.stackguard1 = ^uintptr(0)
-		newg.stackAlloc = uintptr(stacksize)
 	}
 	return newg
 }
@@ -2874,7 +2952,6 @@
 	if newg == nil {
 		newg = malg(_StackMin)
 		casgstatus(newg, _Gidle, _Gdead)
-		newg.gcRescan = -1
 		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
 	}
 	if newg.stack.hi == 0 {
@@ -2920,20 +2997,13 @@
 	gostartcallfn(&newg.sched, fn)
 	newg.gopc = callerpc
 	newg.startpc = fn.fn
+	if _g_.m.curg != nil {
+		newg.labels = _g_.m.curg.labels
+	}
 	if isSystemGoroutine(newg) {
 		atomic.Xadd(&sched.ngsys, +1)
 	}
-	// The stack is dirty from the argument frame, so queue it for
-	// scanning. Do this before setting it to runnable so we still
-	// own the G. If we're recycling a G, it may already be on the
-	// rescan list.
-	if newg.gcRescan == -1 {
-		queueRescan(newg)
-	} else {
-		// The recycled G is already on the rescan list. Just
-		// mark the stack dirty.
-		newg.gcscanvalid = false
-	}
+	newg.gcscanvalid = false
 	casgstatus(newg, _Gdead, _Grunnable)
 
 	if _p_.goidcache == _p_.goidcacheend {
@@ -2971,20 +3041,14 @@
 		throw("gfput: bad status (not Gdead)")
 	}
 
-	stksize := gp.stackAlloc
+	stksize := gp.stack.hi - gp.stack.lo
 
 	if stksize != _FixedStack {
 		// non-standard stack size - free it.
-		stackfree(gp.stack, gp.stackAlloc)
+		stackfree(gp.stack)
 		gp.stack.lo = 0
 		gp.stack.hi = 0
 		gp.stackguard0 = 0
-		gp.stkbar = nil
-		gp.stkbarPos = 0
-	} else {
-		// Reset stack barriers.
-		gp.stkbar = gp.stkbar[:0]
-		gp.stkbarPos = 0
 	}
 
 	gp.schedlink.set(_p_.gfree)
@@ -3041,16 +3105,15 @@
 		if gp.stack.lo == 0 {
 			// Stack was deallocated in gfput. Allocate a new one.
 			systemstack(func() {
-				gp.stack, gp.stkbar = stackalloc(_FixedStack)
+				gp.stack = stackalloc(_FixedStack)
 			})
 			gp.stackguard0 = gp.stack.lo + _StackGuard
-			gp.stackAlloc = _FixedStack
 		} else {
 			if raceenabled {
-				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
+				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
 			}
 			if msanenabled {
-				msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
+				msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stack.hi-gp.stack.lo)
 			}
 		}
 	}
@@ -3145,8 +3208,7 @@
 
 func gcount() int32 {
 	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
-	for i := 0; ; i++ {
-		_p_ := allp[i]
+	for _, _p_ := range &allp {
 		if _p_ == nil {
 			break
 		}
@@ -3166,13 +3228,14 @@
 }
 
 var prof struct {
-	lock uint32
-	hz   int32
+	signalLock uint32
+	hz         int32
 }
 
-func _System()       { _System() }
-func _ExternalCode() { _ExternalCode() }
-func _GC()           { _GC() }
+func _System()           { _System() }
+func _ExternalCode()     { _ExternalCode() }
+func _LostExternalCode() { _LostExternalCode() }
+func _GC()               { _GC() }
 
 // Called if we receive a SIGPROF signal.
 // Called by the signal handler, may run during STW.
@@ -3260,7 +3323,6 @@
 		traceback = false
 	}
 	var stk [maxCPUProfStack]uintptr
-	var haveStackLock *g
 	n := 0
 	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
 		cgoOff := 0
@@ -3278,26 +3340,9 @@
 		}
 
 		// Collect Go stack that leads to the cgo call.
-		if gcTryLockStackBarriers(mp.curg) {
-			haveStackLock = mp.curg
-			n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
-		}
+		n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
 	} else if traceback {
-		var flags uint = _TraceTrap
-		if gp.m.curg != nil && gcTryLockStackBarriers(gp.m.curg) {
-			// It's safe to traceback the user stack.
-			haveStackLock = gp.m.curg
-			flags |= _TraceJumpStack
-		}
-		// Traceback is safe if we're on the system stack (if
-		// necessary, flags will stop it before switching to
-		// the user stack), or if we locked the user stack.
-		if gp != gp.m.curg || haveStackLock != nil {
-			n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, flags)
-		}
-	}
-	if haveStackLock != nil {
-		gcUnlockStackBarriers(haveStackLock)
+		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
 	}
 
 	if n <= 0 {
@@ -3307,10 +3352,7 @@
 		if GOOS == "windows" && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
 			// Libcall, i.e. runtime syscall on windows.
 			// Collect Go stack that leads to the call.
-			if gcTryLockStackBarriers(mp.libcallg.ptr()) {
-				n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
-				gcUnlockStackBarriers(mp.libcallg.ptr())
-			}
+			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
 		}
 		if n == 0 {
 			// If all of the above has failed, account it against abstract "System" or "GC".
@@ -3329,14 +3371,7 @@
 	}
 
 	if prof.hz != 0 {
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.add(stk[:n])
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.add(gp, stk[:n])
 	}
 	getg().m.mallocing--
 }
@@ -3359,15 +3394,7 @@
 		for n < len(sigprofCallers) && sigprofCallers[n] != 0 {
 			n++
 		}
-
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.addNonGo(sigprofCallers[:n])
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.addNonGo(sigprofCallers[:n])
 	}
 
 	atomic.Store(&sigprofCallersUse, 0)
@@ -3380,19 +3407,11 @@
 //go:nowritebarrierrec
 func sigprofNonGoPC(pc uintptr) {
 	if prof.hz != 0 {
-		pc := []uintptr{
+		stk := []uintptr{
 			pc,
 			funcPC(_ExternalCode) + sys.PCQuantum,
 		}
-
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !atomic.Cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.addNonGo(pc)
-		}
-		atomic.Store(&prof.lock, 0)
+		cpuprof.addNonGo(stk)
 	}
 }
 
@@ -3408,7 +3427,7 @@
 // or putting one on the stack at the right offset.
 func setsSP(pc uintptr) bool {
 	f := findfunc(pc)
-	if f == nil {
+	if !f.valid() {
 		// couldn't find the function for this PC,
 		// so assume the worst and stop traceback
 		return true
@@ -3420,8 +3439,9 @@
 	return false
 }
 
-// Arrange to call fn with a traceback hz times a second.
-func setcpuprofilerate_m(hz int32) {
+// setcpuprofilerate sets the CPU profiling rate to hz times per second.
+// If hz <= 0, setcpuprofilerate turns off CPU profiling.
+func setcpuprofilerate(hz int32) {
 	// Force sane arguments.
 	if hz < 0 {
 		hz = 0
@@ -3435,20 +3455,23 @@
 	// Stop profiler on this thread so that it is safe to lock prof.
 	// if a profiling signal came in while we had prof locked,
 	// it would deadlock.
-	resetcpuprofiler(0)
+	setThreadCPUProfiler(0)
 
-	for !atomic.Cas(&prof.lock, 0, 1) {
+	for !atomic.Cas(&prof.signalLock, 0, 1) {
 		osyield()
 	}
-	prof.hz = hz
-	atomic.Store(&prof.lock, 0)
+	if prof.hz != hz {
+		setProcessCPUProfiler(hz)
+		prof.hz = hz
+	}
+	atomic.Store(&prof.signalLock, 0)
 
 	lock(&sched.lock)
 	sched.profilehz = hz
 	unlock(&sched.lock)
 
 	if hz != 0 {
-		resetcpuprofiler(hz)
+		setThreadCPUProfiler(hz)
 	}
 
 	_g_.m.locks--
@@ -3802,7 +3825,25 @@
 				if scavengelimit < forcegcperiod {
 					maxsleep = scavengelimit / 2
 				}
+				shouldRelax := true
+				if osRelaxMinNS > 0 {
+					lock(&timers.lock)
+					if timers.sleeping {
+						now := nanotime()
+						next := timers.sleepUntil
+						if next-now < osRelaxMinNS {
+							shouldRelax = false
+						}
+					}
+					unlock(&timers.lock)
+				}
+				if shouldRelax {
+					osRelax(true)
+				}
 				notetsleep(&sched.sysmonnote, maxsleep)
+				if shouldRelax {
+					osRelax(false)
+				}
 				lock(&sched.lock)
 				atomic.Store(&sched.sysmonwait, 0)
 				noteclear(&sched.sysmonnote)
@@ -3811,10 +3852,13 @@
 			}
 			unlock(&sched.lock)
 		}
+		// trigger libc interceptors if needed
+		if *cgo_yield != nil {
+			asmcgocall(*cgo_yield, nil)
+		}
 		// poll network if not polled for more than 10ms
 		lastpoll := int64(atomic.Load64(&sched.lastpoll))
 		now := nanotime()
-		unixnow := unixnanotime()
 		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
 			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
 			gp := netpoll(false) // non-blocking - returns list of goroutines
@@ -3839,8 +3883,7 @@
 			idle++
 		}
 		// check if we need to force a GC
-		lastgc := int64(atomic.Load64(&memstats.last_gc))
-		if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
+		if t := (gcTrigger{kind: gcTriggerTime, now: now}); t.test() && atomic.Load(&forcegc.idle) != 0 {
 			lock(&forcegc.lock)
 			forcegc.idle = 0
 			forcegc.g.schedlink = 0
@@ -3860,7 +3903,7 @@
 	}
 }
 
-var pdesc [_MaxGomaxprocs]struct {
+type sysmontick struct {
 	schedtick   uint32
 	schedwhen   int64
 	syscalltick uint32
@@ -3878,7 +3921,7 @@
 		if _p_ == nil {
 			continue
 		}
-		pd := &pdesc[i]
+		pd := &_p_.sysmontick
 		s := _p_.status
 		if s == _Psyscall {
 			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
@@ -4274,7 +4317,7 @@
 
 	if randomizeScheduler {
 		for i := uint32(1); i <= n; i++ {
-			j := fastrand() % (i + 1)
+			j := fastrandn(i + 1)
 			batch[i], batch[j] = batch[j], batch[i]
 		}
 	}

diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 22e4dca..90a6cab 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go

@@ -53,14 +53,14 @@
 }
 
 func TestYieldProgress(t *testing.T) {
-	testYieldProgress(t, false)
+	testYieldProgress(false)
 }
 
 func TestYieldLockedProgress(t *testing.T) {
-	testYieldProgress(t, true)
+	testYieldProgress(true)
 }
 
-func testYieldProgress(t *testing.T, locked bool) {
+func testYieldProgress(locked bool) {
 	c := make(chan bool)
 	cack := make(chan bool)
 	go func() {
@@ -428,10 +428,13 @@
 	<-lightChan
 
 	// Check that hogCount and lightCount are within a factor of
-	// 2, which indicates that both pairs of goroutines handed off
-	// the P within a time-slice to their buddy.
-	if hogCount > lightCount*2 || lightCount > hogCount*2 {
-		t.Fatalf("want hogCount/lightCount in [0.5, 2]; got %d/%d = %g", hogCount, lightCount, float64(hogCount)/float64(lightCount))
+	// 5, which indicates that both pairs of goroutines handed off
+	// the P within a time-slice to their buddy. We can use a
+	// fairly large factor here to make this robust: if the
+	// scheduler isn't working right, the gap should be ~1000X.
+	const factor = 5
+	if hogCount > lightCount*factor || lightCount > hogCount*factor {
+		t.Fatalf("want hogCount/lightCount in [%v, %v]; got %d/%d = %g", 1.0/factor, factor, hogCount, lightCount, float64(hogCount)/float64(lightCount))
 	}
 }
 

diff --git a/src/runtime/profbuf.go b/src/runtime/profbuf.go
new file mode 100644
index 0000000..2719238
--- /dev/null
+++ b/src/runtime/profbuf.go

@@ -0,0 +1,561 @@
+// Copyright 2017 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// A profBuf is a lock-free buffer for profiling events,
+// safe for concurrent use by one reader and one writer.
+// The writer may be a signal handler running without a user g.
+// The reader is assumed to be a user g.
+//
+// Each logged event corresponds to a fixed size header, a list of
+// uintptrs (typically a stack), and exactly one unsafe.Pointer tag.
+// The header and uintptrs are stored in the circular buffer data and the
+// tag is stored in a circular buffer tags, running in parallel.
+// In the circular buffer data, each event takes 2+hdrsize+len(stk)
+// words: the value 2+hdrsize+len(stk), then the time of the event, then
+// hdrsize words giving the fixed-size header, and then len(stk) words
+// for the stack.
+//
+// The current effective offsets into the tags and data circular buffers
+// for reading and writing are stored in the high 30 and low 32 bits of r and w.
+// The bottom bits of the high 32 are additional flag bits in w, unused in r.
+// "Effective" offsets means the total number of reads or writes, mod 2^length.
+// The offset in the buffer is the effective offset mod the length of the buffer.
+// To make wraparound mod 2^length match wraparound mod length of the buffer,
+// the length of the buffer must be a power of two.
+//
+// If the reader catches up to the writer, a flag passed to read controls
+// whether the read blocks until more data is available. A read returns a
+// pointer to the buffer data itself; the caller is assumed to be done with
+// that data at the next read. The read offset rNext tracks the next offset to
+// be returned by read. By definition, r ≤ rNext ≤ w (before wraparound),
+// and rNext is only used by the reader, so it can be accessed without atomics.
+//
+// If the writer gets ahead of the reader, so that the buffer fills,
+// future writes are discarded and replaced in the output stream by an
+// overflow entry, which has size 2+hdrsize+1, time set to the time of
+// the first discarded write, a header of all zeroed words, and a "stack"
+// containing one word, the number of discarded writes.
+//
+// Between the time the buffer fills and the buffer becomes empty enough
+// to hold more data, the overflow entry is stored as a pending overflow
+// entry in the fields overflow and overflowTime. The pending overflow
+// entry can be turned into a real record by either the writer or the
+// reader. If the writer is called to write a new record and finds that
+// the output buffer has room for both the pending overflow entry and the
+// new record, the writer emits the pending overflow entry and the new
+// record into the buffer. If the reader is called to read data and finds
+// that the output buffer is empty but that there is a pending overflow
+// entry, the reader will return a synthesized record for the pending
+// overflow entry.
+//
+// Only the writer can create or add to a pending overflow entry, but
+// either the reader or the writer can clear the pending overflow entry.
+// A pending overflow entry is indicated by the low 32 bits of 'overflow'
+// holding the number of discarded writes, and overflowTime holding the
+// time of the first discarded write. The high 32 bits of 'overflow'
+// increment each time the low 32 bits transition from zero to non-zero
+// or vice versa. This sequence number avoids ABA problems in the use of
+// compare-and-swap to coordinate between reader and writer.
+// The overflowTime is only written when the low 32 bits of overflow are
+// zero, that is, only when there is no pending overflow entry, in
+// preparation for creating a new one. The reader can therefore fetch and
+// clear the entry atomically using
+//
+//	for {
+//		overflow = load(&b.overflow)
+//		if uint32(overflow) == 0 {
+//			// no pending entry
+//			break
+//		}
+//		time = load(&b.overflowTime)
+//		if cas(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+//			// pending entry cleared
+//			break
+//		}
+//	}
+//	if uint32(overflow) > 0 {
+//		emit entry for uint32(overflow), time
+//	}
+//
+type profBuf struct {
+	// accessed atomically
+	r, w         profAtomic
+	overflow     uint64
+	overflowTime uint64
+	eof          uint32
+
+	// immutable (excluding slice content)
+	hdrsize uintptr
+	data    []uint64
+	tags    []unsafe.Pointer
+
+	// owned by reader
+	rNext       profIndex
+	overflowBuf []uint64 // for use by reader to return overflow record
+	wait        note
+}
+
+// A profAtomic is the atomically-accessed word holding a profIndex.
+type profAtomic uint64
+
+// A profIndex is the packet tag and data counts and flags bits, described above.
+type profIndex uint64
+
+const (
+	profReaderSleeping profIndex = 1 << 32 // reader is sleeping and must be woken up
+	profWriteExtra     profIndex = 1 << 33 // overflow or eof waiting
+)
+
+func (x *profAtomic) load() profIndex {
+	return profIndex(atomic.Load64((*uint64)(x)))
+}
+
+func (x *profAtomic) store(new profIndex) {
+	atomic.Store64((*uint64)(x), uint64(new))
+}
+
+func (x *profAtomic) cas(old, new profIndex) bool {
+	return atomic.Cas64((*uint64)(x), uint64(old), uint64(new))
+}
+
+func (x profIndex) dataCount() uint32 {
+	return uint32(x)
+}
+
+func (x profIndex) tagCount() uint32 {
+	return uint32(x >> 34)
+}
+
+// countSub subtracts two counts obtained from profIndex.dataCount or profIndex.tagCount,
+// assuming that they are no more than 2^29 apart (guaranteed since they are never more than
+// len(data) or len(tags) apart, respectively).
+// tagCount wraps at 2^30, while dataCount wraps at 2^32.
+// This function works for both.
+func countSub(x, y uint32) int {
+	// x-y is 32-bit signed or 30-bit signed; sign-extend to 32 bits and convert to int.
+	return int(int32(x-y) << 2 >> 2)
+}
+
+// addCountsAndClearFlags returns the packed form of "x + (data, tag) - all flags".
+func (x profIndex) addCountsAndClearFlags(data, tag int) profIndex {
+	return profIndex((uint64(x)>>34+uint64(uint32(tag)<<2>>2))<<34 | uint64(uint32(x)+uint32(data)))
+}
+
+// hasOverflow reports whether b has any overflow records pending.
+func (b *profBuf) hasOverflow() bool {
+	return uint32(atomic.Load64(&b.overflow)) > 0
+}
+
+// takeOverflow consumes the pending overflow records, returning the overflow count
+// and the time of the first overflow.
+// When called by the reader, it is racing against incrementOverflow.
+func (b *profBuf) takeOverflow() (count uint32, time uint64) {
+	overflow := atomic.Load64(&b.overflow)
+	time = atomic.Load64(&b.overflowTime)
+	for {
+		count = uint32(overflow)
+		if count == 0 {
+			time = 0
+			break
+		}
+		// Increment generation, clear overflow count in low bits.
+		if atomic.Cas64(&b.overflow, overflow, ((overflow>>32)+1)<<32) {
+			break
+		}
+		overflow = atomic.Load64(&b.overflow)
+		time = atomic.Load64(&b.overflowTime)
+	}
+	return uint32(overflow), time
+}
+
+// incrementOverflow records a single overflow at time now.
+// It is racing against a possible takeOverflow in the reader.
+func (b *profBuf) incrementOverflow(now int64) {
+	for {
+		overflow := atomic.Load64(&b.overflow)
+
+		// Once we see b.overflow reach 0, it's stable: no one else is changing it underfoot.
+		// We need to set overflowTime if we're incrementing b.overflow from 0.
+		if uint32(overflow) == 0 {
+			// Store overflowTime first so it's always available when overflow != 0.
+			atomic.Store64(&b.overflowTime, uint64(now))
+			atomic.Store64(&b.overflow, (((overflow>>32)+1)<<32)+1)
+			break
+		}
+		// Otherwise we're racing to increment against reader
+		// who wants to set b.overflow to 0.
+		// Out of paranoia, leave 2³²-1 a sticky overflow value,
+		// to avoid wrapping around. Extremely unlikely.
+		if int32(overflow) == -1 {
+			break
+		}
+		if atomic.Cas64(&b.overflow, overflow, overflow+1) {
+			break
+		}
+	}
+}
+
+// newProfBuf returns a new profiling buffer with room for
+// a header of hdrsize words and a buffer of at least bufwords words.
+func newProfBuf(hdrsize, bufwords, tags int) *profBuf {
+	if min := 2 + hdrsize + 1; bufwords < min {
+		bufwords = min
+	}
+
+	// Buffer sizes must be power of two, so that we don't have to
+	// worry about uint32 wraparound changing the effective position
+	// within the buffers. We store 30 bits of count; limiting to 28
+	// gives us some room for intermediate calculations.
+	if bufwords >= 1<<28 || tags >= 1<<28 {
+		throw("newProfBuf: buffer too large")
+	}
+	var i int
+	for i = 1; i < bufwords; i <<= 1 {
+	}
+	bufwords = i
+	for i = 1; i < tags; i <<= 1 {
+	}
+	tags = i
+
+	b := new(profBuf)
+	b.hdrsize = uintptr(hdrsize)
+	b.data = make([]uint64, bufwords)
+	b.tags = make([]unsafe.Pointer, tags)
+	b.overflowBuf = make([]uint64, 2+b.hdrsize+1)
+	return b
+}
+
+// canWriteRecord reports whether the buffer has room
+// for a single contiguous record with a stack of length nstk.
+func (b *profBuf) canWriteRecord(nstk int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 1 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	want := 2 + int(b.hdrsize) + nstk
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+	}
+	return nd >= want
+}
+
+// canWriteTwoRecords reports whether the buffer has room
+// for two records with stack lengths nstk1, nstk2, in that order.
+// Each record must be contiguous on its own, but the two
+// records need not be contiguous (one can be at the end of the buffer
+// and the other can wrap around and start at the beginning of the buffer).
+func (b *profBuf) canWriteTwoRecords(nstk1, nstk2 int) bool {
+	br := b.r.load()
+	bw := b.w.load()
+
+	// room for tag?
+	if countSub(br.tagCount(), bw.tagCount())+len(b.tags) < 2 {
+		return false
+	}
+
+	// room for data?
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+
+	// first record
+	want := 2 + int(b.hdrsize) + nstk1
+	i := int(bw.dataCount() % uint32(len(b.data)))
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	i += want
+	nd -= want
+
+	// second record
+	want = 2 + int(b.hdrsize) + nstk2
+	if i+want > len(b.data) {
+		// Can't fit in trailing fragment of slice.
+		// Skip over that and start over at beginning of slice.
+		nd -= len(b.data) - i
+		i = 0
+	}
+	return nd >= want
+}
+
+// write writes an entry to the profiling buffer b.
+// The entry begins with a fixed hdr, which must have
+// length b.hdrsize, followed by a variable-sized stack
+// and a single tag pointer *tagPtr (or nil if tagPtr is nil).
+// No write barriers allowed because this might be called from a signal handler.
+func (b *profBuf) write(tagPtr *unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+	if b == nil {
+		return
+	}
+	if len(hdr) > int(b.hdrsize) {
+		throw("misuse of profBuf.write")
+	}
+
+	if hasOverflow := b.hasOverflow(); hasOverflow && b.canWriteTwoRecords(1, len(stk)) {
+		// Room for both an overflow record and the one being written.
+		// Write the overflow record if the reader hasn't gotten to it yet.
+		// Only racing against reader, not other writers.
+		count, time := b.takeOverflow()
+		if count > 0 {
+			var stk [1]uintptr
+			stk[0] = uintptr(count)
+			b.write(nil, int64(time), nil, stk[:])
+		}
+	} else if hasOverflow || !b.canWriteRecord(len(stk)) {
+		// Pending overflow without room to write overflow and new records
+		// or no overflow but also no room for new record.
+		b.incrementOverflow(now)
+		b.wakeupExtra()
+		return
+	}
+
+	// There's room: write the record.
+	br := b.r.load()
+	bw := b.w.load()
+
+	// Profiling tag
+	//
+	// The tag is a pointer, but we can't run a write barrier here.
+	// We have interrupted the OS-level execution of gp, but the
+	// runtime still sees gp as executing. In effect, we are running
+	// in place of the real gp. Since gp is the only goroutine that
+	// can overwrite gp.labels, the value of gp.labels is stable during
+	// this signal handler: it will still be reachable from gp when
+	// we finish executing. If a GC is in progress right now, it must
+	// keep gp.labels alive, because gp.labels is reachable from gp.
+	// If gp were to overwrite gp.labels, the deletion barrier would
+	// still shade that pointer, which would preserve it for the
+	// in-progress GC, so all is well. Any future GC will see the
+	// value we copied when scanning b.tags (heap-allocated).
+	// We arrange that the store here is always overwriting a nil,
+	// so there is no need for a deletion barrier on b.tags[wt].
+	wt := int(bw.tagCount() % uint32(len(b.tags)))
+	if tagPtr != nil {
+		*(*uintptr)(unsafe.Pointer(&b.tags[wt])) = uintptr(unsafe.Pointer(*tagPtr))
+	}
+
+	// Main record.
+	// It has to fit in a contiguous section of the slice, so if it doesn't fit at the end,
+	// leave a rewind marker (0) and start over at the beginning of the slice.
+	wd := int(bw.dataCount() % uint32(len(b.data)))
+	nd := countSub(br.dataCount(), bw.dataCount()) + len(b.data)
+	skip := 0
+	if wd+2+int(b.hdrsize)+len(stk) > len(b.data) {
+		b.data[wd] = 0
+		skip = len(b.data) - wd
+		nd -= skip
+		wd = 0
+	}
+	data := b.data[wd:]
+	data[0] = uint64(2 + b.hdrsize + uintptr(len(stk))) // length
+	data[1] = uint64(now)                               // time stamp
+	// header, zero-padded
+	i := uintptr(copy(data[2:2+b.hdrsize], hdr))
+	for ; i < b.hdrsize; i++ {
+		data[2+i] = 0
+	}
+	for i, pc := range stk {
+		data[2+b.hdrsize+uintptr(i)] = uint64(pc)
+	}
+
+	for {
+		// Commit write.
+		// Racing with reader setting flag bits in b.w, to avoid lost wakeups.
+		old := b.w.load()
+		new := old.addCountsAndClearFlags(skip+2+len(stk)+int(b.hdrsize), 1)
+		if !b.w.cas(old, new) {
+			continue
+		}
+		// If there was a reader, wake it up.
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// close signals that there will be no more writes on the buffer.
+// Once all the data has been read from the buffer, reads will return eof=true.
+func (b *profBuf) close() {
+	if atomic.Load(&b.eof) > 0 {
+		throw("runtime: profBuf already closed")
+	}
+	atomic.Store(&b.eof, 1)
+	b.wakeupExtra()
+}
+
+// wakeupExtra must be called after setting one of the "extra"
+// atomic fields b.overflow or b.eof.
+// It records the change in b.w and wakes up the reader if needed.
+func (b *profBuf) wakeupExtra() {
+	for {
+		old := b.w.load()
+		new := old | profWriteExtra
+		if !b.w.cas(old, new) {
+			continue
+		}
+		if old&profReaderSleeping != 0 {
+			notewakeup(&b.wait)
+		}
+		break
+	}
+}
+
+// profBufReadMode specifies whether to block when no data is available to read.
+type profBufReadMode int
+
+const (
+	profBufBlocking profBufReadMode = iota
+	profBufNonBlocking
+)
+
+var overflowTag [1]unsafe.Pointer // always nil
+
+func (b *profBuf) read(mode profBufReadMode) (data []uint64, tags []unsafe.Pointer, eof bool) {
+	if b == nil {
+		return nil, nil, true
+	}
+
+	br := b.rNext
+
+	// Commit previous read, returning that part of the ring to the writer.
+	// First clear tags that have now been read, both to avoid holding
+	// up the memory they point at for longer than necessary
+	// and so that b.write can assume it is always overwriting
+	// nil tag entries (see comment in b.write).
+	rPrev := b.r.load()
+	if rPrev != br {
+		ntag := countSub(br.tagCount(), rPrev.tagCount())
+		ti := int(rPrev.tagCount() % uint32(len(b.tags)))
+		for i := 0; i < ntag; i++ {
+			b.tags[ti] = nil
+			if ti++; ti == len(b.tags) {
+				ti = 0
+			}
+		}
+		b.r.store(br)
+	}
+
+Read:
+	bw := b.w.load()
+	numData := countSub(bw.dataCount(), br.dataCount())
+	if numData == 0 {
+		if b.hasOverflow() {
+			// No data to read, but there is overflow to report.
+			// Racing with writer flushing b.overflow into a real record.
+			count, time := b.takeOverflow()
+			if count == 0 {
+				// Lost the race, go around again.
+				goto Read
+			}
+			// Won the race, report overflow.
+			dst := b.overflowBuf
+			dst[0] = uint64(2 + b.hdrsize + 1)
+			dst[1] = uint64(time)
+			for i := uintptr(0); i < b.hdrsize; i++ {
+				dst[2+i] = 0
+			}
+			dst[2+b.hdrsize] = uint64(count)
+			return dst[:2+b.hdrsize+1], overflowTag[:1], false
+		}
+		if atomic.Load(&b.eof) > 0 {
+			// No data, no overflow, EOF set: done.
+			return nil, nil, true
+		}
+		if bw&profWriteExtra != 0 {
+			// Writer claims to have published extra information (overflow or eof).
+			// Attempt to clear notification and then check again.
+			// If we fail to clear the notification it means b.w changed,
+			// so we still need to check again.
+			b.w.cas(bw, bw&^profWriteExtra)
+			goto Read
+		}
+
+		// Nothing to read right now.
+		// Return or sleep according to mode.
+		if mode == profBufNonBlocking {
+			return nil, nil, false
+		}
+		if !b.w.cas(bw, bw|profReaderSleeping) {
+			goto Read
+		}
+		// Committed to sleeping.
+		notetsleepg(&b.wait, -1)
+		noteclear(&b.wait)
+		goto Read
+	}
+	data = b.data[br.dataCount()%uint32(len(b.data)):]
+	if len(data) > numData {
+		data = data[:numData]
+	} else {
+		numData -= len(data) // available in case of wraparound
+	}
+	skip := 0
+	if data[0] == 0 {
+		// Wraparound record. Go back to the beginning of the ring.
+		skip = len(data)
+		data = b.data
+		if len(data) > numData {
+			data = data[:numData]
+		}
+	}
+
+	ntag := countSub(bw.tagCount(), br.tagCount())
+	if ntag == 0 {
+		throw("runtime: malformed profBuf buffer - tag and data out of sync")
+	}
+	tags = b.tags[br.tagCount()%uint32(len(b.tags)):]
+	if len(tags) > ntag {
+		tags = tags[:ntag]
+	}
+
+	// Count out whole data records until either data or tags is done.
+	// They are always in sync in the buffer, but due to an end-of-slice
+	// wraparound we might need to stop early and return the rest
+	// in the next call.
+	di := 0
+	ti := 0
+	for di < len(data) && data[di] != 0 && ti < len(tags) {
+		if uintptr(di)+uintptr(data[di]) > uintptr(len(data)) {
+			throw("runtime: malformed profBuf buffer - invalid size")
+		}
+		di += int(data[di])
+		ti++
+	}
+
+	// Remember how much we returned, to commit read on next call.
+	b.rNext = br.addCountsAndClearFlags(skip+di, ti)
+
+	if raceenabled {
+		// Match racewritepc in runtime_setProfLabel,
+		// so that the setting of the labels in runtime_setProfLabel
+		// is treated as happening before any use of the labels
+		// by our caller. The synchronization on labelSync itself is a fiction
+		// for the race detector. The actual synchronization is handled
+		// by the fact that the signal handler only reads from the current
+		// goroutine and uses atomics to write the updated queue indices,
+		// and then the read-out from the signal handler buffer uses
+		// atomics to read those queue indices.
+		raceacquire(unsafe.Pointer(&labelSync))
+	}
+
+	return data[:di], tags[:ti], false
+}

diff --git a/src/runtime/profbuf_test.go b/src/runtime/profbuf_test.go
new file mode 100644
index 0000000..d9c5264
--- /dev/null
+++ b/src/runtime/profbuf_test.go

@@ -0,0 +1,182 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"reflect"
+	. "runtime"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func TestProfBuf(t *testing.T) {
+	const hdrSize = 2
+
+	write := func(t *testing.T, b *ProfBuf, tag unsafe.Pointer, now int64, hdr []uint64, stk []uintptr) {
+		b.Write(&tag, now, hdr, stk)
+	}
+	read := func(t *testing.T, b *ProfBuf, data []uint64, tags []unsafe.Pointer) {
+		rdata, rtags, eof := b.Read(ProfBufNonBlocking)
+		if !reflect.DeepEqual(rdata, data) || !reflect.DeepEqual(rtags, tags) {
+			t.Fatalf("unexpected profile read:\nhave data %#x\nwant data %#x\nhave tags %#x\nwant tags %#x", rdata, data, rtags, tags)
+		}
+		if eof {
+			t.Fatalf("unexpected eof")
+		}
+	}
+	readBlock := func(t *testing.T, b *ProfBuf, data []uint64, tags []unsafe.Pointer) func() {
+		c := make(chan int)
+		go func() {
+			eof := data == nil
+			rdata, rtags, reof := b.Read(ProfBufBlocking)
+			if !reflect.DeepEqual(rdata, data) || !reflect.DeepEqual(rtags, tags) || reof != eof {
+				// Errorf, not Fatalf, because called in goroutine.
+				t.Errorf("unexpected profile read:\nhave data %#x\nwant data %#x\nhave tags %#x\nwant tags %#x\nhave eof=%v, want %v", rdata, data, rtags, tags, reof, eof)
+			}
+			c <- 1
+		}()
+		time.Sleep(10 * time.Millisecond) // let goroutine run and block
+		return func() {
+			select {
+			case <-c:
+			case <-time.After(1 * time.Second):
+				t.Fatalf("timeout waiting for blocked read")
+			}
+		}
+	}
+	readEOF := func(t *testing.T, b *ProfBuf) {
+		rdata, rtags, eof := b.Read(ProfBufBlocking)
+		if rdata != nil || rtags != nil || !eof {
+			t.Errorf("unexpected profile read: %#x, %#x, eof=%v; want nil, nil, eof=true", rdata, rtags, eof)
+		}
+		rdata, rtags, eof = b.Read(ProfBufNonBlocking)
+		if rdata != nil || rtags != nil || !eof {
+			t.Errorf("unexpected profile read (non-blocking): %#x, %#x, eof=%v; want nil, nil, eof=true", rdata, rtags, eof)
+		}
+	}
+
+	myTags := make([]byte, 100)
+	t.Logf("myTags is %p", &myTags[0])
+
+	t.Run("BasicWriteRead", func(t *testing.T) {
+		b := NewProfBuf(2, 11, 1)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		read(t, b, nil, nil) // release data returned by previous read
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadMany", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204, 5, 500, 502, 504, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2]), unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadManyShortData", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadManyShortTags", func(t *testing.T) {
+		b := NewProfBuf(2, 50, 50)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[0]), unsafe.Pointer(&myTags[2])})
+	})
+
+	t.Run("ReadAfterOverflow1", func(t *testing.T) {
+		// overflow record synthesized by write
+		b := NewProfBuf(2, 16, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})           // uses 10
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])}) // reads 10 but still in use until next read
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5})                       // uses 6
+		read(t, b, []uint64{6, 1, 2, 3, 4, 5}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})              // reads 6 but still in use until next read
+		// now 10 available
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204, 205, 206, 207, 208, 209}) // no room
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), int64(100+i), []uint64{101, 102}, []uintptr{201, 202, 203, 204}) // no room for overflow+this record
+		}
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506}) // room for overflow+this record
+		read(t, b, []uint64{5, 99, 0, 0, 300, 5, 500, 502, 504, 506}, []unsafe.Pointer{nil, unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadAfterOverflow2", func(t *testing.T) {
+		// overflow record synthesized by read
+		b := NewProfBuf(2, 16, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213})
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), 100, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		}
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])}) // reads 10 but still in use until next read
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{})                     // still overflow
+		read(t, b, []uint64{5, 99, 0, 0, 301}, []unsafe.Pointer{nil})                                     // overflow synthesized by read
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 505}, []uintptr{506})                  // written
+		read(t, b, []uint64{5, 500, 502, 505, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("ReadAtEndAfterOverflow", func(t *testing.T) {
+		b := NewProfBuf(2, 12, 5)
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		for i := 0; i < 299; i++ {
+			write(t, b, unsafe.Pointer(&myTags[3]), 100, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		}
+		read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		read(t, b, []uint64{5, 99, 0, 0, 300}, []unsafe.Pointer{nil})
+		write(t, b, unsafe.Pointer(&myTags[1]), 500, []uint64{502, 504}, []uintptr{506})
+		read(t, b, []uint64{5, 500, 502, 504, 506}, []unsafe.Pointer{unsafe.Pointer(&myTags[1])})
+	})
+
+	t.Run("BlockingWriteRead", func(t *testing.T) {
+		b := NewProfBuf(2, 11, 1)
+		wait := readBlock(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+		write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+		wait()
+		wait = readBlock(t, b, []uint64{8, 99, 101, 102, 201, 202, 203, 204}, []unsafe.Pointer{unsafe.Pointer(&myTags[2])})
+		time.Sleep(10 * time.Millisecond)
+		write(t, b, unsafe.Pointer(&myTags[2]), 99, []uint64{101, 102}, []uintptr{201, 202, 203, 204})
+		wait()
+		wait = readBlock(t, b, nil, nil)
+		b.Close()
+		wait()
+		wait = readBlock(t, b, nil, nil)
+		wait()
+		readEOF(t, b)
+	})
+
+	t.Run("DataWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 16, 1024)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+
+	t.Run("TagWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 1024, 2)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+
+	t.Run("BothWraparound", func(t *testing.T) {
+		b := NewProfBuf(2, 16, 2)
+		for i := 0; i < 10; i++ {
+			write(t, b, unsafe.Pointer(&myTags[0]), 1, []uint64{2, 3}, []uintptr{4, 5, 6, 7, 8, 9})
+			read(t, b, []uint64{10, 1, 2, 3, 4, 5, 6, 7, 8, 9}, []unsafe.Pointer{unsafe.Pointer(&myTags[0])})
+			read(t, b, nil, nil) // release data returned by previous read
+		}
+	})
+}

diff --git a/src/runtime/proflabel.go b/src/runtime/proflabel.go
new file mode 100644
index 0000000..1b41a8a
--- /dev/null
+++ b/src/runtime/proflabel.go

@@ -0,0 +1,25 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+var labelSync uintptr
+
+//go:linkname runtime_setProfLabel runtime/pprof.runtime_setProfLabel
+func runtime_setProfLabel(labels unsafe.Pointer) {
+	// Introduce race edge for read-back via profile.
+	// This would more properly use &getg().labels as the sync address,
+	// but we do the read in a signal handler and can't call the race runtime then.
+	if raceenabled {
+		racerelease(unsafe.Pointer(&labelSync))
+	}
+	getg().labels = labels
+}
+
+//go:linkname runtime_getProfLabel runtime/pprof.runtime_getProfLabel
+func runtime_getProfLabel() unsafe.Pointer {
+	return getg().labels
+}

diff --git a/src/runtime/race.go b/src/runtime/race.go
index d8483c0..49495cc 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go

@@ -17,9 +17,6 @@
 func RaceReadRange(addr unsafe.Pointer, len int)
 func RaceWriteRange(addr unsafe.Pointer, len int)
 
-func RaceSemacquire(s *uint32)
-func RaceSemrelease(s *uint32)
-
 func RaceErrors() int {
 	var n uint64
 	racecall(&__tsan_report_count, uintptr(unsafe.Pointer(&n)), 0, 0, 0)
@@ -101,7 +98,7 @@
 	if f != nil {
 		file, line := f.FileLine(ctx.pc)
 		if line != 0 {
-			ctx.fn = cfuncname(f.raw())
+			ctx.fn = cfuncname(f.funcInfo())
 			ctx.line = uintptr(line)
 			ctx.file = &bytes(file)[0] // assume NUL-terminated
 			ctx.off = ctx.pc - f.Entry()

diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index 587540f..e73e6b3 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go

@@ -181,10 +181,12 @@
 	}()
 	x = 43
 	<-done
+	t.Log(t.Failed())
 }
 `, `
 ==================
 --- FAIL: TestFail \(0...s\)
+.*main_test.go:13: true
 .*testing.go:.*: race detected during execution of test
 FAIL`},
 

diff --git a/src/runtime/race/race_test.go b/src/runtime/race/race_test.go
index 8cdf52d..a0b8531 100644
--- a/src/runtime/race/race_test.go
+++ b/src/runtime/race/race_test.go

@@ -68,7 +68,7 @@
 	}
 
 	if totalTests == 0 {
-		t.Fatalf("failed to parse test output")
+		t.Fatalf("failed to parse test output:\n%s", testOutput)
 	}
 	fmt.Printf("\nPassed %d of %d tests (%.02f%%, %d+, %d-)\n",
 		passedTests, totalTests, 100*float64(passedTests)/float64(totalTests), falsePos, falseNeg)

diff --git a/src/runtime/rand_test.go b/src/runtime/rand_test.go
new file mode 100644
index 0000000..f8831b0
--- /dev/null
+++ b/src/runtime/rand_test.go

@@ -0,0 +1,45 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"strconv"
+	"testing"
+)
+
+func BenchmarkFastrand(b *testing.B) {
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			Fastrand()
+		}
+	})
+}
+
+func BenchmarkFastrandHashiter(b *testing.B) {
+	var m = make(map[int]int, 10)
+	for i := 0; i < 10; i++ {
+		m[i] = i
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		for pb.Next() {
+			for _ = range m {
+				break
+			}
+		}
+	})
+}
+
+var sink32 uint32
+
+func BenchmarkFastrandn(b *testing.B) {
+	for n := uint32(2); n <= 5; n++ {
+		b.Run(strconv.Itoa(int(n)), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sink32 = Fastrandn(n)
+			}
+		})
+	}
+}

diff --git a/src/runtime/relax_stub.go b/src/runtime/relax_stub.go
new file mode 100644
index 0000000..81ed129
--- /dev/null
+++ b/src/runtime/relax_stub.go

@@ -0,0 +1,17 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !windows
+
+package runtime
+
+// osRelaxMinNS is the number of nanoseconds of idleness to tolerate
+// without performing an osRelax. Since osRelax may reduce the
+// precision of timers, this should be enough larger than the relaxed
+// timer precision to keep the timer error acceptable.
+const osRelaxMinNS = 0
+
+// osRelax is called by the scheduler when transitioning to and from
+// all Ps being idle.
+func osRelax(relax bool) {}

diff --git a/src/runtime/rt0_linux_mips64x.s b/src/runtime/rt0_linux_mips64x.s
index beb4ef2..0891c68 100644
--- a/src/runtime/rt0_linux_mips64x.s
+++ b/src/runtime/rt0_linux_mips64x.s

@@ -30,7 +30,7 @@
 	// in external linking, glibc jumps to main with argc in R4
 	// and argv in R5
 
-	// initalize REGSB = PC&0xffffffff00000000
+	// initialize REGSB = PC&0xffffffff00000000
 	BGEZAL	R0, 1(PC)
 	SRLV	$32, R31, RSB
 	SLLV	$32, RSB

diff --git a/src/runtime/rt0_linux_ppc64le.s b/src/runtime/rt0_linux_ppc64le.s
index 2c55413..81b9913 100644
--- a/src/runtime/rt0_linux_ppc64le.s
+++ b/src/runtime/rt0_linux_ppc64le.s

@@ -8,6 +8,8 @@
 	// Start with standard C stack frame layout and linkage.
 	MOVD	LR, R0
 	MOVD	R0, 16(R1) // Save LR in caller's frame.
+	MOVW	CR, R0     // Save CR in caller's frame
+	MOVD	R0, 8(R1)
 	MOVD	R2, 24(R1) // Save TOC in caller's frame.
 	MOVDU	R1, -320(R1) // Allocate frame.
 	
@@ -53,6 +55,9 @@
 	MOVD	R4, _rt0_ppc64le_linux_lib_argv<>(SB)
 
 	// Synchronous initialization.
+	MOVD	$runtime·reginit(SB), R12
+	MOVD	R12, CTR
+	BL	(CTR)
 	MOVD	$runtime·libpreinit(SB), R12
 	MOVD	R12, CTR
 	BL	(CTR)
@@ -117,6 +122,8 @@
 
 	ADD	$320, R1
 	MOVD	24(R1), R2
+	MOVD	8(R1), R0
+	MOVFL	R0, $0xff
 	MOVD	16(R1), R0
 	MOVD	R0, LR
 	RET

diff --git a/src/runtime/runtime-gdb.py b/src/runtime/runtime-gdb.py
index 5c9b2a0..dd1f79b 100644
--- a/src/runtime/runtime-gdb.py
+++ b/src/runtime/runtime-gdb.py

@@ -416,8 +416,37 @@
 		if ptr['atomicstatus'] == 6:  # 'gdead'
 			continue
 		if ptr['goid'] == goid:
-			return (ptr['sched'][x].cast(vp) for x in ('pc', 'sp'))
-	return None, None
+			break
+	else:
+		return None, None
+	# Get the goroutine's saved state.
+	pc, sp = ptr['sched']['pc'], ptr['sched']['sp']
+	# If the goroutine is stopped, sched.sp will be non-0.
+	if sp != 0:
+		return pc.cast(vp), sp.cast(vp)
+	# If the goroutine is in a syscall, use syscallpc/sp.
+	pc, sp = ptr['syscallpc'], ptr['syscallsp']
+	if sp != 0:
+		return pc.cast(vp), sp.cast(vp)
+	# Otherwise, the goroutine is running, so it doesn't have
+	# saved scheduler state. Find G's OS thread.
+	m = ptr['m']
+	if m == 0:
+		return None, None
+	for thr in gdb.selected_inferior().threads():
+		if thr.ptid[1] == m['procid']:
+			break
+	else:
+		return None, None
+	# Get scheduler state from the G's OS thread state.
+	curthr = gdb.selected_thread()
+	try:
+		thr.switch()
+		pc = gdb.parse_and_eval('$pc')
+		sp = gdb.parse_and_eval('$sp')
+	finally:
+		curthr.switch()
+	return pc.cast(vp), sp.cast(vp)
 
 
 class GoroutineCmd(gdb.Command):

diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index f886961..1318bab 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go

@@ -56,6 +56,10 @@
 }
 
 func checkGdbPython(t *testing.T) {
+	if runtime.GOOS == "solaris" && testenv.Builder() != "solaris-amd64-smartosbuildlet" {
+		t.Skip("skipping gdb python tests on solaris; see golang.org/issue/20821")
+	}
+
 	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
 	out, err := cmd.CombinedOutput()
 
@@ -69,6 +73,7 @@
 
 const helloSource = `
 import "fmt"
+import "runtime"
 var gslice []string
 func main() {
 	mapvar := make(map[string]string,5)
@@ -78,9 +83,10 @@
 	ptrvar := &strvar
 	slicevar := make([]string, 0, 16)
 	slicevar = append(slicevar, mapvar["abc"])
-	fmt.Println("hi") // line 12
+	fmt.Println("hi") // line 13
 	_ = ptrvar
 	gslice = slicevar
+	runtime.KeepAlive(mapvar)
 }
 `
 
@@ -89,13 +95,13 @@
 }
 
 func TestGdbPythonCgo(t *testing.T) {
+	if runtime.GOARCH == "mips" || runtime.GOARCH == "mipsle" || runtime.GOARCH == "mips64" {
+		testenv.SkipFlaky(t, 18784)
+	}
 	testGdbPython(t, true)
 }
 
 func testGdbPython(t *testing.T, cgo bool) {
-	if runtime.GOARCH == "mips64" {
-		testenv.SkipFlaky(t, 18173)
-	}
 	if cgo && !build.Default.CgoEnabled {
 		t.Skip("skipping because cgo is not enabled")
 	}
@@ -152,6 +158,9 @@
 		"-ex", "info locals",
 		"-ex", "echo END\n",
 		"-ex", "down", // back to fmt.Println (goroutine 2 below only works at bottom of stack.  TODO: fix that)
+		"-ex", "echo BEGIN goroutine 1 bt\n",
+		"-ex", "goroutine 1 bt",
+		"-ex", "echo END\n",
 		"-ex", "echo BEGIN goroutine 2 bt\n",
 		"-ex", "goroutine 2 bt",
 		"-ex", "echo END\n",
@@ -208,8 +217,13 @@
 		t.Fatalf("info locals failed: %s", bl)
 	}
 
-	btGoroutineRe := regexp.MustCompile(`^#0\s+runtime.+at`)
-	if bl := blocks["goroutine 2 bt"]; !btGoroutineRe.MatchString(bl) {
+	btGoroutine1Re := regexp.MustCompile(`(?m)^#0\s+(0x[0-9a-f]+\s+in\s+)?fmt\.Println.+at`)
+	if bl := blocks["goroutine 1 bt"]; !btGoroutine1Re.MatchString(bl) {
+		t.Fatalf("goroutine 1 bt failed: %s", bl)
+	}
+
+	btGoroutine2Re := regexp.MustCompile(`(?m)^#0\s+(0x[0-9a-f]+\s+in\s+)?runtime.+at`)
+	if bl := blocks["goroutine 2 bt"]; !btGoroutine2Re.MatchString(bl) {
 		t.Fatalf("goroutine 2 bt failed: %s", bl)
 	}
 }
@@ -245,9 +259,6 @@
 	if runtime.GOOS == "netbsd" {
 		testenv.SkipFlaky(t, 15603)
 	}
-	if runtime.GOARCH == "mips64" {
-		testenv.SkipFlaky(t, 18173)
-	}
 
 	t.Parallel()
 	checkGdbEnvironment(t)
@@ -319,10 +330,6 @@
 // TestGdbAutotmpTypes ensures that types of autotmp variables appear in .debug_info
 // See bug #17830.
 func TestGdbAutotmpTypes(t *testing.T) {
-	if runtime.GOARCH == "mips64" {
-		testenv.SkipFlaky(t, 18173)
-	}
-
 	t.Parallel()
 	checkGdbEnvironment(t)
 	checkGdbVersion(t)

diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 40c0e85..c073348 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go

@@ -35,15 +35,14 @@
 //go:nosplit
 func gotraceback() (level int32, all, crash bool) {
 	_g_ := getg()
-	all = _g_.m.throwing > 0
-	if _g_.m.traceback != 0 {
-		level = int32(_g_.m.traceback)
-		return
-	}
 	t := atomic.Load(&traceback_cache)
 	crash = t&tracebackCrash != 0
-	all = all || t&tracebackAll != 0
-	level = int32(t >> tracebackShift)
+	all = _g_.m.throwing > 0 || t&tracebackAll != 0
+	if _g_.m.traceback != 0 {
+		level = int32(_g_.m.traceback)
+	} else {
+		level = int32(t >> tracebackShift)
+	}
 	return
 }
 
@@ -260,6 +259,12 @@
 		throw("atomicor8")
 	}
 
+	m = [4]byte{0xff, 0xff, 0xff, 0xff}
+	atomic.And8(&m[1], 0x1)
+	if m[0] != 0xff || m[1] != 0x1 || m[2] != 0xff || m[3] != 0xff {
+		throw("atomicand8")
+	}
+
 	*(*uint64)(unsafe.Pointer(&j)) = ^uint64(0)
 	if j == j {
 		throw("float64nan")
@@ -313,23 +318,20 @@
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace    int32
-	cgocheck          int32
-	efence            int32
-	gccheckmark       int32
-	gcpacertrace      int32
-	gcshrinkstackoff  int32
-	gcstackbarrieroff int32
-	gcstackbarrierall int32
-	gcrescanstacks    int32
-	gcstoptheworld    int32
-	gctrace           int32
-	invalidptr        int32
-	sbrk              int32
-	scavenge          int32
-	scheddetail       int32
-	schedtrace        int32
-	wbshadow          int32
+	allocfreetrace   int32
+	cgocheck         int32
+	efence           int32
+	gccheckmark      int32
+	gcpacertrace     int32
+	gcshrinkstackoff int32
+	gcrescanstacks   int32
+	gcstoptheworld   int32
+	gctrace          int32
+	invalidptr       int32
+	sbrk             int32
+	scavenge         int32
+	scheddetail      int32
+	schedtrace       int32
 }
 
 var dbgvars = []dbgVar{
@@ -339,8 +341,6 @@
 	{"gccheckmark", &debug.gccheckmark},
 	{"gcpacertrace", &debug.gcpacertrace},
 	{"gcshrinkstackoff", &debug.gcshrinkstackoff},
-	{"gcstackbarrieroff", &debug.gcstackbarrieroff},
-	{"gcstackbarrierall", &debug.gcstackbarrierall},
 	{"gcrescanstacks", &debug.gcrescanstacks},
 	{"gcstoptheworld", &debug.gcstoptheworld},
 	{"gctrace", &debug.gctrace},
@@ -349,7 +349,6 @@
 	{"scavenge", &debug.scavenge},
 	{"scheddetail", &debug.scheddetail},
 	{"schedtrace", &debug.schedtrace},
-	{"wbshadow", &debug.wbshadow},
 }
 
 func parsedebugvars() {
@@ -392,17 +391,6 @@
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 
-	if debug.gcrescanstacks == 0 {
-		// Without rescanning, there's no need for stack
-		// barriers.
-		debug.gcstackbarrieroff = 1
-		debug.gcstackbarrierall = 0
-	}
-
-	if debug.gcstackbarrierall > 0 {
-		firstStackBarrierOffset = 0
-	}
-
 	// For cgocheck > 1, we turn on the write barrier at all times
 	// and check all pointer writes.
 	if debug.cgocheck > 1 {

diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 1ceab0a..6871d9c 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go

@@ -270,7 +270,7 @@
 type sudog struct {
 	// The following fields are protected by the hchan.lock of the
 	// channel this sudog is blocking on. shrinkstack depends on
-	// this.
+	// this for sudogs involved in channel ops.
 
 	g          *g
 	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
@@ -279,25 +279,19 @@
 	elem       unsafe.Pointer // data element (may point to stack)
 
 	// The following fields are never accessed concurrently.
-	// waitlink is only accessed by g.
+	// For channels, waitlink is only accessed by g.
+	// For semaphores, all fields (including the ones above)
+	// are only accessed when holding a semaRoot lock.
 
 	acquiretime int64
 	releasetime int64
 	ticket      uint32
-	waitlink    *sudog // g.waiting list
+	parent      *sudog // semaRoot binary tree
+	waitlink    *sudog // g.waiting list or semaRoot
+	waittail    *sudog // semaRoot
 	c           *hchan // channel
 }
 
-type gcstats struct {
-	// the struct must consist of only uint64's,
-	// because it is casted to uint64[].
-	nhandoff    uint64
-	nhandoffcnt uint64
-	nprocyield  uint64
-	nosyield    uint64
-	nsleep      uint64
-}
-
 type libcall struct {
 	fn   uintptr
 	n    uintptr // number of parameters
@@ -323,12 +317,6 @@
 	hi uintptr
 }
 
-// stkbar records the state of a G's stack barrier.
-type stkbar struct {
-	savedLRPtr uintptr // location overwritten by stack barrier PC
-	savedLRVal uintptr // value overwritten at savedLRPtr
-}
-
 type g struct {
 	// Stack parameters.
 	// stack describes the actual stack memory: [stack.lo, stack.hi).
@@ -344,12 +332,9 @@
 	_panic         *_panic // innermost panic - offset known to liblink
 	_defer         *_defer // innermost defer
 	m              *m      // current m; offset known to arm liblink
-	stackAlloc     uintptr // stack allocation is [stack.lo,stack.lo+stackAlloc)
 	sched          gobuf
 	syscallsp      uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
 	syscallpc      uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
-	stkbar         []stkbar       // stack barriers, from low to high (see top of mstkbar.go)
-	stkbarPos      uintptr        // index of lowest stack barrier not hit
 	stktopsp       uintptr        // expected sp at top of stack, to check in traceback
 	param          unsafe.Pointer // passed parameter on wakeup
 	atomicstatus   uint32
@@ -362,7 +347,7 @@
 	paniconfault   bool     // panic (instead of crash) on unexpected fault address
 	preemptscan    bool     // preempted g does scan for gc
 	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; TODO: remove?
 	throwsplit     bool     // must not split stack
 	raceignore     int8     // ignore race detection events
 	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
@@ -378,18 +363,13 @@
 	gopc           uintptr // pc of go statement that created this goroutine
 	startpc        uintptr // pc of goroutine function
 	racectx        uintptr
-	waiting        *sudog    // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
-	cgoCtxt        []uintptr // cgo traceback context
+	waiting        *sudog         // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
+	cgoCtxt        []uintptr      // cgo traceback context
+	labels         unsafe.Pointer // profiler labels
+	timer          *timer         // cached timer for time.Sleep
 
 	// Per-G GC state
 
-	// gcRescan is this G's index in work.rescan.list. If this is
-	// -1, this G is not on the rescan list.
-	//
-	// If gcphase != _GCoff and this G is visible to the garbage
-	// collector, writes to this are protected by work.rescan.lock.
-	gcRescan int32
-
 	// gcAssistBytes is this G's GC assist credit in terms of
 	// bytes allocated. If this is positive, then the G has credit
 	// to allocate gcAssistBytes bytes without assisting. If this
@@ -429,6 +409,7 @@
 	inwb          bool // m is executing a write barrier
 	newSigstack   bool // minit on C thread called sigaltstack
 	printlock     int8
+	incgo         bool // m is executing a cgo call
 	fastrand      uint32
 	ncgocall      uint64      // number of cgo calls in total
 	ncgo          int32       // number of cgo calls currently in progress
@@ -445,7 +426,6 @@
 	fflag         uint32      // floating point compare flags
 	locked        uint32      // tracking for lockosthread
 	nextwaitm     uintptr     // next m waiting for lock
-	gcstats       gcstats
 	needextram    bool
 	traceback     uint8
 	waitunlockf   unsafe.Pointer // todo go func(*g, unsafe.pointer) bool
@@ -473,9 +453,10 @@
 	id          int32
 	status      uint32 // one of pidle/prunning/...
 	link        puintptr
-	schedtick   uint32   // incremented on every scheduler call
-	syscalltick uint32   // incremented on every system call
-	m           muintptr // back-link to associated m (nil if idle)
+	schedtick   uint32     // incremented on every scheduler call
+	syscalltick uint32     // incremented on every system call
+	sysmontick  sysmontick // last tick observed by sysmon
+	m           muintptr   // back-link to associated m (nil if idle)
 	mcache      *mcache
 	racectx     uintptr
 
@@ -510,6 +491,14 @@
 
 	tracebuf traceBufPtr
 
+	// traceSweep indicates the sweep events should be traced.
+	// This is used to defer the sweep start event until a span
+	// has actually been swept.
+	traceSweep bool
+	// traceSwept and traceReclaimed track the number of bytes
+	// swept and reclaimed by sweeping in the current sweep loop.
+	traceSwept, traceReclaimed uintptr
+
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
@@ -530,7 +519,7 @@
 const (
 	// The max value of GOMAXPROCS.
 	// There are no fundamental restrictions on the value.
-	_MaxGomaxprocs = 1 << 8
+	_MaxGomaxprocs = 1 << 10
 )
 
 type schedt struct {
@@ -607,7 +596,6 @@
 	_SigThrow                // if signal.Notify doesn't take it, exit loudly
 	_SigPanic                // if the signal is from the kernel, panic
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
-	_SigHandling             // our signal handler is registered
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
 	_SigUnblock              // unblocked in minit
@@ -639,8 +627,10 @@
 	inter  *interfacetype
 	_type  *_type
 	link   *itab
-	bad    int32
-	inhash int32      // has this itab been added to hash?
+	hash   uint32 // copy of _type.hash. Used for type switches.
+	bad    bool   // type does not implement interface
+	inhash bool   // has this itab been added to hash?
+	unused [2]byte
 	fun    [1]uintptr // variable sized
 }
 
@@ -704,7 +694,7 @@
 
 // stack traces
 type stkframe struct {
-	fn       *_func     // function being run
+	fn       funcInfo   // function being run
 	pc       uintptr    // program counter within fn
 	continpc uintptr    // program counter where execution can continue, or 0 if not
 	lr       uintptr    // program counter at caller aka link register
@@ -731,22 +721,30 @@
 	allm        *m
 	allp        [_MaxGomaxprocs + 1]*p
 	gomaxprocs  int32
-	panicking   uint32
 	ncpu        int32
 	forcegc     forcegcstate
 	sched       schedt
 	newprocs    int32
 
 	// Information about what cpu features are available.
-	// Set on startup in asm_{x86,amd64}.s.
-	cpuid_ecx         uint32
-	cpuid_edx         uint32
-	cpuid_ebx7        uint32
-	lfenceBeforeRdtsc bool
-	support_avx       bool
-	support_avx2      bool
-	support_bmi1      bool
-	support_bmi2      bool
+	// Set on startup in asm_{386,amd64,amd64p32}.s.
+	// Packages outside the runtime should not use these
+	// as they are not an external api.
+	processorVersionInfo uint32
+	isIntel              bool
+	lfenceBeforeRdtsc    bool
+	support_aes          bool
+	support_avx          bool
+	support_avx2         bool
+	support_bmi1         bool
+	support_bmi2         bool
+	support_erms         bool
+	support_osxsave      bool
+	support_popcnt       bool
+	support_sse2         bool
+	support_sse41        bool
+	support_sse42        bool
+	support_ssse3        bool
 
 	goarm                uint8 // set by cmd/link on arm systems
 	framepointer_enabled bool  // set by cmd/link

diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index 9febbe6..e9bc256 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go

@@ -50,6 +50,23 @@
 	}
 }
 
+var efaceCmp1 interface{}
+var efaceCmp2 interface{}
+
+func BenchmarkEfaceCmpDiff(b *testing.B) {
+	x := 5
+	efaceCmp1 = &x
+	y := 6
+	efaceCmp2 = &y
+	for i := 0; i < b.N; i++ {
+		for j := 0; j < 100; j++ {
+			if efaceCmp1 == efaceCmp2 {
+				b.Fatal("bad comparison")
+			}
+		}
+	}
+}
+
 func BenchmarkDefer(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		defer1()
@@ -62,7 +79,6 @@
 			panic("bad recover")
 		}
 	}(1, 2, 3)
-	return
 }
 
 func BenchmarkDefer10(b *testing.B) {

diff --git a/src/runtime/rwmutex.go b/src/runtime/rwmutex.go
new file mode 100644
index 0000000..7eeb559
--- /dev/null
+++ b/src/runtime/rwmutex.go

@@ -0,0 +1,125 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+)
+
+// This is a copy of sync/rwmutex.go rewritten to work in the runtime.
+
+// An rwmutex is a reader/writer mutual exclusion lock.
+// The lock can be held by an arbitrary number of readers or a single writer.
+// This is a variant of sync.RWMutex, for the runtime package.
+// Like mutex, rwmutex blocks the calling M.
+// It does not interact with the goroutine scheduler.
+type rwmutex struct {
+	rLock      mutex    // protects readers, readerPass, writer
+	readers    muintptr // list of pending readers
+	readerPass uint32   // number of pending readers to skip readers list
+
+	wLock  mutex    // serializes writers
+	writer muintptr // pending writer waiting for completing readers
+
+	readerCount uint32 // number of pending readers
+	readerWait  uint32 // number of departing readers
+}
+
+const rwmutexMaxReaders = 1 << 30
+
+// rlock locks rw for reading.
+func (rw *rwmutex) rlock() {
+	// The reader must not be allowed to lose its P or else other
+	// things blocking on the lock may consume all of the Ps and
+	// deadlock (issue #20903). Alternatively, we could drop the P
+	// while sleeping.
+	acquirem()
+	if int32(atomic.Xadd(&rw.readerCount, 1)) < 0 {
+		// A writer is pending. Park on the reader queue.
+		systemstack(func() {
+			lock(&rw.rLock)
+			if rw.readerPass > 0 {
+				// Writer finished.
+				rw.readerPass -= 1
+				unlock(&rw.rLock)
+			} else {
+				// Queue this reader to be woken by
+				// the writer.
+				m := getg().m
+				m.schedlink = rw.readers
+				rw.readers.set(m)
+				unlock(&rw.rLock)
+				notesleep(&m.park)
+				noteclear(&m.park)
+			}
+		})
+	}
+}
+
+// runlock undoes a single rlock call on rw.
+func (rw *rwmutex) runlock() {
+	if r := int32(atomic.Xadd(&rw.readerCount, -1)); r < 0 {
+		if r+1 == 0 || r+1 == -rwmutexMaxReaders {
+			throw("runlock of unlocked rwmutex")
+		}
+		// A writer is pending.
+		if atomic.Xadd(&rw.readerWait, -1) == 0 {
+			// The last reader unblocks the writer.
+			lock(&rw.rLock)
+			w := rw.writer.ptr()
+			if w != nil {
+				notewakeup(&w.park)
+			}
+			unlock(&rw.rLock)
+		}
+	}
+	releasem(getg().m)
+}
+
+// lock locks rw for writing.
+func (rw *rwmutex) lock() {
+	// Resolve competition with other writers and stick to our P.
+	lock(&rw.wLock)
+	m := getg().m
+	// Announce that there is a pending writer.
+	r := int32(atomic.Xadd(&rw.readerCount, -rwmutexMaxReaders)) + rwmutexMaxReaders
+	// Wait for any active readers to complete.
+	lock(&rw.rLock)
+	if r != 0 && atomic.Xadd(&rw.readerWait, r) != 0 {
+		// Wait for reader to wake us up.
+		systemstack(func() {
+			rw.writer.set(m)
+			unlock(&rw.rLock)
+			notesleep(&m.park)
+			noteclear(&m.park)
+		})
+	} else {
+		unlock(&rw.rLock)
+	}
+}
+
+// unlock unlocks rw for writing.
+func (rw *rwmutex) unlock() {
+	// Announce to readers that there is no active writer.
+	r := int32(atomic.Xadd(&rw.readerCount, rwmutexMaxReaders))
+	if r >= rwmutexMaxReaders {
+		throw("unlock of unlocked rwmutex")
+	}
+	// Unblock blocked readers.
+	lock(&rw.rLock)
+	for rw.readers.ptr() != nil {
+		reader := rw.readers.ptr()
+		rw.readers = reader.schedlink
+		reader.schedlink.set(nil)
+		notewakeup(&reader.park)
+		r -= 1
+	}
+	// If r > 0, there are pending readers that aren't on the
+	// queue. Tell them to skip waiting.
+	rw.readerPass += uint32(r)
+	unlock(&rw.rLock)
+	// Allow other writers to proceed.
+	unlock(&rw.wLock)
+}

diff --git a/src/runtime/rwmutex_test.go b/src/runtime/rwmutex_test.go
new file mode 100644
index 0000000..a69eca1
--- /dev/null
+++ b/src/runtime/rwmutex_test.go

@@ -0,0 +1,178 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// GOMAXPROCS=10 go test
+
+// This is a copy of sync/rwmutex_test.go rewritten to test the
+// runtime rwmutex.
+
+package runtime_test
+
+import (
+	"fmt"
+	. "runtime"
+	"sync/atomic"
+	"testing"
+)
+
+func parallelReader(m *RWMutex, clocked chan bool, cunlock *uint32, cdone chan bool) {
+	m.RLock()
+	clocked <- true
+	for atomic.LoadUint32(cunlock) == 0 {
+	}
+	m.RUnlock()
+	cdone <- true
+}
+
+func doTestParallelReaders(numReaders int) {
+	GOMAXPROCS(numReaders + 1)
+	var m RWMutex
+	clocked := make(chan bool, numReaders)
+	var cunlock uint32
+	cdone := make(chan bool)
+	for i := 0; i < numReaders; i++ {
+		go parallelReader(&m, clocked, &cunlock, cdone)
+	}
+	// Wait for all parallel RLock()s to succeed.
+	for i := 0; i < numReaders; i++ {
+		<-clocked
+	}
+	atomic.StoreUint32(&cunlock, 1)
+	// Wait for the goroutines to finish.
+	for i := 0; i < numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestParallelRWMutexReaders(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	doTestParallelReaders(1)
+	doTestParallelReaders(3)
+	doTestParallelReaders(4)
+}
+
+func reader(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.RLock()
+		n := atomic.AddInt32(activity, 1)
+		if n < 1 || n >= 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -1)
+		rwm.RUnlock()
+	}
+	cdone <- true
+}
+
+func writer(rwm *RWMutex, num_iterations int, activity *int32, cdone chan bool) {
+	for i := 0; i < num_iterations; i++ {
+		rwm.Lock()
+		n := atomic.AddInt32(activity, 10000)
+		if n != 10000 {
+			panic(fmt.Sprintf("wlock(%d)\n", n))
+		}
+		for i := 0; i < 100; i++ {
+		}
+		atomic.AddInt32(activity, -10000)
+		rwm.Unlock()
+	}
+	cdone <- true
+}
+
+func HammerRWMutex(gomaxprocs, numReaders, num_iterations int) {
+	GOMAXPROCS(gomaxprocs)
+	// Number of active readers + 10000 * number of active writers.
+	var activity int32
+	var rwm RWMutex
+	cdone := make(chan bool)
+	go writer(&rwm, num_iterations, &activity, cdone)
+	var i int
+	for i = 0; i < numReaders/2; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	go writer(&rwm, num_iterations, &activity, cdone)
+	for ; i < numReaders; i++ {
+		go reader(&rwm, num_iterations, &activity, cdone)
+	}
+	// Wait for the 2 writers and all readers to finish.
+	for i := 0; i < 2+numReaders; i++ {
+		<-cdone
+	}
+}
+
+func TestRWMutex(t *testing.T) {
+	defer GOMAXPROCS(GOMAXPROCS(-1))
+	n := 1000
+	if testing.Short() {
+		n = 5
+	}
+	HammerRWMutex(1, 1, n)
+	HammerRWMutex(1, 3, n)
+	HammerRWMutex(1, 10, n)
+	HammerRWMutex(4, 1, n)
+	HammerRWMutex(4, 3, n)
+	HammerRWMutex(4, 10, n)
+	HammerRWMutex(10, 1, n)
+	HammerRWMutex(10, 3, n)
+	HammerRWMutex(10, 10, n)
+	HammerRWMutex(10, 5, n)
+}
+
+func BenchmarkRWMutexUncontended(b *testing.B) {
+	type PaddedRWMutex struct {
+		RWMutex
+		pad [32]uint32
+	}
+	b.RunParallel(func(pb *testing.PB) {
+		var rwm PaddedRWMutex
+		for pb.Next() {
+			rwm.RLock()
+			rwm.RLock()
+			rwm.RUnlock()
+			rwm.RUnlock()
+			rwm.Lock()
+			rwm.Unlock()
+		}
+	})
+}
+
+func benchmarkRWMutex(b *testing.B, localWork, writeRatio int) {
+	var rwm RWMutex
+	b.RunParallel(func(pb *testing.PB) {
+		foo := 0
+		for pb.Next() {
+			foo++
+			if foo%writeRatio == 0 {
+				rwm.Lock()
+				rwm.Unlock()
+			} else {
+				rwm.RLock()
+				for i := 0; i != localWork; i += 1 {
+					foo *= 2
+					foo /= 2
+				}
+				rwm.RUnlock()
+			}
+		}
+		_ = foo
+	})
+}
+
+func BenchmarkRWMutexWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 0, 100)
+}
+
+func BenchmarkRWMutexWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 0, 10)
+}
+
+func BenchmarkRWMutexWorkWrite100(b *testing.B) {
+	benchmarkRWMutex(b, 100, 100)
+}
+
+func BenchmarkRWMutexWorkWrite10(b *testing.B) {
+	benchmarkRWMutex(b, 100, 10)
+}

diff --git a/src/runtime/select.go b/src/runtime/select.go
index 0d846b1..715cee8 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go

@@ -11,11 +11,12 @@
 	"unsafe"
 )
 
-const (
-	debugSelect = false
+const debugSelect = false
 
+const (
 	// scase.kind
-	caseRecv = iota
+	caseNil = iota
+	caseRecv
 	caseSend
 	caseDefault
 )
@@ -37,10 +38,9 @@
 type scase struct {
 	elem        unsafe.Pointer // data element
 	c           *hchan         // chan
-	pc          uintptr        // return pc
+	pc          uintptr        // return pc (for race detector / msan)
 	kind        uint16
-	so          uint16 // vararg of selected bool
-	receivedp   *bool  // pointer to received bool (recv2)
+	receivedp   *bool // pointer to received bool, if any
 	releasetime int64
 }
 
@@ -72,92 +72,63 @@
 	}
 }
 
-//go:nosplit
-func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) (selected bool) {
-	// nil cases do not compete
-	if c != nil {
-		selectsendImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
-	}
-	return
-}
-
-// cut in half to give stack a chance to split
-func selectsendImpl(sel *hselect, c *hchan, pc uintptr, elem unsafe.Pointer, so uintptr) {
+func selectsend(sel *hselect, c *hchan, elem unsafe.Pointer) {
+	pc := getcallerpc(unsafe.Pointer(&sel))
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectsend: too many cases")
 	}
 	sel.ncase = i + 1
+	if c == nil {
+		return
+	}
 	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
-
 	cas.pc = pc
 	cas.c = c
-	cas.so = uint16(so)
 	cas.kind = caseSend
 	cas.elem = elem
 
 	if debugSelect {
-		print("selectsend s=", sel, " pc=", hex(cas.pc), " chan=", cas.c, " so=", cas.so, "\n")
+		print("selectsend s=", sel, " pc=", hex(cas.pc), " chan=", cas.c, "\n")
 	}
 }
 
-//go:nosplit
-func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer) (selected bool) {
-	// nil cases do not compete
-	if c != nil {
-		selectrecvImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, nil, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
-	}
-	return
-}
-
-//go:nosplit
-func selectrecv2(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) (selected bool) {
-	// nil cases do not compete
-	if c != nil {
-		selectrecvImpl(sel, c, getcallerpc(unsafe.Pointer(&sel)), elem, received, uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
-	}
-	return
-}
-
-func selectrecvImpl(sel *hselect, c *hchan, pc uintptr, elem unsafe.Pointer, received *bool, so uintptr) {
+func selectrecv(sel *hselect, c *hchan, elem unsafe.Pointer, received *bool) {
+	pc := getcallerpc(unsafe.Pointer(&sel))
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectrecv: too many cases")
 	}
 	sel.ncase = i + 1
+	if c == nil {
+		return
+	}
 	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
 	cas.pc = pc
 	cas.c = c
-	cas.so = uint16(so)
 	cas.kind = caseRecv
 	cas.elem = elem
 	cas.receivedp = received
 
 	if debugSelect {
-		print("selectrecv s=", sel, " pc=", hex(cas.pc), " chan=", cas.c, " so=", cas.so, "\n")
+		print("selectrecv s=", sel, " pc=", hex(cas.pc), " chan=", cas.c, "\n")
 	}
 }
 
-//go:nosplit
-func selectdefault(sel *hselect) (selected bool) {
-	selectdefaultImpl(sel, getcallerpc(unsafe.Pointer(&sel)), uintptr(unsafe.Pointer(&selected))-uintptr(unsafe.Pointer(&sel)))
-	return
-}
-
-func selectdefaultImpl(sel *hselect, callerpc uintptr, so uintptr) {
+func selectdefault(sel *hselect) {
+	pc := getcallerpc(unsafe.Pointer(&sel))
 	i := sel.ncase
 	if i >= sel.tcase {
 		throw("selectdefault: too many cases")
 	}
 	sel.ncase = i + 1
 	cas := (*scase)(add(unsafe.Pointer(&sel.scase), uintptr(i)*unsafe.Sizeof(sel.scase[0])))
-	cas.pc = callerpc
+	cas.pc = pc
 	cas.c = nil
-	cas.so = uint16(so)
 	cas.kind = caseDefault
 
 	if debugSelect {
-		print("selectdefault s=", sel, " pc=", hex(cas.pc), " so=", cas.so, "\n")
+		print("selectdefault s=", sel, " pc=", hex(cas.pc), "\n")
 	}
 }
 
@@ -181,14 +152,11 @@
 	// the G that calls select runnable again and schedules it for execution.
 	// When the G runs on another M, it locks all the locks and frees sel.
 	// Now if the first M touches sel, it will access freed memory.
-	n := len(scases)
-	r := 0
-	// skip the default case
-	if n > 0 && scases[lockorder[0]].c == nil {
-		r = 1
-	}
-	for i := n - 1; i >= r; i-- {
+	for i := len(scases) - 1; i >= 0; i-- {
 		c := scases[lockorder[i]].c
+		if c == nil {
+			break
+		}
 		if i > 0 && c == scases[lockorder[i-1]].c {
 			continue // will unlock it on the next iteration
 		}
@@ -229,23 +197,15 @@
 // *sel is on the current goroutine's stack (regardless of any
 // escaping in selectgo).
 //
-// selectgo does not return. Instead, it overwrites its return PC and
-// returns directly to the triggered select case. Because of this, it
-// cannot appear at the top of a split stack.
-//
-//go:nosplit
-func selectgo(sel *hselect) {
-	pc, offset := selectgoImpl(sel)
-	*(*bool)(add(unsafe.Pointer(&sel), uintptr(offset))) = true
-	setcallerpc(unsafe.Pointer(&sel), pc)
-}
-
-// selectgoImpl returns scase.pc and scase.so for the select
-// case which fired.
-func selectgoImpl(sel *hselect) (uintptr, uint16) {
+// selectgo returns the index of the chosen scase, which matches the
+// ordinal position of its respective select{recv,send,default} call.
+func selectgo(sel *hselect) int {
 	if debugSelect {
 		print("select: sel=", sel, "\n")
 	}
+	if sel.ncase != sel.tcase {
+		throw("selectgo: case count mismatch")
+	}
 
 	scaseslice := slice{unsafe.Pointer(&sel.scase), int(sel.ncase), int(sel.ncase)}
 	scases := *(*[]scase)(unsafe.Pointer(&scaseslice))
@@ -270,7 +230,7 @@
 	pollslice := slice{unsafe.Pointer(sel.pollorder), int(sel.ncase), int(sel.ncase)}
 	pollorder := *(*[]uint16)(unsafe.Pointer(&pollslice))
 	for i := 1; i < int(sel.ncase); i++ {
-		j := int(fastrand()) % (i + 1)
+		j := fastrandn(uint32(i + 1))
 		pollorder[i] = pollorder[j]
 		pollorder[j] = uint16(i)
 	}
@@ -338,13 +298,19 @@
 
 loop:
 	// pass 1 - look for something already waiting
+	var dfli int
 	var dfl *scase
+	var casi int
 	var cas *scase
 	for i := 0; i < int(sel.ncase); i++ {
-		cas = &scases[pollorder[i]]
+		casi = int(pollorder[i])
+		cas = &scases[casi]
 		c = cas.c
 
 		switch cas.kind {
+		case caseNil:
+			continue
+
 		case caseRecv:
 			sg = c.sendq.dequeue()
 			if sg != nil {
@@ -373,12 +339,14 @@
 			}
 
 		case caseDefault:
+			dfli = casi
 			dfl = cas
 		}
 	}
 
 	if dfl != nil {
 		selunlock(scases, lockorder)
+		casi = dfli
 		cas = dfl
 		goto retc
 	}
@@ -391,7 +359,11 @@
 	}
 	nextp = &gp.waiting
 	for _, casei := range lockorder {
-		cas = &scases[casei]
+		casi = int(casei)
+		cas = &scases[casi]
+		if cas.kind == caseNil {
+			continue
+		}
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
@@ -420,7 +392,7 @@
 
 	// wait for someone to wake us up
 	gp.param = nil
-	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 2)
+	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
 
 	// While we were asleep, some goroutine came along and completed
 	// one of the cases in the select and woke us up (called ready).
@@ -485,6 +457,7 @@
 	// otherwise they stack up on quiet channels
 	// record the successful case, if any.
 	// We singly-linked up the SudoGs in lock order.
+	casi = -1
 	cas = nil
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
@@ -497,11 +470,15 @@
 
 	for _, casei := range lockorder {
 		k = &scases[casei]
+		if k.kind == caseNil {
+			continue
+		}
 		if sglist.releasetime > 0 {
 			k.releasetime = sglist.releasetime
 		}
 		if sg == sglist {
 			// sg has already been dequeued by the G that woke us up.
+			casi = int(casei)
 			cas = k
 		} else {
 			c = k.c
@@ -609,7 +586,7 @@
 
 recv:
 	// can receive from sleeping sender (sg)
-	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
+	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
 	if debugSelect {
 		print("syncrecv: sel=", sel, " c=", c, "\n")
 	}
@@ -640,7 +617,7 @@
 	if msanenabled {
 		msanread(cas.elem, c.elemtype.size)
 	}
-	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
+	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) }, 2)
 	if debugSelect {
 		print("syncsend: sel=", sel, " c=", c, "\n")
 	}
@@ -648,9 +625,9 @@
 
 retc:
 	if cas.releasetime > 0 {
-		blockevent(cas.releasetime-t0, 2)
+		blockevent(cas.releasetime-t0, 1)
 	}
-	return cas.pc, cas.so
+	return casi
 
 sclose:
 	// send on closed channel
@@ -694,22 +671,15 @@
 		rc := &cases[i]
 		switch rc.dir {
 		case selectDefault:
-			selectdefaultImpl(sel, uintptr(i), 0)
+			selectdefault(sel)
 		case selectSend:
-			if rc.ch == nil {
-				break
-			}
-			selectsendImpl(sel, rc.ch, uintptr(i), rc.val, 0)
+			selectsend(sel, rc.ch, rc.val)
 		case selectRecv:
-			if rc.ch == nil {
-				break
-			}
-			selectrecvImpl(sel, rc.ch, uintptr(i), rc.val, r, 0)
+			selectrecv(sel, rc.ch, rc.val, r)
 		}
 	}
 
-	pc, _ := selectgoImpl(sel)
-	chosen = int(pc)
+	chosen = selectgo(sel)
 	recvOK = *r
 	return
 }

diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 576a1fb..8715e07 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go

@@ -27,10 +27,19 @@
 
 // Asynchronous semaphore for sync.Mutex.
 
+// A semaRoot holds a balanced tree of sudog with distinct addresses (s.elem).
+// Each of those sudog may in turn point (through s.waitlink) to a list
+// of other sudogs waiting on the same address.
+// The operations on the inner lists of sudogs with the same address
+// are all O(1). The scanning of the top-level semaRoot list is O(log n),
+// where n is the number of distinct addresses with goroutines blocked
+// on them that hash to the given semaRoot.
+// See golang.org/issue/17953 for a program that worked badly
+// before we introduced the second level of list, and test/locklinear.go
+// for a test that exercises this.
 type semaRoot struct {
 	lock  mutex
-	head  *sudog
-	tail  *sudog
+	treap *sudog // root of balanced tree of unique waiters.
 	nwait uint32 // Number of waiters. Read w/o the lock.
 }
 
@@ -44,26 +53,26 @@
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
 func sync_runtime_Semacquire(addr *uint32) {
-	semacquire(addr, semaBlockProfile)
+	semacquire1(addr, false, semaBlockProfile)
 }
 
-//go:linkname net_runtime_Semacquire net.runtime_Semacquire
-func net_runtime_Semacquire(addr *uint32) {
-	semacquire(addr, semaBlockProfile)
+//go:linkname poll_runtime_Semacquire internal/poll.runtime_Semacquire
+func poll_runtime_Semacquire(addr *uint32) {
+	semacquire1(addr, false, semaBlockProfile)
 }
 
 //go:linkname sync_runtime_Semrelease sync.runtime_Semrelease
-func sync_runtime_Semrelease(addr *uint32) {
-	semrelease(addr)
+func sync_runtime_Semrelease(addr *uint32, handoff bool) {
+	semrelease1(addr, handoff)
 }
 
 //go:linkname sync_runtime_SemacquireMutex sync.runtime_SemacquireMutex
-func sync_runtime_SemacquireMutex(addr *uint32) {
-	semacquire(addr, semaBlockProfile|semaMutexProfile)
+func sync_runtime_SemacquireMutex(addr *uint32, lifo bool) {
+	semacquire1(addr, lifo, semaBlockProfile|semaMutexProfile)
 }
 
-//go:linkname net_runtime_Semrelease net.runtime_Semrelease
-func net_runtime_Semrelease(addr *uint32) {
+//go:linkname poll_runtime_Semrelease internal/poll.runtime_Semrelease
+func poll_runtime_Semrelease(addr *uint32) {
 	semrelease(addr)
 }
 
@@ -82,7 +91,11 @@
 )
 
 // Called from runtime.
-func semacquire(addr *uint32, profile semaProfileFlags) {
+func semacquire(addr *uint32) {
+	semacquire1(addr, false, 0)
+}
+
+func semacquire1(addr *uint32, lifo bool, profile semaProfileFlags) {
 	gp := getg()
 	if gp != gp.m.curg {
 		throw("semacquire not on the G stack")
@@ -104,6 +117,7 @@
 	t0 := int64(0)
 	s.releasetime = 0
 	s.acquiretime = 0
+	s.ticket = 0
 	if profile&semaBlockProfile != 0 && blockprofilerate > 0 {
 		t0 = cputicks()
 		s.releasetime = -1
@@ -126,9 +140,9 @@
 		}
 		// Any semrelease after the cansemacquire knows we're waiting
 		// (we set nwait above), so go to sleep.
-		root.queue(addr, s)
+		root.queue(addr, s, lifo)
 		goparkunlock(&root.lock, "semacquire", traceEvGoBlockSync, 4)
-		if cansemacquire(addr) {
+		if s.ticket != 0 || cansemacquire(addr) {
 			break
 		}
 	}
@@ -139,6 +153,10 @@
 }
 
 func semrelease(addr *uint32) {
+	semrelease1(addr, false)
+}
+
+func semrelease1(addr *uint32, handoff bool) {
 	root := semroot(addr)
 	atomic.Xadd(addr, 1)
 
@@ -157,27 +175,22 @@
 		unlock(&root.lock)
 		return
 	}
-	s := root.head
-	for ; s != nil; s = s.next {
-		if s.elem == unsafe.Pointer(addr) {
-			atomic.Xadd(&root.nwait, -1)
-			root.dequeue(s)
-			break
-		}
-	}
+	s, t0 := root.dequeue(addr)
 	if s != nil {
-		if s.acquiretime != 0 {
-			t0 := cputicks()
-			for x := root.head; x != nil; x = x.next {
-				if x.elem == unsafe.Pointer(addr) {
-					x.acquiretime = t0
-				}
-			}
-			mutexevent(t0-s.acquiretime, 3)
-		}
+		atomic.Xadd(&root.nwait, -1)
 	}
 	unlock(&root.lock)
 	if s != nil { // May be slow, so unlock first
+		acquiretime := s.acquiretime
+		if acquiretime != 0 {
+			mutexevent(t0-acquiretime, 3)
+		}
+		if s.ticket != 0 {
+			throw("corrupted semaphore ticket")
+		}
+		if handoff && cansemacquire(addr) {
+			s.ticket = 1
+		}
 		readyWithTime(s, 5)
 	}
 }
@@ -198,33 +211,230 @@
 	}
 }
 
-func (root *semaRoot) queue(addr *uint32, s *sudog) {
+// queue adds s to the blocked goroutines in semaRoot.
+func (root *semaRoot) queue(addr *uint32, s *sudog, lifo bool) {
 	s.g = getg()
 	s.elem = unsafe.Pointer(addr)
 	s.next = nil
-	s.prev = root.tail
-	if root.tail != nil {
-		root.tail.next = s
-	} else {
-		root.head = s
+	s.prev = nil
+
+	var last *sudog
+	pt := &root.treap
+	for t := *pt; t != nil; t = *pt {
+		if t.elem == unsafe.Pointer(addr) {
+			// Already have addr in list.
+			if lifo {
+				// Substitute s in t's place in treap.
+				*pt = s
+				s.ticket = t.ticket
+				s.acquiretime = t.acquiretime
+				s.parent = t.parent
+				s.prev = t.prev
+				s.next = t.next
+				if s.prev != nil {
+					s.prev.parent = s
+				}
+				if s.next != nil {
+					s.next.parent = s
+				}
+				// Add t first in s's wait list.
+				s.waitlink = t
+				s.waittail = t.waittail
+				if s.waittail == nil {
+					s.waittail = t
+				}
+				t.parent = nil
+				t.prev = nil
+				t.next = nil
+				t.waittail = nil
+			} else {
+				// Add s to end of t's wait list.
+				if t.waittail == nil {
+					t.waitlink = s
+				} else {
+					t.waittail.waitlink = s
+				}
+				t.waittail = s
+				s.waitlink = nil
+			}
+			return
+		}
+		last = t
+		if uintptr(unsafe.Pointer(addr)) < uintptr(t.elem) {
+			pt = &t.prev
+		} else {
+			pt = &t.next
+		}
 	}
-	root.tail = s
+
+	// Add s as new leaf in tree of unique addrs.
+	// The balanced tree is a treap using ticket as the random heap priority.
+	// That is, it is a binary tree ordered according to the elem addresses,
+	// but then among the space of possible binary trees respecting those
+	// addresses, it is kept balanced on average by maintaining a heap ordering
+	// on the ticket: s.ticket <= both s.prev.ticket and s.next.ticket.
+	// https://en.wikipedia.org/wiki/Treap
+	// http://faculty.washington.edu/aragon/pubs/rst89.pdf
+	s.ticket = fastrand()
+	s.parent = last
+	*pt = s
+
+	// Rotate up into tree according to ticket (priority).
+	for s.parent != nil && s.parent.ticket > s.ticket {
+		if s.parent.prev == s {
+			root.rotateRight(s.parent)
+		} else {
+			if s.parent.next != s {
+				panic("semaRoot queue")
+			}
+			root.rotateLeft(s.parent)
+		}
+	}
 }
 
-func (root *semaRoot) dequeue(s *sudog) {
-	if s.next != nil {
-		s.next.prev = s.prev
-	} else {
-		root.tail = s.prev
+// dequeue searches for and finds the first goroutine
+// in semaRoot blocked on addr.
+// If the sudog was being profiled, dequeue returns the time
+// at which it was woken up as now. Otherwise now is 0.
+func (root *semaRoot) dequeue(addr *uint32) (found *sudog, now int64) {
+	ps := &root.treap
+	s := *ps
+	for ; s != nil; s = *ps {
+		if s.elem == unsafe.Pointer(addr) {
+			goto Found
+		}
+		if uintptr(unsafe.Pointer(addr)) < uintptr(s.elem) {
+			ps = &s.prev
+		} else {
+			ps = &s.next
+		}
 	}
-	if s.prev != nil {
-		s.prev.next = s.next
-	} else {
-		root.head = s.next
+	return nil, 0
+
+Found:
+	now = int64(0)
+	if s.acquiretime != 0 {
+		now = cputicks()
 	}
+	if t := s.waitlink; t != nil {
+		// Substitute t, also waiting on addr, for s in root tree of unique addrs.
+		*ps = t
+		t.ticket = s.ticket
+		t.parent = s.parent
+		t.prev = s.prev
+		if t.prev != nil {
+			t.prev.parent = t
+		}
+		t.next = s.next
+		if t.next != nil {
+			t.next.parent = t
+		}
+		if t.waitlink != nil {
+			t.waittail = s.waittail
+		} else {
+			t.waittail = nil
+		}
+		t.acquiretime = now
+		s.waitlink = nil
+		s.waittail = nil
+	} else {
+		// Rotate s down to be leaf of tree for removal, respecting priorities.
+		for s.next != nil || s.prev != nil {
+			if s.next == nil || s.prev != nil && s.prev.ticket < s.next.ticket {
+				root.rotateRight(s)
+			} else {
+				root.rotateLeft(s)
+			}
+		}
+		// Remove s, now a leaf.
+		if s.parent != nil {
+			if s.parent.prev == s {
+				s.parent.prev = nil
+			} else {
+				s.parent.next = nil
+			}
+		} else {
+			root.treap = nil
+		}
+	}
+	s.parent = nil
 	s.elem = nil
 	s.next = nil
 	s.prev = nil
+	s.ticket = 0
+	return s, now
+}
+
+// rotateLeft rotates the tree rooted at node x.
+// turning (x a (y b c)) into (y (x a b) c).
+func (root *semaRoot) rotateLeft(x *sudog) {
+	// p -> (x a (y b c))
+	p := x.parent
+	a, y := x.prev, x.next
+	b, c := y.prev, y.next
+
+	y.prev = x
+	x.parent = y
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = b
+	if b != nil {
+		b.parent = x
+	}
+
+	y.parent = p
+	if p == nil {
+		root.treap = y
+	} else if p.prev == x {
+		p.prev = y
+	} else {
+		if p.next != x {
+			throw("semaRoot rotateLeft")
+		}
+		p.next = y
+	}
+}
+
+// rotateRight rotates the tree rooted at node y.
+// turning (y (x a b) c) into (x a (y b c)).
+func (root *semaRoot) rotateRight(y *sudog) {
+	// p -> (y (x a b) c)
+	p := y.parent
+	x, c := y.prev, y.next
+	a, b := x.prev, x.next
+
+	x.prev = a
+	if a != nil {
+		a.parent = x
+	}
+	x.next = y
+	y.parent = x
+	y.prev = b
+	if b != nil {
+		b.parent = y
+	}
+	y.next = c
+	if c != nil {
+		c.parent = y
+	}
+
+	x.parent = p
+	if p == nil {
+		root.treap = x
+	} else if p.prev == y {
+		p.prev = x
+	} else {
+		if p.next != y {
+			throw("semaRoot rotateRight")
+		}
+		p.next = x
+	}
 }
 
 // notifyList is a ticket-based notification list used to implement sync.Cond.
@@ -351,10 +561,22 @@
 		return
 	}
 
-	// Update the next notify ticket number, and try to find the G that
-	// needs to be notified. If it hasn't made it to the list yet we won't
-	// find it, but it won't park itself once it sees the new notify number.
+	// Update the next notify ticket number.
 	atomic.Store(&l.notify, t+1)
+
+	// Try to find the g that needs to be notified.
+	// If it hasn't made it to the list yet we won't find it,
+	// but it won't park itself once it sees the new notify number.
+	//
+	// This scan looks linear but essentially always stops quickly.
+	// Because g's queue separately from taking numbers,
+	// there may be minor reorderings in the list, but we
+	// expect the g we're looking for to be near the front.
+	// The g has others in front of it on the list only to the
+	// extent that it lost the race, so the iteration will not
+	// be too long. This applies even when the g is missing:
+	// it hasn't yet gotten to sleep and has lost the race to
+	// the (few) other g's that we find on the list.
 	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
 		if s.ticket == t {
 			n := s.next
@@ -382,3 +604,8 @@
 		throw("bad notifyList size")
 	}
 }
+
+//go:linkname sync_nanotime sync.runtime_nanotime
+func sync_nanotime() int64 {
+	return nanotime()
+}

diff --git a/src/runtime/signal_386.go b/src/runtime/signal_386.go
index 8807552..416c7c2 100644
--- a/src/runtime/signal_386.go
+++ b/src/runtime/signal_386.go

@@ -60,7 +60,7 @@
 	// but we do recognize the top pointer on the stack as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(*(*uintptr)(unsafe.Pointer(sp))) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(*(*uintptr)(unsafe.Pointer(sp))).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_amd64x.go b/src/runtime/signal_amd64x.go
index c8a6513..fad5fc0 100644
--- a/src/runtime/signal_amd64x.go
+++ b/src/runtime/signal_amd64x.go

@@ -71,7 +71,7 @@
 	// but we do recognize the top pointer on the stack as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(*(*uintptr)(unsafe.Pointer(sp))) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(*(*uintptr)(unsafe.Pointer(sp))).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_arm.go b/src/runtime/signal_arm.go
index 9748544..d00b225 100644
--- a/src/runtime/signal_arm.go
+++ b/src/runtime/signal_arm.go

@@ -57,7 +57,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.lr())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.lr())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go
index 4c6df42..1db0525 100644
--- a/src/runtime/signal_arm64.go
+++ b/src/runtime/signal_arm64.go

@@ -73,7 +73,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.lr())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.lr())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_linux_s390x.go b/src/runtime/signal_linux_s390x.go
index de71ee9..a31f436 100644
--- a/src/runtime/signal_linux_s390x.go
+++ b/src/runtime/signal_linux_s390x.go

@@ -103,7 +103,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.link())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_mips64x.go b/src/runtime/signal_mips64x.go
index 973ec2d..9546a5a 100644
--- a/src/runtime/signal_mips64x.go
+++ b/src/runtime/signal_mips64x.go

@@ -77,7 +77,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.link())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_mipsx.go b/src/runtime/signal_mipsx.go
index 62df79c..1c545ec 100644
--- a/src/runtime/signal_mipsx.go
+++ b/src/runtime/signal_mipsx.go

@@ -74,7 +74,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.link())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_ppc64x.go b/src/runtime/signal_ppc64x.go
index f09f890..03cb996 100644
--- a/src/runtime/signal_ppc64x.go
+++ b/src/runtime/signal_ppc64x.go

@@ -78,7 +78,7 @@
 	// but we do recognize the link register as code,
 	// then assume this was a call to non-code and treat like
 	// pc == 0, to make unwinding show the context.
-	if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+	if pc != 0 && !findfunc(pc).valid() && findfunc(uintptr(c.link())).valid() {
 		pc = 0
 	}
 

diff --git a/src/runtime/signal_sighandler.go b/src/runtime/signal_sighandler.go
index 5af12d7..b2e15a6 100644
--- a/src/runtime/signal_sighandler.go
+++ b/src/runtime/signal_sighandler.go

@@ -101,7 +101,7 @@
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
 			// tracebackothers on original m skipped this one; trace it now.
 			goroutineheader(_g_.m.curg)
-			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+			traceback(^uintptr(0), ^uintptr(0), 0, _g_.m.curg)
 		} else if crashing == 0 {
 			tracebackothers(gp)
 			print("\n")
@@ -111,7 +111,7 @@
 
 	if docrash {
 		crashing++
-		if crashing < sched.mcount {
+		if crashing < sched.mcount-int32(extraMCount) {
 			// There are other m's that need to dump their stacks.
 			// Relay SIGQUIT to the next m by sending it to the current process.
 			// All m's that have already received SIGQUIT have signal masks blocking

diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index 49c7579..539b165 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go

@@ -7,6 +7,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -31,11 +32,18 @@
 // Stores the signal handlers registered before Go installed its own.
 // These signal handlers will be invoked in cases where Go doesn't want to
 // handle a particular signal (e.g., signal occurred on a non-Go thread).
-// See sigfwdgo() for more information on when the signals are forwarded.
+// See sigfwdgo for more information on when the signals are forwarded.
 //
-// Signal forwarding is currently available only on Darwin and Linux.
+// This is read by the signal handler; accesses should use
+// atomic.Loaduintptr and atomic.Storeuintptr.
 var fwdSig [_NSIG]uintptr
 
+// handlingSig is indexed by signal number and is non-zero if we are
+// currently handling the signal. Or, to put it another way, whether
+// the signal handler is currently set to the Go signal handler or not.
+// This is uint32 rather than bool so that we can use atomic instructions.
+var handlingSig [_NSIG]uint32
+
 // channels for synchronizing signal mask updates with the signal mask
 // thread
 var (
@@ -76,6 +84,9 @@
 		if t.flags == 0 || t.flags&_SigDefault != 0 {
 			continue
 		}
+
+		// We don't need to use atomic operations here because
+		// there shouldn't be any other goroutines running yet.
 		fwdSig[i] = getsig(i)
 
 		if !sigInstallGoHandler(i) {
@@ -87,7 +98,7 @@
 			continue
 		}
 
-		t.flags |= _SigHandling
+		handlingSig[i] = 1
 		setsig(i, funcPC(sighandler))
 	}
 }
@@ -100,7 +111,7 @@
 	// Even these signals can be fetched using the os/signal package.
 	switch sig {
 	case _SIGHUP, _SIGINT:
-		if fwdSig[sig] == _SIG_IGN {
+		if atomic.Loaduintptr(&fwdSig[sig]) == _SIG_IGN {
 			return false
 		}
 	}
@@ -111,37 +122,52 @@
 	}
 
 	// When built using c-archive or c-shared, only install signal
-	// handlers for synchronous signals.
-	if (isarchive || islibrary) && t.flags&_SigPanic == 0 {
+	// handlers for synchronous signals and SIGPIPE.
+	if (isarchive || islibrary) && t.flags&_SigPanic == 0 && sig != _SIGPIPE {
 		return false
 	}
 
 	return true
 }
 
+// sigenable enables the Go signal handler to catch the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.enableSignal and signal_enable.
 func sigenable(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
 		ensureSigM()
 		enableSigChan <- sig
 		<-maskUpdatedChan
-		if t.flags&_SigHandling == 0 {
-			t.flags |= _SigHandling
-			fwdSig[sig] = getsig(sig)
+		if atomic.Cas(&handlingSig[sig], 0, 1) {
+			atomic.Storeuintptr(&fwdSig[sig], getsig(sig))
 			setsig(sig, funcPC(sighandler))
 		}
 	}
 }
 
+// sigdisable disables the Go signal handler for the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.disableSignal and signal_disable.
 func sigdisable(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
 		ensureSigM()
@@ -152,25 +178,71 @@
 		// signal, then to go back to the state before Notify
 		// we should remove the one we installed.
 		if !sigInstallGoHandler(sig) {
-			t.flags &^= _SigHandling
-			setsig(sig, fwdSig[sig])
+			atomic.Store(&handlingSig[sig], 0)
+			setsig(sig, atomic.Loaduintptr(&fwdSig[sig]))
 		}
 	}
 }
 
+// sigignore ignores the signal sig.
+// It is only called while holding the os/signal.handlers lock,
+// via os/signal.ignoreSignal and signal_ignore.
 func sigignore(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
 	}
 
+	// SIGPROF is handled specially for profiling.
+	if sig == _SIGPROF {
+		return
+	}
+
 	t := &sigtable[sig]
 	if t.flags&_SigNotify != 0 {
-		t.flags &^= _SigHandling
+		atomic.Store(&handlingSig[sig], 0)
 		setsig(sig, _SIG_IGN)
 	}
 }
 
-func resetcpuprofiler(hz int32) {
+// clearSignalHandlers clears all signal handlers that are not ignored
+// back to the default. This is called by the child after a fork, so that
+// we can enable the signal mask for the exec without worrying about
+// running a signal handler in the child.
+//go:nosplit
+//go:nowritebarrierrec
+func clearSignalHandlers() {
+	for i := uint32(0); i < _NSIG; i++ {
+		if atomic.Load(&handlingSig[i]) != 0 {
+			setsig(i, _SIG_DFL)
+		}
+	}
+}
+
+// setProcessCPUProfiler is called when the profiling timer changes.
+// It is called with prof.lock held. hz is the new timer, and is 0 if
+// profiling is being disabled. Enable or disable the signal as
+// required for -buildmode=c-archive.
+func setProcessCPUProfiler(hz int32) {
+	if hz != 0 {
+		// Enable the Go signal handler if not enabled.
+		if atomic.Cas(&handlingSig[_SIGPROF], 0, 1) {
+			atomic.Storeuintptr(&fwdSig[_SIGPROF], getsig(_SIGPROF))
+			setsig(_SIGPROF, funcPC(sighandler))
+		}
+	} else {
+		// If the Go signal handler should be disabled by default,
+		// disable it if it is enabled.
+		if !sigInstallGoHandler(_SIGPROF) {
+			if atomic.Cas(&handlingSig[_SIGPROF], 1, 0) {
+				setsig(_SIGPROF, atomic.Loaduintptr(&fwdSig[_SIGPROF]))
+			}
+		}
+	}
+}
+
+// setThreadCPUProfiler makes any thread-specific changes required to
+// implement profiling at a rate of hz.
+func setThreadCPUProfiler(hz int32) {
 	var it itimerval
 	if hz == 0 {
 		setitimer(_ITIMER_PROF, &it, nil)
@@ -252,6 +324,11 @@
 	}
 
 	setg(g.m.gsignal)
+
+	if g.stackguard0 == stackFork {
+		signalDuringFork(sig)
+	}
+
 	c := &sigctxt{info, ctx}
 	c.fixsigcode(sig)
 	sighandler(sig, info, ctx, g)
@@ -348,7 +425,7 @@
 	if sig >= _NSIG {
 		handler = _SIG_DFL
 	} else {
-		handler = fwdSig[sig]
+		handler = atomic.Loaduintptr(&fwdSig[sig])
 	}
 
 	// Reset the signal handler and raise the signal.
@@ -463,6 +540,16 @@
 	throw("non-Go code set up signal handler without SA_ONSTACK flag")
 }
 
+// signalDuringFork is called if we receive a signal while doing a fork.
+// We do not want signals at that time, as a signal sent to the process
+// group may be delivered to the child process, causing confusion.
+// This should never be called, because we block signals across the fork;
+// this function is just a safety check. See issue 18600 for background.
+func signalDuringFork(sig uint32) {
+	println("signal", sig, "received during fork")
+	throw("signal received during fork")
+}
+
 // This runs on a foreign stack, without an m or a g. No stack split.
 //go:nosplit
 //go:norace
@@ -490,7 +577,7 @@
 	if sig >= uint32(len(sigtable)) {
 		return false
 	}
-	fwdFn := fwdSig[sig]
+	fwdFn := atomic.Loaduintptr(&fwdSig[sig])
 
 	if !signalsOK {
 		// The only way we can get here is if we are in a
@@ -505,35 +592,44 @@
 		return true
 	}
 
-	flags := sigtable[sig].flags
-
 	// If there is no handler to forward to, no need to forward.
 	if fwdFn == _SIG_DFL {
 		return false
 	}
 
 	// If we aren't handling the signal, forward it.
-	if flags&_SigHandling == 0 {
+	// Really if we aren't handling the signal, we shouldn't get here,
+	// but on Darwin setsigstack can lead us here because it sets
+	// the sa_tramp field. The sa_tramp field is not returned by
+	// sigaction, so the fix for that is non-obvious.
+	if atomic.Load(&handlingSig[sig]) == 0 {
 		sigfwd(fwdFn, sig, info, ctx)
 		return true
 	}
 
-	// Only forward synchronous signals.
+	flags := sigtable[sig].flags
+
 	c := &sigctxt{info, ctx}
-	if c.sigcode() == _SI_USER || flags&_SigPanic == 0 {
+	// Only forward synchronous signals and SIGPIPE.
+	// Unfortunately, user generated SIGPIPEs will also be forwarded, because si_code
+	// is set to _SI_USER even for a SIGPIPE raised from a write to a closed socket
+	// or pipe.
+	if (c.sigcode() == _SI_USER || flags&_SigPanic == 0) && sig != _SIGPIPE {
 		return false
 	}
 	// Determine if the signal occurred inside Go code. We test that:
 	//   (1) we were in a goroutine (i.e., m.curg != nil), and
-	//   (2) we weren't in CGO (i.e., m.curg.syscallsp == 0).
+	//   (2) we weren't in CGO.
 	g := getg()
-	if g != nil && g.m != nil && g.m.curg != nil && g.m.curg.syscallsp == 0 {
+	if g != nil && g.m != nil && g.m.curg != nil && !g.m.incgo {
 		return false
 	}
+
 	// Signal not handled by Go, forward it.
 	if fwdFn != _SIG_IGN {
 		sigfwd(fwdFn, sig, info, ctx)
 	}
+
 	return true
 }
 
@@ -645,7 +741,6 @@
 	stack       stack
 	stackguard0 uintptr
 	stackguard1 uintptr
-	stackAlloc  uintptr
 	stktopsp    uintptr
 }
 
@@ -662,7 +757,6 @@
 		old.stack = g.m.gsignal.stack
 		old.stackguard0 = g.m.gsignal.stackguard0
 		old.stackguard1 = g.m.gsignal.stackguard1
-		old.stackAlloc = g.m.gsignal.stackAlloc
 		old.stktopsp = g.m.gsignal.stktopsp
 	}
 	stsp := uintptr(unsafe.Pointer(st.ss_sp))
@@ -670,7 +764,6 @@
 	g.m.gsignal.stack.hi = stsp + st.ss_size
 	g.m.gsignal.stackguard0 = stsp + _StackGuard
 	g.m.gsignal.stackguard1 = stsp + _StackGuard
-	g.m.gsignal.stackAlloc = st.ss_size
 }
 
 // restoreGsignalStack restores the gsignal stack to the value it had
@@ -682,7 +775,6 @@
 	gp.stack = st.stack
 	gp.stackguard0 = st.stackguard0
 	gp.stackguard1 = st.stackguard1
-	gp.stackAlloc = st.stackAlloc
 	gp.stktopsp = st.stktopsp
 }
 

diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 0162ffa..236bb29 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go

@@ -33,6 +33,17 @@
 	_ "unsafe" // for go:linkname
 )
 
+// sig handles communication between the signal handler and os/signal.
+// Other than the inuse and recv fields, the fields are accessed atomically.
+//
+// The wanted and ignored fields are only written by one goroutine at
+// a time; access is controlled by the handlers Mutex in os/signal.
+// The fields are only read by that one goroutine and by the signal handler.
+// We access them atomically to minimize the race between setting them
+// in the goroutine calling os/signal and the signal handler,
+// which may be running in a different thread. That race is unavoidable,
+// as there is no connection between handling a signal and receiving one,
+// but atomic instructions should minimize it.
 var sig struct {
 	note    note
 	mask    [(_NSIG + 31) / 32]uint32
@@ -53,7 +64,11 @@
 // Reports whether the signal was sent. If not, the caller typically crashes the program.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
-	if !sig.inuse || s >= uint32(32*len(sig.wanted)) || sig.wanted[s/32]&bit == 0 {
+	if !sig.inuse || s >= uint32(32*len(sig.wanted)) {
+		return false
+	}
+
+	if w := atomic.Load(&sig.wanted[s/32]); w&bit == 0 {
 		return false
 	}
 
@@ -131,6 +146,23 @@
 	}
 }
 
+// signalWaitUntilIdle waits until the signal delivery mechanism is idle.
+// This is used to ensure that we do not drop a signal notification due
+// to a race between disabling a signal and receiving a signal.
+// This assumes that signal delivery has already been disabled for
+// the signal(s) in question, and here we are just waiting to make sure
+// that all the signals have been delivered to the user channels
+// by the os/signal package.
+//go:linkname signalWaitUntilIdle os/signal.signalWaitUntilIdle
+func signalWaitUntilIdle() {
+	// Although WaitUntilIdle seems like the right name for this
+	// function, the state we are looking for is sigReceiving, not
+	// sigIdle.  The sigIdle state is really more like sigProcessing.
+	for atomic.Load(&sig.state) != sigReceiving {
+		Gosched()
+	}
+}
+
 // Must only be called from a single goroutine at a time.
 //go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {
@@ -146,8 +178,15 @@
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] |= 1 << (s & 31)
-	sig.ignored[s/32] &^= 1 << (s & 31)
+
+	w := sig.wanted[s/32]
+	w |= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i &^= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
+
 	sigenable(s)
 }
 
@@ -157,8 +196,11 @@
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] &^= 1 << (s & 31)
 	sigdisable(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
 }
 
 // Must only be called from a single goroutine at a time.
@@ -167,12 +209,19 @@
 	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
-	sig.wanted[s/32] &^= 1 << (s & 31)
-	sig.ignored[s/32] |= 1 << (s & 31)
 	sigignore(s)
+
+	w := sig.wanted[s/32]
+	w &^= 1 << (s & 31)
+	atomic.Store(&sig.wanted[s/32], w)
+
+	i := sig.ignored[s/32]
+	i |= 1 << (s & 31)
+	atomic.Store(&sig.ignored[s/32], i)
 }
 
 // Checked by signal handlers.
 func signal_ignored(s uint32) bool {
-	return sig.ignored[s/32]&(1<<(s&31)) != 0
+	i := atomic.Load(&sig.ignored[s/32])
+	return i&(1<<(s&31)) != 0
 }

diff --git a/src/runtime/sigqueue_plan9.go b/src/runtime/sigqueue_plan9.go
index 575d26a..7666804 100644
--- a/src/runtime/sigqueue_plan9.go
+++ b/src/runtime/sigqueue_plan9.go

@@ -110,6 +110,26 @@
 	}
 }
 
+// signalWaitUntilIdle waits until the signal delivery mechanism is idle.
+// This is used to ensure that we do not drop a signal notification due
+// to a race between disabling a signal and receiving a signal.
+// This assumes that signal delivery has already been disabled for
+// the signal(s) in question, and here we are just waiting to make sure
+// that all the signals have been delivered to the user channels
+// by the os/signal package.
+//go:linkname signalWaitUntilIdle os/signal.signalWaitUntilIdle
+func signalWaitUntilIdle() {
+	for {
+		lock(&sig.lock)
+		sleeping := sig.sleeping
+		unlock(&sig.lock)
+		if sleeping {
+			return
+		}
+		Gosched()
+	}
+}
+
 // Must only be called from a single goroutine at a time.
 //go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {

diff --git a/src/runtime/sizeclasses.go b/src/runtime/sizeclasses.go
index e616e95..5366564 100644
--- a/src/runtime/sizeclasses.go
+++ b/src/runtime/sizeclasses.go

@@ -1,4 +1,4 @@
-// AUTO-GENERATED by mksizeclasses.go; DO NOT EDIT
+// Code generated by mksizeclasses.go; DO NOT EDIT.
 //go:generate go run mksizeclasses.go
 
 package runtime

diff --git a/src/runtime/softfloat_arm.go b/src/runtime/softfloat_arm.go
index 3cbb4b3..726699d 100644
--- a/src/runtime/softfloat_arm.go
+++ b/src/runtime/softfloat_arm.go

@@ -653,3 +653,12 @@
 	}
 	return pc
 }
+
+// Stubs to pacify vet. Not safe to call from Go.
+// Calls to these functions are inserted by the compiler or assembler.
+func _sfloat()
+func udiv()
+func _div()
+func _divu()
+func _mod()
+func _modu()

diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 0f1a5c1..525d0b1 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go

@@ -102,15 +102,6 @@
 	_StackLimit = _StackGuard - _StackSystem - _StackSmall
 )
 
-// Goroutine preemption request.
-// Stored into g->stackguard0 to cause split stack check failure.
-// Must be greater than any real sp.
-// 0xfffffade in hex.
-const (
-	_StackPreempt = uintptrMask & -1314
-	_StackFork    = uintptrMask & -1234
-)
-
 const (
 	// stackDebug == 0: no logging
 	//            == 1: logging of per-stack operations
@@ -121,8 +112,7 @@
 	stackFromSystem  = 0 // allocate stacks from system memory instead of the heap
 	stackFaultOnFree = 0 // old stacks are mapped noaccess to detect use after free
 	stackPoisonCopy  = 0 // fill stack that should not be accessed with garbage, to detect bad dereferences during copy
-
-	stackCache = 1
+	stackNoCache     = 0 // disable per-P small stack caches
 
 	// check the BP links during traceback.
 	debugCheckBP = false
@@ -186,30 +176,31 @@
 	s := list.first
 	if s == nil {
 		// no free stacks. Allocate another span worth.
-		s = mheap_.allocStack(_StackCacheSize >> _PageShift)
+		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
 		if s == nil {
 			throw("out of memory")
 		}
 		if s.allocCount != 0 {
 			throw("bad allocCount")
 		}
-		if s.stackfreelist.ptr() != nil {
-			throw("bad stackfreelist")
+		if s.manualFreeList.ptr() != nil {
+			throw("bad manualFreeList")
 		}
-		for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
+		s.elemsize = _FixedStack << order
+		for i := uintptr(0); i < _StackCacheSize; i += s.elemsize {
 			x := gclinkptr(s.base() + i)
-			x.ptr().next = s.stackfreelist
-			s.stackfreelist = x
+			x.ptr().next = s.manualFreeList
+			s.manualFreeList = x
 		}
 		list.insert(s)
 	}
-	x := s.stackfreelist
+	x := s.manualFreeList
 	if x.ptr() == nil {
 		throw("span has no free stacks")
 	}
-	s.stackfreelist = x.ptr().next
+	s.manualFreeList = x.ptr().next
 	s.allocCount++
-	if s.stackfreelist.ptr() == nil {
+	if s.manualFreeList.ptr() == nil {
 		// all stacks in s are allocated.
 		list.remove(s)
 	}
@@ -219,15 +210,15 @@
 // Adds stack x to the free pool. Must be called with stackpoolmu held.
 func stackpoolfree(x gclinkptr, order uint8) {
 	s := mheap_.lookup(unsafe.Pointer(x))
-	if s.state != _MSpanStack {
+	if s.state != _MSpanManual {
 		throw("freeing stack not in a stack span")
 	}
-	if s.stackfreelist.ptr() == nil {
+	if s.manualFreeList.ptr() == nil {
 		// s will now have a free stack
 		stackpool[order].insert(s)
 	}
-	x.ptr().next = s.stackfreelist
-	s.stackfreelist = x
+	x.ptr().next = s.manualFreeList
+	s.manualFreeList = x
 	s.allocCount--
 	if gcphase == _GCoff && s.allocCount == 0 {
 		// Span is completely free. Return it to the heap
@@ -246,8 +237,8 @@
 		//
 		// By not freeing, we prevent step #4 until GC is done.
 		stackpool[order].remove(s)
-		s.stackfreelist = 0
-		mheap_.freeStack(s)
+		s.manualFreeList = 0
+		mheap_.freeManual(s, &memstats.stacks_inuse)
 	}
 }
 
@@ -320,7 +311,7 @@
 // resources and must not split the stack.
 //
 //go:systemstack
-func stackalloc(n uint32) (stack, []stkbar) {
+func stackalloc(n uint32) stack {
 	// Stackalloc must be called on scheduler stack, so that we
 	// never try to grow the stack during the code that stackalloc runs.
 	// Doing so would cause a deadlock (issue 1547).
@@ -335,28 +326,20 @@
 		print("stackalloc ", n, "\n")
 	}
 
-	// Compute the size of stack barrier array.
-	maxstkbar := gcMaxStackBarriers(int(n))
-	nstkbar := unsafe.Sizeof(stkbar{}) * uintptr(maxstkbar)
-	var stkbarSlice slice
-
 	if debug.efence != 0 || stackFromSystem != 0 {
-		v := sysAlloc(round(uintptr(n), _PageSize), &memstats.stacks_sys)
+		n = uint32(round(uintptr(n), physPageSize))
+		v := sysAlloc(uintptr(n), &memstats.stacks_sys)
 		if v == nil {
 			throw("out of memory (stackalloc)")
 		}
-		top := uintptr(n) - nstkbar
-		if maxstkbar != 0 {
-			stkbarSlice = slice{add(v, top), 0, maxstkbar}
-		}
-		return stack{uintptr(v), uintptr(v) + top}, *(*[]stkbar)(unsafe.Pointer(&stkbarSlice))
+		return stack{uintptr(v), uintptr(v) + uintptr(n)}
 	}
 
 	// Small stacks are allocated with a fixed-size free-list allocator.
 	// If we need a stack of a bigger size, we fall back on allocating
 	// a dedicated span.
 	var v unsafe.Pointer
-	if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+	if n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
 		order := uint8(0)
 		n2 := n
 		for n2 > _FixedStack {
@@ -365,7 +348,7 @@
 		}
 		var x gclinkptr
 		c := thisg.m.mcache
-		if c == nil || thisg.m.preemptoff != "" || thisg.m.helpgc != 0 {
+		if stackNoCache != 0 || c == nil || thisg.m.preemptoff != "" || thisg.m.helpgc != 0 {
 			// c == nil can happen in the guts of exitsyscall or
 			// procresize. Just get a stack from the global pool.
 			// Also don't touch stackcache during gc
@@ -398,10 +381,11 @@
 
 		if s == nil {
 			// Allocate a new stack from the heap.
-			s = mheap_.allocStack(npage)
+			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
 			if s == nil {
 				throw("out of memory")
 			}
+			s.elemsize = uintptr(n)
 		}
 		v = unsafe.Pointer(s.base())
 	}
@@ -415,11 +399,7 @@
 	if stackDebug >= 1 {
 		print("  allocated ", v, "\n")
 	}
-	top := uintptr(n) - nstkbar
-	if maxstkbar != 0 {
-		stkbarSlice = slice{add(v, top), 0, maxstkbar}
-	}
-	return stack{uintptr(v), uintptr(v) + top}, *(*[]stkbar)(unsafe.Pointer(&stkbarSlice))
+	return stack{uintptr(v), uintptr(v) + uintptr(n)}
 }
 
 // stackfree frees an n byte stack allocation at stk.
@@ -428,9 +408,10 @@
 // resources and must not split the stack.
 //
 //go:systemstack
-func stackfree(stk stack, n uintptr) {
+func stackfree(stk stack) {
 	gp := getg()
 	v := unsafe.Pointer(stk.lo)
+	n := stk.hi - stk.lo
 	if n&(n-1) != 0 {
 		throw("stack not a power of 2")
 	}
@@ -452,7 +433,7 @@
 	if msanenabled {
 		msanfree(v, n)
 	}
-	if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
+	if n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
 		order := uint8(0)
 		n2 := n
 		for n2 > _FixedStack {
@@ -461,7 +442,7 @@
 		}
 		x := gclinkptr(v)
 		c := gp.m.mcache
-		if c == nil || gp.m.preemptoff != "" || gp.m.helpgc != 0 {
+		if stackNoCache != 0 || c == nil || gp.m.preemptoff != "" || gp.m.helpgc != 0 {
 			lock(&stackpoolmu)
 			stackpoolfree(x, order)
 			unlock(&stackpoolmu)
@@ -475,14 +456,14 @@
 		}
 	} else {
 		s := mheap_.lookup(v)
-		if s.state != _MSpanStack {
+		if s.state != _MSpanManual {
 			println(hex(s.base()), v)
 			throw("bad span state")
 		}
 		if gcphase == _GCoff {
 			// Free the stack immediately if we're
 			// sweeping.
-			mheap_.freeStack(s)
+			mheap_.freeManual(s, &memstats.stacks_inuse)
 		} else {
 			// If the GC is running, we can't return a
 			// stack span to the heap because it could be
@@ -581,7 +562,7 @@
 
 // bv describes the memory starting at address scanp.
 // Adjust any pointers contained therein.
-func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f *_func) {
+func adjustpointers(scanp unsafe.Pointer, cbv *bitvector, adjinfo *adjustinfo, f funcInfo) {
 	bv := gobv(*cbv)
 	minp := adjinfo.old.lo
 	maxp := adjinfo.old.hi
@@ -601,7 +582,7 @@
 			pp := (*uintptr)(add(scanp, i*sys.PtrSize))
 		retry:
 			p := *pp
-			if f != nil && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
+			if f.valid() && 0 < p && p < minLegalPointer && debug.invalidptr != 0 {
 				// Looks like a junk value in a pointer slot.
 				// Live analysis wrong?
 				getg().m.traceback = 2
@@ -725,7 +706,7 @@
 		if stackDebug >= 3 {
 			print("      args\n")
 		}
-		adjustpointers(unsafe.Pointer(frame.argp), &bv, adjinfo, nil)
+		adjustpointers(unsafe.Pointer(frame.argp), &bv, adjinfo, funcInfo{})
 	}
 	return true
 }
@@ -774,12 +755,6 @@
 	}
 }
 
-func adjuststkbar(gp *g, adjinfo *adjustinfo) {
-	for i := int(gp.stkbarPos); i < len(gp.stkbar); i++ {
-		adjustpointer(adjinfo, unsafe.Pointer(&gp.stkbar[i].savedLRPtr))
-	}
-}
-
 func fillstack(stk stack, b byte) {
 	for p := stk.lo; p < stk.hi; p++ {
 		*(*byte)(unsafe.Pointer(p)) = b
@@ -866,12 +841,12 @@
 	used := old.hi - gp.sched.sp
 
 	// allocate new stack
-	new, newstkbar := stackalloc(uint32(newsize))
+	new := stackalloc(uint32(newsize))
 	if stackPoisonCopy != 0 {
 		fillstack(new, 0xfd)
 	}
 	if stackDebug >= 1 {
-		print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]/", gp.stackAlloc, " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
+		print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]", " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
 	}
 
 	// Compute adjustment.
@@ -900,44 +875,30 @@
 	// Copy the stack (or the rest of it) to the new location
 	memmove(unsafe.Pointer(new.hi-ncopy), unsafe.Pointer(old.hi-ncopy), ncopy)
 
-	// Disallow sigprof scans of this stack and block if there's
-	// one in progress.
-	gcLockStackBarriers(gp)
-
 	// Adjust remaining structures that have pointers into stacks.
 	// We have to do most of these before we traceback the new
 	// stack because gentraceback uses them.
 	adjustctxt(gp, &adjinfo)
 	adjustdefers(gp, &adjinfo)
 	adjustpanics(gp, &adjinfo)
-	adjuststkbar(gp, &adjinfo)
 	if adjinfo.sghi != 0 {
 		adjinfo.sghi += adjinfo.delta
 	}
 
-	// copy old stack barriers to new stack barrier array
-	newstkbar = newstkbar[:len(gp.stkbar)]
-	copy(newstkbar, gp.stkbar)
-
 	// Swap out old stack for new one
 	gp.stack = new
 	gp.stackguard0 = new.lo + _StackGuard // NOTE: might clobber a preempt request
 	gp.sched.sp = new.hi - used
-	oldsize := gp.stackAlloc
-	gp.stackAlloc = newsize
-	gp.stkbar = newstkbar
 	gp.stktopsp += adjinfo.delta
 
 	// Adjust pointers in the new stack.
 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
 
-	gcUnlockStackBarriers(gp)
-
 	// free old stack
 	if stackPoisonCopy != 0 {
 		fillstack(old, 0xfc)
 	}
-	stackfree(old, oldsize)
+	stackfree(old)
 }
 
 // round x up to a power of 2.
@@ -1082,9 +1043,9 @@
 	}
 
 	// Allocate a bigger segment and move the stack.
-	oldsize := int(gp.stackAlloc)
+	oldsize := gp.stack.hi - gp.stack.lo
 	newsize := oldsize * 2
-	if uintptr(newsize) > maxstacksize {
+	if newsize > maxstacksize {
 		print("runtime: goroutine stack exceeds ", maxstacksize, "-byte limit\n")
 		throw("stack overflow")
 	}
@@ -1095,7 +1056,7 @@
 
 	// The concurrent GC will not scan the stack while we are doing the copy since
 	// the gp is in a Gcopystack status.
-	copystack(gp, uintptr(newsize), true)
+	copystack(gp, newsize, true)
 	if stackDebug >= 1 {
 		print("stack grow done\n")
 	}
@@ -1129,11 +1090,9 @@
 		if gp.stack.lo != 0 {
 			// Free whole stack - it will get reallocated
 			// if G is used again.
-			stackfree(gp.stack, gp.stackAlloc)
+			stackfree(gp.stack)
 			gp.stack.lo = 0
 			gp.stack.hi = 0
-			gp.stkbar = nil
-			gp.stkbarPos = 0
 		}
 		return
 	}
@@ -1153,7 +1112,7 @@
 		return
 	}
 
-	oldsize := gp.stackAlloc
+	oldsize := gp.stack.hi - gp.stack.lo
 	newsize := oldsize / 2
 	// Don't shrink the allocation below the minimum-sized stack
 	// allocation.
@@ -1197,8 +1156,8 @@
 			next := s.next
 			if s.allocCount == 0 {
 				list.remove(s)
-				s.stackfreelist = 0
-				mheap_.freeStack(s)
+				s.manualFreeList = 0
+				mheap_.freeManual(s, &memstats.stacks_inuse)
 			}
 			s = next
 		}
@@ -1212,7 +1171,7 @@
 		for s := stackLarge.free[i].first; s != nil; {
 			next := s.next
 			stackLarge.free[i].remove(s)
-			mheap_.freeStack(s)
+			mheap_.freeManual(s, &memstats.stacks_inuse)
 			s = next
 		}
 	}
@@ -1222,6 +1181,6 @@
 //go:nosplit
 func morestackc() {
 	systemstack(func() {
-		throw("attempt to execute C code on Go stack")
+		throw("attempt to execute system stack code on user stack")
 	})
 }

diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index a32b68b..25e8f77 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go

@@ -8,6 +8,7 @@
 	. "runtime"
 	"strings"
 	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 )
@@ -71,7 +72,9 @@
 
 // Test stack growing in different contexts.
 func TestStackGrowth(t *testing.T) {
-	t.Parallel()
+	// Don't make this test parallel as this makes the 20 second
+	// timeout unreliable on slow builders. (See issue #19381.)
+
 	var wg sync.WaitGroup
 
 	// in a normal goroutine
@@ -97,9 +100,11 @@
 	go func() {
 		defer wg.Done()
 		done := make(chan bool)
+		var started uint32
 		go func() {
 			s := new(string)
 			SetFinalizer(s, func(ss *string) {
+				atomic.StoreUint32(&started, 1)
 				growStack()
 				done <- true
 			})
@@ -111,6 +116,9 @@
 		select {
 		case <-done:
 		case <-time.After(20 * time.Second):
+			if atomic.LoadUint32(&started) == 0 {
+				t.Log("finalizer did not start")
+			}
 			t.Error("finalizer did not run")
 			return
 		}
@@ -315,17 +323,17 @@
 	defer func() {
 		// At this point we created a large stack and unwound
 		// it via recovery. Force a stack walk, which will
-		// check the consistency of stack barriers.
+		// check the stack's consistency.
 		Callers(0, pc)
 	}()
 	defer func() {
 		recover()
 	}()
 	useStackAndCall(100, func() {
-		// Kick off the GC and make it do something nontrivial
-		// to keep stack barriers installed for a while.
+		// Kick off the GC and make it do something nontrivial.
+		// (This used to force stack barriers to stick around.)
 		xtree = makeTree(18)
-		// Give the GC time to install stack barriers.
+		// Give the GC time to start scanning stacks.
 		time.Sleep(time.Millisecond)
 		panic(1)
 	})
@@ -447,3 +455,175 @@
 	}
 	return 1 + count(n-1)
 }
+
+func BenchmarkStackCopyNoCache(b *testing.B) {
+	c := make(chan bool)
+	for i := 0; i < b.N; i++ {
+		go func() {
+			count1(1000000)
+			c <- true
+		}()
+		<-c
+	}
+}
+
+func count1(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count2(n-1)
+}
+
+func count2(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count3(n-1)
+}
+
+func count3(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count4(n-1)
+}
+
+func count4(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count5(n-1)
+}
+
+func count5(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count6(n-1)
+}
+
+func count6(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count7(n-1)
+}
+
+func count7(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count8(n-1)
+}
+
+func count8(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count9(n-1)
+}
+
+func count9(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count10(n-1)
+}
+
+func count10(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count11(n-1)
+}
+
+func count11(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count12(n-1)
+}
+
+func count12(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count13(n-1)
+}
+
+func count13(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count14(n-1)
+}
+
+func count14(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count15(n-1)
+}
+
+func count15(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count16(n-1)
+}
+
+func count16(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count17(n-1)
+}
+
+func count17(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count18(n-1)
+}
+
+func count18(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count19(n-1)
+}
+
+func count19(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count20(n-1)
+}
+
+func count20(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count21(n-1)
+}
+
+func count21(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count22(n-1)
+}
+
+func count22(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count23(n-1)
+}
+
+func count23(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count1(n-1)
+}

diff --git a/src/runtime/string.go b/src/runtime/string.go
index 822adaa..0ccc81e 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go

@@ -69,7 +69,7 @@
 
 // Buf is a fixed-size buffer for the result,
 // it is not nil if the result does not escape.
-func slicebytetostring(buf *tmpBuf, b []byte) string {
+func slicebytetostring(buf *tmpBuf, b []byte) (str string) {
 	l := len(b)
 	if l == 0 {
 		// Turns out to be a relatively common case.
@@ -77,18 +77,26 @@
 		// you find the indices and convert the subslice to string.
 		return ""
 	}
-	if raceenabled && l > 0 {
+	if raceenabled {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
 			getcallerpc(unsafe.Pointer(&buf)),
 			funcPC(slicebytetostring))
 	}
-	if msanenabled && l > 0 {
+	if msanenabled {
 		msanread(unsafe.Pointer(&b[0]), uintptr(l))
 	}
-	s, c := rawstringtmp(buf, l)
-	copy(c, b)
-	return s
+
+	var p unsafe.Pointer
+	if buf != nil && len(b) <= len(buf) {
+		p = unsafe.Pointer(buf)
+	} else {
+		p = mallocgc(uintptr(len(b)), nil, false)
+	}
+	stringStructOf(&str).str = p
+	stringStructOf(&str).len = len(b)
+	memmove(p, (*(*slice)(unsafe.Pointer(&b))).array, uintptr(len(b)))
+	return
 }
 
 // stringDataOnStack reports whether the string's data is

diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index fcfc522..7633cfd 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go

@@ -6,6 +6,7 @@
 
 import (
 	"runtime"
+	"strconv"
 	"strings"
 	"testing"
 )
@@ -89,6 +90,20 @@
 	}
 }
 
+var escapeString string
+
+func BenchmarkSliceByteToString(b *testing.B) {
+	buf := []byte{'!'}
+	for n := 0; n < 8; n++ {
+		b.Run(strconv.Itoa(len(buf)), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				escapeString = string(buf)
+			}
+		})
+		buf = append(buf, buf...)
+	}
+}
+
 var stringdata = []struct{ name, data string }{
 	{"ASCII", "01234567890"},
 	{"Japanese", "日本語日本語日本語"},

diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index 107f260..c4f32a8 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go

@@ -93,8 +93,22 @@
 // exported value for testing
 var hashLoad = loadFactor
 
-// in asm_*.s
-func fastrand() uint32
+//go:nosplit
+func fastrand() uint32 {
+	mp := getg().m
+	fr := mp.fastrand
+	mx := uint32(int32(fr)>>31) & 0xa8888eef
+	fr = fr<<1 ^ mx
+	mp.fastrand = fr
+	return fr
+}
+
+//go:nosplit
+func fastrandn(n uint32) uint32 {
+	// This is similar to fastrand() % n, but faster.
+	// See http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+	return uint32(uint64(fastrand()) * uint64(n) >> 32)
+}
 
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
@@ -150,7 +164,7 @@
 // This function must never be called directly. Call goexit1 instead.
 // gentraceback assumes that goexit terminates the stack. A direct
 // call on the stack will cause gentraceback to stop walking the stack
-// prematurely and if there are leftover stack barriers it may panic.
+// prematurely and if there is leftover state it may panic.
 func goexit(neverCallThisFunction)
 
 // Not all cgocallback_gofunc frames are actually cgocallback_gofunc,
@@ -178,9 +192,6 @@
 // data dependency ordering.
 func publicationBarrier()
 
-//go:noescape
-func setcallerpc(argp unsafe.Pointer, pc uintptr)
-
 // getcallerpc returns the program counter (PC) of its caller's caller.
 // getcallersp returns the stack pointer (SP) of its caller's caller.
 // For both, the argp must be a pointer to the caller's first function argument.
@@ -227,13 +238,6 @@
 func morestack_noctxt()
 func rt0_go()
 
-// stackBarrier records that the stack has been unwound past a certain
-// point. It is installed over a return PC on the stack. It must
-// retrieve the original return PC from g.stkbuf, increment
-// g.stkbufPos to record that the barrier was hit, and jump to the
-// original return PC.
-func stackBarrier()
-
 // return0 is a stub used to return 0 from deferproc.
 // It is called at the very end of deferproc to signal
 // the calling Go function that it should not jump
@@ -241,9 +245,6 @@
 // in asm_*.s
 func return0()
 
-//go:linkname time_now time.now
-func time_now() (sec int64, nsec int32)
-
 // in asm_*.s
 // not called directly; definitions here supply type information for traceback.
 func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
@@ -280,11 +281,6 @@
 func prefetcht2(addr uintptr)
 func prefetchnta(addr uintptr)
 
-func unixnanotime() int64 {
-	sec, nsec := time_now()
-	return sec*1e9 + int64(nsec)
-}
-
 // round n up to a multiple of a.  a must be a power of 2.
 func round(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
@@ -295,3 +291,10 @@
 
 func memequal_varlen(a, b unsafe.Pointer) bool
 func eqstring(s1, s2 string) bool
+
+// bool2int returns 0 if x is false or 1 if x is true.
+func bool2int(x bool) int {
+	// Avoid branches. In the SSA compiler, this compiles to
+	// exactly what you would want it to.
+	return int(uint8(*(*uint8)(unsafe.Pointer(&x))))
+}

diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 95db924..8390d8f 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go

@@ -18,8 +18,6 @@
 func nanotime() int64
 func usleep(usec uint32)
 
-func munmap(addr unsafe.Pointer, n uintptr)
-
 //go:noescape
 func write(fd uintptr, p unsafe.Pointer, n int32) int32
 

diff --git a/src/runtime/stubs_linux.go b/src/runtime/stubs_linux.go
new file mode 100644
index 0000000..d10f657
--- /dev/null
+++ b/src/runtime/stubs_linux.go

@@ -0,0 +1,9 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package runtime
+
+func sbrk0() uintptr

diff --git a/src/runtime/stubs_nonlinux.go b/src/runtime/stubs_nonlinux.go
new file mode 100644
index 0000000..e1ea05c
--- /dev/null
+++ b/src/runtime/stubs_nonlinux.go

@@ -0,0 +1,12 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !linux
+
+package runtime
+
+// sbrk0 returns the current process brk, or 0 if not implemented.
+func sbrk0() uintptr {
+	return 0
+}

diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index ed82783..8e410c4 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go

@@ -13,108 +13,268 @@
 // Frames may be used to get function/file/line information for a
 // slice of PC values returned by Callers.
 type Frames struct {
+	// callers is a slice of PCs that have not yet been expanded.
 	callers []uintptr
 
-	// If previous caller in iteration was a panic, then
-	// ci.callers[0] is the address of the faulting instruction
-	// instead of the return address of the call.
-	wasPanic bool
-
-	// Frames to return for subsequent calls to the Next method.
-	// Used for non-Go frames.
-	frames *[]Frame
+	// stackExpander expands callers into a sequence of Frames,
+	// tracking the necessary state across PCs.
+	stackExpander stackExpander
 }
 
 // Frame is the information returned by Frames for each call frame.
 type Frame struct {
-	// Program counter for this frame; multiple frames may have
-	// the same PC value.
+	// PC is the program counter for the location in this frame.
+	// For a frame that calls another frame, this will be the
+	// program counter of a call instruction. Because of inlining,
+	// multiple frames may have the same PC value, but different
+	// symbolic information.
 	PC uintptr
 
-	// Func for this frame; may be nil for non-Go code or fully
-	// inlined functions.
+	// Func is the Func value of this call frame. This may be nil
+	// for non-Go code or fully inlined functions.
 	Func *Func
 
-	// Function name, file name, and line number for this call frame.
-	// May be the empty string or zero if not known.
+	// Function is the package path-qualified function name of
+	// this call frame. If non-empty, this string uniquely
+	// identifies a single function in the program.
+	// This may be the empty string if not known.
 	// If Func is not nil then Function == Func.Name().
 	Function string
-	File     string
-	Line     int
 
-	// Entry point for the function; may be zero if not known.
-	// If Func is not nil then Entry == Func.Entry().
+	// File and Line are the file name and line number of the
+	// location in this frame. For non-leaf frames, this will be
+	// the location of a call. These may be the empty string and
+	// zero, respectively, if not known.
+	File string
+	Line int
+
+	// Entry point program counter for the function; may be zero
+	// if not known. If Func is not nil then Entry ==
+	// Func.Entry().
 	Entry uintptr
 }
 
+// stackExpander expands a call stack of PCs into a sequence of
+// Frames. It tracks state across PCs necessary to perform this
+// expansion.
+//
+// This is the core of the Frames implementation, but is a separate
+// internal API to make it possible to use within the runtime without
+// heap-allocating the PC slice. The only difference with the public
+// Frames API is that the caller is responsible for threading the PC
+// slice between expansion steps in this API. If escape analysis were
+// smarter, we may not need this (though it may have to be a lot
+// smarter).
+type stackExpander struct {
+	// pcExpander expands the current PC into a sequence of Frames.
+	pcExpander pcExpander
+
+	// If previous caller in iteration was a panic, then the next
+	// PC in the call stack is the address of the faulting
+	// instruction instead of the return address of the call.
+	wasPanic bool
+
+	// skip > 0 indicates that skip frames in the expansion of the
+	// first PC should be skipped over and callers[1] should also
+	// be skipped.
+	skip int
+}
+
 // CallersFrames takes a slice of PC values returned by Callers and
 // prepares to return function/file/line information.
 // Do not change the slice until you are done with the Frames.
 func CallersFrames(callers []uintptr) *Frames {
-	return &Frames{callers: callers}
+	ci := &Frames{}
+	ci.callers = ci.stackExpander.init(callers)
+	return ci
+}
+
+func (se *stackExpander) init(callers []uintptr) []uintptr {
+	if len(callers) >= 1 {
+		pc := callers[0]
+		s := pc - skipPC
+		if s >= 0 && s < sizeofSkipFunction {
+			// Ignore skip frame callers[0] since this means the caller trimmed the PC slice.
+			return callers[1:]
+		}
+	}
+	if len(callers) >= 2 {
+		pc := callers[1]
+		s := pc - skipPC
+		if s > 0 && s < sizeofSkipFunction {
+			// Skip the first s inlined frames when we expand the first PC.
+			se.skip = int(s)
+		}
+	}
+	return callers
 }
 
 // Next returns frame information for the next caller.
 // If more is false, there are no more callers (the Frame value is valid).
 func (ci *Frames) Next() (frame Frame, more bool) {
-	if ci.frames != nil {
-		// We have saved up frames to return.
-		f := (*ci.frames)[0]
-		if len(*ci.frames) == 1 {
-			ci.frames = nil
-		} else {
-			*ci.frames = (*ci.frames)[1:]
-		}
-		return f, ci.frames != nil || len(ci.callers) > 0
-	}
-
-	if len(ci.callers) == 0 {
-		ci.wasPanic = false
-		return Frame{}, false
-	}
-	pc := ci.callers[0]
-	ci.callers = ci.callers[1:]
-	more = len(ci.callers) > 0
-	f := FuncForPC(pc)
-	if f == nil {
-		ci.wasPanic = false
-		if cgoSymbolizer != nil {
-			return ci.cgoNext(pc, more)
-		}
-		return Frame{}, more
-	}
-
-	entry := f.Entry()
-	xpc := pc
-	if xpc > entry && !ci.wasPanic {
-		xpc--
-	}
-	file, line := f.FileLine(xpc)
-
-	function := f.Name()
-	ci.wasPanic = entry == sigpanicPC
-
-	frame = Frame{
-		PC:       xpc,
-		Func:     f,
-		Function: function,
-		File:     file,
-		Line:     line,
-		Entry:    entry,
-	}
-
-	return frame, more
+	ci.callers, frame, more = ci.stackExpander.next(ci.callers)
+	return
 }
 
-// cgoNext returns frame information for pc, known to be a non-Go function,
-// using the cgoSymbolizer hook.
-func (ci *Frames) cgoNext(pc uintptr, more bool) (Frame, bool) {
+func (se *stackExpander) next(callers []uintptr) (ncallers []uintptr, frame Frame, more bool) {
+	ncallers = callers
+	if !se.pcExpander.more {
+		// Expand the next PC.
+		if len(ncallers) == 0 {
+			se.wasPanic = false
+			return ncallers, Frame{}, false
+		}
+		se.pcExpander.init(ncallers[0], se.wasPanic)
+		ncallers = ncallers[1:]
+		se.wasPanic = se.pcExpander.funcInfo.valid() && se.pcExpander.funcInfo.entry == sigpanicPC
+		if se.skip > 0 {
+			for ; se.skip > 0; se.skip-- {
+				se.pcExpander.next()
+			}
+			se.skip = 0
+			// Drop skipPleaseUseCallersFrames.
+			ncallers = ncallers[1:]
+		}
+		if !se.pcExpander.more {
+			// No symbolic information for this PC.
+			// However, we return at least one frame for
+			// every PC, so return an invalid frame.
+			return ncallers, Frame{}, len(ncallers) > 0
+		}
+	}
+
+	frame = se.pcExpander.next()
+	return ncallers, frame, se.pcExpander.more || len(ncallers) > 0
+}
+
+// A pcExpander expands a single PC into a sequence of Frames.
+type pcExpander struct {
+	// more indicates that the next call to next will return a
+	// valid frame.
+	more bool
+
+	// pc is the pc being expanded.
+	pc uintptr
+
+	// frames is a pre-expanded set of Frames to return from the
+	// iterator. If this is set, then this is everything that will
+	// be returned from the iterator.
+	frames []Frame
+
+	// funcInfo is the funcInfo of the function containing pc.
+	funcInfo funcInfo
+
+	// inlTree is the inlining tree of the function containing pc.
+	inlTree *[1 << 20]inlinedCall
+
+	// file and line are the file name and line number of the next
+	// frame.
+	file string
+	line int32
+
+	// inlIndex is the inlining index of the next frame, or -1 if
+	// the next frame is an outermost frame.
+	inlIndex int32
+}
+
+// init initializes this pcExpander to expand pc. It sets ex.more if
+// pc expands to any Frames.
+//
+// A pcExpander can be reused by calling init again.
+//
+// If pc was a "call" to sigpanic, panicCall should be true. In this
+// case, pc is treated as the address of a faulting instruction
+// instead of the return address of a call.
+func (ex *pcExpander) init(pc uintptr, panicCall bool) {
+	ex.more = false
+
+	ex.funcInfo = findfunc(pc)
+	if !ex.funcInfo.valid() {
+		if cgoSymbolizer != nil {
+			// Pre-expand cgo frames. We could do this
+			// incrementally, too, but there's no way to
+			// avoid allocation in this case anyway.
+			ex.frames = expandCgoFrames(pc)
+			ex.more = len(ex.frames) > 0
+		}
+		return
+	}
+
+	ex.more = true
+	entry := ex.funcInfo.entry
+	ex.pc = pc
+	if ex.pc > entry && !panicCall {
+		ex.pc--
+	}
+
+	// file and line are the innermost position at pc.
+	ex.file, ex.line = funcline1(ex.funcInfo, ex.pc, false)
+
+	// Get inlining tree at pc
+	inldata := funcdata(ex.funcInfo, _FUNCDATA_InlTree)
+	if inldata != nil {
+		ex.inlTree = (*[1 << 20]inlinedCall)(inldata)
+		ex.inlIndex = pcdatavalue(ex.funcInfo, _PCDATA_InlTreeIndex, ex.pc, nil)
+	} else {
+		ex.inlTree = nil
+		ex.inlIndex = -1
+	}
+}
+
+// next returns the next Frame in the expansion of pc and sets ex.more
+// if there are more Frames to follow.
+func (ex *pcExpander) next() Frame {
+	if !ex.more {
+		return Frame{}
+	}
+
+	if len(ex.frames) > 0 {
+		// Return pre-expended frame.
+		frame := ex.frames[0]
+		ex.frames = ex.frames[1:]
+		ex.more = len(ex.frames) > 0
+		return frame
+	}
+
+	if ex.inlIndex >= 0 {
+		// Return inner inlined frame.
+		call := ex.inlTree[ex.inlIndex]
+		frame := Frame{
+			PC:       ex.pc,
+			Func:     nil, // nil for inlined functions
+			Function: funcnameFromNameoff(ex.funcInfo, call.func_),
+			File:     ex.file,
+			Line:     int(ex.line),
+			Entry:    ex.funcInfo.entry,
+		}
+		ex.file = funcfile(ex.funcInfo, call.file)
+		ex.line = call.line
+		ex.inlIndex = call.parent
+		return frame
+	}
+
+	// No inlining or pre-expanded frames.
+	ex.more = false
+	return Frame{
+		PC:       ex.pc,
+		Func:     ex.funcInfo._Func(),
+		Function: funcname(ex.funcInfo),
+		File:     ex.file,
+		Line:     int(ex.line),
+		Entry:    ex.funcInfo.entry,
+	}
+}
+
+// expandCgoFrames expands frame information for pc, known to be
+// a non-Go function, using the cgoSymbolizer hook. expandCgoFrames
+// returns nil if pc could not be expanded.
+func expandCgoFrames(pc uintptr) []Frame {
 	arg := cgoSymbolizerArg{pc: pc}
 	callCgoSymbolizer(&arg)
 
 	if arg.file == nil && arg.funcName == nil {
 		// No useful information from symbolizer.
-		return Frame{}, more
+		return nil
 	}
 
 	var frames []Frame
@@ -140,24 +300,14 @@
 	arg.pc = 0
 	callCgoSymbolizer(&arg)
 
-	if len(frames) == 1 {
-		// Return a single frame.
-		return frames[0], more
-	}
-
-	// Return the first frame we saw and store the rest to be
-	// returned by later calls to Next.
-	rf := frames[0]
-	frames = frames[1:]
-	ci.frames = new([]Frame)
-	*ci.frames = frames
-	return rf, true
+	return frames
 }
 
 // NOTE: Func does not expose the actual unexported fields, because we return *Func
 // values to users, and we want to keep them from being able to overwrite the data
 // with (say) *f = Func{}.
-// All code operating on a *Func must call raw to get the *_func instead.
+// All code operating on a *Func must call raw() to get the *_func
+// or funcInfo() to get the funcInfo instead.
 
 // A Func represents a Go function in the running binary.
 type Func struct {
@@ -168,11 +318,20 @@
 	return (*_func)(unsafe.Pointer(f))
 }
 
-// funcdata.h
+func (f *Func) funcInfo() funcInfo {
+	fn := f.raw()
+	return funcInfo{fn, findmoduledatap(fn.entry)}
+}
+
+// PCDATA and FUNCDATA table indexes.
+//
+// See funcdata.h and ../cmd/internal/obj/funcdata.go.
 const (
 	_PCDATA_StackMapIndex       = 0
+	_PCDATA_InlTreeIndex        = 1
 	_FUNCDATA_ArgsPointerMaps   = 0
 	_FUNCDATA_LocalsPointerMaps = 1
+	_FUNCDATA_InlTree           = 2
 	_ArgsSizeUnknown            = -0x80000000
 )
 
@@ -223,7 +382,7 @@
 // at link time and a pointer to the runtime abi hash. These are checked in
 // moduledataverify1 below.
 //
-// For each loaded plugin, the the pkghashes slice has a modulehash of the
+// For each loaded plugin, the pkghashes slice has a modulehash of the
 // newly loaded package that can be used to check the plugin's version of
 // a package against any previously loaded version of the package.
 // This is done in plugin.lastmoduleinit.
@@ -361,15 +520,15 @@
 	for i := 0; i < nftab; i++ {
 		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
 		if datap.ftab[i].entry > datap.ftab[i+1].entry {
-			f1 := (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff]))
-			f2 := (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff]))
+			f1 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff])), datap}
+			f2 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff])), datap}
 			f2name := "end"
 			if i+1 < nftab {
 				f2name = funcname(f2)
 			}
 			println("function symbol table not sorted by program counter:", hex(datap.ftab[i].entry), funcname(f1), ">", hex(datap.ftab[i+1].entry), f2name)
 			for j := 0; j <= i; j++ {
-				print("\t", hex(datap.ftab[j].entry), " ", funcname((*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[j].funcoff]))), "\n")
+				print("\t", hex(datap.ftab[j].entry), " ", funcname(funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[j].funcoff])), datap}), "\n")
 			}
 			throw("invalid runtime symbol table")
 		}
@@ -382,10 +541,10 @@
 			// But don't use the next PC if it corresponds to a foreign object chunk
 			// (no pcln table, f2.pcln == 0). That chunk might have an alignment
 			// more than 16 bytes.
-			f := (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff]))
+			f := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i].funcoff])), datap}
 			end := f.entry
 			if i+1 < nftab {
-				f2 := (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff]))
+				f2 := funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[i+1].funcoff])), datap}
 				if f2.pcln != 0 {
 					end = f2.entry - 16
 					if end < f.entry {
@@ -414,13 +573,19 @@
 
 // FuncForPC returns a *Func describing the function that contains the
 // given program counter address, or else nil.
+//
+// If pc represents multiple functions because of inlining, it returns
+// the *Func describing the outermost function.
 func FuncForPC(pc uintptr) *Func {
-	return (*Func)(unsafe.Pointer(findfunc(pc)))
+	return findfunc(pc)._Func()
 }
 
 // Name returns the name of the function.
 func (f *Func) Name() string {
-	return funcname(f.raw())
+	if f == nil {
+		return ""
+	}
+	return funcname(f.funcInfo())
 }
 
 // Entry returns the entry address of the function.
@@ -435,7 +600,7 @@
 func (f *Func) FileLine(pc uintptr) (file string, line int) {
 	// Pass strict=false here, because anyone can call this function,
 	// and they might just be wrong about targetpc belonging to f.
-	file, line32 := funcline1(f.raw(), pc, false)
+	file, line32 := funcline1(f.funcInfo(), pc, false)
 	return file, int(line32)
 }
 
@@ -448,10 +613,23 @@
 	return nil
 }
 
-func findfunc(pc uintptr) *_func {
+type funcInfo struct {
+	*_func
+	datap *moduledata
+}
+
+func (f funcInfo) valid() bool {
+	return f._func != nil
+}
+
+func (f funcInfo) _Func() *Func {
+	return (*Func)(unsafe.Pointer(f._func))
+}
+
+func findfunc(pc uintptr) funcInfo {
 	datap := findmoduledatap(pc)
 	if datap == nil {
-		return nil
+		return funcInfo{}
 	}
 	const nsub = uintptr(len(findfuncbucket{}.subbuckets))
 
@@ -487,7 +665,7 @@
 			idx++
 		}
 	}
-	return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff]))
+	return funcInfo{(*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff])), datap}
 }
 
 type pcvalueCache struct {
@@ -502,7 +680,7 @@
 	val int32
 }
 
-func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
+func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
 	if off == 0 {
 		return -1
 	}
@@ -514,26 +692,27 @@
 	// cheaper than doing the hashing for a less associative
 	// cache.
 	if cache != nil {
-		for _, ent := range cache.entries {
+		for i := range cache.entries {
 			// We check off first because we're more
 			// likely to have multiple entries with
 			// different offsets for the same targetpc
 			// than the other way around, so we'll usually
 			// fail in the first clause.
+			ent := &cache.entries[i]
 			if ent.off == off && ent.targetpc == targetpc {
 				return ent.val
 			}
 		}
 	}
 
-	datap := findmoduledatap(f.entry) // inefficient
-	if datap == nil {
+	if !f.valid() {
 		if strict && panicking == 0 {
 			print("runtime: no module data for ", hex(f.entry), "\n")
 			throw("no module data")
 		}
 		return -1
 	}
+	datap := f.datap
 	p := datap.pclntable[off:]
 	pc := f.entry
 	val := int32(-1)
@@ -549,7 +728,7 @@
 			// a recursive stack's cycle is slightly
 			// larger than the cache.
 			if cache != nil {
-				ci := fastrand() % uint32(len(cache.entries))
+				ci := fastrandn(uint32(len(cache.entries)))
 				cache.entries[ci] = pcvalueCacheEnt{
 					targetpc: targetpc,
 					off:      off,
@@ -585,24 +764,37 @@
 	return -1
 }
 
-func cfuncname(f *_func) *byte {
-	if f == nil || f.nameoff == 0 {
+func cfuncname(f funcInfo) *byte {
+	if !f.valid() || f.nameoff == 0 {
 		return nil
 	}
-	datap := findmoduledatap(f.entry) // inefficient
-	if datap == nil {
-		return nil
-	}
-	return &datap.pclntable[f.nameoff]
+	return &f.datap.pclntable[f.nameoff]
 }
 
-func funcname(f *_func) string {
+func funcname(f funcInfo) string {
 	return gostringnocopy(cfuncname(f))
 }
 
-func funcline1(f *_func, targetpc uintptr, strict bool) (file string, line int32) {
-	datap := findmoduledatap(f.entry) // inefficient
-	if datap == nil {
+func funcnameFromNameoff(f funcInfo, nameoff int32) string {
+	datap := f.datap
+	if !f.valid() {
+		return ""
+	}
+	cstr := &datap.pclntable[nameoff]
+	return gostringnocopy(cstr)
+}
+
+func funcfile(f funcInfo, fileno int32) string {
+	datap := f.datap
+	if !f.valid() {
+		return "?"
+	}
+	return gostringnocopy(&datap.pclntable[datap.filetab[fileno]])
+}
+
+func funcline1(f funcInfo, targetpc uintptr, strict bool) (file string, line int32) {
+	datap := f.datap
+	if !f.valid() {
 		return "?", 0
 	}
 	fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
@@ -615,11 +807,11 @@
 	return
 }
 
-func funcline(f *_func, targetpc uintptr) (file string, line int32) {
+func funcline(f funcInfo, targetpc uintptr) (file string, line int32) {
 	return funcline1(f, targetpc, true)
 }
 
-func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 {
+func funcspdelta(f funcInfo, targetpc uintptr, cache *pcvalueCache) int32 {
 	x := pcvalue(f, f.pcsp, targetpc, cache, true)
 	if x&(sys.PtrSize-1) != 0 {
 		print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
@@ -627,7 +819,7 @@
 	return x
 }
 
-func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
+func pcdatavalue(f funcInfo, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
 	if table < 0 || table >= f.npcdata {
 		return -1
 	}
@@ -635,14 +827,14 @@
 	return pcvalue(f, off, targetpc, cache, true)
 }
 
-func funcdata(f *_func, i int32) unsafe.Pointer {
+func funcdata(f funcInfo, i int32) unsafe.Pointer {
 	if i < 0 || i >= f.nfuncdata {
 		return nil
 	}
 	p := add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(f.npcdata)*4)
 	if sys.PtrSize == 8 && uintptr(p)&4 != 0 {
-		if uintptr(unsafe.Pointer(f))&4 != 0 {
-			println("runtime: misaligned func", f)
+		if uintptr(unsafe.Pointer(f._func))&4 != 0 {
+			println("runtime: misaligned func", f._func)
 		}
 		p = add(p, 4)
 	}
@@ -651,35 +843,47 @@
 
 // step advances to the next pc, value pair in the encoded table.
 func step(p []byte, pc *uintptr, val *int32, first bool) (newp []byte, ok bool) {
-	p, uvdelta := readvarint(p)
+	// For both uvdelta and pcdelta, the common case (~70%)
+	// is that they are a single byte. If so, avoid calling readvarint.
+	uvdelta := uint32(p[0])
 	if uvdelta == 0 && !first {
 		return nil, false
 	}
+	n := uint32(1)
+	if uvdelta&0x80 != 0 {
+		n, uvdelta = readvarint(p)
+	}
+	p = p[n:]
 	if uvdelta&1 != 0 {
 		uvdelta = ^(uvdelta >> 1)
 	} else {
 		uvdelta >>= 1
 	}
 	vdelta := int32(uvdelta)
-	p, pcdelta := readvarint(p)
+	pcdelta := uint32(p[0])
+	n = 1
+	if pcdelta&0x80 != 0 {
+		n, pcdelta = readvarint(p)
+	}
+	p = p[n:]
 	*pc += uintptr(pcdelta * sys.PCQuantum)
 	*val += vdelta
 	return p, true
 }
 
 // readvarint reads a varint from p.
-func readvarint(p []byte) (newp []byte, val uint32) {
-	var v, shift uint32
+func readvarint(p []byte) (read uint32, val uint32) {
+	var v, shift, n uint32
 	for {
-		b := p[0]
-		p = p[1:]
-		v |= (uint32(b) & 0x7F) << shift
+		b := p[n]
+		n++
+		v |= uint32(b&0x7F) << (shift & 31)
 		if b&0x80 == 0 {
 			break
 		}
 		shift += 7
 	}
-	return p, v
+	return n, v
 }
 
 type stackmap struct {
@@ -693,5 +897,13 @@
 	if n < 0 || n >= stkmap.n {
 		throw("stackmapdata: index out of range")
 	}
-	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)/8))))}
+	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)>>3))))}
+}
+
+// inlinedCall is the encoding of entries in the FUNCDATA_InlTree table.
+type inlinedCall struct {
+	parent int32 // index of parent in the inltree, or < 0
+	file   int32 // fileno index into filetab
+	line   int32 // line number of the call site
+	func_  int32 // offset into pclntab for name of called function
 }

diff --git a/src/runtime/symtab_test.go b/src/runtime/symtab_test.go
index b15a2e9..01e5002 100644
--- a/src/runtime/symtab_test.go
+++ b/src/runtime/symtab_test.go

@@ -26,10 +26,14 @@
 	}
 }
 
+// These are marked noinline so that we can use FuncForPC
+// in testCallerBar.
+//go:noinline
 func testCallerFoo(t *testing.T) {
 	testCallerBar(t)
 }
 
+//go:noinline
 func testCallerBar(t *testing.T) {
 	for i := 0; i < 2; i++ {
 		pc, file, line, ok := runtime.Caller(i)
@@ -88,7 +92,7 @@
 }                           // 33
 var intLit = lineNumber() + // 34
 	lineNumber() + // 35
-			lineNumber() // 36
+	lineNumber() // 36
 func trythis() { // 37
 	recordLines(lineNumber(), // 38
 		lineNumber(), // 39
@@ -150,3 +154,14 @@
 		}
 	}
 }
+
+func TestNilName(t *testing.T) {
+	defer func() {
+		if ex := recover(); ex != nil {
+			t.Fatalf("expected no nil panic, got=%v", ex)
+		}
+	}()
+	if got := (*runtime.Func)(nil).Name(); got != "" {
+		t.Errorf("Name() = %q, want %q", got, "")
+	}
+}

diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
index 200961f..5c62bfd 100644
--- a/src/runtime/sys_darwin_386.s
+++ b/src/runtime/sys_darwin_386.s

@@ -114,6 +114,16 @@
 // 64-bit unix nanoseconds returned in DX:AX.
 // I'd much rather write this in C but we need
 // assembly for the 96-bit multiply and RDTSC.
+//
+// Note that we could arrange to return monotonic time here
+// as well, but we don't bother, for two reasons:
+// 1. macOS only supports 64-bit systems, so no one should
+// be using the 32-bit code in production.
+// This code is only maintained to make it easier for developers
+// using Macs to test the 32-bit compiler.
+// 2. On some (probably now unsupported) CPUs,
+// the code falls back to the system call always,
+// so it can't even use the comm page at all. 
 TEXT runtime·now(SB),NOSPLIT,$40
 	MOVL	$0xffff0000, BP /* comm page base */
 	
@@ -217,9 +227,15 @@
 	ADCL	$0, DX
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$0
+// func now() (sec int64, nsec int32, mono uint64)
+TEXT time·now(SB),NOSPLIT,$0-20
 	CALL	runtime·now(SB)
+	MOVL	AX, BX
+	MOVL	DX, BP
+	SUBL	runtime·startNano(SB), BX
+	SBBL	runtime·startNano+4(SB), BP
+	MOVL	BX, mono+12(FP)
+	MOVL	BP, mono+16(FP)
 	MOVL	$1000000000, CX
 	DIVL	CX
 	MOVL	AX, sec+0(FP)
@@ -230,6 +246,8 @@
 // func nanotime() int64
 TEXT runtime·nanotime(SB),NOSPLIT,$0
 	CALL	runtime·now(SB)
+	SUBL	runtime·startNano(SB), AX
+	SBBL	runtime·startNano+4(SB), DX
 	MOVL	AX, ret_lo+0(FP)
 	MOVL	DX, ret_hi+4(FP)
 	RET

diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 96fa5b9..a8dc700 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s

@@ -117,14 +117,44 @@
 #define	gtod_ns_base	0x70
 #define	gtod_sec_base	0x78
 
-TEXT nanotime<>(SB), NOSPLIT, $32
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+	MOVQ	$0x7fffffe00000, BP	/* comm page base */
+	// Loop trying to take a consistent snapshot
+	// of the time parameters.
+timeloop:
+	MOVL	nt_generation(BP), R9
+	TESTL	R9, R9
+	JZ	timeloop
+	RDTSC
+	MOVQ	nt_tsc_base(BP), R10
+	MOVL	nt_scale(BP), R11
+	MOVQ	nt_ns_base(BP), R12
+	CMPL	nt_generation(BP), R9
+	JNE	timeloop
+
+	// Gathered all the data we need. Compute monotonic time:
+	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
+	// The multiply and shift extracts the top 64 bits of the 96-bit product.
+	SHLQ	$32, DX
+	ADDQ	DX, AX
+	SUBQ	R10, AX
+	MULQ	R11
+	SHRQ	$32, AX:DX
+	ADDQ	R12, AX
+	MOVQ	runtime·startNano(SB), CX
+	SUBQ	CX, AX
+	MOVQ	AX, ret+0(FP)
+	RET
+
+TEXT time·now(SB), NOSPLIT, $32-24
+	// Note: The 32 bytes of stack frame requested on the TEXT line
+	// are used in the systime fallback, as the timeval address
+	// filled in by the system call.
 	MOVQ	$0x7fffffe00000, BP	/* comm page base */
 	// Loop trying to take a consistent snapshot
 	// of the time parameters.
 timeloop:
 	MOVL	gtod_generation(BP), R8
-	TESTL	R8, R8
-	JZ	systime
 	MOVL	nt_generation(BP), R9
 	TESTL	R9, R9
 	JZ	timeloop
@@ -139,8 +169,8 @@
 	CMPL	gtod_generation(BP), R8
 	JNE	timeloop
 
-	// Gathered all the data we need. Compute time.
-	//	((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base - gtod_ns_base + gtod_sec_base*1e9
+	// Gathered all the data we need. Compute:
+	//	monotonic_time = ((tsc - nt_tsc_base) * nt_scale) >> 32 + nt_ns_base
 	// The multiply and shift extracts the top 64 bits of the 96-bit product.
 	SHLQ	$32, DX
 	ADDQ	DX, AX
@@ -148,9 +178,33 @@
 	MULQ	R11
 	SHRQ	$32, AX:DX
 	ADDQ	R12, AX
+	MOVQ	AX, BX
+	MOVQ	runtime·startNano(SB), CX
+	SUBQ	CX, BX
+	MOVQ	BX, monotonic+16(FP)
+
+	// Compute:
+	//	wall_time = monotonic time - gtod_ns_base + gtod_sec_base*1e9
+	// or, if gtod_generation==0, invoke the system call.
+	TESTL	R8, R8
+	JZ	systime
 	SUBQ	R13, AX
 	IMULQ	$1000000000, R14
 	ADDQ	R14, AX
+
+	// Split wall time into sec, nsec.
+	// generated code for
+	//	func f(x uint64) (uint64, uint64) { return x/1e9, x%1e9 }
+	// adapted to reduce duplication
+	MOVQ	AX, CX
+	SHRQ	$9, AX
+	MOVQ	$19342813113834067, DX
+	MULQ	DX
+	SHRQ	$11, DX
+	MOVQ	DX, sec+0(FP)
+	IMULQ	$1000000000, DX
+	SUBQ	DX, CX
+	MOVL	CX, nsec+8(FP)
 	RET
 
 systime:
@@ -166,34 +220,9 @@
 	MOVL	8(SP), DX
 inreg:
 	// sec is in AX, usec in DX
-	// return nsec in AX
-	IMULQ	$1000000000, AX
 	IMULQ	$1000, DX
-	ADDQ	DX, AX
-	RET
-
-TEXT runtime·nanotime(SB),NOSPLIT,$0-8
-	CALL	nanotime<>(SB)
-	MOVQ	AX, ret+0(FP)
-	RET
-
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$0-12
-	CALL	nanotime<>(SB)
-
-	// generated code for
-	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
-	// adapted to reduce duplication
-	MOVQ	AX, CX
-	MOVQ	$1360296554856532783, AX
-	MULQ	CX
-	ADDQ	CX, DX
-	RCRQ	$1, DX
-	SHRQ	$29, DX
-	MOVQ	DX, sec+0(FP)
-	IMULQ	$1000000000, DX
-	SUBQ	DX, CX
-	MOVL	CX, nsec+8(FP)
+	MOVQ	AX, sec+0(FP)
+	MOVL	DX, nsec+8(FP)
 	RET
 
 TEXT runtime·sigprocmask(SB),NOSPLIT,$0
@@ -231,14 +260,15 @@
 	POPQ	BP
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$32
+TEXT runtime·sigtramp(SB),NOSPLIT,$40
 	MOVL SI, 24(SP) // save infostyle for sigreturn below
+	MOVQ R8, 32(SP) // save ctx
 	MOVL DX, 0(SP)  // sig
 	MOVQ CX, 8(SP)  // info
 	MOVQ R8, 16(SP) // ctx
 	MOVQ $runtime·sigtrampgo(SB), AX
 	CALL AX
-	MOVQ 16(SP), DI // ctx
+	MOVQ 32(SP), DI // ctx
 	MOVL 24(SP), SI // infostyle
 	MOVL $(0x2000000+184), AX
 	SYSCALL

diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
index 2c03c91..ea559b5 100644
--- a/src/runtime/sys_darwin_arm.s
+++ b/src/runtime/sys_darwin_arm.s

@@ -159,7 +159,7 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT time·now(SB), 7, $32
+TEXT runtime·walltime(SB), 7, $32
 	MOVW	$8(R13), R0  // timeval
 	MOVW	$0, R1  // zone
 	MOVW	$0, R2	// see issue 16570
@@ -171,9 +171,9 @@
 	MOVW	12(R13), R1
 inreg:
 	MOVW    R1, R2  // usec
-	MOVW	R0, sec+0(FP)
+	MOVW	R0, sec_lo+0(FP)
 	MOVW	$0, R1
-	MOVW	R1, loc+4(FP)
+	MOVW	R1, sec_hi+4(FP)
 	MOVW	$1000, R3
 	MUL	R3, R2
 	MOVW	R2, nsec+8(FP)

diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index c02d000..0e91d5b 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s

@@ -151,7 +151,7 @@
 	SVC	$0x80
 	RET
 
-TEXT time·now(SB),NOSPLIT,$40-12
+TEXT runtime·walltime(SB),NOSPLIT,$40-12
 	MOVD	RSP, R0	// timeval
 	MOVD	R0, R9	// this is how dyld calls gettimeofday
 	MOVW	$0, R1	// zone

diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index b950b69..f355268 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s

@@ -148,8 +148,8 @@
 	SYSCALL
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVL	$232, AX // clock_gettime
 	MOVQ	$0, DI  	// CLOCK_REALTIME
 	LEAQ	8(SP), SI

diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
index 8b6ee1f..0f5df21 100644
--- a/src/runtime/sys_freebsd_386.s
+++ b/src/runtime/sys_freebsd_386.s

@@ -159,8 +159,8 @@
 	INT	$0x80
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVL	$232, AX // clock_gettime
 	LEAL	12(SP), BX
 	MOVL	$0, 4(SP)	// CLOCK_REALTIME
@@ -170,8 +170,8 @@
 	MOVL	16(SP), BX	// nsec
 
 	// sec is in AX, nsec in BX
-	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
+	MOVL	AX, sec_lo+0(FP)
+	MOVL	$0, sec_hi+4(FP)
 	MOVL	BX, nsec+8(FP)
 	RET
 
@@ -398,4 +398,13 @@
 	NEGL	AX
 	RET
 
+// func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-28
+	MOVL	$487, AX
+	INT	$0x80
+	JAE	2(PC)
+	NEGL	AX
+	MOVL	AX, ret+24(FP)
+	RET
+
 GLOBL runtime·tlsoffset(SB),NOPTR,$4

diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index 158a60d..5d072a9 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s

@@ -142,8 +142,8 @@
 	SYSCALL
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVL	$232, AX // clock_gettime
 	MOVQ	$0, DI		// CLOCK_REALTIME
 	LEAQ	8(SP), SI
@@ -354,3 +354,17 @@
 	MOVL	$92, AX		// fcntl
 	SYSCALL
 	RET
+
+// func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-44
+	MOVQ	level+0(FP), DI
+	MOVQ	which+8(FP), SI
+	MOVQ	id+16(FP), DX
+	MOVQ	size+24(FP), R10
+	MOVQ	mask+32(FP), R8
+	MOVL	$487, AX
+	SYSCALL
+	JCC	2(PC)
+	NEGQ	AX
+	MOVL	AX, ret+40(FP)
+	RET

diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index 3c5a5cb..2851587 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s

@@ -39,8 +39,9 @@
 #define SYS_thr_kill (SYS_BASE + 433)
 #define SYS__umtx_op (SYS_BASE + 454)
 #define SYS_thr_new (SYS_BASE + 455)
-#define SYS_mmap (SYS_BASE + 477) 
-	
+#define SYS_mmap (SYS_BASE + 477)
+#define SYS_cpuset_getaffinity (SYS_BASE + 487)
+
 TEXT runtime·sys_umtx_op(SB),NOSPLIT,$0
 	MOVW addr+0(FP), R0
 	MOVW mode+4(FP), R1
@@ -166,8 +167,8 @@
 	SWI $0
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW $0, R0 // CLOCK_REALTIME
 	MOVW $8(R13), R1
 	MOVW $SYS_clock_gettime, R7
@@ -376,3 +377,17 @@
 TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
 	WORD $0xee1d0f70 // mrc p15, 0, r0, c13, c0, 3
 	RET
+
+// func cpuset_getaffinity(level int, which int, id int64, size int, mask *byte) int32
+TEXT runtime·cpuset_getaffinity(SB), NOSPLIT, $0-28
+	MOVW	level+0(FP), R0
+	MOVW	which+4(FP), R1
+	MOVW	id_lo+8(FP), R2
+	MOVW	id_hi+12(FP), R3
+	ADD	$20, R13	// Pass size and mask on stack.
+	MOVW	$SYS_cpuset_getaffinity, R7
+	SWI	$0
+	RSB.CS	$0, R0
+	SUB	$20, R13
+	MOVW	R0, ret+24(FP)
+	RET

diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 45320c0..6061833 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s

@@ -98,15 +98,18 @@
 	MOVL	$1000000, CX
 	DIVL	CX
 	MOVL	AX, 0(SP)
+	MOVL	$1000, AX	// usec to nsec
+	MULL	DX
 	MOVL	DX, 4(SP)
 
-	// select(0, 0, 0, 0, &tv)
-	MOVL	$142, AX
+	// pselect6(0, 0, 0, 0, &ts, 0)
+	MOVL	$308, AX
 	MOVL	$0, BX
 	MOVL	$0, CX
 	MOVL	$0, DX
 	MOVL	$0, SI
 	LEAL	0(SP), DI
+	MOVL	$0, BP
 	INVOKE_SYSCALL
 	RET
 
@@ -151,8 +154,8 @@
 	MOVL	AX, ret+12(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVL	$265, AX			// syscall - clock_gettime
 	MOVL	$0, BX		// CLOCK_REALTIME
 	LEAL	8(SP), CX
@@ -162,8 +165,8 @@
 	MOVL	12(SP), BX	// nsec
 
 	// sec is in AX, nsec in BX
-	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
+	MOVL	AX, sec_lo+0(FP)
+	MOVL	$0, sec_hi+4(FP)
 	MOVL	BX, nsec+8(FP)
 	RET
 
@@ -596,3 +599,12 @@
 	INVOKE_SYSCALL
 	MOVL	AX, ret+12(FP)
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
+	// Implemented as brk(NULL).
+	MOVL	$45, AX  // syscall - brk
+	MOVL	$0, BX  // NULL
+	INVOKE_SYSCALL
+	MOVL	AX, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 6ddcb30..e0dc3e1 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s

@@ -82,15 +82,18 @@
 	MOVL	$1000000, CX
 	DIVL	CX
 	MOVQ	AX, 0(SP)
-	MOVQ	DX, 8(SP)
+	MOVL	$1000, AX	// usec to nsec
+	MULL	DX
+	MOVQ	AX, 8(SP)
 
-	// select(0, 0, 0, 0, &tv)
+	// pselect6(0, 0, 0, 0, &ts, 0)
 	MOVL	$0, DI
 	MOVL	$0, SI
 	MOVL	$0, DX
 	MOVL	$0, R10
 	MOVQ	SP, R8
-	MOVL	$23, AX
+	MOVL	$0, R9
+	MOVL	$270, AX
 	SYSCALL
 	RET
 
@@ -135,8 +138,8 @@
 	MOVL	AX, ret+24(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$16
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	// Be careful. We're calling a function with gcc calling convention here.
 	// We're guaranteed 128 bytes on entry, and we've taken 16, and the
 	// call uses another 8.
@@ -390,7 +393,7 @@
 	MOVQ	AX, ret+32(FP)
 	RET
 
-TEXT runtime·munmap(SB),NOSPLIT,$0
+TEXT runtime·sysMunmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
 	MOVQ	$11, AX	// munmap
@@ -400,6 +403,19 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+// Call the function stored in _cgo_munmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMunmap(SB),NOSPLIT,$16-16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVQ	_cgo_munmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	RET
+
 TEXT runtime·madvise(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
@@ -598,3 +614,12 @@
 	SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
+	// Implemented as brk(NULL).
+	MOVQ	$0, DI
+	MOVL	$12, AX  // syscall entry
+	SYSCALL
+	MOVQ	AX, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 666b879..64beed8 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s

@@ -36,7 +36,7 @@
 #define SYS_gettid (SYS_BASE + 224)
 #define SYS_tkill (SYS_BASE + 238)
 #define SYS_sched_yield (SYS_BASE + 158)
-#define SYS_select (SYS_BASE + 142) // newselect
+#define SYS_pselect6 (SYS_BASE + 335)
 #define SYS_ugetrlimit (SYS_BASE + 191)
 #define SYS_sched_getaffinity (SYS_BASE + 242)
 #define SYS_clock_gettime (SYS_BASE + 263)
@@ -48,6 +48,7 @@
 #define SYS_access (SYS_BASE + 33)
 #define SYS_connect (SYS_BASE + 283)
 #define SYS_socket (SYS_BASE + 281)
+#define SYS_brk (SYS_BASE + 45)
 
 #define ARM_BASE (SYS_BASE + 0x0f0000)
 
@@ -197,7 +198,7 @@
 	MOVW	R0, ret+12(FP)
 	RET
 
-TEXT time·now(SB), NOSPLIT, $32
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW	$0, R0  // CLOCK_REALTIME
 	MOVW	$8(R13), R1  // timespec
 	MOVW	$SYS_clock_gettime, R7
@@ -206,9 +207,9 @@
 	MOVW	8(R13), R0  // sec
 	MOVW	12(R13), R2  // nsec
 	
-	MOVW	R0, sec+0(FP)
+	MOVW	R0, sec_lo+0(FP)
 	MOVW	$0, R1
-	MOVW	R1, loc+4(FP)
+	MOVW	R1, sec_hi+4(FP)
 	MOVW	R2, nsec+8(FP)
 	RET	
 
@@ -387,13 +388,16 @@
 	MOVW	usec+0(FP), R0
 	CALL	runtime·usplitR0(SB)
 	MOVW	R0, 4(R13)
+	MOVW	$1000, R0	// usec to nsec
+	MUL	R0, R1
 	MOVW	R1, 8(R13)
 	MOVW	$0, R0
 	MOVW	$0, R1
 	MOVW	$0, R2
 	MOVW	$0, R3
 	MOVW	$4(R13), R4
-	MOVW	$SYS_select, R7
+	MOVW	$0, R5
+	MOVW	$SYS_pselect6, R7
 	SWI	$0
 	RET
 
@@ -504,3 +508,12 @@
 	SWI	$0
 	MOVW	R0, ret+12(FP)
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
+	// Implemented as brk(NULL).
+	MOVW	$0, R0
+	MOVW	$SYS_brk, R7
+	SWI	$0
+	MOVW	R0, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index 1b91b44..e921f99 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s

@@ -46,6 +46,7 @@
 #define SYS_faccessat		48
 #define SYS_socket		198
 #define SYS_connect		203
+#define SYS_brk			214
 
 TEXT runtime·exit(SB),NOSPLIT,$-8-4
 	MOVW	code+0(FP), R0
@@ -182,8 +183,8 @@
 	MOVW	R0, ret+24(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$24-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$24-12
 	MOVW	$0, R0 // CLOCK_REALTIME
 	MOVD	RSP, R1
 	MOVD	$SYS_clock_gettime, R8
@@ -483,3 +484,12 @@
 	SVC
 	MOVW	R0, ret+16(FP)
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-8
+	// Implemented as brk(NULL).
+	MOVD	$0, R0
+	MOVD	$SYS_brk, R8
+	SVC
+	MOVD	R0, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
index 5a75bb8..27de7b0 100644
--- a/src/runtime/sys_linux_mips64x.s
+++ b/src/runtime/sys_linux_mips64x.s

@@ -45,6 +45,7 @@
 #define SYS_epoll_wait		5209
 #define SYS_clock_gettime	5222
 #define SYS_epoll_create1	5285
+#define SYS_brk			5012
 
 TEXT runtime·exit(SB),NOSPLIT,$-8-4
 	MOVW	code+0(FP), R4
@@ -172,8 +173,8 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$16
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVW	$0, R4 // CLOCK_REALTIME
 	MOVV	$0(R29), R5
 	MOVV	$SYS_clock_gettime, R2
@@ -426,3 +427,12 @@
 	MOVV	$SYS_fcntl, R2
 	SYSCALL
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$-8-8
+	// Implemented as brk(NULL).
+	MOVV	$0, R4
+	MOVV	$SYS_brk, R2
+	SYSCALL
+	MOVV	R2, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_mipsx.s b/src/runtime/sys_linux_mipsx.s
index 73ce061..39bd731 100644
--- a/src/runtime/sys_linux_mipsx.s
+++ b/src/runtime/sys_linux_mipsx.s

@@ -45,6 +45,7 @@
 #define SYS_epoll_wait		    4250
 #define SYS_clock_gettime	    4263
 #define SYS_epoll_create1	    4326
+#define SYS_brk			    4045
 
 TEXT runtime·exit(SB),NOSPLIT,$0-4
 	MOVW	code+0(FP), R4
@@ -175,8 +176,8 @@
 	MOVW	R2, ret+12(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	MOVW	$0, R4	// CLOCK_REALTIME
 	MOVW	$4(R29), R5
 	MOVW	$SYS_clock_gettime, R2
@@ -323,7 +324,7 @@
 	RET
 
 
-// int32 clone(int32 flags, void *stk, M *mm, G *gg, void (*fn)(void));
+// int32 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
 TEXT runtime·clone(SB),NOSPLIT,$-4-24
 	MOVW	flags+0(FP), R4
 	MOVW	stk+4(FP), R5
@@ -335,9 +336,9 @@
 	// stack so that any syscall invoked immediately in the new thread won't fail.
 	ADD	$-32, R5
 
-	// Copy mm, gg, fn off parent stack for use by child.
-	MOVW	mm+8(FP), R16
-	MOVW	gg+12(FP), R17
+	// Copy mp, gp, fn off parent stack for use by child.
+	MOVW	mp+8(FP), R16
+	MOVW	gp+12(FP), R17
 	MOVW	fn+16(FP), R18
 
 	MOVW	$1234, R1
@@ -465,3 +466,12 @@
 	MOVW	$SYS_fcntl, R2
 	SYSCALL
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT,$0-4
+	// Implemented as brk(NULL).
+	MOVW	$0, R4
+	MOVW	$SYS_brk, R2
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index a40fe3b..2b2aa61 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s

@@ -21,6 +21,7 @@
 #define SYS_close		  6
 #define SYS_getpid		 20
 #define SYS_kill		 37
+#define SYS_brk			 45
 #define SYS_fcntl		 55
 #define SYS_gettimeofday	 78
 #define SYS_select		 82	// always return -ENOSYS
@@ -157,8 +158,8 @@
 	MOVW	R3, ret+24(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$16
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVD	$0, R3 // CLOCK_REALTIME
 	MOVD	$0(R1), R4
 	SYSCALL	$SYS_clock_gettime
@@ -189,7 +190,7 @@
 	MOVW	size+24(FP), R6
 	SYSCALL	$SYS_rt_sigprocmask
 	BVC	2(PC)
-	MOVD	R0, 0xf1(R0)	// crash
+	MOVD	R0, 0xf0(R0)	// crash
 	RET
 
 TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
@@ -273,7 +274,7 @@
 	MOVD	n+8(FP), R4
 	SYSCALL	$SYS_munmap
 	BVC	2(PC)
-	MOVD	R0, 0xf3(R0)
+	MOVD	R0, 0xf0(R0)
 	RET
 
 TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
@@ -366,7 +367,7 @@
 	MOVD	old+8(FP), R4
 	SYSCALL	$SYS_sigaltstack
 	BVC	2(PC)
-	MOVD	R0, 0xf1(R0)  // crash
+	MOVD	R0, 0xf0(R0)  // crash
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
@@ -422,3 +423,11 @@
 	MOVD    $1, R5  // FD_CLOEXEC
 	SYSCALL	$SYS_fcntl
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0
+	// Implemented as brk(NULL).
+	MOVD	$0, R3
+	SYSCALL	$SYS_brk
+	MOVD	R3, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
index 47f34d9..b8099e2 100644
--- a/src/runtime/sys_linux_s390x.s
+++ b/src/runtime/sys_linux_s390x.s

@@ -16,6 +16,7 @@
 #define SYS_close                 6
 #define SYS_getpid               20
 #define SYS_kill                 37
+#define SYS_brk			 45
 #define SYS_fcntl                55
 #define SYS_gettimeofday         78
 #define SYS_mmap                 90
@@ -169,8 +170,8 @@
 	MOVW	R2, ret+24(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$16
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVW	$0, R2 // CLOCK_REALTIME
 	MOVD	$tp-16(SP), R3
 	MOVW	$SYS_clock_gettime, R1
@@ -434,3 +435,12 @@
 	MOVW	$SYS_fcntl, R1
 	SYSCALL
 	RET
+
+// func sbrk0() uintptr
+TEXT runtime·sbrk0(SB),NOSPLIT|NOFRAME,$0-8
+	// Implemented as brk(NULL).
+	MOVD	$0, R2
+	MOVW	$SYS_brk, R1
+	SYSCALL
+	MOVD	R2, ret+0(FP)
+	RET

diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s
index 05de20c..d945453 100644
--- a/src/runtime/sys_nacl_386.s
+++ b/src/runtime/sys_nacl_386.s

@@ -233,7 +233,7 @@
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT time·now(SB),NOSPLIT,$20
+TEXT runtime·walltime(SB),NOSPLIT,$20
 	MOVL $0, 0(SP) // real time clock
 	LEAL 8(SP), AX
 	MOVL AX, 4(SP) // timespec
@@ -243,13 +243,13 @@
 	MOVL 16(SP), BX // nsec
 
 	// sec is in AX, nsec in BX
-	MOVL	AX, sec+0(FP)
-	MOVL	CX, sec+4(FP)
+	MOVL	AX, sec_lo+0(FP)
+	MOVL	CX, sec_hi+4(FP)
 	MOVL	BX, nsec+8(FP)
 	RET
 
 TEXT syscall·now(SB),NOSPLIT,$0
-	JMP time·now(SB)
+	JMP runtime·walltime(SB)
 
 TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$8
 	MOVL arg1+0(FP), AX

diff --git a/src/runtime/sys_nacl_amd64p32.s b/src/runtime/sys_nacl_amd64p32.s
index c2a24e8..2a39983 100644
--- a/src/runtime/sys_nacl_amd64p32.s
+++ b/src/runtime/sys_nacl_amd64p32.s

@@ -242,7 +242,7 @@
 	MOVL	AX, ret+24(FP)
 	RET
 
-TEXT time·now(SB),NOSPLIT,$16
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVQ runtime·faketime(SB), AX
 	CMPQ AX, $0
 	JEQ realtime
@@ -262,13 +262,13 @@
 	MOVL 8(SP), BX // nsec
 
 	// sec is in AX, nsec in BX
-	MOVL	AX, sec+0(FP)
-	MOVL	CX, sec+4(FP)
+	MOVL	AX, sec_lo+0(FP)
+	MOVL	CX, sec_hi+4(FP)
 	MOVL	BX, nsec+8(FP)
 	RET
 
 TEXT syscall·now(SB),NOSPLIT,$0
-	JMP time·now(SB)
+	JMP runtime·walltime(SB)
 
 TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$0
 	MOVL arg1+0(FP), DI
@@ -366,40 +366,40 @@
 	// 136(SI) is saved EFLAGS, never to be seen again
 	JMP	SI
 
-debughandler:
-	// print basic information
-	LEAL	ctxt+0(FP), DI
-	MOVL	$runtime·sigtrampf(SB), AX
-	MOVL	AX, 0(SP)
-	MOVQ	(16*4+16*8)(DI), BX // rip
-	MOVQ	BX, 8(SP)
-	MOVQ	(16*4+0*8)(DI), BX // rax
-	MOVQ	BX, 16(SP)
-	MOVQ	(16*4+1*8)(DI), BX // rcx
-	MOVQ	BX, 24(SP)
-	MOVQ	(16*4+2*8)(DI), BX // rdx
-	MOVQ	BX, 32(SP)
-	MOVQ	(16*4+3*8)(DI), BX // rbx
-	MOVQ	BX, 40(SP)
-	MOVQ	(16*4+7*8)(DI), BX // rdi
-	MOVQ	BX, 48(SP)
-	MOVQ	(16*4+15*8)(DI), BX // r15
-	MOVQ	BX, 56(SP)
-	MOVQ	(16*4+4*8)(DI), BX // rsp
-	MOVQ	0(BX), BX
-	MOVQ	BX, 64(SP)
-	CALL	runtime·printf(SB)
-	
-	LEAL	ctxt+0(FP), DI
-	MOVQ	(16*4+16*8)(DI), BX // rip
-	MOVL	BX, 0(SP)
-	MOVQ	(16*4+4*8)(DI), BX // rsp
-	MOVL	BX, 4(SP)
-	MOVL	$0, 8(SP)	// lr
-	get_tls(CX)
-	MOVL	g(CX), BX
-	MOVL	BX, 12(SP)	// gp
-	CALL	runtime·traceback(SB)
+//debughandler:
+	//// print basic information
+	//LEAL	ctxt+0(FP), DI
+	//MOVL	$runtime·sigtrampf(SB), AX
+	//MOVL	AX, 0(SP)
+	//MOVQ	(16*4+16*8)(DI), BX // rip
+	//MOVQ	BX, 8(SP)
+	//MOVQ	(16*4+0*8)(DI), BX // rax
+	//MOVQ	BX, 16(SP)
+	//MOVQ	(16*4+1*8)(DI), BX // rcx
+	//MOVQ	BX, 24(SP)
+	//MOVQ	(16*4+2*8)(DI), BX // rdx
+	//MOVQ	BX, 32(SP)
+	//MOVQ	(16*4+3*8)(DI), BX // rbx
+	//MOVQ	BX, 40(SP)
+	//MOVQ	(16*4+7*8)(DI), BX // rdi
+	//MOVQ	BX, 48(SP)
+	//MOVQ	(16*4+15*8)(DI), BX // r15
+	//MOVQ	BX, 56(SP)
+	//MOVQ	(16*4+4*8)(DI), BX // rsp
+	//MOVQ	0(BX), BX
+	//MOVQ	BX, 64(SP)
+	//CALL	runtime·printf(SB)
+	//
+	//LEAL	ctxt+0(FP), DI
+	//MOVQ	(16*4+16*8)(DI), BX // rip
+	//MOVL	BX, 0(SP)
+	//MOVQ	(16*4+4*8)(DI), BX // rsp
+	//MOVL	BX, 4(SP)
+	//MOVL	$0, 8(SP)	// lr
+	//get_tls(CX)
+	//MOVL	g(CX), BX
+	//MOVL	BX, 12(SP)	// gp
+	//CALL	runtime·traceback(SB)
 
 notls:
 	MOVL	0, AX

diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
index 6cbc23f..6a6ef4e 100644
--- a/src/runtime/sys_nacl_arm.s
+++ b/src/runtime/sys_nacl_arm.s

@@ -196,20 +196,20 @@
 	MOVW	R0, ret+24(FP)
 	RET
 
-TEXT time·now(SB),NOSPLIT,$16
+TEXT runtime·walltime(SB),NOSPLIT,$16
 	MOVW	$0, R0 // real time clock
 	MOVW	$4(R13), R1
 	NACL_SYSCALL(SYS_clock_gettime)
 	MOVW	4(R13), R0 // low 32-bit sec
 	MOVW	8(R13), R1 // high 32-bit sec
 	MOVW	12(R13), R2 // nsec
-	MOVW	R0, sec+0(FP)
-	MOVW	R1, sec+4(FP)
-	MOVW	R2, sec+8(FP)
+	MOVW	R0, sec_lo+0(FP)
+	MOVW	R1, sec_hi+4(FP)
+	MOVW	R2, nsec+8(FP)
 	RET
 
 TEXT syscall·now(SB),NOSPLIT,$0
-	B time·now(SB)
+	B runtime·walltime(SB)
 
 TEXT runtime·nacl_clock_gettime(SB),NOSPLIT,$0
 	MOVW	arg1+0(FP), R0

diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index 8c4f004..742193c 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s

@@ -134,8 +134,8 @@
 	INT	$0x80
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	LEAL	12(SP), BX
 	MOVL	$0, 4(SP)		// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -143,9 +143,9 @@
 	INT	$0x80
 
 	MOVL	12(SP), AX		// sec - l32
-	MOVL	AX, sec+0(FP)
+	MOVL	AX, sec_lo+0(FP)
 	MOVL	16(SP), AX		// sec - h32
-	MOVL	AX, sec+4(FP)
+	MOVL	AX, sec_hi+4(FP)
 
 	MOVL	20(SP), BX		// nsec
 	MOVL	BX, nsec+8(FP)

diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index 7c7771b..c632a0b 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s

@@ -169,8 +169,8 @@
 	SYSCALL
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVQ	$0, DI			// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$427, AX		// sys_clock_gettime

diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index a8914c1..789b12e 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s

@@ -137,8 +137,8 @@
 	SWI $0xa001a9	// sys_setitimer
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW $0, R0	// CLOCK_REALTIME
 	MOVW $8(R13), R1
 	SWI $0xa001ab	// clock_gettime

diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
index 76d22b0..fb2a688 100644
--- a/src/runtime/sys_openbsd_386.s
+++ b/src/runtime/sys_openbsd_386.s

@@ -140,8 +140,8 @@
 	INT	$0x80
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	LEAL	12(SP), BX
 	MOVL	$0, 4(SP)		// arg 1 - clock_id
 	MOVL	BX, 8(SP)		// arg 2 - tp
@@ -149,9 +149,9 @@
 	INT	$0x80
 
 	MOVL	12(SP), AX		// sec - l32
-	MOVL	AX, sec+0(FP)
+	MOVL	AX, sec_lo+0(FP)
 	MOVL	16(SP), AX		// sec - h32
-	MOVL	AX, sec+4(FP)
+	MOVL	AX, sec_hi+4(FP)
 
 	MOVL	20(SP), BX		// nsec
 	MOVL	BX, nsec+8(FP)

diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index cf7a3fb..9a52e5d 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s

@@ -180,8 +180,8 @@
 	SYSCALL
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVQ	$0, DI			// arg 1 - clock_id
 	LEAQ	8(SP), SI		// arg 2 - tp
 	MOVL	$87, AX			// sys_clock_gettime

diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index f573a02..93a5d5b 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s

@@ -150,8 +150,8 @@
 	SWI	$0
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB), NOSPLIT, $32
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB), NOSPLIT, $32
 	MOVW	CLOCK_REALTIME, R0	// arg 1 - clock_id
 	MOVW	$8(R13), R1		// arg 2 - tp
 	MOVW	$87, R12		// sys_clock_gettime

diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index 41aa2fd..688bd23 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s

@@ -102,16 +102,16 @@
 	MOVL	$-1, ret_hi+8(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	CALL	runtime·nanotime(SB)
 	MOVL	0(SP), AX
 	MOVL	4(SP), DX
 
 	MOVL	$1000000000, CX
 	DIVL	CX
-	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
+	MOVL	AX, sec_lo+0(FP)
+	MOVL	$0, sec_hi+4(FP)
 	MOVL	DX, nsec+8(FP)
 	RET
 

diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
index 149505f..d7bd92c 100644
--- a/src/runtime/sys_plan9_amd64.s
+++ b/src/runtime/sys_plan9_amd64.s

@@ -92,8 +92,8 @@
 	MOVQ	AX, ret+8(FP)
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	CALL	runtime·nanotime(SB)
 	MOVQ	0(SP), AX
 

diff --git a/src/runtime/sys_plan9_arm.s b/src/runtime/sys_plan9_arm.s
index d54f56f..94a6f63 100644
--- a/src/runtime/sys_plan9_arm.s
+++ b/src/runtime/sys_plan9_arm.s

@@ -54,21 +54,21 @@
 //func open(name *byte, mode, perm int32) int32
 TEXT runtime·open(SB),NOSPLIT,$0-16
 	MOVW    $SYS_OPEN, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+12(FP)
 	RET
 
 //func pread(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
 TEXT runtime·pread(SB),NOSPLIT,$0-24
 	MOVW    $SYS_PREAD, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+20(FP)
 	RET
 
 //func pwrite(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
 TEXT runtime·pwrite(SB),NOSPLIT,$0-24
 	MOVW    $SYS_PWRITE, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+20(FP)
 	RET
 
@@ -79,7 +79,7 @@
 	MOVW	R0, 0(R13)
 	MOVW.W	R1, -4(R13)
 	MOVW	$SYS_SEEK, R0
-	SWI	0
+	SWI	$0
 	MOVW.W	R1, 4(R13)
 	CMP	$-1, R0
 	MOVW.EQ	R0, ret_lo+16(FP)
@@ -89,48 +89,48 @@
 //func closefd(fd int32) int32
 TEXT runtime·closefd(SB),NOSPLIT,$0-8
 	MOVW	$SYS_CLOSE, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
 //func exits(msg *byte)
 TEXT runtime·exits(SB),NOSPLIT,$0-4
 	MOVW    $SYS_EXITS, R0
-	SWI	0
+	SWI	$0
 	RET
 
 //func brk_(addr unsafe.Pointer) int32
 TEXT runtime·brk_(SB),NOSPLIT,$0-8
 	MOVW    $SYS_BRK_, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
 //func sleep(ms int32) int32
 TEXT runtime·sleep(SB),NOSPLIT,$0-8
 	MOVW    $SYS_SLEEP, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
 //func plan9_semacquire(addr *uint32, block int32) int32
 TEXT runtime·plan9_semacquire(SB),NOSPLIT,$0-12
 	MOVW	$SYS_SEMACQUIRE, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+8(FP)
 	RET
 
 //func plan9_tsemacquire(addr *uint32, ms int32) int32
 TEXT runtime·plan9_tsemacquire(SB),NOSPLIT,$0-12
 	MOVW	$SYS_TSEMACQUIRE, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+8(FP)
 	RET
 
 //func nsec(*int64) int64
 TEXT runtime·nsec(SB),NOSPLIT,$-4-12
 	MOVW	$SYS_NSEC, R0
-	SWI	0
+	SWI	$0
 	MOVW	arg+0(FP), R1
 	MOVW	0(R1), R0
 	MOVW	R0, ret_lo+4(FP)
@@ -139,12 +139,12 @@
 	RET
 
 // time.now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$12-12
+TEXT runtime·walltime(SB),NOSPLIT,$12-12
 	// use nsec system call to get current time in nanoseconds
 	MOVW	$sysnsec_lo-8(SP), R0	// destination addr
 	MOVW	R0,res-12(SP)
 	MOVW	$SYS_NSEC, R0
-	SWI	0
+	SWI	$0
 	MOVW	sysnsec_lo-8(SP), R1	// R1:R2 = nsec
 	MOVW	sysnsec_hi-4(SP), R2
 
@@ -181,28 +181,28 @@
 //func notify(fn unsafe.Pointer) int32
 TEXT runtime·notify(SB),NOSPLIT,$0-8
 	MOVW	$SYS_NOTIFY, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
 //func noted(mode int32) int32
 TEXT runtime·noted(SB),NOSPLIT,$0-8
 	MOVW	$SYS_NOTED, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
 //func plan9_semrelease(addr *uint32, count int32) int32
 TEXT runtime·plan9_semrelease(SB),NOSPLIT,$0-12
 	MOVW	$SYS_SEMRELEASE, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+8(FP)
 	RET
 
 //func rfork(flags int32) int32
 TEXT runtime·rfork(SB),NOSPLIT,$0-8
 	MOVW	$SYS_RFORK, R0
-	SWI	0
+	SWI	$0
 	MOVW	R0, ret+4(FP)
 	RET
 
@@ -297,7 +297,7 @@
 	MOVW	$ERRMAX, R2
 	MOVW	R2, ret_len+4(FP)
 	MOVW    $SYS_ERRSTR, R0
-	SWI	0
+	SWI	$0
 	MOVW	R1, R2
 	MOVBU	0(R2), R0
 	CMP	$0, R0

diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s
index c542db3..aeb2e2c 100644
--- a/src/runtime/sys_solaris_amd64.s
+++ b/src/runtime/sys_solaris_amd64.s

@@ -354,8 +354,8 @@
 	CALL	AX
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
+// func walltime() (sec int64, nsec int32)
+TEXT runtime·walltime(SB),NOSPLIT,$8-12
 	CALL	runtime·nanotime(SB)
 	MOVQ	0(SP), AX
 

diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index bd5de33..128e8ab 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s

@@ -152,7 +152,7 @@
 	// RET 4 (return and pop 4 bytes parameters)
 	BYTE $0xC2; WORD $4
 	RET // unreached; make assembler happy
- 
+
 TEXT runtime·exceptiontramp(SB),NOSPLIT,$0
 	MOVL	$runtime·exceptionhandler(SB), AX
 	JMP	runtime·sigtramp(SB)
@@ -432,15 +432,113 @@
 	MOVL	BP, SP
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
-	CALL	runtime·unixnano(SB)
-	MOVL	0(SP), AX
-	MOVL	4(SP), DX
+// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
+// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
+#define _INTERRUPT_TIME 0x7ffe0008
+#define _SYSTEM_TIME 0x7ffe0014
+#define time_lo 0
+#define time_hi1 4
+#define time_hi2 8
 
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+loop:
+	MOVL	(_INTERRUPT_TIME+time_hi1), AX
+	MOVL	(_INTERRUPT_TIME+time_lo), CX
+	MOVL	(_INTERRUPT_TIME+time_hi2), DI
+	CMPL	AX, DI
+	JNE	loop
+
+	// wintime = DI:CX, multiply by 100
+	MOVL	$100, AX
+	MULL	CX
+	IMULL	$100, DI
+	ADDL	DI, DX
+	// wintime*100 = DX:AX, subtract startNano and return
+	SUBL	runtime·startNano+0(SB), AX
+	SBBL	runtime·startNano+4(SB), DX
+	MOVL	AX, ret_lo+0(FP)
+	MOVL	DX, ret_hi+4(FP)
+	RET
+useQPC:
+	JMP	runtime·nanotimeQPC(SB)
+	RET
+
+TEXT time·now(SB),NOSPLIT,$0-20
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+loop:
+	MOVL	(_INTERRUPT_TIME+time_hi1), AX
+	MOVL	(_INTERRUPT_TIME+time_lo), CX
+	MOVL	(_INTERRUPT_TIME+time_hi2), DI
+	CMPL	AX, DI
+	JNE	loop
+
+	// w = DI:CX
+	// multiply by 100
+	MOVL	$100, AX
+	MULL	CX
+	IMULL	$100, DI
+	ADDL	DI, DX
+	// w*100 = DX:AX
+	// subtract startNano and save for return
+	SUBL	runtime·startNano+0(SB), AX
+	SBBL	runtime·startNano+4(SB), DX
+	MOVL	AX, mono+12(FP)
+	MOVL	DX, mono+16(FP)
+
+wall:
+	MOVL	(_SYSTEM_TIME+time_hi1), CX
+	MOVL	(_SYSTEM_TIME+time_lo), AX
+	MOVL	(_SYSTEM_TIME+time_hi2), DX
+	CMPL	CX, DX
+	JNE	wall
+	
+	// w = DX:AX
+	// convert to Unix epoch (but still 100ns units)
+	#define delta 116444736000000000
+	SUBL	$(delta & 0xFFFFFFFF), AX
+	SBBL $(delta >> 32), DX
+	
+	// nano/100 = DX:AX
+	// split into two decimal halves by div 1e9.
+	// (decimal point is two spots over from correct place,
+	// but we avoid overflow in the high word.)
 	MOVL	$1000000000, CX
 	DIVL	CX
+	MOVL	AX, DI
+	MOVL	DX, SI
+	
+	// DI = nano/100/1e9 = nano/1e11 = sec/100, DX = SI = nano/100%1e9
+	// split DX into seconds and nanoseconds by div 1e7 magic multiply.
+	MOVL	DX, AX
+	MOVL	$1801439851, CX
+	MULL	CX
+	SHRL	$22, DX
+	MOVL	DX, BX
+	IMULL	$10000000, DX
+	MOVL	SI, CX
+	SUBL	DX, CX
+	
+	// DI = sec/100 (still)
+	// BX = (nano/100%1e9)/1e7 = (nano/1e9)%100 = sec%100
+	// CX = (nano/100%1e9)%1e7 = (nano%1e9)/100 = nsec/100
+	// store nsec for return
+	IMULL	$100, CX
+	MOVL	CX, nsec+8(FP)
+
+	// DI = sec/100 (still)
+	// BX = sec%100
+	// construct DX:AX = 64-bit sec and store for return
+	MOVL	$0, DX
+	MOVL	$100, AX
+	MULL	DI
+	ADDL	BX, AX
+	ADCL	$0, DX
 	MOVL	AX, sec+0(FP)
-	MOVL	$0, sec+4(FP)
-	MOVL	DX, nsec+8(FP)
+	MOVL	DX, sec+4(FP)
+	RET
+useQPC:
+	JMP	runtime·nowQPC(SB)
 	RET

diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index c61b79d..744e78c 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s

@@ -465,10 +465,62 @@
 	MOVQ	32(SP), SP
 	RET
 
-// func now() (sec int64, nsec int32)
-TEXT time·now(SB),NOSPLIT,$8-12
-	CALL	runtime·unixnano(SB)
-	MOVQ	0(SP), AX
+// See http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
+// Must read hi1, then lo, then hi2. The snapshot is valid if hi1 == hi2.
+#define _INTERRUPT_TIME 0x7ffe0008
+#define _SYSTEM_TIME 0x7ffe0014
+#define time_lo 0
+#define time_hi1 4
+#define time_hi2 8
+
+TEXT runtime·nanotime(SB),NOSPLIT,$0-8
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+	MOVQ	$_INTERRUPT_TIME, DI
+loop:
+	MOVL	time_hi1(DI), AX
+	MOVL	time_lo(DI), BX
+	MOVL	time_hi2(DI), CX
+	CMPL	AX, CX
+	JNE	loop
+	SHLQ	$32, CX
+	ORQ	BX, CX
+	IMULQ	$100, CX
+	SUBQ	runtime·startNano(SB), CX
+	MOVQ	CX, ret+0(FP)
+	RET
+useQPC:
+	JMP	runtime·nanotimeQPC(SB)
+	RET
+
+TEXT time·now(SB),NOSPLIT,$0-24
+	CMPB	runtime·useQPCTime(SB), $0
+	JNE	useQPC
+	MOVQ	$_INTERRUPT_TIME, DI
+loop:
+	MOVL	time_hi1(DI), AX
+	MOVL	time_lo(DI), BX
+	MOVL	time_hi2(DI), CX
+	CMPL	AX, CX
+	JNE	loop
+	SHLQ	$32, AX
+	ORQ	BX, AX
+	IMULQ	$100, AX
+	SUBQ	runtime·startNano(SB), AX
+	MOVQ	AX, mono+16(FP)
+
+	MOVQ	$_SYSTEM_TIME, DI
+wall:
+	MOVL	time_hi1(DI), AX
+	MOVL	time_lo(DI), BX
+	MOVL	time_hi2(DI), CX
+	CMPL	AX, CX
+	JNE	wall
+	SHLQ	$32, AX
+	ORQ	BX, AX
+	MOVQ	$116444736000000000, DI
+	SUBQ	DI, AX
+	IMULQ	$100, AX
 
 	// generated code for
 	//	func f(x uint64) (uint64, uint64) { return x/1000000000, x%100000000 }
@@ -484,4 +536,6 @@
 	SUBQ	DX, CX
 	MOVL	CX, nsec+8(FP)
 	RET
-
+useQPC:
+	JMP	runtime·nowQPC(SB)
+	RET

diff --git a/src/runtime/syscall_nacl.h b/src/runtime/syscall_nacl.h
index 834ecfc..5ee75ab 100644
--- a/src/runtime/syscall_nacl.h
+++ b/src/runtime/syscall_nacl.h

@@ -1,4 +1,4 @@
-// generated by mknacl.sh - do not edit
+// Code generated by mknacl.sh; DO NOT EDIT.
 #define SYS_null 1
 #define SYS_nameservice 2
 #define SYS_dup 8

diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index cd23b8d..ca8ea8b 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go

@@ -207,3 +207,9 @@
 	cgocall(asmstdcallAddr, unsafe.Pointer(c))
 	return c.r1, c.r2, c.err
 }
+
+//go:linkname syscall_exit syscall.Exit
+//go:nosplit
+func syscall_exit(code int) {
+	exit(int32(code))
+}

diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 11e67df..3da154d 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go

@@ -1037,7 +1037,7 @@
 	defer os.RemoveAll(tmpdir)
 
 	src := filepath.Join(tmpdir, "main.go")
-	err = ioutil.WriteFile(src, []byte(benchmarkRunnigGoProgram), 0666)
+	err = ioutil.WriteFile(src, []byte(benchmarkRunningGoProgram), 0666)
 	if err != nil {
 		b.Fatal(err)
 	}
@@ -1055,14 +1055,16 @@
 		cmd := exec.Command(exe)
 		out, err := cmd.CombinedOutput()
 		if err != nil {
-			b.Fatalf("runing main.exe failed: %v\n%s", err, out)
+			b.Fatalf("running main.exe failed: %v\n%s", err, out)
 		}
 	}
 }
 
-const benchmarkRunnigGoProgram = `
+const benchmarkRunningGoProgram = `
 package main
 
+import _ "os" // average Go program will use "os" package, do the same here
+
 func main() {
 }
 `

diff --git a/src/runtime/testdata/testprog/numcpu_freebsd.go b/src/runtime/testdata/testprog/numcpu_freebsd.go
new file mode 100644
index 0000000..035c534
--- /dev/null
+++ b/src/runtime/testdata/testprog/numcpu_freebsd.go

@@ -0,0 +1,126 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"strconv"
+	"strings"
+	"syscall"
+)
+
+func init() {
+	register("FreeBSDNumCPU", FreeBSDNumCPU)
+	register("FreeBSDNumCPUHelper", FreeBSDNumCPUHelper)
+}
+
+func FreeBSDNumCPUHelper() {
+	fmt.Printf("%d\n", runtime.NumCPU())
+}
+
+func FreeBSDNumCPU() {
+	_, err := exec.LookPath("cpuset")
+	if err != nil {
+		// Can not test without cpuset command.
+		fmt.Println("OK")
+		return
+	}
+	_, err = exec.LookPath("sysctl")
+	if err != nil {
+		// Can not test without sysctl command.
+		fmt.Println("OK")
+		return
+	}
+	cmd := exec.Command("sysctl", "-n", "kern.smp.active")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		fmt.Printf("fail to launch '%s', error: %s, output: %s\n", strings.Join(cmd.Args, " "), err, output)
+		return
+	}
+	if bytes.Equal(output, []byte("1\n")) == false {
+		// SMP mode deactivated in kernel.
+		fmt.Println("OK")
+		return
+	}
+
+	list, err := getList()
+	if err != nil {
+		fmt.Printf("%s\n", err)
+		return
+	}
+	err = checkNCPU(list)
+	if err != nil {
+		fmt.Printf("%s\n", err)
+		return
+	}
+	if len(list) >= 2 {
+		err = checkNCPU(list[:len(list)-1])
+		if err != nil {
+			fmt.Printf("%s\n", err)
+			return
+		}
+	}
+	fmt.Println("OK")
+	return
+}
+
+func getList() ([]string, error) {
+	pid := syscall.Getpid()
+
+	// Launch cpuset to print a list of available CPUs: pid <PID> mask: 0, 1, 2, 3.
+	cmd := exec.Command("cpuset", "-g", "-p", strconv.Itoa(pid))
+	cmdline := strings.Join(cmd.Args, " ")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return nil, fmt.Errorf("fail to execute '%s': %s", cmdline, err)
+	}
+	pos := bytes.IndexRune(output, ':')
+	if pos == -1 {
+		return nil, fmt.Errorf("invalid output from '%s', ':' not found: %s", cmdline, output)
+	}
+
+	var list []string
+	for _, val := range bytes.Split(output[pos+1:], []byte(",")) {
+		index := string(bytes.TrimSpace(val))
+		if len(index) == 0 {
+			continue
+		}
+		list = append(list, index)
+	}
+	if len(list) == 0 {
+		return nil, fmt.Errorf("empty CPU list from '%s': %s", cmdline, output)
+	}
+	return list, nil
+}
+
+func checkNCPU(list []string) error {
+	listString := strings.Join(list, ",")
+	if len(listString) == 0 {
+		return fmt.Errorf("could not check against an empty CPU list")
+	}
+
+	// Launch FreeBSDNumCPUHelper() with specified CPUs list.
+	cmd := exec.Command("cpuset", "-l", listString, os.Args[0], "FreeBSDNumCPUHelper")
+	cmdline := strings.Join(cmd.Args, " ")
+	output, err := cmd.CombinedOutput()
+	if err != nil {
+		return fmt.Errorf("fail to launch child '%s', error: %s, output: %s", cmdline, err, output)
+	}
+
+	// NumCPU from FreeBSDNumCPUHelper come with '\n'.
+	output = bytes.TrimSpace(output)
+	n, err := strconv.Atoi(string(output))
+	if err != nil {
+		return fmt.Errorf("fail to parse output from child '%s', error: %s, output: %s", cmdline, err, output)
+	}
+	if n != len(list) {
+		return fmt.Errorf("runtime.NumCPU() expected to %d, got %d when run with CPU list %s", len(list), n, listString)
+	}
+	return nil
+}

diff --git a/src/runtime/testdata/testprog/panicrace.go b/src/runtime/testdata/testprog/panicrace.go
new file mode 100644
index 0000000..f058994
--- /dev/null
+++ b/src/runtime/testdata/testprog/panicrace.go

@@ -0,0 +1,27 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"runtime"
+	"sync"
+)
+
+func init() {
+	register("PanicRace", PanicRace)
+}
+
+func PanicRace() {
+	var wg sync.WaitGroup
+	wg.Add(1)
+	go func() {
+		defer func() {
+			wg.Done()
+			runtime.Gosched()
+		}()
+		panic("crash")
+	}()
+	wg.Wait()
+}

diff --git a/src/runtime/testdata/testprogcgo/cgo.go b/src/runtime/testdata/testprogcgo/cgo.go
index 870d4ef..209524a 100644
--- a/src/runtime/testdata/testprogcgo/cgo.go
+++ b/src/runtime/testdata/testprogcgo/cgo.go

@@ -45,10 +45,13 @@
 				}()
 				var s *string
 				*s = ""
+				fmt.Printf("continued after expected panic\n")
 			}()
 		}
 	}()
 	time.Sleep(time.Millisecond)
+	start := time.Now()
+	var times []time.Duration
 	for i := 0; i < 64; i++ {
 		go func() {
 			runtime.LockOSThread()
@@ -62,8 +65,9 @@
 		ping <- false
 		select {
 		case <-ping:
+			times = append(times, time.Since(start))
 		case <-time.After(time.Second):
-			fmt.Printf("HANG\n")
+			fmt.Printf("HANG 1 %v\n", times)
 			return
 		}
 	}
@@ -71,7 +75,7 @@
 	select {
 	case <-ping:
 	case <-time.After(time.Second):
-		fmt.Printf("HANG\n")
+		fmt.Printf("HANG 2 %v\n", times)
 		return
 	}
 	fmt.Printf("OK\n")

diff --git a/src/runtime/testdata/testprogcgo/numgoroutine.go b/src/runtime/testdata/testprogcgo/numgoroutine.go
new file mode 100644
index 0000000..12fda49
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/numgoroutine.go

@@ -0,0 +1,99 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <stddef.h>
+#include <pthread.h>
+
+extern void CallbackNumGoroutine();
+
+static void* thread2(void* arg __attribute__ ((unused))) {
+	CallbackNumGoroutine();
+	return NULL;
+}
+
+static void CheckNumGoroutine() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, thread2, NULL);
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"strings"
+)
+
+var baseGoroutines int
+
+func init() {
+	register("NumGoroutine", NumGoroutine)
+}
+
+func NumGoroutine() {
+	// Test that there are just the expected number of goroutines
+	// running. Specifically, test that the spare M's goroutine
+	// doesn't show up.
+	//
+	// On non-Windows platforms there's a signal handling thread
+	// started by os/signal.init in addition to the main
+	// goroutine.
+	if runtime.GOOS != "windows" {
+		baseGoroutines = 1
+	}
+	if _, ok := checkNumGoroutine("first", 1+baseGoroutines); !ok {
+		return
+	}
+
+	// Test that the goroutine for a callback from C appears.
+	if C.CheckNumGoroutine(); !callbackok {
+		return
+	}
+
+	// Make sure we're back to the initial goroutines.
+	if _, ok := checkNumGoroutine("third", 1+baseGoroutines); !ok {
+		return
+	}
+
+	fmt.Println("OK")
+}
+
+func checkNumGoroutine(label string, want int) (string, bool) {
+	n := runtime.NumGoroutine()
+	if n != want {
+		fmt.Printf("%s NumGoroutine: want %d; got %d\n", label, want, n)
+		return "", false
+	}
+
+	sbuf := make([]byte, 32<<10)
+	sbuf = sbuf[:runtime.Stack(sbuf, true)]
+	n = strings.Count(string(sbuf), "goroutine ")
+	if n != want {
+		fmt.Printf("%s Stack: want %d; got %d:\n%s\n", label, want, n, string(sbuf))
+		return "", false
+	}
+	return string(sbuf), true
+}
+
+var callbackok bool
+
+//export CallbackNumGoroutine
+func CallbackNumGoroutine() {
+	stk, ok := checkNumGoroutine("second", 2+baseGoroutines)
+	if !ok {
+		return
+	}
+	if !strings.Contains(stk, "CallbackNumGoroutine") {
+		fmt.Printf("missing CallbackNumGoroutine from stack:\n%s\n", stk)
+		return
+	}
+
+	callbackok = true
+}

diff --git a/src/runtime/testdata/testprognet/signalexec.go b/src/runtime/testdata/testprognet/signalexec.go
new file mode 100644
index 0000000..4a988ef
--- /dev/null
+++ b/src/runtime/testdata/testprognet/signalexec.go

@@ -0,0 +1,70 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+// This is in testprognet instead of testprog because testprog
+// must not import anything (like net, but also like os/signal)
+// that kicks off background goroutines during init.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"os/signal"
+	"sync"
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("SignalDuringExec", SignalDuringExec)
+	register("Nop", Nop)
+}
+
+func SignalDuringExec() {
+	pgrp := syscall.Getpgrp()
+
+	const tries = 10
+
+	var wg sync.WaitGroup
+	c := make(chan os.Signal, tries)
+	signal.Notify(c, syscall.SIGWINCH)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		for range c {
+		}
+	}()
+
+	for i := 0; i < tries; i++ {
+		time.Sleep(time.Microsecond)
+		wg.Add(2)
+		go func() {
+			defer wg.Done()
+			cmd := exec.Command(os.Args[0], "Nop")
+			cmd.Stdout = os.Stdout
+			cmd.Stderr = os.Stderr
+			if err := cmd.Run(); err != nil {
+				fmt.Printf("Start failed: %v", err)
+			}
+		}()
+		go func() {
+			defer wg.Done()
+			syscall.Kill(-pgrp, syscall.SIGWINCH)
+		}()
+	}
+
+	signal.Stop(c)
+	close(c)
+	wg.Wait()
+
+	fmt.Println("OK")
+}
+
+func Nop() {
+	// This is just for SignalDuringExec.
+}

diff --git a/src/runtime/time.go b/src/runtime/time.go
index 604ccde..abf200d 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go

@@ -31,6 +31,7 @@
 	created      bool
 	sleeping     bool
 	rescheduling bool
+	sleepUntil   int64
 	waitnote     note
 	t            []*timer
 }
@@ -50,7 +51,12 @@
 		return
 	}
 
-	t := new(timer)
+	t := getg().timer
+	if t == nil {
+		t = new(timer)
+		getg().timer = t
+	}
+	*t = timer{}
 	t.when = nanotime() + ns
 	t.f = goroutineReady
 	t.arg = getg()
@@ -204,6 +210,7 @@
 		}
 		// At least one timer pending. Sleep until then.
 		timers.sleeping = true
+		timers.sleepUntil = now + delta
 		noteclear(&timers.waitnote)
 		unlock(&timers.lock)
 		notetsleepg(&timers.waitnote, delta)
@@ -292,8 +299,8 @@
 
 // Entry points for net, time to call nanotime.
 
-//go:linkname net_runtimeNano net.runtimeNano
-func net_runtimeNano() int64 {
+//go:linkname poll_runtimeNano internal/poll.runtimeNano
+func poll_runtimeNano() int64 {
 	return nanotime()
 }
 
@@ -301,3 +308,5 @@
 func time_runtimeNano() int64 {
 	return nanotime()
 }
+
+var startNano int64 = nanotime()

diff --git a/src/runtime/timeasm.go b/src/runtime/timeasm.go
new file mode 100644
index 0000000..7474bec
--- /dev/null
+++ b/src/runtime/timeasm.go

@@ -0,0 +1,16 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now directly in assembly.
+// Those systems are also expected to have nanotime subtract startNano,
+// so that time.now and nanotime return the same monotonic clock readings.
+
+// +build darwin,amd64 darwin,386 windows
+
+package runtime
+
+import _ "unsafe"
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64)

diff --git a/src/runtime/timestub.go b/src/runtime/timestub.go
new file mode 100644
index 0000000..adc3a86
--- /dev/null
+++ b/src/runtime/timestub.go

@@ -0,0 +1,21 @@
+// Copyright 2017 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Declarations for operating systems implementing time.now
+// indirectly, in terms of walltime and nanotime assembly.
+
+// +build !darwin !amd64,!386
+// +build !windows
+
+package runtime
+
+import _ "unsafe" // for go:linkname
+
+func walltime() (sec int64, nsec int32)
+
+//go:linkname time_now time.now
+func time_now() (sec int64, nsec int32, mono int64) {
+	sec, nsec = walltime()
+	return sec, nsec, nanotime() - startNano
+}

diff --git a/src/runtime/tls_arm.s b/src/runtime/tls_arm.s
index 32bfcf8..a5f5003 100644
--- a/src/runtime/tls_arm.s
+++ b/src/runtime/tls_arm.s

@@ -35,7 +35,7 @@
 	// nothing to do as nacl/arm does not use TLS at all.
 	MOVW	g, R0 // preserve R0 across call to setg<>
 	RET
-#endif
+#else
 	// If the host does not support MRC the linker will replace it with
 	// a call to runtime.read_tls_fallback which jumps to __kuser_get_tls.
 	// The replacement function saves LR in R11 over the call to read_tls_fallback.
@@ -46,6 +46,7 @@
 	MOVW	g, 0(R0)
 	MOVW	g, R0 // preserve R0 across call to setg<>
 	RET
+#endif
 
 // load_g loads the g register from pthread-provided
 // thread-local memory, for use after calling externally compiled
@@ -54,7 +55,7 @@
 #ifdef GOOS_nacl
 	// nothing to do as nacl/arm does not use TLS at all.
 	RET
-#endif
+#else
 	// See save_g
 	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
 	BIC $3, R0 // Darwin/ARM might return unaligned pointer
@@ -62,6 +63,7 @@
 	ADD	R11, R0
 	MOVW	0(R0), g
 	RET
+#endif
 
 // This is called from rt0_go, which runs on the system stack
 // using the initial stack allocated by the OS.

diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index a8f4ab6..826dc9a 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go

@@ -19,50 +19,52 @@
 
 // Event types in the trace, args are given in square brackets.
 const (
-	traceEvNone           = 0  // unused
-	traceEvBatch          = 1  // start of per-P batch of events [pid, timestamp]
-	traceEvFrequency      = 2  // contains tracer timer frequency [frequency (ticks per second)]
-	traceEvStack          = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
-	traceEvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
-	traceEvProcStart      = 5  // start of P [timestamp, thread id]
-	traceEvProcStop       = 6  // stop of P [timestamp]
-	traceEvGCStart        = 7  // GC start [timestamp, seq, stack id]
-	traceEvGCDone         = 8  // GC done [timestamp]
-	traceEvGCScanStart    = 9  // GC mark termination start [timestamp]
-	traceEvGCScanDone     = 10 // GC mark termination done [timestamp]
-	traceEvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
-	traceEvGCSweepDone    = 12 // GC sweep done [timestamp]
-	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
-	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
-	traceEvGoEnd          = 15 // goroutine ends [timestamp]
-	traceEvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
-	traceEvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
-	traceEvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
-	traceEvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
-	traceEvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
-	traceEvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
-	traceEvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
-	traceEvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
-	traceEvGoBlockSync    = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
-	traceEvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
-	traceEvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
-	traceEvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
-	traceEvGoSysBlock     = 30 // syscall blocks [timestamp]
-	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
-	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
-	traceEvHeapAlloc      = 33 // memstats.heap_live change [timestamp, heap_alloc]
-	traceEvNextGC         = 34 // memstats.next_gc change [timestamp, next_gc]
-	traceEvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
-	traceEvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
-	traceEvString         = 37 // string dictionary entry [ID, length, string]
-	traceEvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
-	traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
-	traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
-	traceEvGoStartLabel   = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
-	traceEvGoBlockGC      = 42 // goroutine blocks on GC assist [timestamp, stack]
-	traceEvCount          = 43
+	traceEvNone              = 0  // unused
+	traceEvBatch             = 1  // start of per-P batch of events [pid, timestamp]
+	traceEvFrequency         = 2  // contains tracer timer frequency [frequency (ticks per second)]
+	traceEvStack             = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
+	traceEvGomaxprocs        = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
+	traceEvProcStart         = 5  // start of P [timestamp, thread id]
+	traceEvProcStop          = 6  // stop of P [timestamp]
+	traceEvGCStart           = 7  // GC start [timestamp, seq, stack id]
+	traceEvGCDone            = 8  // GC done [timestamp]
+	traceEvGCScanStart       = 9  // GC mark termination start [timestamp]
+	traceEvGCScanDone        = 10 // GC mark termination done [timestamp]
+	traceEvGCSweepStart      = 11 // GC sweep start [timestamp, stack id]
+	traceEvGCSweepDone       = 12 // GC sweep done [timestamp, swept, reclaimed]
+	traceEvGoCreate          = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart           = 14 // goroutine starts running [timestamp, goroutine id, seq]
+	traceEvGoEnd             = 15 // goroutine ends [timestamp]
+	traceEvGoStop            = 16 // goroutine stops (like in select{}) [timestamp, stack]
+	traceEvGoSched           = 17 // goroutine calls Gosched [timestamp, stack]
+	traceEvGoPreempt         = 18 // goroutine is preempted [timestamp, stack]
+	traceEvGoSleep           = 19 // goroutine calls Sleep [timestamp, stack]
+	traceEvGoBlock           = 20 // goroutine blocks [timestamp, stack]
+	traceEvGoUnblock         = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
+	traceEvGoBlockSend       = 22 // goroutine blocks on chan send [timestamp, stack]
+	traceEvGoBlockRecv       = 23 // goroutine blocks on chan recv [timestamp, stack]
+	traceEvGoBlockSelect     = 24 // goroutine blocks on select [timestamp, stack]
+	traceEvGoBlockSync       = 25 // goroutine blocks on Mutex/RWMutex [timestamp, stack]
+	traceEvGoBlockCond       = 26 // goroutine blocks on Cond [timestamp, stack]
+	traceEvGoBlockNet        = 27 // goroutine blocks on network [timestamp, stack]
+	traceEvGoSysCall         = 28 // syscall enter [timestamp, stack]
+	traceEvGoSysExit         = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
+	traceEvGoSysBlock        = 30 // syscall blocks [timestamp]
+	traceEvGoWaiting         = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall       = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
+	traceEvHeapAlloc         = 33 // memstats.heap_live change [timestamp, heap_alloc]
+	traceEvNextGC            = 34 // memstats.next_gc change [timestamp, next_gc]
+	traceEvTimerGoroutine    = 35 // denotes timer goroutine [timer goroutine id]
+	traceEvFutileWakeup      = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
+	traceEvString            = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal      = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal    = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal    = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvGoStartLabel      = 41 // goroutine starts running with label [timestamp, goroutine id, seq, label string id]
+	traceEvGoBlockGC         = 42 // goroutine blocks on GC assist [timestamp, stack]
+	traceEvGCMarkAssistStart = 43 // GC mark assist start [timestamp, stack]
+	traceEvGCMarkAssistDone  = 44 // GC mark assist done [timestamp]
+	traceEvCount             = 45
 )
 
 const (
@@ -311,7 +313,7 @@
 
 	// The world is started but we've set trace.shutdown, so new tracing can't start.
 	// Wait for the trace reader to flush pending buffers and stop.
-	semacquire(&trace.shutdownSema, 0)
+	semacquire(&trace.shutdownSema)
 	if raceenabled {
 		raceacquire(unsafe.Pointer(&trace.shutdownSema))
 	}
@@ -380,7 +382,7 @@
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.8 trace\x00\x00\x00\x00")
+		return []byte("go 1.9 trace\x00\x00\x00\x00")
 	}
 	// Wait for new data.
 	if trace.fullHead == 0 && !trace.shutdown {
@@ -570,12 +572,7 @@
 		nstk = callers(skip+1, buf[:])
 	} else if gp != nil {
 		gp = mp.curg
-		// This may happen when tracing a system call,
-		// so we must lock the stack.
-		if gcTryLockStackBarriers(gp) {
-			nstk = gcallers(gp, skip, buf[:])
-			gcUnlockStackBarriers(gp)
-		}
+		nstk = gcallers(gp, skip, buf[:])
 	}
 	if nstk > 0 {
 		nstk-- // skip runtime.goexit
@@ -767,10 +764,22 @@
 	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*sys.PtrSize))
 }
 
+// allFrames returns all of the Frames corresponding to pcs.
+func allFrames(pcs []uintptr) []Frame {
+	frames := make([]Frame, 0, len(pcs))
+	ci := CallersFrames(pcs)
+	for {
+		f, more := ci.Next()
+		frames = append(frames, f)
+		if !more {
+			return frames
+		}
+	}
+}
+
 // dump writes all previously cached stacks to trace buffers,
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
-	frames := make(map[uintptr]traceFrame)
 	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
 	buf := traceFlush(0).ptr()
 	for _, stk := range tab.tab {
@@ -778,11 +787,12 @@
 		for ; stk != nil; stk = stk.link.ptr() {
 			tmpbuf := tmp[:0]
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
-			tmpbuf = traceAppend(tmpbuf, uint64(stk.n))
-			for _, pc := range stk.stack() {
+			frames := allFrames(stk.stack())
+			tmpbuf = traceAppend(tmpbuf, uint64(len(frames)))
+			for _, f := range frames {
 				var frame traceFrame
-				frame, buf = traceFrameForPC(buf, frames, pc)
-				tmpbuf = traceAppend(tmpbuf, uint64(pc))
+				frame, buf = traceFrameForPC(buf, f)
+				tmpbuf = traceAppend(tmpbuf, uint64(f.PC))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
 				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
@@ -812,26 +822,17 @@
 	line   uint64
 }
 
-func traceFrameForPC(buf *traceBuf, frames map[uintptr]traceFrame, pc uintptr) (traceFrame, *traceBuf) {
-	if frame, ok := frames[pc]; ok {
-		return frame, buf
-	}
-
+func traceFrameForPC(buf *traceBuf, f Frame) (traceFrame, *traceBuf) {
 	var frame traceFrame
-	f := findfunc(pc)
-	if f == nil {
-		frames[pc] = frame
-		return frame, buf
-	}
 
-	fn := funcname(f)
+	fn := f.Function
 	const maxLen = 1 << 10
 	if len(fn) > maxLen {
 		fn = fn[len(fn)-maxLen:]
 	}
 	frame.funcID, buf = traceString(buf, fn)
-	file, line := funcline(f, pc-sys.PCQuantum)
-	frame.line = uint64(line)
+	frame.line = uint64(f.Line)
+	file := f.File
 	if len(file) > maxLen {
 		file = file[len(file)-maxLen:]
 	}
@@ -931,12 +932,52 @@
 	traceEvent(traceEvGCScanDone, -1)
 }
 
+// traceGCSweepStart prepares to trace a sweep loop. This does not
+// emit any events until traceGCSweepSpan is called.
+//
+// traceGCSweepStart must be paired with traceGCSweepDone and there
+// must be no preemption points between these two calls.
 func traceGCSweepStart() {
-	traceEvent(traceEvGCSweepStart, 1)
+	// Delay the actual GCSweepStart event until the first span
+	// sweep. If we don't sweep anything, don't emit any events.
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		throw("double traceGCSweepStart")
+	}
+	_p_.traceSweep, _p_.traceSwept, _p_.traceReclaimed = true, 0, 0
+}
+
+// traceGCSweepSpan traces the sweep of a single page.
+//
+// This may be called outside a traceGCSweepStart/traceGCSweepDone
+// pair; however, it will not emit any trace events in this case.
+func traceGCSweepSpan(bytesSwept uintptr) {
+	_p_ := getg().m.p.ptr()
+	if _p_.traceSweep {
+		if _p_.traceSwept == 0 {
+			traceEvent(traceEvGCSweepStart, 1)
+		}
+		_p_.traceSwept += bytesSwept
+	}
 }
 
 func traceGCSweepDone() {
-	traceEvent(traceEvGCSweepDone, -1)
+	_p_ := getg().m.p.ptr()
+	if !_p_.traceSweep {
+		throw("missing traceGCSweepStart")
+	}
+	if _p_.traceSwept != 0 {
+		traceEvent(traceEvGCSweepDone, -1, uint64(_p_.traceSwept), uint64(_p_.traceReclaimed))
+	}
+	_p_.traceSweep = false
+}
+
+func traceGCMarkAssistStart() {
+	traceEvent(traceEvGCMarkAssistStart, 1)
+}
+
+func traceGCMarkAssistDone() {
+	traceEvent(traceEvGCMarkAssistDone, -1)
 }
 
 func traceGoCreate(newg *g, pc uintptr) {
@@ -977,7 +1018,7 @@
 	traceEvent(traceEvGoPreempt, 1)
 }
 
-func traceGoPark(traceEv byte, skip int, gp *g) {
+func traceGoPark(traceEv byte, skip int) {
 	if traceEv&traceFutileWakeup != 0 {
 		traceEvent(traceEvFutileWakeup, -1)
 	}

diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
index c37b33d..274cdf7 100644
--- a/src/runtime/trace/trace_stack_test.go
+++ b/src/runtime/trace/trace_stack_test.go

@@ -151,7 +151,7 @@
 			{"testing.tRunner", 0},
 		}},
 		{trace.EvGoCreate, []frame{
-			{"runtime/trace_test.TestTraceSymbolize", 39},
+			{"runtime/trace_test.TestTraceSymbolize", 37},
 			{"testing.tRunner", 0},
 		}},
 		{trace.EvGoStop, []frame{
@@ -231,6 +231,7 @@
 	if runtime.GOOS != "windows" && runtime.GOOS != "plan9" {
 		want = append(want, []eventDesc{
 			{trace.EvGoBlockNet, []frame{
+				{"internal/poll.(*FD).Accept", 0},
 				{"net.(*netFD).accept", 0},
 				{"net.(*TCPListener).accept", 0},
 				{"net.(*TCPListener).Accept", 0},
@@ -239,6 +240,7 @@
 			{trace.EvGoSysCall, []frame{
 				{"syscall.read", 0},
 				{"syscall.Read", 0},
+				{"internal/poll.(*FD).Read", 0},
 				{"os.(*File).read", 0},
 				{"os.(*File).Read", 0},
 				{"runtime/trace_test.TestTraceSymbolize.func11", 102},
@@ -274,9 +276,10 @@
 				continue
 			}
 			for _, f := range ev.Stk {
-				t.Logf("  %v:%v", f.Fn, f.Line)
+				t.Logf("  %v :: %s:%v", f.Fn, f.File, f.Line)
 			}
 			t.Logf("---")
 		}
+		t.Logf("======")
 	}
 }

diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 180489f..c74d438 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go

@@ -51,8 +51,8 @@
 	gcBgMarkWorkerPC     uintptr
 	systemstack_switchPC uintptr
 	systemstackPC        uintptr
-	stackBarrierPC       uintptr
 	cgocallback_gofuncPC uintptr
+	skipPC               uintptr
 
 	gogoPC uintptr
 
@@ -78,8 +78,8 @@
 	gcBgMarkWorkerPC = funcPC(gcBgMarkWorker)
 	systemstack_switchPC = funcPC(systemstack_switch)
 	systemstackPC = funcPC(systemstack)
-	stackBarrierPC = funcPC(stackBarrier)
 	cgocallback_gofuncPC = funcPC(cgocallback_gofunc)
+	skipPC = funcPC(skipPleaseUseCallersFrames)
 
 	// used by sigprof handler
 	gogoPC = funcPC(gogo)
@@ -94,14 +94,14 @@
 		if fn == nil {
 			// Defer of nil function. Args don't matter.
 			frame.pc = 0
-			frame.fn = nil
+			frame.fn = funcInfo{}
 			frame.argp = 0
 			frame.arglen = 0
 			frame.argmap = nil
 		} else {
 			frame.pc = fn.fn
 			f := findfunc(frame.pc)
-			if f == nil {
+			if !f.valid() {
 				print("runtime: unknown pc in defer ", hex(frame.pc), "\n")
 				throw("unknown pc")
 			}
@@ -116,11 +116,25 @@
 	}
 }
 
+const sizeofSkipFunction = 256
+
+// This function is defined in asm.s to be sizeofSkipFunction bytes long.
+func skipPleaseUseCallersFrames()
+
 // Generic traceback. Handles runtime stack prints (pcbuf == nil),
 // the runtime.Callers function (pcbuf != nil), as well as the garbage
 // collector (callback != nil).  A little clunky to merge these, but avoids
 // duplicating the code and all its subtlety.
+//
+// The skip argument is only valid with pcbuf != nil and counts the number
+// of logical frames to skip rather than physical frames (with inlining, a
+// PC in pcbuf can represent multiple calls). If a PC is partially skipped
+// and max > 1, pcbuf[1] will be runtime.skipPleaseUseCallersFrames+N where
+// N indicates the number of logical frames to skip in pcbuf[0].
 func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max int, callback func(*stkframe, unsafe.Pointer) bool, v unsafe.Pointer, flags uint) int {
+	if skip > 0 && callback != nil {
+		throw("gentraceback callback cannot be used with non-zero skip")
+	}
 	if goexitPC == 0 {
 		throw("gentraceback before goexitPC initialization")
 	}
@@ -143,11 +157,6 @@
 	}
 	level, _, _ := gotraceback()
 
-	// Fix up returns to the stack barrier by fetching the
-	// original return PC from gp.stkbar.
-	stkbarG := gp
-	stkbar := stkbarG.stkbar[stkbarG.stkbarPos:]
-
 	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
 		if gp.syscallsp != 0 {
 			pc0 = gp.syscallpc
@@ -193,35 +202,7 @@
 	}
 
 	f := findfunc(frame.pc)
-	if f != nil && f.entry == stackBarrierPC {
-		// We got caught in the middle of a stack barrier
-		// (presumably by a signal), so stkbar may be
-		// inconsistent with the barriers on the stack.
-		// Simulate the completion of the barrier.
-		//
-		// On x86, SP will be exactly one word above
-		// savedLRPtr. On LR machines, SP will be above
-		// savedLRPtr by some frame size.
-		var stkbarPos uintptr
-		if len(stkbar) > 0 && stkbar[0].savedLRPtr < sp0 {
-			// stackBarrier has not incremented stkbarPos.
-			stkbarPos = gp.stkbarPos
-		} else if gp.stkbarPos > 0 && gp.stkbar[gp.stkbarPos-1].savedLRPtr < sp0 {
-			// stackBarrier has incremented stkbarPos.
-			stkbarPos = gp.stkbarPos - 1
-		} else {
-			printlock()
-			print("runtime: failed to unwind through stackBarrier at SP ", hex(sp0), "; ")
-			gcPrintStkbars(gp, int(gp.stkbarPos))
-			print("\n")
-			throw("inconsistent state in stackBarrier")
-		}
-
-		frame.pc = gp.stkbar[stkbarPos].savedLRVal
-		stkbar = gp.stkbar[stkbarPos+1:]
-		f = findfunc(frame.pc)
-	}
-	if f == nil {
+	if !f.valid() {
 		if callback != nil {
 			print("runtime: unknown pc ", hex(frame.pc), "\n")
 			throw("unknown pc")
@@ -257,8 +238,6 @@
 			if flags&_TraceJumpStack != 0 && f.entry == systemstackPC && gp == g.m.g0 && gp.m.curg != nil {
 				sp = gp.m.curg.sched.sp
 				frame.sp = sp
-				stkbarG = gp.m.curg
-				stkbar = stkbarG.stkbar[stkbarG.stkbarPos:]
 				cgoCtxt = gp.m.curg.cgoCtxt
 			}
 			frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
@@ -267,10 +246,10 @@
 				frame.fp += sys.RegSize
 			}
 		}
-		var flr *_func
+		var flr funcInfo
 		if topofstack(f) {
 			frame.lr = 0
-			flr = nil
+			flr = funcInfo{}
 		} else if usesLR && f.entry == jmpdeferPC {
 			// jmpdefer modifies SP/LR/PC non-atomically.
 			// If a profiling interrupt arrives during jmpdefer,
@@ -295,19 +274,8 @@
 					frame.lr = uintptr(*(*sys.Uintreg)(unsafe.Pointer(lrPtr)))
 				}
 			}
-			if frame.lr == stackBarrierPC {
-				// Recover original PC.
-				if len(stkbar) == 0 || stkbar[0].savedLRPtr != lrPtr {
-					print("found next stack barrier at ", hex(lrPtr), "; expected ")
-					gcPrintStkbars(stkbarG, len(stkbarG.stkbar)-len(stkbar))
-					print("\n")
-					throw("missed stack barrier")
-				}
-				frame.lr = stkbar[0].savedLRVal
-				stkbar = stkbar[1:]
-			}
 			flr = findfunc(frame.lr)
-			if flr == nil {
+			if !flr.valid() {
 				// This happens if you get a profiling interrupt at just the wrong time.
 				// In that context it is okay to stop early.
 				// But if callback is set, we're doing a garbage collection and must
@@ -366,20 +334,59 @@
 			_defer = _defer.link
 		}
 
-		if skip > 0 {
-			skip--
-			goto skipped
-		}
-
-		if pcbuf != nil {
-			(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = frame.pc
-		}
 		if callback != nil {
 			if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
 				return n
 			}
 		}
+
+		if pcbuf != nil {
+			if skip == 0 {
+				(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = frame.pc
+			} else {
+				// backup to CALL instruction to read inlining info (same logic as below)
+				tracepc := frame.pc
+				if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry && !waspanic {
+					tracepc--
+				}
+				inldata := funcdata(f, _FUNCDATA_InlTree)
+
+				// no inlining info, skip the physical frame
+				if inldata == nil {
+					skip--
+					goto skipped
+				}
+
+				ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, &cache)
+				inltree := (*[1 << 20]inlinedCall)(inldata)
+				// skip the logical (inlined) frames
+				logicalSkipped := 0
+				for ix >= 0 && skip > 0 {
+					skip--
+					logicalSkipped++
+					ix = inltree[ix].parent
+				}
+
+				// skip the physical frame if there's more to skip
+				if skip > 0 {
+					skip--
+					goto skipped
+				}
+
+				// now we have a partially skipped frame
+				(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = frame.pc
+
+				// if there's room, pcbuf[1] is a skip PC that encodes the number of skipped frames in pcbuf[0]
+				if n+1 < max {
+					n++
+					skipPC := funcPC(skipPleaseUseCallersFrames) + uintptr(logicalSkipped)
+					(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = skipPC
+				}
+			}
+		}
+
 		if printing {
+			// assume skip=0 for printing
 			if (flags&_TraceRuntimeFrames) != 0 || showframe(f, gp, nprint == 0) {
 				// Print during crash.
 				//	main(0x1, 0x2, 0x3)
@@ -389,6 +396,21 @@
 				if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry && !waspanic {
 					tracepc--
 				}
+				file, line := funcline(f, tracepc)
+				inldata := funcdata(f, _FUNCDATA_InlTree)
+				if inldata != nil {
+					inltree := (*[1 << 20]inlinedCall)(inldata)
+					ix := pcdatavalue(f, _PCDATA_InlTreeIndex, tracepc, nil)
+					for ix != -1 {
+						name := funcnameFromNameoff(f, inltree[ix].func_)
+						print(name, "(...)\n")
+						print("\t", file, ":", line, "\n")
+
+						file = funcfile(f, inltree[ix].file)
+						line = inltree[ix].line
+						ix = inltree[ix].parent
+					}
+				}
 				name := funcname(f)
 				if name == "runtime.gopanic" {
 					name = "panic"
@@ -406,13 +428,12 @@
 					print(hex(argp[i]))
 				}
 				print(")\n")
-				file, line := funcline(f, tracepc)
 				print("\t", file, ":", line)
 				if frame.pc > f.entry {
 					print(" +", hex(frame.pc-f.entry))
 				}
 				if g.m.throwing > 0 && gp == g.m.curg || level >= 2 {
-					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp))
+					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp), " pc=", hex(frame.pc))
 				}
 				print("\n")
 				nprint++
@@ -436,7 +457,7 @@
 		waspanic = f.entry == sigpanicPC
 
 		// Do not unwind past the bottom of the stack.
-		if flr == nil {
+		if !flr.valid() {
 			break
 		}
 
@@ -459,7 +480,7 @@
 			}
 			f = findfunc(frame.pc)
 			frame.fn = f
-			if f == nil {
+			if !f.valid() {
 				frame.pc = x
 			} else if funcspdelta(f, frame.pc, &cache) == 0 {
 				frame.lr = x
@@ -530,13 +551,6 @@
 		throw("traceback has leftover defers")
 	}
 
-	if callback != nil && n < max && len(stkbar) > 0 {
-		print("runtime: g", gp.goid, ": leftover stack barriers ")
-		gcPrintStkbars(stkbarG, len(stkbarG.stkbar)-len(stkbar))
-		print("\n")
-		throw("traceback has leftover stack barriers")
-	}
-
 	if callback != nil && n < max && frame.sp != gp.stktopsp {
 		print("runtime: g", gp.goid, ": frame.sp=", hex(frame.sp), " top=", hex(gp.stktopsp), "\n")
 		print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "] n=", n, " max=", max, "\n")
@@ -561,7 +575,7 @@
 // call, ctxt must be nil (getArgInfo will retrieve what it needs from
 // the active stack frame). If this is a deferred call, ctxt must be
 // the function object that was deferred.
-func getArgInfo(frame *stkframe, f *_func, needArgMap bool, ctxt *funcval) (arglen uintptr, argmap *bitvector) {
+func getArgInfo(frame *stkframe, f funcInfo, needArgMap bool, ctxt *funcval) (arglen uintptr, argmap *bitvector) {
 	arglen = uintptr(f.args)
 	if needArgMap && f.args == _ArgsSizeUnknown {
 		// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
@@ -633,7 +647,7 @@
 	// Show what created goroutine, except main goroutine (goid 1).
 	pc := gp.gopc
 	f := findfunc(pc)
-	if f != nil && showframe(f, gp, false) && gp.goid != 1 {
+	if f.valid() && showframe(f, gp, false) && gp.goid != 1 {
 		print("created by ", funcname(f), "\n")
 		tracepc := pc // back up to CALL instruction for funcline.
 		if pc > f.entry {
@@ -713,7 +727,7 @@
 	return gentraceback(^uintptr(0), ^uintptr(0), 0, gp, skip, &pcbuf[0], len(pcbuf), nil, nil, 0)
 }
 
-func showframe(f *_func, gp *g, firstFrame bool) bool {
+func showframe(f funcInfo, gp *g, firstFrame bool) bool {
 	g := getg()
 	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
 		return true
@@ -730,7 +744,7 @@
 		return true
 	}
 
-	return level > 1 || f != nil && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+	return level > 1 || f.valid() && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.
@@ -821,7 +835,7 @@
 }
 
 // Does f mark the top of a goroutine stack?
-func topofstack(f *_func) bool {
+func topofstack(f funcInfo) bool {
 	pc := f.entry
 	return pc == goexitPC ||
 		pc == mstartPC ||

diff --git a/src/runtime/type.go b/src/runtime/type.go
index 3ecc54c..bf54d54 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go

@@ -22,7 +22,7 @@
 	tflagNamed     tflag = 1 << 2
 )
 
-// Needs to be in sync with ../cmd/compile/internal/ld/decodesym.go:/^func.commonsize,
+// Needs to be in sync with ../cmd/link/internal/ld/decodesym.go:/^func.commonsize,
 // ../cmd/compile/internal/gc/reflect.go:/^func.dcommontype and
 // ../reflect/type.go:/^type.rtype.
 type _type struct {
@@ -390,9 +390,13 @@
 }
 
 type structfield struct {
-	name   name
-	typ    *_type
-	offset uintptr
+	name       name
+	typ        *_type
+	offsetAnon uintptr
+}
+
+func (f *structfield) offset() uintptr {
+	return f.offsetAnon >> 1
 }
 
 type structtype struct {
@@ -507,7 +511,8 @@
 			for _, tl := range md.typelinks {
 				t := (*_type)(unsafe.Pointer(md.types + uintptr(tl)))
 				for _, candidate := range typehash[t.hash] {
-					if typesEqual(t, candidate) {
+					seen := map[_typePair]struct{}{}
+					if typesEqual(t, candidate, seen) {
 						t = candidate
 						break
 					}
@@ -520,6 +525,11 @@
 	}
 }
 
+type _typePair struct {
+	t1 *_type
+	t2 *_type
+}
+
 // typesEqual reports whether two types are equal.
 //
 // Everywhere in the runtime and reflect packages, it is assumed that
@@ -532,7 +542,17 @@
 // back into earlier ones.
 //
 // Only typelinksinit needs this function.
-func typesEqual(t, v *_type) bool {
+func typesEqual(t, v *_type, seen map[_typePair]struct{}) bool {
+	tp := _typePair{t, v}
+	if _, ok := seen[tp]; ok {
+		return true
+	}
+
+	// mark these types as seen, and thus equivalent which prevents an infinite loop if
+	// the two types are identical, but recursively defined and loaded from
+	// different modules
+	seen[tp] = struct{}{}
+
 	if t == v {
 		return true
 	}
@@ -564,11 +584,11 @@
 	case kindArray:
 		at := (*arraytype)(unsafe.Pointer(t))
 		av := (*arraytype)(unsafe.Pointer(v))
-		return typesEqual(at.elem, av.elem) && at.len == av.len
+		return typesEqual(at.elem, av.elem, seen) && at.len == av.len
 	case kindChan:
 		ct := (*chantype)(unsafe.Pointer(t))
 		cv := (*chantype)(unsafe.Pointer(v))
-		return ct.dir == cv.dir && typesEqual(ct.elem, cv.elem)
+		return ct.dir == cv.dir && typesEqual(ct.elem, cv.elem, seen)
 	case kindFunc:
 		ft := (*functype)(unsafe.Pointer(t))
 		fv := (*functype)(unsafe.Pointer(v))
@@ -577,13 +597,13 @@
 		}
 		tin, vin := ft.in(), fv.in()
 		for i := 0; i < len(tin); i++ {
-			if !typesEqual(tin[i], vin[i]) {
+			if !typesEqual(tin[i], vin[i], seen) {
 				return false
 			}
 		}
 		tout, vout := ft.out(), fv.out()
 		for i := 0; i < len(tout); i++ {
-			if !typesEqual(tout[i], vout[i]) {
+			if !typesEqual(tout[i], vout[i], seen) {
 				return false
 			}
 		}
@@ -612,7 +632,7 @@
 			}
 			tityp := resolveTypeOff(unsafe.Pointer(tm), tm.ityp)
 			vityp := resolveTypeOff(unsafe.Pointer(vm), vm.ityp)
-			if !typesEqual(tityp, vityp) {
+			if !typesEqual(tityp, vityp, seen) {
 				return false
 			}
 		}
@@ -620,15 +640,15 @@
 	case kindMap:
 		mt := (*maptype)(unsafe.Pointer(t))
 		mv := (*maptype)(unsafe.Pointer(v))
-		return typesEqual(mt.key, mv.key) && typesEqual(mt.elem, mv.elem)
+		return typesEqual(mt.key, mv.key, seen) && typesEqual(mt.elem, mv.elem, seen)
 	case kindPtr:
 		pt := (*ptrtype)(unsafe.Pointer(t))
 		pv := (*ptrtype)(unsafe.Pointer(v))
-		return typesEqual(pt.elem, pv.elem)
+		return typesEqual(pt.elem, pv.elem, seen)
 	case kindSlice:
 		st := (*slicetype)(unsafe.Pointer(t))
 		sv := (*slicetype)(unsafe.Pointer(v))
-		return typesEqual(st.elem, sv.elem)
+		return typesEqual(st.elem, sv.elem, seen)
 	case kindStruct:
 		st := (*structtype)(unsafe.Pointer(t))
 		sv := (*structtype)(unsafe.Pointer(v))
@@ -644,13 +664,13 @@
 			if tf.name.pkgPath() != vf.name.pkgPath() {
 				return false
 			}
-			if !typesEqual(tf.typ, vf.typ) {
+			if !typesEqual(tf.typ, vf.typ, seen) {
 				return false
 			}
 			if tf.name.tag() != vf.name.tag() {
 				return false
 			}
-			if tf.offset != vf.offset {
+			if tf.offsetAnon != vf.offsetAnon {
 				return false
 			}
 		}

diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index d4c411c..6a602ff 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s

@@ -28,26 +28,10 @@
 #include "funcdata.h"
 #include "textflag.h"
 
-/* replaced use of R10 by R11 because the former can be the data segment base register */
-
-TEXT _mulv(SB), NOSPLIT, $0
-	MOVW	l0+0(FP), R2	/* l0 */
-	MOVW	h0+4(FP), R11	/* h0 */
-	MOVW	l1+8(FP), R4	/* l1 */
-	MOVW	h1+12(FP), R5	/* h1 */
-	MULLU	R4, R2, (R7,R6)
-	MUL	R11, R4, R8
-	ADD	R8, R7
-	MUL	R2, R5, R8
-	ADD	R8, R7
-	MOVW	R6, ret_lo+16(FP)
-	MOVW	R7, ret_hi+20(FP)
-	RET
-
 // trampoline for _sfloat2. passes LR as arg0 and
 // saves registers R0-R13 and CPSR on the stack. R0-R12 and CPSR flags can
 // be changed by _sfloat2.
-TEXT _sfloat(SB), NOSPLIT, $68-0 // 4 arg + 14*4 saved regs + cpsr + return value
+TEXT runtime·_sfloat(SB), NOSPLIT, $68-0 // 4 arg + 14*4 saved regs + cpsr + return value
 	MOVW	R14, 4(R13)
 	MOVW	R0, 8(R13)
 	MOVW	$12(R13), R0
@@ -106,7 +90,7 @@
 	MOVW	g_sigpc(g), LR
 	B	runtime·sigpanic(SB)
 
-// func udiv(n, d uint32) (q, r uint32)
+// func runtime·udiv(n, d uint32) (q, r uint32)
 // compiler knowns the register usage of this function
 // Reference: 
 // Sloss, Andrew et. al; ARM System Developer's Guide: Designing and Optimizing System Software
@@ -118,7 +102,11 @@
 #define Ra	R11
 
 // Be careful: Ra == R11 will be used by the linker for synthesized instructions.
-TEXT udiv(SB),NOSPLIT,$-4
+TEXT runtime·udiv(SB),NOSPLIT,$-4
+	MOVBU	runtime·hardDiv(SB), Ra
+	CMP	$0, Ra
+	BNE	udiv_hardware
+
 	CLZ 	Rq, Rs // find normalizing shift
 	MOVW.S	Rq<<Rs, Ra
 	MOVW	$fast_udiv_tab<>-64(SB), RM
@@ -154,6 +142,14 @@
 	ADD.PL	$2, Rq
 	RET
 
+// use hardware divider
+udiv_hardware:
+	DIVUHW	Rq, Rr, Rs
+	MUL	Rs, Rq, RM
+	RSB	Rr, RM, Rr
+	MOVW	Rs, Rq
+	RET
+
 udiv_by_large_d:
 	// at this point we know d>=2^(31-6)=2^25
 	SUB 	$4, Ra, Ra
@@ -208,7 +204,7 @@
 // The linker expects the result in RTMP
 #define RTMP R11
 
-TEXT _divu(SB), NOSPLIT, $16-0
+TEXT runtime·_divu(SB), NOSPLIT, $16-0
 	// It's not strictly true that there are no local pointers.
 	// It could be that the saved registers Rq, Rr, Rs, and Rm
 	// contain pointers. However, the only way this can matter
@@ -229,7 +225,7 @@
 	MOVW	Rn, Rr			/* numerator */
 	MOVW	g_m(g), Rq
 	MOVW	m_divmod(Rq), Rq	/* denominator */
-	BL  	udiv(SB)
+	BL  	runtime·udiv(SB)
 	MOVW	Rq, RTMP
 	MOVW	4(R13), Rq
 	MOVW	8(R13), Rr
@@ -237,7 +233,7 @@
 	MOVW	16(R13), RM
 	RET
 
-TEXT _modu(SB), NOSPLIT, $16-0
+TEXT runtime·_modu(SB), NOSPLIT, $16-0
 	NO_LOCAL_POINTERS
 	MOVW	Rq, 4(R13)
 	MOVW	Rr, 8(R13)
@@ -247,7 +243,7 @@
 	MOVW	Rn, Rr			/* numerator */
 	MOVW	g_m(g), Rq
 	MOVW	m_divmod(Rq), Rq	/* denominator */
-	BL  	udiv(SB)
+	BL  	runtime·udiv(SB)
 	MOVW	Rr, RTMP
 	MOVW	4(R13), Rq
 	MOVW	8(R13), Rr
@@ -255,7 +251,7 @@
 	MOVW	16(R13), RM
 	RET
 
-TEXT _div(SB),NOSPLIT,$16-0
+TEXT runtime·_div(SB),NOSPLIT,$16-0
 	NO_LOCAL_POINTERS
 	MOVW	Rq, 4(R13)
 	MOVW	Rr, 8(R13)
@@ -271,7 +267,7 @@
 	BGE 	d2
 	RSB 	$0, Rq, Rq
 d0:
-	BL  	udiv(SB)  		/* none/both neg */
+	BL  	runtime·udiv(SB)  	/* none/both neg */
 	MOVW	Rq, RTMP
 	B	out1
 d1:
@@ -279,7 +275,7 @@
 	BGE 	d0
 	RSB 	$0, Rq, Rq
 d2:
-	BL  	udiv(SB)  		/* one neg */
+	BL  	runtime·udiv(SB)  	/* one neg */
 	RSB	$0, Rq, RTMP
 out1:
 	MOVW	4(R13), Rq
@@ -288,7 +284,7 @@
 	MOVW	16(R13), RM
 	RET
 
-TEXT _mod(SB),NOSPLIT,$16-0
+TEXT runtime·_mod(SB),NOSPLIT,$16-0
 	NO_LOCAL_POINTERS
 	MOVW	Rq, 4(R13)
 	MOVW	Rr, 8(R13)
@@ -302,11 +298,11 @@
 	CMP 	$0, Rr
 	BGE 	m1
 	RSB 	$0, Rr, Rr
-	BL  	udiv(SB)  		/* neg numerator */
+	BL  	runtime·udiv(SB)  	/* neg numerator */
 	RSB 	$0, Rr, RTMP
 	B   	out
 m1:
-	BL  	udiv(SB)  		/* pos numerator */
+	BL  	runtime·udiv(SB)  	/* pos numerator */
 	MOVW	Rr, RTMP
 out:
 	MOVW	4(R13), Rq

diff --git a/src/runtime/write_err_android.go b/src/runtime/write_err_android.go
index 748dec6..bf99b5f 100644
--- a/src/runtime/write_err_android.go
+++ b/src/runtime/write_err_android.go

@@ -144,7 +144,7 @@
 	//      hdr[3:7] sec unsigned uint32, little endian.
 	//      hdr[7:11] nsec unsigned uint32, little endian.
 	hdr[0] = 0 // LOG_ID_MAIN
-	sec, nsec := time_now()
+	sec, nsec := walltime()
 	packUint32(hdr[3:7], uint32(sec))
 	packUint32(hdr[7:11], uint32(nsec))
commit	c78f7149fedd171aa5e7b2071d601b4d21ce4773	[log] [tgz]
author	Dan Willemsen <[email protected]>	Wed Jul 26 13:08:14 2017 -0700
committer	Dan Willemsen <[email protected]>	Wed Jul 26 13:14:12 2017 -0700
tree	adbcec2da9660fc1e8303812f761738383c63d02
parent	3377874f67bcc7d200408ceb8b02169c4c3b201d [diff]