Update prebuilts to go1.7rc1 ab/3043704 toolchain/go sha ffb9ee3a Change-Id: I230dd018def6fd975e28add1eb70aae6c54ad5d7

commit: 0c15709165b7a6466cf121b3333ffa4386796b51 [log] [tgz]
author: Dan Willemsen <[email protected]> Fri Jul 08 13:57:52 2016 -0700
committer: Dan Willemsen <[email protected]> Fri Jul 08 14:02:56 2016 -0700
tree: 08b920899f1b5e74b7d6b6bbac171271f8ae2349
parent: 0cbf8ca5219490161024433a12d92adf55b4a2c3 [diff]
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index c666836..6694349 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go

@@ -4,33 +4,28 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
-	c0 = uintptr((8-ptrSize)/4*2860486313 + (ptrSize-4)/4*33054211828000289)
-	c1 = uintptr((8-ptrSize)/4*3267000013 + (ptrSize-4)/4*23344194077549503)
+	c0 = uintptr((8-sys.PtrSize)/4*2860486313 + (sys.PtrSize-4)/4*33054211828000289)
+	c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503)
 )
 
 // type algorithms - known to compiler
 const (
-	alg_MEM = iota
+	alg_NOEQ = iota
 	alg_MEM0
 	alg_MEM8
 	alg_MEM16
 	alg_MEM32
 	alg_MEM64
 	alg_MEM128
-	alg_NOEQ
-	alg_NOEQ0
-	alg_NOEQ8
-	alg_NOEQ16
-	alg_NOEQ32
-	alg_NOEQ64
-	alg_NOEQ128
 	alg_STRING
 	alg_INTER
 	alg_NILINTER
-	alg_SLICE
 	alg_FLOAT32
 	alg_FLOAT64
 	alg_CPLX64
@@ -69,29 +64,21 @@
 }
 
 // memhash_varlen is defined in assembly because it needs access
-// to the closure.  It appears here to provide an argument
+// to the closure. It appears here to provide an argument
 // signature for the assembly routine.
 func memhash_varlen(p unsafe.Pointer, h uintptr) uintptr
 
 var algarray = [alg_max]typeAlg{
-	alg_MEM:      {nil, nil}, // not used
+	alg_NOEQ:     {nil, nil},
 	alg_MEM0:     {memhash0, memequal0},
 	alg_MEM8:     {memhash8, memequal8},
 	alg_MEM16:    {memhash16, memequal16},
 	alg_MEM32:    {memhash32, memequal32},
 	alg_MEM64:    {memhash64, memequal64},
 	alg_MEM128:   {memhash128, memequal128},
-	alg_NOEQ:     {nil, nil},
-	alg_NOEQ0:    {nil, nil},
-	alg_NOEQ8:    {nil, nil},
-	alg_NOEQ16:   {nil, nil},
-	alg_NOEQ32:   {nil, nil},
-	alg_NOEQ64:   {nil, nil},
-	alg_NOEQ128:  {nil, nil},
 	alg_STRING:   {strhash, strequal},
 	alg_INTER:    {interhash, interequal},
 	alg_NILINTER: {nilinterhash, nilinterequal},
-	alg_SLICE:    {nil, nil},
 	alg_FLOAT32:  {f32hash, f32equal},
 	alg_FLOAT64:  {f64hash, f64equal},
 	alg_CPLX64:   {c64hash, c64equal},
@@ -159,7 +146,7 @@
 	t := tab._type
 	fn := t.alg.hash
 	if fn == nil {
-		panic(errorString("hash of unhashable type " + *t._string))
+		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
@@ -176,7 +163,7 @@
 	}
 	fn := t.alg.hash
 	if fn == nil {
-		panic(errorString("hash of unhashable type " + *t._string))
+		panic(errorString("hash of unhashable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return c1 * fn(unsafe.Pointer(&a.data), h^c0)
@@ -185,13 +172,6 @@
 	}
 }
 
-func memequal(p, q unsafe.Pointer, size uintptr) bool {
-	if p == q {
-		return true
-	}
-	return memeq(p, q, size)
-}
-
 func memequal0(p, q unsafe.Pointer) bool {
 	return true
 }
@@ -226,18 +206,12 @@
 	return *(*string)(p) == *(*string)(q)
 }
 func interequal(p, q unsafe.Pointer) bool {
-	return ifaceeq(*(*interface {
-		f()
-	})(p), *(*interface {
-		f()
-	})(q))
+	return ifaceeq(*(*iface)(p), *(*iface)(q))
 }
 func nilinterequal(p, q unsafe.Pointer) bool {
-	return efaceeq(*(*interface{})(p), *(*interface{})(q))
+	return efaceeq(*(*eface)(p), *(*eface)(q))
 }
-func efaceeq(p, q interface{}) bool {
-	x := (*eface)(unsafe.Pointer(&p))
-	y := (*eface)(unsafe.Pointer(&q))
+func efaceeq(x, y eface) bool {
 	t := x._type
 	if t != y._type {
 		return false
@@ -247,18 +221,14 @@
 	}
 	eq := t.alg.equal
 	if eq == nil {
-		panic(errorString("comparing uncomparable type " + *t._string))
+		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
 	}
 	return eq(x.data, y.data)
 }
-func ifaceeq(p, q interface {
-	f()
-}) bool {
-	x := (*iface)(unsafe.Pointer(&p))
-	y := (*iface)(unsafe.Pointer(&q))
+func ifaceeq(x, y iface) bool {
 	xtab := x.tab
 	if xtab != y.tab {
 		return false
@@ -269,7 +239,7 @@
 	t := xtab._type
 	eq := t.alg.equal
 	if eq == nil {
-		panic(errorString("comparing uncomparable type " + *t._string))
+		panic(errorString("comparing uncomparable type " + t.string()))
 	}
 	if isDirectIface(t) {
 		return eq(noescape(unsafe.Pointer(&x.data)), noescape(unsafe.Pointer(&y.data)))
@@ -311,7 +281,7 @@
 	memclr(s.array, uintptr(s.len))
 }
 
-const hashRandomBytes = ptrSize / 4 * 64
+const hashRandomBytes = sys.PtrSize / 4 * 64
 
 // used in asm_{386,amd64}.s to seed the hash function
 var aeskeysched [hashRandomBytes]byte
@@ -334,6 +304,9 @@
 		getRandomData(aeskeysched[:])
 		return
 	}
-	getRandomData((*[len(hashkey) * ptrSize]byte)(unsafe.Pointer(&hashkey))[:])
-	hashkey[0] |= 1 // make sure this number is odd
+	getRandomData((*[len(hashkey) * sys.PtrSize]byte)(unsafe.Pointer(&hashkey))[:])
+	hashkey[0] |= 1 // make sure these numbers are odd
+	hashkey[1] |= 1
+	hashkey[2] |= 1
+	hashkey[3] |= 1
 }

diff --git a/src/runtime/append_test.go b/src/runtime/append_test.go
index a67dc9b..6b8968e 100644
--- a/src/runtime/append_test.go
+++ b/src/runtime/append_test.go

@@ -3,10 +3,59 @@
 // license that can be found in the LICENSE file.
 package runtime_test
 
-import "testing"
+import (
+	"fmt"
+	"testing"
+)
 
 const N = 20
 
+func BenchmarkMakeSlice(b *testing.B) {
+	var x []byte
+	for i := 0; i < b.N; i++ {
+		x = make([]byte, 32)
+		_ = x
+	}
+}
+
+func BenchmarkGrowSliceBytes(b *testing.B) {
+	b.StopTimer()
+	var x = make([]byte, 9)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_ = append([]byte(nil), x...)
+	}
+}
+
+func BenchmarkGrowSliceInts(b *testing.B) {
+	b.StopTimer()
+	var x = make([]int, 9)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_ = append([]int(nil), x...)
+	}
+}
+
+func BenchmarkGrowSlicePtr(b *testing.B) {
+	b.StopTimer()
+	var x = make([]*byte, 9)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_ = append([]*byte(nil), x...)
+	}
+}
+
+type struct24 struct{ a, b, c int64 }
+
+func BenchmarkGrowSliceStruct24Bytes(b *testing.B) {
+	b.StopTimer()
+	var x = make([]struct24, 9)
+	b.StartTimer()
+	for i := 0; i < b.N; i++ {
+		_ = append([]struct24(nil), x...)
+	}
+}
+
 func BenchmarkAppend(b *testing.B) {
 	b.StopTimer()
 	x := make([]int, 0, N)
@@ -38,75 +87,37 @@
 	}
 }
 
-func benchmarkAppendBytes(b *testing.B, length int) {
-	b.StopTimer()
-	x := make([]byte, 0, N)
-	y := make([]byte, length)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		x = x[0:0]
-		x = append(x, y...)
+func BenchmarkAppendSlice(b *testing.B) {
+	for _, length := range []int{1, 4, 7, 8, 15, 16, 32} {
+		b.Run(fmt.Sprint(length, "Bytes"), func(b *testing.B) {
+			x := make([]byte, 0, N)
+			y := make([]byte, length)
+			for i := 0; i < b.N; i++ {
+				x = x[0:0]
+				x = append(x, y...)
+			}
+		})
 	}
 }
 
-func BenchmarkAppend1Byte(b *testing.B) {
-	benchmarkAppendBytes(b, 1)
-}
-
-func BenchmarkAppend4Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 4)
-}
-
-func BenchmarkAppend7Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 7)
-}
-
-func BenchmarkAppend8Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 8)
-}
-
-func BenchmarkAppend15Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 15)
-}
-
-func BenchmarkAppend16Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 16)
-}
-
-func BenchmarkAppend32Bytes(b *testing.B) {
-	benchmarkAppendBytes(b, 32)
-}
-
-func benchmarkAppendStr(b *testing.B, str string) {
-	b.StopTimer()
-	x := make([]byte, 0, N)
-	b.StartTimer()
-	for i := 0; i < b.N; i++ {
-		x = x[0:0]
-		x = append(x, str...)
+func BenchmarkAppendStr(b *testing.B) {
+	for _, str := range []string{
+		"1",
+		"1234",
+		"12345678",
+		"1234567890123456",
+		"12345678901234567890123456789012",
+	} {
+		b.Run(fmt.Sprint(len(str), "Bytes"), func(b *testing.B) {
+			x := make([]byte, 0, N)
+			for i := 0; i < b.N; i++ {
+				x = x[0:0]
+				x = append(x, str...)
+			}
+		})
 	}
 }
 
-func BenchmarkAppendStr1Byte(b *testing.B) {
-	benchmarkAppendStr(b, "1")
-}
-
-func BenchmarkAppendStr4Bytes(b *testing.B) {
-	benchmarkAppendStr(b, "1234")
-}
-
-func BenchmarkAppendStr8Bytes(b *testing.B) {
-	benchmarkAppendStr(b, "12345678")
-}
-
-func BenchmarkAppendStr16Bytes(b *testing.B) {
-	benchmarkAppendStr(b, "1234567890123456")
-}
-
-func BenchmarkAppendStr32Bytes(b *testing.B) {
-	benchmarkAppendStr(b, "12345678901234567890123456789012")
-}
-
 func BenchmarkAppendSpecialCase(b *testing.B) {
 	b.StopTimer()
 	x := make([]int, 0, N)
@@ -149,42 +160,153 @@
 	}
 }
 
-func benchmarkCopySlice(b *testing.B, l int) {
-	s := make([]byte, l)
-	buf := make([]byte, 4096)
-	var n int
-	for i := 0; i < b.N; i++ {
-		n = copy(buf, s)
+func BenchmarkCopy(b *testing.B) {
+	for _, l := range []int{1, 2, 4, 8, 12, 16, 32, 128, 1024} {
+		buf := make([]byte, 4096)
+		b.Run(fmt.Sprint(l, "Byte"), func(b *testing.B) {
+			s := make([]byte, l)
+			var n int
+			for i := 0; i < b.N; i++ {
+				n = copy(buf, s)
+			}
+			b.SetBytes(int64(n))
+		})
+		b.Run(fmt.Sprint(l, "String"), func(b *testing.B) {
+			s := string(make([]byte, l))
+			var n int
+			for i := 0; i < b.N; i++ {
+				n = copy(buf, s)
+			}
+			b.SetBytes(int64(n))
+		})
 	}
-	b.SetBytes(int64(n))
 }
 
-func benchmarkCopyStr(b *testing.B, l int) {
-	s := string(make([]byte, l))
-	buf := make([]byte, 4096)
-	var n int
-	for i := 0; i < b.N; i++ {
-		n = copy(buf, s)
-	}
-	b.SetBytes(int64(n))
+var (
+	sByte []byte
+	s1Ptr []uintptr
+	s2Ptr [][2]uintptr
+	s3Ptr [][3]uintptr
+	s4Ptr [][4]uintptr
+)
+
+// BenchmarkAppendInPlace tests the performance of append
+// when the result is being written back to the same slice.
+// In order for the in-place optimization to occur,
+// the slice must be referred to by address;
+// using a global is an easy way to trigger that.
+// We test the "grow" and "no grow" paths separately,
+// but not the "normal" (occasionally grow) path,
+// because it is a blend of the other two.
+// We use small numbers and small sizes in an attempt
+// to avoid benchmarking memory allocation and copying.
+// We use scalars instead of pointers in an attempt
+// to avoid benchmarking the write barriers.
+// We benchmark four common sizes (byte, pointer, string/interface, slice),
+// and one larger size.
+func BenchmarkAppendInPlace(b *testing.B) {
+	b.Run("NoGrow", func(b *testing.B) {
+		const C = 128
+
+		b.Run("Byte", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sByte = make([]byte, C)
+				for j := 0; j < C; j++ {
+					sByte = append(sByte, 0x77)
+				}
+			}
+		})
+
+		b.Run("1Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s1Ptr = make([]uintptr, C)
+				for j := 0; j < C; j++ {
+					s1Ptr = append(s1Ptr, 0x77)
+				}
+			}
+		})
+
+		b.Run("2Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s2Ptr = make([][2]uintptr, C)
+				for j := 0; j < C; j++ {
+					s2Ptr = append(s2Ptr, [2]uintptr{0x77, 0x88})
+				}
+			}
+		})
+
+		b.Run("3Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s3Ptr = make([][3]uintptr, C)
+				for j := 0; j < C; j++ {
+					s3Ptr = append(s3Ptr, [3]uintptr{0x77, 0x88, 0x99})
+				}
+			}
+		})
+
+		b.Run("4Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s4Ptr = make([][4]uintptr, C)
+				for j := 0; j < C; j++ {
+					s4Ptr = append(s4Ptr, [4]uintptr{0x77, 0x88, 0x99, 0xAA})
+				}
+			}
+		})
+
+	})
+
+	b.Run("Grow", func(b *testing.B) {
+		const C = 5
+
+		b.Run("Byte", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				sByte = make([]byte, 0)
+				for j := 0; j < C; j++ {
+					sByte = append(sByte, 0x77)
+					sByte = sByte[:cap(sByte)]
+				}
+			}
+		})
+
+		b.Run("1Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s1Ptr = make([]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s1Ptr = append(s1Ptr, 0x77)
+					s1Ptr = s1Ptr[:cap(s1Ptr)]
+				}
+			}
+		})
+
+		b.Run("2Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s2Ptr = make([][2]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s2Ptr = append(s2Ptr, [2]uintptr{0x77, 0x88})
+					s2Ptr = s2Ptr[:cap(s2Ptr)]
+				}
+			}
+		})
+
+		b.Run("3Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s3Ptr = make([][3]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s3Ptr = append(s3Ptr, [3]uintptr{0x77, 0x88, 0x99})
+					s3Ptr = s3Ptr[:cap(s3Ptr)]
+				}
+			}
+		})
+
+		b.Run("4Ptr", func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				s4Ptr = make([][4]uintptr, 0)
+				for j := 0; j < C; j++ {
+					s4Ptr = append(s4Ptr, [4]uintptr{0x77, 0x88, 0x99, 0xAA})
+					s4Ptr = s4Ptr[:cap(s4Ptr)]
+				}
+			}
+		})
+
+	})
 }
-
-func BenchmarkCopy1Byte(b *testing.B)    { benchmarkCopySlice(b, 1) }
-func BenchmarkCopy2Byte(b *testing.B)    { benchmarkCopySlice(b, 2) }
-func BenchmarkCopy4Byte(b *testing.B)    { benchmarkCopySlice(b, 4) }
-func BenchmarkCopy8Byte(b *testing.B)    { benchmarkCopySlice(b, 8) }
-func BenchmarkCopy12Byte(b *testing.B)   { benchmarkCopySlice(b, 12) }
-func BenchmarkCopy16Byte(b *testing.B)   { benchmarkCopySlice(b, 16) }
-func BenchmarkCopy32Byte(b *testing.B)   { benchmarkCopySlice(b, 32) }
-func BenchmarkCopy128Byte(b *testing.B)  { benchmarkCopySlice(b, 128) }
-func BenchmarkCopy1024Byte(b *testing.B) { benchmarkCopySlice(b, 1024) }
-
-func BenchmarkCopy1String(b *testing.B)    { benchmarkCopyStr(b, 1) }
-func BenchmarkCopy2String(b *testing.B)    { benchmarkCopyStr(b, 2) }
-func BenchmarkCopy4String(b *testing.B)    { benchmarkCopyStr(b, 4) }
-func BenchmarkCopy8String(b *testing.B)    { benchmarkCopyStr(b, 8) }
-func BenchmarkCopy12String(b *testing.B)   { benchmarkCopyStr(b, 12) }
-func BenchmarkCopy16String(b *testing.B)   { benchmarkCopyStr(b, 16) }
-func BenchmarkCopy32String(b *testing.B)   { benchmarkCopyStr(b, 32) }
-func BenchmarkCopy128String(b *testing.B)  { benchmarkCopyStr(b, 128) }
-func BenchmarkCopy1024String(b *testing.B) { benchmarkCopyStr(b, 1024) }

diff --git a/src/runtime/arch1_386.go b/src/runtime/arch1_386.go
deleted file mode 100644
index d41696a..0000000
--- a/src/runtime/arch1_386.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '8'
-	_BigEndian     = 0
-	_CacheLineSize = 64
-	_PhysPageSize  = goos_nacl*65536 + (1-goos_nacl)*4096 // 4k normally; 64k on NaCl
-	_PCQuantum     = 1
-	_Int64Align    = 4
-	hugePageSize   = 1 << 21
-)

diff --git a/src/runtime/arch1_amd64.go b/src/runtime/arch1_amd64.go
deleted file mode 100644
index 15f4cc6..0000000
--- a/src/runtime/arch1_amd64.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '6'
-	_BigEndian     = 0
-	_CacheLineSize = 64
-	_PhysPageSize  = 4096
-	_PCQuantum     = 1
-	_Int64Align    = 8
-	hugePageSize   = 1 << 21
-)

diff --git a/src/runtime/arch1_amd64p32.go b/src/runtime/arch1_amd64p32.go
deleted file mode 100644
index 3c5456f..0000000
--- a/src/runtime/arch1_amd64p32.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '6'
-	_BigEndian     = 0
-	_CacheLineSize = 64
-	_PhysPageSize  = 65536*goos_nacl + 4096*(1-goos_nacl)
-	_PCQuantum     = 1
-	_Int64Align    = 8
-	hugePageSize   = 1 << 21
-)

diff --git a/src/runtime/arch1_arm.go b/src/runtime/arch1_arm.go
deleted file mode 100644
index 0ec2093..0000000
--- a/src/runtime/arch1_arm.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '5'
-	_BigEndian     = 0
-	_CacheLineSize = 32
-	_PhysPageSize  = 65536*goos_nacl + 4096*(1-goos_nacl)
-	_PCQuantum     = 4
-	_Int64Align    = 4
-	hugePageSize   = 0
-)

diff --git a/src/runtime/arch1_arm64.go b/src/runtime/arch1_arm64.go
deleted file mode 100644
index 29a87db..0000000
--- a/src/runtime/arch1_arm64.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '7'
-	_BigEndian     = 0
-	_CacheLineSize = 32
-	_PhysPageSize  = 65536
-	_PCQuantum     = 4
-	_Int64Align    = 8
-	hugePageSize   = 0
-)

diff --git a/src/runtime/arch1_ppc64.go b/src/runtime/arch1_ppc64.go
deleted file mode 100644
index de6dd91..0000000
--- a/src/runtime/arch1_ppc64.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '9'
-	_BigEndian     = 1
-	_CacheLineSize = 64
-	_PhysPageSize  = 65536
-	_PCQuantum     = 4
-	_Int64Align    = 8
-	hugePageSize   = 0
-)

diff --git a/src/runtime/arch1_ppc64le.go b/src/runtime/arch1_ppc64le.go
deleted file mode 100644
index 9a55c71..0000000
--- a/src/runtime/arch1_ppc64le.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	thechar        = '9'
-	_BigEndian     = 0
-	_CacheLineSize = 64
-	_PhysPageSize  = 65536
-	_PCQuantum     = 4
-	_Int64Align    = 8
-	hugePageSize   = 0
-)

diff --git a/src/runtime/arch_386.go b/src/runtime/arch_386.go
deleted file mode 100644
index 79d38c7..0000000
--- a/src/runtime/arch_386.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint32
-type intptr int32 // TODO(rsc): remove

diff --git a/src/runtime/arch_amd64.go b/src/runtime/arch_amd64.go
deleted file mode 100644
index 270cd7b..0000000
--- a/src/runtime/arch_amd64.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint64
-type intptr int64 // TODO(rsc): remove

diff --git a/src/runtime/arch_arm.go b/src/runtime/arch_arm.go
deleted file mode 100644
index 79d38c7..0000000
--- a/src/runtime/arch_arm.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint32
-type intptr int32 // TODO(rsc): remove

diff --git a/src/runtime/arch_arm64.go b/src/runtime/arch_arm64.go
deleted file mode 100644
index 270cd7b..0000000
--- a/src/runtime/arch_arm64.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint64
-type intptr int64 // TODO(rsc): remove

diff --git a/src/runtime/arch_ppc64.go b/src/runtime/arch_ppc64.go
deleted file mode 100644
index 270cd7b..0000000
--- a/src/runtime/arch_ppc64.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint64
-type intptr int64 // TODO(rsc): remove

diff --git a/src/runtime/arch_ppc64le.go b/src/runtime/arch_ppc64le.go
deleted file mode 100644
index 270cd7b..0000000
--- a/src/runtime/arch_ppc64le.go
+++ /dev/null

@@ -1,8 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-type uintreg uint64
-type intptr int64 // TODO(rsc): remove

diff --git a/src/runtime/asm.s b/src/runtime/asm.s
index f1c812b..646dc2f 100644
--- a/src/runtime/asm.s
+++ b/src/runtime/asm.s

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index eb9ca63..ea11b2b 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s

@@ -26,8 +26,35 @@
 	MOVL	SP, (g_stack+stack_hi)(BP)
 	
 	// find out information about the processor we're on
+#ifdef GOOS_nacl // NaCl doesn't like PUSHFL/POPFL
+	JMP 	has_cpuid
+#else
+	// first see if CPUID instruction is supported.
+	PUSHFL
+	PUSHFL
+	XORL	$(1<<21), 0(SP) // flip ID bit
+	POPFL
+	PUSHFL
+	POPL	AX
+	XORL	0(SP), AX
+	POPFL	// restore EFLAGS
+	TESTL	$(1<<21), AX
+	JNE 	has_cpuid
+#endif
+
+bad_proc: // show that the program requires MMX.
+	MOVL	$2, 0(SP)
+	MOVL	$bad_proc_msg<>(SB), 4(SP)
+	MOVL	$0x3d, 8(SP)
+	CALL	runtime·write(SB)
+	MOVL	$1, 0(SP)
+	CALL	runtime·exit(SB)
+	INT	$3
+
+has_cpuid:
 	MOVL	$0, AX
 	CPUID
+	MOVL	AX, SI
 	CMPL	AX, $0
 	JE	nocpuinfo
 
@@ -43,10 +70,25 @@
 	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
 
+	// Load EAX=1 cpuid flags
 	MOVL	$1, AX
 	CPUID
-	MOVL	CX, runtime·cpuid_ecx(SB)
+	MOVL	CX, AX // Move to global variable clobbers CX when generating PIC
+	MOVL	AX, runtime·cpuid_ecx(SB)
 	MOVL	DX, runtime·cpuid_edx(SB)
+
+	// Check for MMX support
+	TESTL	$(1<<23), DX	// MMX
+	JZ 	bad_proc
+
+	// Load EAX=7/ECX=0 cpuid flags
+	CMPL	SI, $7
+	JLT	nocpuinfo
+	MOVL	$7, AX
+	MOVL	$0, CX
+	CPUID
+	MOVL	BX, runtime·cpuid_ebx7(SB)
+
 nocpuinfo:	
 
 	// if there is an _cgo_init, call it to let it
@@ -67,13 +109,15 @@
 	MOVL	AX, g_stackguard0(CX)
 	MOVL	AX, g_stackguard1(CX)
 
+#ifndef GOOS_windows
 	// skip runtime·ldt0setup(SB) and tls test after _cgo_init for non-windows
-	CMPL runtime·iswindows(SB), $0
-	JEQ ok
+	JMP ok
+#endif
 needtls:
+#ifdef GOOS_plan9
 	// skip runtime·ldt0setup(SB) and tls test on Plan 9 in all cases
-	CMPL	runtime·isplan9(SB), $1
-	JEQ	ok
+	JMP	ok
+#endif
 
 	// set up %gs
 	CALL	runtime·ldt0setup(SB)
@@ -81,21 +125,21 @@
 	// store through it, to make sure it works
 	get_tls(BX)
 	MOVL	$0x123, g(BX)
-	MOVL	runtime·tls0(SB), AX
+	MOVL	runtime·m0+m_tls(SB), AX
 	CMPL	AX, $0x123
 	JEQ	ok
 	MOVL	AX, 0	// abort
 ok:
 	// set up m and g "registers"
 	get_tls(BX)
-	LEAL	runtime·g0(SB), CX
-	MOVL	CX, g(BX)
+	LEAL	runtime·g0(SB), DX
+	MOVL	DX, g(BX)
 	LEAL	runtime·m0(SB), AX
 
 	// save m->g0 = g0
-	MOVL	CX, m_g0(AX)
+	MOVL	DX, m_g0(AX)
 	// save g0->m = m0
-	MOVL	AX, g_m(CX)
+	MOVL	AX, g_m(DX)
 
 	CALL	runtime·emptyfunc(SB)	// fault if stack check is wrong
 
@@ -126,6 +170,17 @@
 	INT $3
 	RET
 
+DATA	bad_proc_msg<>+0x00(SB)/8, $"This pro"
+DATA	bad_proc_msg<>+0x08(SB)/8, $"gram can"
+DATA	bad_proc_msg<>+0x10(SB)/8, $" only be"
+DATA	bad_proc_msg<>+0x18(SB)/8, $" run on "
+DATA	bad_proc_msg<>+0x20(SB)/8, $"processo"
+DATA	bad_proc_msg<>+0x28(SB)/8, $"rs with "
+DATA	bad_proc_msg<>+0x30(SB)/8, $"MMX supp"
+DATA	bad_proc_msg<>+0x38(SB)/4, $"ort."
+DATA	bad_proc_msg<>+0x3c(SB)/1, $0xa
+GLOBL	bad_proc_msg<>(SB), RODATA, $0x3d
+
 DATA	runtime·mainPC+0(SB)/4,$runtime·main(SB)
 GLOBL	runtime·mainPC(SB),RODATA,$4
 
@@ -181,13 +236,13 @@
 
 // func mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB), NOSPLIT, $0-4
 	MOVL	fn+0(FP), DI
-	
-	get_tls(CX)
-	MOVL	g(CX), AX	// save state in g->sched
+
+	get_tls(DX)
+	MOVL	g(DX), AX	// save state in g->sched
 	MOVL	0(SP), BX	// caller's PC
 	MOVL	BX, (g_sched+gobuf_pc)(AX)
 	LEAL	fn+0(FP), BX	// caller's SP
@@ -195,14 +250,14 @@
 	MOVL	AX, (g_sched+gobuf_g)(AX)
 
 	// switch to m->g0 & its stack, call fn
-	MOVL	g(CX), BX
+	MOVL	g(DX), BX
 	MOVL	g_m(BX), BX
 	MOVL	m_g0(BX), SI
 	CMPL	SI, AX	// if g == m->g0 call badmcall
 	JNE	3(PC)
 	MOVL	$runtime·badmcall(SB), AX
 	JMP	AX
-	MOVL	SI, g(CX)	// g = m->g0
+	MOVL	SI, g(DX)	// g = m->g0
 	MOVL	(g_sched+gobuf_sp)(SI), SP	// sp = m->g0->sched.sp
 	PUSHL	AX
 	MOVL	DI, DX
@@ -214,7 +269,7 @@
 	RET
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
@@ -246,13 +301,14 @@
 	CALL	AX
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVL	$runtime·systemstack_switch(SB), (g_sched+gobuf_pc)(AX)
 	MOVL	SP, (g_sched+gobuf_sp)(AX)
 	MOVL	AX, (g_sched+gobuf_g)(AX)
 
 	// switch to g0
+	get_tls(CX)
 	MOVL	DX, g(CX)
 	MOVL	(g_sched+gobuf_sp)(DX), BX
 	// make it look like mstart called systemstack on g0, to stop traceback
@@ -467,100 +523,6 @@
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-13
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+12(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime·atomicstore(SB)
-
-// bool runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-21
-	MOVL	ptr+0(FP), BP
-	MOVL	old_lo+4(FP), AX
-	MOVL	old_hi+8(FP), DX
-	MOVL	new_lo+12(FP), BX
-	MOVL	new_hi+16(FP), CX
-	LOCK
-	CMPXCHG8B	0(BP)
-	SETEQ	ret+20(FP)
-	RET
-
-// bool casp(void **p, void *old, void *new)
-// Atomically:
-//	if(*p == old){
-//		*p = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-13
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+12(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	delta+4(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchgp1(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xchg(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -569,69 +531,6 @@
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-// uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime·atomicload64(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), AX
-	TESTL	$7, AX
-	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
-	LEAL	ret_lo+4(FP), BX
-	// MOVQ (%EAX), %MM0
-	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
-	// MOVQ %MM0, 0(%EBX)
-	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
-	// EMMS
-	BYTE $0x0F; BYTE $0x77
-	RET
-
-// void runtime·atomicstore64(uint64 volatile* addr, uint64 v);
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), AX
-	TESTL	$7, AX
-	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
-	// MOVQ and EMMS were introduced on the Pentium MMX.
-	// MOVQ 0x8(%ESP), %MM0
-	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
-	// MOVQ %MM0, (%EAX)
-	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
-	// EMMS
-	BYTE $0x0F; BYTE $0x77
-	// This is essentially a no-op, but it provides required memory fencing.
-	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
-	MOVL	$0, AX
-	LOCK
-	XADDL	AX, (SP)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), AX
-	MOVB	val+4(FP), BX
-	LOCK
-	ORB	BX, (AX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), AX
-	MOVB	val+4(FP), BX
-	LOCK
-	ANDB	BX, (AX)
-	RET
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
 	// compile barrier.
@@ -640,13 +539,20 @@
 // void jmpdefer(fn, sp);
 // called from deferreturn.
 // 1. pop the caller
-// 2. sub 5 bytes from the callers return
+// 2. sub 5 bytes (the length of CALL & a 32 bit displacement) from the callers
+//    return (when building for shared libraries, subtract 16 bytes -- 5 bytes
+//    for CALL & displacement to call __x86.get_pc_thunk.cx, 6 bytes for the
+//    LEAL to load the offset into BX, and finally 5 for the call & displacement)
 // 3. jmp to the argument
 TEXT runtime·jmpdefer(SB), NOSPLIT, $0-8
 	MOVL	fv+0(FP), DX	// fn
 	MOVL	argp+4(FP), BX	// caller sp
 	LEAL	-4(BX), SP	// caller sp after CALL
+#ifdef GOBUILDMODE_shared
+	SUBL	$16, (SP)	// return to CALL again
+#else
 	SUBL	$5, (SP)	// return to CALL again
+#endif
 	MOVL	0(DX), BX
 	JMP	BX	// but first run the deferred function
 
@@ -685,11 +591,13 @@
 	MOVL	m_g0(BP), SI
 	MOVL	g(CX), DI
 	CMPL	SI, DI
-	JEQ	4(PC)
+	JEQ	noswitch
 	CALL	gosave<>(SB)
+	get_tls(CX)
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), SP
 
+noswitch:
 	// Now on a scheduling stack (a pthread-created stack).
 	SUBL	$32, SP
 	ANDL	$~15, SP	// alignment, perhaps unnecessary
@@ -711,23 +619,25 @@
 	MOVL	AX, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
+TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
 	LEAL	fn+0(FP), AX
 	MOVL	AX, 0(SP)
 	MOVL	frame+4(FP), AX
 	MOVL	AX, 4(SP)
 	MOVL	framesize+8(FP), AX
 	MOVL	AX, 8(SP)
+	MOVL	ctxt+12(FP), AX
+	MOVL	AX, 12(SP)
 	MOVL	$runtime·cgocallback_gofunc(SB), AX
 	CALL	AX
 	RET
 
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-12
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -795,17 +705,19 @@
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
 	//
-	// In the new goroutine, 0(SP) holds the saved oldm (DX) register.
-	// 4(SP) and 8(SP) are unused.
+	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
+	// 8(SP) is unused.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
 	MOVL	(g_sched+gobuf_pc)(SI), BP
 	MOVL	BP, -4(DI)
+	MOVL	ctxt+12(FP), CX
 	LEAL	-(4+12)(DI), SP
-	MOVL	DX, 0(SP)
+	MOVL	DX, 4(SP)
+	MOVL	CX, 0(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVL	0(SP), DX
+	MOVL	4(SP), DX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
@@ -887,8 +799,8 @@
 TEXT runtime·setcallerpc(SB),NOSPLIT,$4-8
 	MOVL	argp+0(FP),AX		// addr of first arg
 	MOVL	pc+4(FP), BX
-	MOVL	-4(AX), CX
-	CMPL	CX, runtime·stackBarrierPC(SB)
+	MOVL	-4(AX), DX
+	CMPL	DX, runtime·stackBarrierPC(SB)
 	JEQ	setbar
 	MOVL	BX, -4(AX)		// set calling pc
 	RET
@@ -920,11 +832,11 @@
 	RET
 
 TEXT runtime·ldt0setup(SB),NOSPLIT,$16-0
-	// set up ldt 7 to point at tls0
+	// set up ldt 7 to point at m0.tls
 	// ldt 1 would be fine on Linux, but on OS X, 7 is as low as we can go.
 	// the entry number is just a hint.  setldt will set up GS with what it used.
 	MOVL	$7, 0(SP)
-	LEAL	runtime·tls0(SB), AX
+	LEAL	runtime·m0+m_tls(SB), AX
 	MOVL	AX, 4(SP)
 	MOVL	$32, 8(SP)	// sizeof(tls array)
 	CALL	runtime·setldt(SB)
@@ -956,36 +868,39 @@
 // hash function using AES hardware instructions
 TEXT runtime·aeshash(SB),NOSPLIT,$0-16
 	MOVL	p+0(FP), AX	// ptr to data
-	MOVL	s+8(FP), CX	// size
+	MOVL	s+8(FP), BX	// size
 	LEAL	ret+12(FP), DX
 	JMP	runtime·aeshashbody(SB)
 
 TEXT runtime·aeshashstr(SB),NOSPLIT,$0-12
 	MOVL	p+0(FP), AX	// ptr to string object
-	MOVL	4(AX), CX	// length of string
+	MOVL	4(AX), BX	// length of string
 	MOVL	(AX), AX	// string data
 	LEAL	ret+8(FP), DX
 	JMP	runtime·aeshashbody(SB)
 
 // AX: data
-// CX: length
+// BX: length
 // DX: address to put return value
 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
-	MOVL	h+4(FP), X6	// seed to low 64 bits of xmm6
-	PINSRD	$2, CX, X6	// size to high 64 bits of xmm6
-	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
-	MOVO	runtime·aeskeysched(SB), X7
-	CMPL	CX, $16
+	MOVL	h+4(FP), X0	            // 32 bits of per-table hash seed
+	PINSRW	$4, BX, X0	            // 16 bits of length
+	PSHUFHW	$0, X0, X0	            // replace size with its low 2 bytes repeated 4 times
+	MOVO	X0, X1                      // save unscrambled seed
+	PXOR	runtime·aeskeysched(SB), X0 // xor in per-process seed
+	AESENC	X0, X0                      // scramble seed
+
+	CMPL	BX, $16
 	JB	aes0to15
 	JE	aes16
-	CMPL	CX, $32
+	CMPL	BX, $32
 	JBE	aes17to32
-	CMPL	CX, $64
+	CMPL	BX, $64
 	JBE	aes33to64
 	JMP	aes65plus
 	
 aes0to15:
-	TESTL	CX, CX
+	TESTL	BX, BX
 	JE	aes0
 
 	ADDL	$16, AX
@@ -994,140 +909,158 @@
 
 	// 16 bytes loaded at this address won't cross
 	// a page boundary, so we can load it directly.
-	MOVOU	-16(AX), X0
-	ADDL	CX, CX
-	PAND	masks<>(SB)(CX*8), X0
+	MOVOU	-16(AX), X1
+	ADDL	BX, BX
+	PAND	masks<>(SB)(BX*8), X1
 
-	// scramble 3 times
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVL	X0, (DX)
+final1:	
+	AESENC	X0, X1  // scramble input, xor in seed
+	AESENC	X1, X1  // scramble combo 2 times
+	AESENC	X1, X1
+	MOVL	X1, (DX)
 	RET
 
 endofpage:
-	// address ends in 1111xxxx.  Might be up against
+	// address ends in 1111xxxx. Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
-	MOVOU	-32(AX)(CX*1), X0
-	ADDL	CX, CX
-	PSHUFB	shifts<>(SB)(CX*8), X0
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVL	X0, (DX)
-	RET
+	MOVOU	-32(AX)(BX*1), X1
+	ADDL	BX, BX
+	PSHUFB	shifts<>(SB)(BX*8), X1
+	JMP	final1
 
 aes0:
-	// return input seed
-	MOVL	h+4(FP), AX
-	MOVL	AX, (DX)
+	// Return scrambled input seed
+	AESENC	X0, X0
+	MOVL	X0, (DX)
 	RET
 
 aes16:
-	MOVOU	(AX), X0
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVL	X0, (DX)
-	RET
-
+	MOVOU	(AX), X1
+	JMP	final1
 
 aes17to32:
+	// make second starting seed
+	PXOR	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
+	
 	// load data to be hashed
-	MOVOU	(AX), X0
-	MOVOU	-16(AX)(CX*1), X1
+	MOVOU	(AX), X2
+	MOVOU	-16(AX)(BX*1), X3
 
 	// scramble 3 times
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X0
-	AESENC	X7, X1
+	AESENC	X0, X2
+	AESENC	X1, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
 
 	// combine results
-	PXOR	X1, X0
-	MOVL	X0, (DX)
+	PXOR	X3, X2
+	MOVL	X2, (DX)
 	RET
 
 aes33to64:
-	MOVOU	(AX), X0
-	MOVOU	16(AX), X1
-	MOVOU	-32(AX)(CX*1), X2
-	MOVOU	-16(AX)(CX*1), X3
+	// make 3 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
 	
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	runtime·aeskeysched+32(SB), X2
-	AESENC	runtime·aeskeysched+48(SB), X3
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
+	MOVOU	(AX), X4
+	MOVOU	16(AX), X5
+	MOVOU	-32(AX)(BX*1), X6
+	MOVOU	-16(AX)(BX*1), X7
+	
+	AESENC	X0, X4
+	AESENC	X1, X5
+	AESENC	X2, X6
+	AESENC	X3, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
 
-	PXOR	X2, X0
-	PXOR	X3, X1
-	PXOR	X1, X0
-	MOVL	X0, (DX)
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVL	X4, (DX)
 	RET
 
 aes65plus:
+	// make 3 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	
 	// start with last (possibly overlapping) block
-	MOVOU	-64(AX)(CX*1), X0
-	MOVOU	-48(AX)(CX*1), X1
-	MOVOU	-32(AX)(CX*1), X2
-	MOVOU	-16(AX)(CX*1), X3
+	MOVOU	-64(AX)(BX*1), X4
+	MOVOU	-48(AX)(BX*1), X5
+	MOVOU	-32(AX)(BX*1), X6
+	MOVOU	-16(AX)(BX*1), X7
 
 	// scramble state once
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	runtime·aeskeysched+32(SB), X2
-	AESENC	runtime·aeskeysched+48(SB), X3
+	AESENC	X0, X4
+	AESENC	X1, X5
+	AESENC	X2, X6
+	AESENC	X3, X7
 
 	// compute number of remaining 64-byte blocks
-	DECL	CX
-	SHRL	$6, CX
+	DECL	BX
+	SHRL	$6, BX
 	
 aesloop:
 	// scramble state, xor in a block
-	MOVOU	(AX), X4
-	MOVOU	16(AX), X5
-	AESENC	X4, X0
-	AESENC	X5, X1
-	MOVOU	32(AX), X4
-	MOVOU	48(AX), X5
-	AESENC	X4, X2
-	AESENC	X5, X3
+	MOVOU	(AX), X0
+	MOVOU	16(AX), X1
+	MOVOU	32(AX), X2
+	MOVOU	48(AX), X3
+	AESENC	X0, X4
+	AESENC	X1, X5
+	AESENC	X2, X6
+	AESENC	X3, X7
 
 	// scramble state
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
 
 	ADDL	$64, AX
-	DECL	CX
+	DECL	BX
 	JNE	aesloop
 
 	// 2 more scrambles to finish
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
 
-	PXOR	X2, X0
-	PXOR	X3, X1
-	PXOR	X1, X0
-	MOVL	X0, (DX)
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVL	X4, (DX)
 	RET
 
 TEXT runtime·aeshash32(SB),NOSPLIT,$0-12
@@ -1233,7 +1166,7 @@
 
 GLOBL masks<>(SB),RODATA,$256
 
-// these are arguments to pshufb.  They move data down from
+// these are arguments to pshufb. They move data down from
 // the high bytes of the register to the low bytes of the register.
 // index is how many bytes to move.
 DATA shifts<>+0x00(SB)/4, $0x00000000
@@ -1318,12 +1251,27 @@
 
 GLOBL shifts<>(SB),RODATA,$256
 
-TEXT runtime·memeq(SB),NOSPLIT,$0-13
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
+	MOVL	$masks<>(SB), AX
+	MOVL	$shifts<>(SB), BX
+	ORL	BX, AX
+	TESTL	$15, AX
+	SETEQ	ret+0(FP)
+	RET
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
 	MOVL	a+0(FP), SI
 	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
 	MOVL	size+8(FP), BX
 	LEAL	ret+12(FP), AX
 	JMP	runtime·memeqbody(SB)
+eq:
+	MOVB    $1, ret+12(FP)
+	RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
@@ -1443,7 +1391,7 @@
 	MOVL	(SI), SI
 	JMP	si_finish
 si_high:
-	// address ends in 111111xx.  Load up to bytes we want, move to correct position.
+	// address ends in 111111xx. Load up to bytes we want, move to correct position.
 	MOVL	-4(SI)(BX*1), SI
 	SHRL	CX, SI
 si_finish:
@@ -1518,7 +1466,8 @@
 TEXT runtime·cmpbody(SB),NOSPLIT,$0-0
 	MOVL	DX, BP
 	SUBL	BX, DX // DX = blen-alen
-	CMOVLGT	BX, BP // BP = min(alen, blen)
+	JLE	2(PC)
+	MOVL	BX, BP // BP = min(alen, blen)
 	CMPL	SI, DI
 	JEQ	allsame
 	CMPL	BP, $4
@@ -1637,7 +1586,8 @@
 	ADDL	DX, DX
 	MOVL	DX, BX
 	XORL	$0x88888eef, DX
-	CMOVLMI	BX, DX
+	JPL	2(PC)
+	MOVL	BX, DX
 	MOVL	DX, m_fastrand(AX)
 	MOVL	DX, ret+0(FP)
 	RET
@@ -1664,23 +1614,26 @@
 	// traceback from goexit1 must hit code range of goexit
 	BYTE	$0x90	// NOP
 
+// Prefetching doesn't seem to help.
 TEXT runtime·prefetcht0(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT0	(AX)
 	RET
 
 TEXT runtime·prefetcht1(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT1	(AX)
 	RET
 
-
 TEXT runtime·prefetcht2(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHT2	(AX)
 	RET
 
 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-4
-	MOVL	addr+0(FP), AX
-	PREFETCHNTA	(AX)
 	RET
+
+// Add a module's moduledata to the linked list of moduledata objects. This
+// is called from .init_array by a function generated in the linker and so
+// follows the platform ABI wrt register preservation -- it only touches AX,
+// CX (implicitly) and DX, but it does not follow the ABI wrt arguments:
+// instead the pointer to the moduledata is passed in AX.
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
+       MOVL    runtime·lastmoduledatap(SB), DX
+       MOVL    AX, moduledata_next(DX)
+       MOVL    AX, runtime·lastmoduledatap(SB)
+       RET

diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index 3b4ca4d..6103d54 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s

@@ -28,6 +28,7 @@
 	// find out information about the processor we're on
 	MOVQ	$0, AX
 	CPUID
+	MOVQ	AX, SI
 	CMPQ	AX, $0
 	JE	nocpuinfo
 
@@ -43,10 +44,42 @@
 	MOVB	$1, runtime·lfenceBeforeRdtsc(SB)
 notintel:
 
+	// Load EAX=1 cpuid flags
 	MOVQ	$1, AX
 	CPUID
 	MOVL	CX, runtime·cpuid_ecx(SB)
 	MOVL	DX, runtime·cpuid_edx(SB)
+
+	// Load EAX=7/ECX=0 cpuid flags
+	CMPQ	SI, $7
+	JLT	no7
+	MOVL	$7, AX
+	MOVL	$0, CX
+	CPUID
+	MOVL	BX, runtime·cpuid_ebx7(SB)
+no7:
+	// Detect AVX and AVX2 as per 14.7.1  Detection of AVX2 chapter of [1]
+	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+	// http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-manual-325462.pdf
+	MOVL	runtime·cpuid_ecx(SB), CX
+	ANDL    $0x18000000, CX // check for OSXSAVE and AVX bits
+	CMPL    CX, $0x18000000
+	JNE     noavx
+	MOVL    $0, CX
+	// For XGETBV, OSXSAVE bit is required and sufficient
+	XGETBV
+	ANDL    $6, AX
+	CMPL    AX, $6 // Check for OS support of YMM registers
+	JNE     noavx
+	MOVB    $1, runtime·support_avx(SB)
+	TESTL   $(1<<5), runtime·cpuid_ebx7(SB) // check for AVX2 bit
+	JEQ     noavx2
+	MOVB    $1, runtime·support_avx2(SB)
+	JMP     nocpuinfo
+noavx:
+	MOVB    $0, runtime·support_avx(SB)
+noavx2:
+	MOVB    $0, runtime·support_avx2(SB)
 nocpuinfo:	
 	
 	// if there is an _cgo_init, call it.
@@ -65,23 +98,26 @@
 	MOVQ	AX, g_stackguard0(CX)
 	MOVQ	AX, g_stackguard1(CX)
 
-	CMPL	runtime·iswindows(SB), $0
-	JEQ ok
+#ifndef GOOS_windows
+	JMP ok
+#endif
 needtls:
+#ifdef GOOS_plan9
 	// skip TLS setup on Plan 9
-	CMPL	runtime·isplan9(SB), $1
-	JEQ ok
+	JMP ok
+#endif
+#ifdef GOOS_solaris
 	// skip TLS setup on Solaris
-	CMPL	runtime·issolaris(SB), $1
-	JEQ ok
+	JMP ok
+#endif
 
-	LEAQ	runtime·tls0(SB), DI
+	LEAQ	runtime·m0+m_tls(SB), DI
 	CALL	runtime·settls(SB)
 
 	// store through it, to make sure it works
 	get_tls(BX)
 	MOVQ	$0x123, g(BX)
-	MOVQ	runtime·tls0(SB), AX
+	MOVQ	runtime·m0+m_tls(SB), AX
 	CMPQ	AX, $0x123
 	JEQ 2(PC)
 	MOVL	AX, 0	// abort
@@ -174,7 +210,7 @@
 
 // func mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB), NOSPLIT, $0-8
 	MOVQ	fn+0(FP), DI
@@ -208,7 +244,7 @@
 	RET
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
@@ -239,7 +275,7 @@
 	CALL	AX
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVQ	$runtime·systemstack_switch(SB), SI
 	MOVQ	SI, (g_sched+gobuf_pc)(AX)
@@ -346,7 +382,13 @@
 	MOVQ	(g_stkbar+slice_array)(CX), DX
 	MOVQ	g_stkbarPos(CX), BX
 	IMULQ	$stkbar__size, BX	// Too big for SIB.
+	MOVQ	stkbar_savedLRPtr(DX)(BX*1), R8
 	MOVQ	stkbar_savedLRVal(DX)(BX*1), BX
+	// Assert that we're popping the right saved LR.
+	ADDQ	$8, R8
+	CMPQ	R8, SP
+	JEQ	2(PC)
+	MOVL	$0, 0
 	// Record that this stack barrier was hit.
 	ADDQ	$1, g_stkbarPos(CX)
 	// Jump to the original return PC.
@@ -461,118 +503,6 @@
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVQ	ptr+0(FP), BX
-	MOVL	old+8(FP), AX
-	MOVL	new+12(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-	
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	JMP	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	JMP	runtime·atomicstore64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	MOVQ	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	delta+8(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadduintptr(SB), NOSPLIT, $0-24
-	JMP	runtime·xadd64(SB)
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-	MOVQ	ptr+0(FP), BX
-	MOVL	new+8(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
-	MOVQ	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	JMP	runtime·xchg64(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -581,39 +511,6 @@
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVQ	ptr+0(FP), BX
-	MOVL	val+8(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVQ	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ORB	BX, (AX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
-	MOVQ	ptr+0(FP), AX
-	MOVB	val+8(FP), BX
-	LOCK
-	ANDB	BX, (AX)
-	RET
 
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
@@ -629,6 +526,7 @@
 	MOVQ	fv+0(FP), DX	// fn
 	MOVQ	argp+8(FP), BX	// caller sp
 	LEAQ	-8(BX), SP	// caller sp after CALL
+	MOVQ	-8(SP), BP	// restore BP as if deferreturn returned (harmless if framepointers not in use)
 	SUBQ	$5, (SP)	// return to CALL again
 	MOVQ	0(DX), BX
 	JMP	BX	// but first run the deferred function
@@ -661,6 +559,8 @@
 	// come in on the m->g0 stack already.
 	get_tls(CX)
 	MOVQ	g(CX), R8
+	CMPQ	R8, $0
+	JEQ	nosave
 	MOVQ	g_m(R8), R8
 	MOVQ	m_g0(R8), SI
 	MOVQ	g(CX), DI
@@ -670,11 +570,11 @@
 	CMPQ	SI, DI
 	JEQ	nosave
 	
+	// Switch to system stack.
 	MOVQ	m_g0(R8), SI
 	CALL	gosave<>(SB)
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), SP
-nosave:
 
 	// Now on a scheduling stack (a pthread-created stack).
 	// Make sure we have enough room for 4 stack-backed fast-call
@@ -700,23 +600,48 @@
 	MOVL	AX, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+nosave:
+	// Running on a system stack, perhaps even without a g.
+	// Having no g can happen during thread creation or thread teardown
+	// (see needm/dropm on Solaris, for example).
+	// This code is like the above sequence but without saving/restoring g
+	// and without worrying about the stack moving out from under us
+	// (because we're on a system stack, not a goroutine stack).
+	// The above code could be used directly if already on a system stack,
+	// but then the only path through this code would be a rare case on Solaris.
+	// Using this code for all "already on system stack" calls exercises it more,
+	// which should help keep it correct.
+	SUBQ	$64, SP
+	ANDQ	$~15, SP
+	MOVQ	$0, 48(SP)		// where above code stores g, in case someone looks during debugging
+	MOVQ	DX, 40(SP)	// save original stack pointer
+	MOVQ	BX, DI		// DI = first argument in AMD64 ABI
+	MOVQ	BX, CX		// CX = first argument in Win64
+	CALL	AX
+	MOVQ	40(SP), SI	// restore original stack pointer
+	MOVQ	SI, SP
+	MOVL	AX, ret+16(FP)
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
+TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
 	LEAQ	fn+0(FP), AX
 	MOVQ	AX, 0(SP)
 	MOVQ	frame+8(FP), AX
 	MOVQ	AX, 8(SP)
 	MOVQ	framesize+16(FP), AX
 	MOVQ	AX, 16(SP)
+	MOVQ	ctxt+24(FP), AX
+	MOVQ	AX, 24(SP)
 	MOVQ	$runtime·cgocallback_gofunc(SB), AX
 	CALL	AX
 	RET
 
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-24
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -784,7 +709,7 @@
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
 	//
-	// In the new goroutine, 0(SP) holds the saved R8.
+	// In the new goroutine, 8(SP) holds the saved R8.
 	MOVQ	m_curg(BX), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
@@ -792,16 +717,18 @@
 	MOVQ	BX, -8(DI)
 	// Compute the size of the frame, including return PC and, if
 	// GOEXPERIMENT=framepointer, the saved based pointer
+	MOVQ	ctxt+24(FP), BX
 	LEAQ	fv+0(FP), AX
 	SUBQ	SP, AX
 	SUBQ	AX, DI
 	MOVQ	DI, SP
 
-	MOVQ	R8, 0(SP)
+	MOVQ	R8, 8(SP)
+	MOVQ	BX, 0(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVQ	0(SP), R8
+	MOVQ	8(SP), R8
 
-	// Compute the size of the frame again.  FP and SP have
+	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
 	// but only their difference matters.
 	LEAQ	fv+0(FP), AX
@@ -907,10 +834,10 @@
 TEXT runtime·cputicks(SB),NOSPLIT,$0-0
 	CMPB	runtime·lfenceBeforeRdtsc(SB), $1
 	JNE	mfence
-	BYTE	$0x0f; BYTE $0xae; BYTE $0xe8 // LFENCE
+	LFENCE
 	JMP	done
 mfence:
-	BYTE	$0x0f; BYTE $0xae; BYTE $0xf0 // MFENCE
+	MFENCE
 done:
 	RDTSC
 	SHLQ	$32, DX
@@ -953,10 +880,14 @@
 // CX: length
 // DX: address to put return value
 TEXT runtime·aeshashbody(SB),NOSPLIT,$0-0
-	MOVQ	h+8(FP), X6	// seed to low 64 bits of xmm6
-	PINSRQ	$1, CX, X6	// size to high 64 bits of xmm6
-	PSHUFHW	$0, X6, X6	// replace size with its low 2 bytes repeated 4 times
-	MOVO	runtime·aeskeysched(SB), X7
+	// Fill an SSE register with our seeds.
+	MOVQ	h+8(FP), X0			// 64 bits of per-table hash seed
+	PINSRW	$4, CX, X0			// 16 bits of length
+	PSHUFHW $0, X0, X0			// repeat length 4 times total
+	MOVO	X0, X1				// save unscrambled seed
+	PXOR	runtime·aeskeysched(SB), X0	// xor in per-process seed
+	AESENC	X0, X0				// scramble seed
+
 	CMPQ	CX, $16
 	JB	aes0to15
 	JE	aes16
@@ -978,218 +909,302 @@
 
 	// 16 bytes loaded at this address won't cross
 	// a page boundary, so we can load it directly.
-	MOVOU	-16(AX), X0
+	MOVOU	-16(AX), X1
 	ADDQ	CX, CX
 	MOVQ	$masks<>(SB), AX
-	PAND	(AX)(CX*8), X0
-
-	// scramble 3 times
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVQ	X0, (DX)
+	PAND	(AX)(CX*8), X1
+final1:
+	PXOR	X0, X1	// xor data with seed
+	AESENC	X1, X1	// scramble combo 3 times
+	AESENC	X1, X1
+	AESENC	X1, X1
+	MOVQ	X1, (DX)
 	RET
 
 endofpage:
-	// address ends in 1111xxxx.  Might be up against
+	// address ends in 1111xxxx. Might be up against
 	// a page boundary, so load ending at last byte.
 	// Then shift bytes down using pshufb.
-	MOVOU	-32(AX)(CX*1), X0
+	MOVOU	-32(AX)(CX*1), X1
 	ADDQ	CX, CX
 	MOVQ	$shifts<>(SB), AX
-	PSHUFB	(AX)(CX*8), X0
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVQ	X0, (DX)
-	RET
+	PSHUFB	(AX)(CX*8), X1
+	JMP	final1
 
 aes0:
-	// return input seed
-	MOVQ	h+8(FP), AX
-	MOVQ	AX, (DX)
+	// Return scrambled input seed
+	AESENC	X0, X0
+	MOVQ	X0, (DX)
 	RET
 
 aes16:
-	MOVOU	(AX), X0
-	AESENC	X6, X0
-	AESENC	X7, X0
-	AESENC	X7, X0
-	MOVQ	X0, (DX)
-	RET
+	MOVOU	(AX), X1
+	JMP	final1
 
 aes17to32:
+	// make second starting seed
+	PXOR	runtime·aeskeysched+16(SB), X1
+	AESENC	X1, X1
+	
 	// load data to be hashed
-	MOVOU	(AX), X0
-	MOVOU	-16(AX)(CX*1), X1
+	MOVOU	(AX), X2
+	MOVOU	-16(AX)(CX*1), X3
+
+	// xor with seed
+	PXOR	X0, X2
+	PXOR	X1, X3
 
 	// scramble 3 times
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X0
-	AESENC	X7, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X2, X2
+	AESENC	X3, X3
 
 	// combine results
-	PXOR	X1, X0
-	MOVQ	X0, (DX)
+	PXOR	X3, X2
+	MOVQ	X2, (DX)
 	RET
 
 aes33to64:
-	MOVOU	(AX), X0
-	MOVOU	16(AX), X1
-	MOVOU	-32(AX)(CX*1), X2
-	MOVOU	-16(AX)(CX*1), X3
+	// make 3 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
 	
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	runtime·aeskeysched+32(SB), X2
-	AESENC	runtime·aeskeysched+48(SB), X3
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
+	MOVOU	(AX), X4
+	MOVOU	16(AX), X5
+	MOVOU	-32(AX)(CX*1), X6
+	MOVOU	-16(AX)(CX*1), X7
 
-	PXOR	X2, X0
-	PXOR	X3, X1
-	PXOR	X1, X0
-	MOVQ	X0, (DX)
+	PXOR	X0, X4
+	PXOR	X1, X5
+	PXOR	X2, X6
+	PXOR	X3, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+
+	PXOR	X6, X4
+	PXOR	X7, X5
+	PXOR	X5, X4
+	MOVQ	X4, (DX)
 	RET
 
 aes65to128:
-	MOVOU	(AX), X0
-	MOVOU	16(AX), X1
-	MOVOU	32(AX), X2
-	MOVOU	48(AX), X3
-	MOVOU	-64(AX)(CX*1), X4
-	MOVOU	-48(AX)(CX*1), X5
-	MOVOU	-32(AX)(CX*1), X8
-	MOVOU	-16(AX)(CX*1), X9
-	
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	runtime·aeskeysched+32(SB), X2
-	AESENC	runtime·aeskeysched+48(SB), X3
-	AESENC	runtime·aeskeysched+64(SB), X4
-	AESENC	runtime·aeskeysched+80(SB), X5
-	AESENC	runtime·aeskeysched+96(SB), X8
-	AESENC	runtime·aeskeysched+112(SB), X9
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X4
-	AESENC	X7, X5
-	AESENC	X7, X8
-	AESENC	X7, X9
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X4
-	AESENC	X7, X5
-	AESENC	X7, X8
-	AESENC	X7, X9
+	// make 7 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	MOVO	X1, X4
+	MOVO	X1, X5
+	MOVO	X1, X6
+	MOVO	X1, X7
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+80(SB), X5
+	PXOR	runtime·aeskeysched+96(SB), X6
+	PXOR	runtime·aeskeysched+112(SB), X7
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
 
-	PXOR	X4, X0
-	PXOR	X5, X1
-	PXOR	X8, X2
-	PXOR	X9, X3
-	PXOR	X2, X0
-	PXOR	X3, X1
-	PXOR	X1, X0
-	MOVQ	X0, (DX)
+	// load data
+	MOVOU	(AX), X8
+	MOVOU	16(AX), X9
+	MOVOU	32(AX), X10
+	MOVOU	48(AX), X11
+	MOVOU	-64(AX)(CX*1), X12
+	MOVOU	-48(AX)(CX*1), X13
+	MOVOU	-32(AX)(CX*1), X14
+	MOVOU	-16(AX)(CX*1), X15
+
+	// xor with seed
+	PXOR	X0, X8
+	PXOR	X1, X9
+	PXOR	X2, X10
+	PXOR	X3, X11
+	PXOR	X4, X12
+	PXOR	X5, X13
+	PXOR	X6, X14
+	PXOR	X7, X15
+
+	// scramble 3 times
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	// combine results
+	PXOR	X12, X8
+	PXOR	X13, X9
+	PXOR	X14, X10
+	PXOR	X15, X11
+	PXOR	X10, X8
+	PXOR	X11, X9
+	PXOR	X9, X8
+	MOVQ	X8, (DX)
 	RET
 
 aes129plus:
+	// make 7 more starting seeds
+	MOVO	X1, X2
+	MOVO	X1, X3
+	MOVO	X1, X4
+	MOVO	X1, X5
+	MOVO	X1, X6
+	MOVO	X1, X7
+	PXOR	runtime·aeskeysched+16(SB), X1
+	PXOR	runtime·aeskeysched+32(SB), X2
+	PXOR	runtime·aeskeysched+48(SB), X3
+	PXOR	runtime·aeskeysched+64(SB), X4
+	PXOR	runtime·aeskeysched+80(SB), X5
+	PXOR	runtime·aeskeysched+96(SB), X6
+	PXOR	runtime·aeskeysched+112(SB), X7
+	AESENC	X1, X1
+	AESENC	X2, X2
+	AESENC	X3, X3
+	AESENC	X4, X4
+	AESENC	X5, X5
+	AESENC	X6, X6
+	AESENC	X7, X7
+	
 	// start with last (possibly overlapping) block
-	MOVOU	-128(AX)(CX*1), X0
-	MOVOU	-112(AX)(CX*1), X1
-	MOVOU	-96(AX)(CX*1), X2
-	MOVOU	-80(AX)(CX*1), X3
-	MOVOU	-64(AX)(CX*1), X4
-	MOVOU	-48(AX)(CX*1), X5
-	MOVOU	-32(AX)(CX*1), X8
-	MOVOU	-16(AX)(CX*1), X9
+	MOVOU	-128(AX)(CX*1), X8
+	MOVOU	-112(AX)(CX*1), X9
+	MOVOU	-96(AX)(CX*1), X10
+	MOVOU	-80(AX)(CX*1), X11
+	MOVOU	-64(AX)(CX*1), X12
+	MOVOU	-48(AX)(CX*1), X13
+	MOVOU	-32(AX)(CX*1), X14
+	MOVOU	-16(AX)(CX*1), X15
 
-	// scramble state once
-	AESENC	X6, X0
-	AESENC	runtime·aeskeysched+16(SB), X1
-	AESENC	runtime·aeskeysched+32(SB), X2
-	AESENC	runtime·aeskeysched+48(SB), X3
-	AESENC	runtime·aeskeysched+64(SB), X4
-	AESENC	runtime·aeskeysched+80(SB), X5
-	AESENC	runtime·aeskeysched+96(SB), X8
-	AESENC	runtime·aeskeysched+112(SB), X9
-
+	// xor in seed
+	PXOR	X0, X8
+	PXOR	X1, X9
+	PXOR	X2, X10
+	PXOR	X3, X11
+	PXOR	X4, X12
+	PXOR	X5, X13
+	PXOR	X6, X14
+	PXOR	X7, X15
+	
 	// compute number of remaining 128-byte blocks
 	DECQ	CX
 	SHRQ	$7, CX
 	
 aesloop:
-	// scramble state, xor in a block
-	MOVOU	(AX), X10
-	MOVOU	16(AX), X11
-	MOVOU	32(AX), X12
-	MOVOU	48(AX), X13
-	AESENC	X10, X0
-	AESENC	X11, X1
-	AESENC	X12, X2
-	AESENC	X13, X3
-	MOVOU	64(AX), X10
-	MOVOU	80(AX), X11
-	MOVOU	96(AX), X12
-	MOVOU	112(AX), X13
-	AESENC	X10, X4
-	AESENC	X11, X5
-	AESENC	X12, X8
-	AESENC	X13, X9
-
 	// scramble state
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X4
-	AESENC	X7, X5
-	AESENC	X7, X8
-	AESENC	X7, X9
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+
+	// scramble state, xor in a block
+	MOVOU	(AX), X0
+	MOVOU	16(AX), X1
+	MOVOU	32(AX), X2
+	MOVOU	48(AX), X3
+	AESENC	X0, X8
+	AESENC	X1, X9
+	AESENC	X2, X10
+	AESENC	X3, X11
+	MOVOU	64(AX), X4
+	MOVOU	80(AX), X5
+	MOVOU	96(AX), X6
+	MOVOU	112(AX), X7
+	AESENC	X4, X12
+	AESENC	X5, X13
+	AESENC	X6, X14
+	AESENC	X7, X15
 
 	ADDQ	$128, AX
 	DECQ	CX
 	JNE	aesloop
 
-	// 2 more scrambles to finish
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X4
-	AESENC	X7, X5
-	AESENC	X7, X8
-	AESENC	X7, X9
-	AESENC	X7, X0
-	AESENC	X7, X1
-	AESENC	X7, X2
-	AESENC	X7, X3
-	AESENC	X7, X4
-	AESENC	X7, X5
-	AESENC	X7, X8
-	AESENC	X7, X9
+	// 3 more scrambles to finish
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
+	AESENC	X8, X8
+	AESENC	X9, X9
+	AESENC	X10, X10
+	AESENC	X11, X11
+	AESENC	X12, X12
+	AESENC	X13, X13
+	AESENC	X14, X14
+	AESENC	X15, X15
 
-	PXOR	X4, X0
-	PXOR	X5, X1
-	PXOR	X8, X2
-	PXOR	X9, X3
-	PXOR	X2, X0
-	PXOR	X3, X1
-	PXOR	X1, X0
-	MOVQ	X0, (DX)
+	PXOR	X12, X8
+	PXOR	X13, X9
+	PXOR	X14, X10
+	PXOR	X15, X11
+	PXOR	X10, X8
+	PXOR	X11, X9
+	PXOR	X9, X8
+	MOVQ	X8, (DX)
 	RET
 	
 TEXT runtime·aeshash32(SB),NOSPLIT,$0-24
@@ -1247,7 +1262,16 @@
 DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff
 GLOBL masks<>(SB),RODATA,$256
 
-// these are arguments to pshufb.  They move data down from
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	// check that masks<>(SB) and shifts<>(SB) are aligned to 16-byte
+	MOVQ	$masks<>(SB), AX
+	MOVQ	$shifts<>(SB), BX
+	ORQ	BX, AX
+	TESTQ	$15, AX
+	SETEQ	ret+0(FP)
+	RET
+
+// these are arguments to pshufb. They move data down from
 // the high bytes of the register to the low bytes of the register.
 // index is how many bytes to move.
 DATA shifts<>+0x00(SB)/8, $0x0000000000000000
@@ -1284,12 +1308,18 @@
 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
 GLOBL shifts<>(SB),RODATA,$256
 
-TEXT runtime·memeq(SB),NOSPLIT,$0-25
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-25
 	MOVQ	a+0(FP), SI
 	MOVQ	b+8(FP), DI
+	CMPQ	SI, DI
+	JEQ	eq
 	MOVQ	size+16(FP), BX
 	LEAQ	ret+24(FP), AX
 	JMP	runtime·memeqbody(SB)
+eq:
+	MOVB	$1, ret+24(FP)
+	RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
@@ -1328,6 +1358,10 @@
 TEXT runtime·memeqbody(SB),NOSPLIT,$0-0
 	CMPQ	BX, $8
 	JB	small
+	CMPQ	BX, $64
+	JB	bigloop
+	CMPB    runtime·support_avx2(SB), $1
+	JE	hugeloop_avx2
 	
 	// 64 bytes at a time using xmm registers
 hugeloop:
@@ -1357,6 +1391,30 @@
 	MOVB	$0, (AX)
 	RET
 
+	// 64 bytes at a time using ymm registers
+hugeloop_avx2:
+	CMPQ	BX, $64
+	JB	bigloop_avx2
+	VMOVDQU	(SI), Y0
+	VMOVDQU	(DI), Y1
+	VMOVDQU	32(SI), Y2
+	VMOVDQU	32(DI), Y3
+	VPCMPEQB	Y1, Y0, Y4
+	VPCMPEQB	Y2, Y3, Y5
+	VPAND	Y4, Y5, Y6
+	VPMOVMSKB Y6, DX
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, BX
+	CMPL	DX, $0xffffffff
+	JEQ	hugeloop_avx2
+	VZEROUPPER
+	MOVB	$0, (AX)
+	RET
+
+bigloop_avx2:
+	VZEROUPPER
+
 	// 8 bytes at a time using 64-bit register
 bigloop:
 	CMPQ	BX, $8
@@ -1393,7 +1451,7 @@
 	MOVQ	(SI), SI
 	JMP	si_finish
 si_high:
-	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
 	MOVQ	-8(SI)(BX*1), SI
 	SHRQ	CX, SI
 si_finish:
@@ -1445,6 +1503,11 @@
 	CMPQ	R8, $8
 	JB	small
 
+	CMPQ	R8, $63
+	JBE	loop
+	CMPB    runtime·support_avx2(SB), $1
+	JEQ     big_loop_avx2
+	JMP	big_loop
 loop:
 	CMPQ	R8, $16
 	JBE	_0through16
@@ -1459,6 +1522,17 @@
 	SUBQ	$16, R8
 	JMP	loop
 	
+diff64:
+	ADDQ	$48, SI
+	ADDQ	$48, DI
+	JMP	diff16
+diff48:
+	ADDQ	$32, SI
+	ADDQ	$32, DI
+	JMP	diff16
+diff32:
+	ADDQ	$16, SI
+	ADDQ	$16, DI
 	// AX = bit mask of differences
 diff16:
 	BSFQ	AX, BX	// index of first byte that differs
@@ -1545,6 +1619,285 @@
 	MOVQ	AX, (R9)
 	RET
 
+	// this works for >= 64 bytes of data.
+big_loop:
+	MOVOU	(SI), X0
+	MOVOU	(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff16
+
+	MOVOU	16(SI), X0
+	MOVOU	16(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff32
+
+	MOVOU	32(SI), X0
+	MOVOU	32(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff48
+
+	MOVOU	48(SI), X0
+	MOVOU	48(DI), X1
+	PCMPEQB X0, X1
+	PMOVMSKB X1, AX
+	XORQ	$0xffff, AX
+	JNE	diff64
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JBE	loop
+	JMP	big_loop
+
+	// Compare 64-bytes per loop iteration.
+	// Loop is unrolled and uses AVX2.
+big_loop_avx2:
+	VMOVDQU	(SI), Y2
+	VMOVDQU	(DI), Y3
+	VMOVDQU	32(SI), Y4
+	VMOVDQU	32(DI), Y5
+	VPCMPEQB Y2, Y3, Y0
+	VPMOVMSKB Y0, AX
+	XORL	$0xffffffff, AX
+	JNE	diff32_avx2
+	VPCMPEQB Y4, Y5, Y6
+	VPMOVMSKB Y6, AX
+	XORL	$0xffffffff, AX
+	JNE	diff64_avx2
+
+	ADDQ	$64, SI
+	ADDQ	$64, DI
+	SUBQ	$64, R8
+	CMPQ	R8, $64
+	JB	big_loop_avx2_exit
+	JMP	big_loop_avx2
+
+	// Avoid AVX->SSE transition penalty and search first 32 bytes of 64 byte chunk.
+diff32_avx2:
+	VZEROUPPER
+	JMP diff16
+
+	// Same as diff32_avx2, but for last 32 bytes.
+diff64_avx2:
+	VZEROUPPER
+	JMP diff48
+
+	// For <64 bytes remainder jump to normal loop.
+big_loop_avx2_exit:
+	VZEROUPPER
+	JMP loop
+
+
+// TODO: Also use this in bytes.Index
+TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
+	MOVQ s+0(FP), DI
+	// We want len in DX and AX, because PCMPESTRI implicitly consumes them
+	MOVQ s_len+8(FP), DX
+	MOVQ c+16(FP), BP
+	MOVQ c_len+24(FP), AX
+	CMPQ AX, DX
+	JA fail
+	CMPQ DX, $16
+	JAE sse42
+no_sse42:
+	CMPQ AX, $2
+	JA   _3_or_more
+	MOVW (BP), BP
+	LEAQ -1(DI)(DX*1), DX
+loop2:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop2
+	JMP fail
+_3_or_more:
+	CMPQ AX, $3
+	JA   _4_or_more
+	MOVW 1(BP), BX
+	MOVW (BP), BP
+	LEAQ -2(DI)(DX*1), DX
+loop3:
+	MOVW (DI), SI
+	CMPW SI,BP
+	JZ   partial_success3
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+partial_success3:
+	MOVW 1(DI), SI
+	CMPW SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop3
+	JMP fail
+_4_or_more:
+	CMPQ AX, $4
+	JA   _5_or_more
+	MOVL (BP), BP
+	LEAQ -3(DI)(DX*1), DX
+loop4:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop4
+	JMP fail
+_5_or_more:
+	CMPQ AX, $7
+	JA   _8_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVL -4(BP)(AX*1), BX
+	MOVL (BP), BP
+loop5to7:
+	MOVL (DI), SI
+	CMPL SI,BP
+	JZ   partial_success5to7
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+partial_success5to7:
+	MOVL -4(AX)(DI*1), SI
+	CMPL SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop5to7
+	JMP fail
+_8_or_more:
+	CMPQ AX, $8
+	JA   _9_or_more
+	MOVQ (BP), BP
+	LEAQ -7(DI)(DX*1), DX
+loop8:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop8
+	JMP fail
+_9_or_more:
+	CMPQ AX, $16
+	JA   _16_or_more
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVQ -8(BP)(AX*1), BX
+	MOVQ (BP), BP
+loop9to15:
+	MOVQ (DI), SI
+	CMPQ SI,BP
+	JZ   partial_success9to15
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+partial_success9to15:
+	MOVQ -8(AX)(DI*1), SI
+	CMPQ SI,BX
+	JZ success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop9to15
+	JMP fail
+_16_or_more:
+	CMPQ AX, $16
+	JA   _17_to_31
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), DX
+loop16:
+	MOVOU (DI), X2
+	PCMPEQB X1, X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop16
+	JMP fail
+_17_to_31:
+	LEAQ 1(DI)(DX*1), DX
+	SUBQ AX, DX
+	MOVOU -16(BP)(AX*1), X0
+	MOVOU (BP), X1
+loop17to31:
+	MOVOU (DI), X2
+	PCMPEQB X1,X2
+	PMOVMSKB X2, SI
+	CMPQ  SI, $0xffff
+	JE   partial_success17to31
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+	JMP fail
+partial_success17to31:
+	MOVOU -16(AX)(DI*1), X3
+	PCMPEQB X0, X3
+	PMOVMSKB X3, SI
+	CMPQ  SI, $0xffff
+	JE success
+	ADDQ $1,DI
+	CMPQ DI,DX
+	JB loop17to31
+fail:
+	MOVQ $-1, ret+32(FP)
+	RET
+sse42:
+	MOVL runtime·cpuid_ecx(SB), CX
+	ANDL $0x100000, CX
+	JZ no_sse42
+	CMPQ AX, $12
+	// PCMPESTRI is slower than normal compare,
+	// so using it makes sense only if we advance 4+ bytes per compare
+	// This value was determined experimentally and is the ~same
+	// on Nehalem (first with SSE42) and Haswell.
+	JAE _9_or_more
+	LEAQ 16(BP), SI
+	TESTW $0xff0, SI
+	JEQ no_sse42
+	MOVOU (BP), X1
+	LEAQ -15(DI)(DX*1), SI
+	MOVQ $16, R9
+	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
+	// for equality (bits 2,3 are 11)
+	// result is not masked or inverted (bits 4,5 are 00)
+	// and corresponds to first matching byte (bit 6 is 0)
+	PCMPESTRI $0x0c, (DI), X1
+	// CX == 16 means no match,
+	// CX > R9 means partial match at the end of the string,
+	// otherwise sep is at offset CX from X1 start
+	CMPQ CX, R9
+	JBE sse42_success
+	ADDQ R9, DI
+	CMPQ DI, SI
+	JB loop_sse42
+	PCMPESTRI $0x0c, -1(SI), X1
+	CMPQ CX, R9
+	JA fail
+	LEAQ -1(SI), DI
+sse42_success:
+	ADDQ CX, DI
+success:
+	SUBQ s+0(FP), DI
+	MOVQ DI, ret+32(FP)
+	RET
+
+
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
 	MOVQ s+0(FP), SI
 	MOVQ s_len+8(FP), BX
@@ -1565,88 +1918,125 @@
 //   AL: byte sought
 //   R8: address to put result
 TEXT runtime·indexbytebody(SB),NOSPLIT,$0
-	MOVQ SI, DI
-
-	CMPQ BX, $16
-	JLT small
-
-	// round up to first 16-byte boundary
-	TESTQ $15, SI
-	JZ aligned
-	MOVQ SI, CX
-	ANDQ $~15, CX
-	ADDQ $16, CX
-
-	// search the beginning
-	SUBQ SI, CX
-	REPN; SCASB
-	JZ success
-
-// DI is 16-byte aligned; get ready to search using SSE instructions
-aligned:
-	// round down to last 16-byte boundary
-	MOVQ BX, R11
-	ADDQ SI, R11
-	ANDQ $~15, R11
-
-	// shuffle X0 around so that each byte contains c
+	// Shuffle X0 around so that each byte contains
+	// the character we're looking for.
 	MOVD AX, X0
 	PUNPCKLBW X0, X0
 	PUNPCKLBW X0, X0
 	PSHUFL $0, X0, X0
-	JMP condition
+	
+	CMPQ BX, $16
+	JLT small
 
+	MOVQ SI, DI
+
+	CMPQ BX, $32
+	JA avx2
 sse:
-	// move the next 16-byte chunk of the buffer into X1
-	MOVO (DI), X1
-	// compare bytes in X0 to X1
-	PCMPEQB X0, X1
-	// take the top bit of each byte in X1 and put the result in DX
+	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
+	JMP	sseloopentry
+	
+sseloop:
+	// Move the next 16-byte chunk of the data into X1.
+	MOVOU	(DI), X1
+	// Compare bytes in X0 to X1.
+	PCMPEQB	X0, X1
+	// Take the top bit of each byte in X1 and put the result in DX.
 	PMOVMSKB X1, DX
-	TESTL DX, DX
-	JNZ ssesuccess
-	ADDQ $16, DI
+	// Find first set bit, if any.
+	BSFL	DX, DX
+	JNZ	ssesuccess
+	// Advance to next block.
+	ADDQ	$16, DI
+sseloopentry:
+	CMPQ	DI, AX
+	JB	sseloop
 
-condition:
-	CMPQ DI, R11
-	JLT sse
-
-	// search the end
-	MOVQ SI, CX
-	ADDQ BX, CX
-	SUBQ R11, CX
-	// if CX == 0, the zero flag will be set and we'll end up
-	// returning a false success
-	JZ failure
-	REPN; SCASB
-	JZ success
+	// Search the last 16-byte chunk. This chunk may overlap with the
+	// chunks we've already searched, but that's ok.
+	MOVQ	AX, DI
+	MOVOU	(AX), X1
+	PCMPEQB	X0, X1
+	PMOVMSKB X1, DX
+	BSFL	DX, DX
+	JNZ	ssesuccess
 
 failure:
 	MOVQ $-1, (R8)
 	RET
 
+// We've found a chunk containing the byte.
+// The chunk was loaded from DI.
+// The index of the matching byte in the chunk is DX.
+// The start of the data is SI.
+ssesuccess:
+	SUBQ SI, DI	// Compute offset of chunk within data.
+	ADDQ DX, DI	// Add offset of byte within chunk.
+	MOVQ DI, (R8)
+	RET
+
 // handle for lengths < 16
 small:
-	MOVQ BX, CX
-	REPN; SCASB
-	JZ success
+	TESTQ	BX, BX
+	JEQ	failure
+
+	// Check if we'll load across a page boundary.
+	LEAQ	16(SI), AX
+	TESTW	$0xff0, AX
+	JEQ	endofpage
+
+	MOVOU	(SI), X1 // Load data
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	CMPL	DX, BX
+	JAE	failure	// Match is past end of data.
+	MOVQ	DX, (R8)
+	RET
+
+endofpage:
+	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
+	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
+	PMOVMSKB X1, DX	// Move result bits to integer register.
+	MOVL	BX, CX
+	SHLL	CX, DX
+	SHRL	$16, DX	// Shift desired bits down to bottom of register.
+	BSFL	DX, DX	// Find first set bit.
+	JZ	failure	// No set bit, failure.
+	MOVQ	DX, (R8)
+	RET
+
+avx2:
+	CMPB   runtime·support_avx2(SB), $1
+	JNE sse
+	MOVD AX, X0
+	LEAQ -32(SI)(BX*1), R11
+	VPBROADCASTB  X0, Y1
+avx2_loop:
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	ADDQ $32, DI
+	CMPQ DI, R11
+	JLT avx2_loop
+	MOVQ R11, DI
+	VMOVDQU (DI), Y2
+	VPCMPEQB Y1, Y2, Y3
+	VPTEST Y3, Y3
+	JNZ avx2success
+	VZEROUPPER
 	MOVQ $-1, (R8)
 	RET
 
-// we've found the chunk containing the byte
-// now just figure out which specific byte it is
-ssesuccess:
-	// get the index of the least significant set bit
-	BSFW DX, DX
+avx2success:
+	VPMOVMSKB Y3, DX
+	BSFL DX, DX
 	SUBQ SI, DI
 	ADDQ DI, DX
 	MOVQ DX, (R8)
-	RET
-
-success:
-	SUBQ SI, DI
-	SUBL $1, DI
-	MOVQ DI, (R8)
+	VZEROUPPER
 	RET
 
 TEXT bytes·Equal(SB),NOSPLIT,$0-49

diff --git a/src/runtime/asm_amd64p32.s b/src/runtime/asm_amd64p32.s
index 6e97256..452ce04 100644
--- a/src/runtime/asm_amd64p32.s
+++ b/src/runtime/asm_amd64p32.s

@@ -39,13 +39,13 @@
 nocpuinfo:	
 	
 needtls:
-	LEAL	runtime·tls0(SB), DI
+	LEAL	runtime·m0+m_tls(SB), DI
 	CALL	runtime·settls(SB)
 
 	// store through it, to make sure it works
 	get_tls(BX)
 	MOVQ	$0x123, g(BX)
-	MOVQ	runtime·tls0(SB), AX
+	MOVQ	runtime·m0+m_tls(SB), AX
 	CMPQ	AX, $0x123
 	JEQ 2(PC)
 	MOVL	AX, 0	// abort
@@ -133,7 +133,7 @@
 
 // func mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB), NOSPLIT, $0-4
 	MOVL	fn+0(FP), DI
@@ -166,7 +166,7 @@
 	RET
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
@@ -198,7 +198,7 @@
 	CALL	AX
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVL	$runtime·systemstack_switch(SB), SI
 	MOVL	SI, (g_sched+gobuf_pc)(AX)
@@ -415,118 +415,6 @@
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-17
-	JMP	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·atomicstore(SB)
-
-// bool	runtime·cas64(uint64 *val, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVL	ptr+0(FP), BX
-	MOVQ	old+8(FP), AX
-	MOVQ	new+16(FP), CX
-	LOCK
-	CMPXCHGQ	CX, 0(BX)
-	SETEQ	ret+24(FP)
-	RET
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-17
-	MOVL	ptr+0(FP), BX
-	MOVL	old+4(FP), AX
-	MOVL	new+8(FP), CX
-	LOCK
-	CMPXCHGL	CX, 0(BX)
-	SETEQ	ret+16(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *val, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	delta+4(FP), AX
-	MOVL	AX, CX
-	LOCK
-	XADDL	AX, 0(BX)
-	ADDL	CX, AX
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	delta+8(FP), AX
-	MOVQ	AX, CX
-	LOCK
-	XADDQ	AX, 0(BX)
-	ADDQ	CX, AX
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xadduintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xadd(SB)
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVL	ptr+0(FP), BX
-	MOVQ	new+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	MOVQ	AX, ret+16(FP)
-	RET
-
-TEXT runtime·xchgp1(SB), NOSPLIT, $0-12
-	MOVL	ptr+0(FP), BX
-	MOVL	new+4(FP), AX
-	XCHGL	AX, 0(BX)
-	MOVL	AX, ret+8(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime·xchg(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVL	cycles+0(FP), AX
 again:
@@ -535,40 +423,6 @@
 	JNZ	again
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-8
-	MOVL	ptr+0(FP), BX
-	MOVL	val+4(FP), AX
-	XCHGL	AX, 0(BX)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVL	ptr+0(FP), BX
-	MOVQ	val+8(FP), AX
-	XCHGQ	AX, 0(BX)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ORB	AX, 0(BX)
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-5
-	MOVL	ptr+0(FP), BX
-	MOVB	val+4(FP), AX
-	LOCK
-	ANDB	AX, 0(BX)
-	RET
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$0-0
 	// Stores are already ordered on x86, so this is just a
 	// compile barrier.
@@ -627,15 +481,18 @@
 	MOVL	ptr+0(FP), DI
 	MOVL	n+4(FP), CX
 	MOVQ	CX, BX
-	ANDQ	$7, BX
-	SHRQ	$3, CX
+	ANDQ	$3, BX
+	SHRQ	$2, CX
 	MOVQ	$0, AX
 	CLD
 	REP
-	STOSQ
+	STOSL
 	MOVQ	BX, CX
 	REP
 	STOSB
+	// Note: we zero only 4 bytes at a time so that the tail is at most
+	// 3 bytes. That guarantees that we aren't zeroing pointers with STOSB.
+	// See issue 13160.
 	RET
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-12
@@ -716,13 +573,19 @@
 	MOVL	AX, ret+16(FP)
 	RET
 
-TEXT runtime·memeq(SB),NOSPLIT,$0-17
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$0-13
 	MOVL	a+0(FP), SI
 	MOVL	b+4(FP), DI
+	CMPL	SI, DI
+	JEQ	eq
 	MOVL	size+8(FP), BX
 	CALL	runtime·memeqbody(SB)
 	MOVB	AX, ret+16(FP)
 	RET
+eq:
+	MOVB    $1, ret+16(FP)
+	RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
 TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-9
@@ -829,7 +692,7 @@
 	MOVQ	(SI), SI
 	JMP	si_finish
 si_high:
-	// address ends in 11111xxx.  Load up to bytes we want, move to correct position.
+	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
 	MOVQ	BX, DX
 	ADDQ	SI, DX
 	MOVQ	-8(DX), SI
@@ -1155,3 +1018,7 @@
 	MOVL	addr+0(FP), AX
 	PREFETCHNTA	(AX)
 	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVB	$1, ret+0(FP)
+	RET

diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 9c32e42..f02297e 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s

@@ -31,7 +31,8 @@
 	MOVW	R8, g_m(g)
 
 	// create istack out of the OS stack
-	MOVW	$(-8192+104)(R13), R0
+	// (1MB of system stack is available on iOS and Android)
+	MOVW	$(-64*1024+104)(R13), R0
 	MOVW	R0, g_stackguard0(g)
 	MOVW	R0, g_stackguard1(g)
 	MOVW	R0, (g_stack+stack_lo)(g)
@@ -85,8 +86,12 @@
 #ifdef GOOS_nacl
 	WORD	$0xe125be7f	// BKPT 0x5bef, NACL_INSTR_ARM_BREAKPOINT
 #else
+#ifdef GOOS_plan9
+	WORD	$0xD1200070	// undefined instruction used as armv5 breakpoint in Plan 9
+#else
 	WORD	$0xe7f001f0	// undefined instruction that gdb understands is a software breakpoint
 #endif
+#endif
 	RET
 
 TEXT runtime·asminit(SB),NOSPLIT,$0-0
@@ -148,7 +153,7 @@
 
 // func mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB),NOSPLIT,$-4-4
 	// Save caller state in g->sched.
@@ -180,7 +185,7 @@
 	RET
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
@@ -212,7 +217,7 @@
 	BL	(R0)
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVW	$runtime·systemstack_switch(SB), R3
 #ifdef GOOS_nacl
@@ -525,23 +530,25 @@
 	MOVW	R0, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$12-12
+TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
 	MOVW	$fn+0(FP), R0
 	MOVW	R0, 4(R13)
 	MOVW	frame+4(FP), R0
 	MOVW	R0, 8(R13)
 	MOVW	framesize+8(FP), R0
 	MOVW	R0, 12(R13)
+	MOVW	ctxt+12(FP), R0
+	MOVW	R0, 16(R13)
 	MOVW	$runtime·cgocallback_gofunc(SB), R0
 	BL	(R0)
 	RET
 
-// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize)
+// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // See cgocall.go for more details.
-TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-12
+TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-16
 	NO_LOCAL_POINTERS
 	
 	// Load m and g from thread-local storage.
@@ -555,7 +562,13 @@
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
 	CMP	$0, g
-	B.NE	havem
+	B.EQ	needm
+
+	MOVW	g_m(g), R8
+	MOVW	R8, savedm-4(SP)
+	B	havem
+
+needm:
 	MOVW	g, savedm-4(SP) // g is zero, so is m.
 	MOVW	$runtime·needm(SB), R0
 	BL	(R0)
@@ -576,8 +589,6 @@
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
 
 havem:
-	MOVW	g_m(g), R8
-	MOVW	R8, savedm-4(SP)
 	// Now there's a valid m, and we're running on its m->g0.
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
@@ -602,12 +613,15 @@
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
 	//
-	// In the new goroutine, -8(SP) and -4(SP) are unused.
+	// In the new goroutine, -4(SP) is unused (where SP refers to
+	// m->curg's SP while we're setting it up, before we've adjusted it).
 	MOVW	m_curg(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVW	(g_sched+gobuf_pc)(g), R5
 	MOVW	R5, -12(R4)
+	MOVW	ctxt+12(FP), R0
+	MOVW	R0, -8(R4)
 	MOVW	$-12(R4), R13
 	BL	runtime·cgocallbackg(SB)
 
@@ -694,67 +708,10 @@
 	MOVW	$0, R0
 	MOVW	(R0), R1
 
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-//
-// To implement runtime·cas in sys_$GOOS_arm.s
-// using the native instructions, use:
-//
-//	TEXT runtime·cas(SB),NOSPLIT,$0
-//		B	runtime·armcas(SB)
-//
-TEXT runtime·armcas(SB),NOSPLIT,$0-13
-	MOVW	valptr+0(FP), R1
-	MOVW	old+4(FP), R2
-	MOVW	new+8(FP), R3
-casl:
-	LDREX	(R1), R0
-	CMP	R0, R2
-	BNE	casfail
-
-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
-	BLT	2(PC)
-	WORD	$0xf57ff05a	// dmb ishst
-
-	STREX	R3, (R1), R0
-	CMP	$0, R0
-	BNE	casl
-	MOVW	$1, R0
-
-	MOVB	runtime·goarm(SB), R11
-	CMP	$7, R11
-	BLT	2(PC)
-	WORD	$0xf57ff05b	// dmb ish
-
-	MOVB	R0, ret+12(FP)
-	RET
-casfail:
-	MOVW	$0, R0
-	MOVB	R0, ret+12(FP)
-	RET
-
-TEXT runtime·casuintptr(SB),NOSPLIT,$0-13
-	B	runtime·cas(SB)
-
-TEXT runtime·atomicloaduintptr(SB),NOSPLIT,$0-8
-	B	runtime·atomicload(SB)
-
-TEXT runtime·atomicloaduint(SB),NOSPLIT,$0-8
-	B	runtime·atomicload(SB)
-
-TEXT runtime·atomicstoreuintptr(SB),NOSPLIT,$0-8
-	B	runtime·atomicstore(SB)
-
 // armPublicationBarrier is a native store/store barrier for ARMv7+.
 // On earlier ARM revisions, armPublicationBarrier is a no-op.
 // This will not work on SMP ARMv6 machines, if any are in use.
-// To implement publiationBarrier in sys_$GOOS_arm.s using the native
+// To implement publicationBarrier in sys_$GOOS_arm.s using the native
 // instructions, use:
 //
 //	TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
@@ -798,13 +755,16 @@
 	MOVW	R0, ret+8(FP)
 	RET
 
-TEXT runtime·memeq(SB),NOSPLIT,$-4-13
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$-4-13
 	MOVW	a+0(FP), R1
 	MOVW	b+4(FP), R2
 	MOVW	size+8(FP), R3
 	ADD	R1, R3, R6
 	MOVW	$1, R0
 	MOVB	R0, ret+12(FP)
+	CMP	R1, R2
+	RET.EQ
 loop:
 	CMP	R1, R6
 	RET.EQ
@@ -827,7 +787,7 @@
 	MOVW	R0, 4(R13)
 	MOVW	R1, 8(R13)
 	MOVW	R2, 12(R13)
-	BL	runtime·memeq(SB)
+	BL	runtime·memequal(SB)
 	MOVB	16(R13), R0
 	MOVB	R0, ret+8(FP)
 	RET
@@ -862,6 +822,8 @@
 // On exit:
 // R4, R5, and R6 are clobbered
 TEXT runtime·cmpbody(SB),NOSPLIT,$-4-0
+	CMP	R2, R3
+	BEQ	samebytes
 	CMP 	R0, R1
 	MOVW 	R0, R6
 	MOVW.LT	R1, R6	// R6 is min(R0, R1)
@@ -912,7 +874,7 @@
 	MOVB	R8, v+16(FP)
 	RET
 
-// TODO: share code with memeq?
+// TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-25
 	MOVW	a_len+4(FP), R1
 	MOVW	b_len+16(FP), R3
@@ -1016,7 +978,7 @@
 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
 // Must obey the gcc calling convention.
 TEXT _cgo_topofstack(SB),NOSPLIT,$8
-	// R11 and g register are clobbered by load_g.  They are
+	// R11 and g register are clobbered by load_g. They are
 	// callee-save in the gcc calling convention, so save them here.
 	MOVW	R11, saveR11-4(SP)
 	MOVW	g, saveG-8(SP)
@@ -1070,3 +1032,22 @@
 	MULU	R0, R1
 	SUB	R1, R3, R1
 	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0-4
+        RET
+
+#ifndef GOOS_nacl
+// This is called from .init_array and follows the platform, not Go, ABI.
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-4
+	MOVW	R9, saver9-4(SP) // The access to global variables below implicitly uses R9, which is callee-save
+	MOVW	runtime·lastmoduledatap(SB), R1
+	MOVW	R0, moduledata_next(R1)
+	MOVW	R0, runtime·lastmoduledatap(SB)
+	MOVW	saver9-4(SP), R9
+	RET
+#endif
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVW	$1, R3
+	MOVB	R3, ret+0(FP)
+	RET

diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 9aff9c7..7ebd7ba 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s

@@ -36,9 +36,9 @@
 	MRS_TPIDR_R0			// load TLS base pointer
 	MOVD	R0, R3			// arg 3: TLS base pointer
 #ifdef TLSG_IS_VARIABLE
-	MOVD	$runtime·tls_g(SB), R2 	// arg 2: tlsg
+	MOVD	$runtime·tls_g(SB), R2 	// arg 2: &tls_g
 #else
-	MOVD	$0x10, R2		// arg 2: tlsg TODO(minux): hardcoded for linux
+	MOVD	$0, R2		        // arg 2: not used when using platform's TLS
 #endif
 	MOVD	$setg_gcc<>(SB), R1	// arg 1: setg
 	MOVD	g, R0			// arg 0: G
@@ -149,7 +149,7 @@
 
 // void mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
 TEXT runtime·mcall(SB), NOSPLIT, $-8-8
 	// Save caller state in g->sched
@@ -178,7 +178,7 @@
 	B	runtime·badmcall2(SB)
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
@@ -211,7 +211,7 @@
 	BL	(R3)
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVD	$runtime·systemstack_switch(SB), R6
 	ADD	$8, R6	// get past prologue
@@ -426,7 +426,6 @@
 
 // These have 8 added to make the overall frame size a multiple of 16,
 // as required by the ABI. (There is another +8 for the saved LR.)
-CALLFN(·call16, 24 )
 CALLFN(·call32, 40 )
 CALLFN(·call64, 72 )
 CALLFN(·call128, 136 )
@@ -454,40 +453,6 @@
 CALLFN(·call536870912, 536870920 )
 CALLFN(·call1073741824, 1073741832 )
 
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R0
-	MOVW	old+8(FP), R1
-	MOVW	new+12(FP), R2
-again:
-	LDAXRW	(R0), R3
-	CMPW	R1, R3
-	BNE	ok
-	STLXRW	R2, (R0), R3
-	CBNZ	R3, again
-ok:
-	CSET	EQ, R0
-	MOVB	R0, ret+16(FP)
-	RET
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	B	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $-8-16
-	B	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $-8-16
-	B	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	B	runtime·atomicstore64(SB)
-
 // AES hashing not implemented for ARM64, issue #10109.
 TEXT runtime·aeshash(SB),NOSPLIT,$-8-0
 	MOVW	$0, R0
@@ -501,17 +466,7 @@
 TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0
 	MOVW	$0, R0
 	MOVW	(R0), R1
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	B runtime·cas64(SB)
-
+	
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	MOVWU	cycles+0(FP), R0
 again:
@@ -587,7 +542,7 @@
 	BL	(R1)
 	MOVD	R0, R9
 
-	// Restore g, stack pointer.  R0 is errno, so don't touch it
+	// Restore g, stack pointer. R0 is errno, so don't touch it
 	MOVD	0(RSP), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_stack+stack_hi)(g), R5
@@ -599,23 +554,25 @@
 	MOVW	R0, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
+TEXT runtime·cgocallback(SB),NOSPLIT,$40-32
 	MOVD	$fn+0(FP), R0
 	MOVD	R0, 8(RSP)
 	MOVD	frame+8(FP), R0
 	MOVD	R0, 16(RSP)
 	MOVD	framesize+16(FP), R0
 	MOVD	R0, 24(RSP)
+	MOVD	ctxt+24(FP), R0
+	MOVD	R0, 32(RSP)
 	MOVD	$runtime·cgocallback_gofunc(SB), R0
 	BL	(R0)
 	RET
 
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-24
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
 	NO_LOCAL_POINTERS
 
 	// Load g from thread-local storage.
@@ -631,7 +588,13 @@
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
 	CMP	$0, g
-	BNE	havem
+	BEQ	needm
+
+	MOVD	g_m(g), R8
+	MOVD	R8, savedm-8(SP)
+	B	havem
+
+needm:
 	MOVD	g, savedm-8(SP) // g is zero, so is m.
 	MOVD	$runtime·needm(SB), R0
 	BL	(R0)
@@ -653,8 +616,6 @@
 	MOVD	R0, (g_sched+gobuf_sp)(R3)
 
 havem:
-	MOVD	g_m(g), R8
-	MOVD	R8, savedm-8(SP)
 	// Now there's a valid m, and we're running on its m->g0.
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
@@ -681,13 +642,16 @@
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
 	//
-	// In the new goroutine, -16(SP) and -8(SP) are unused.
+	// In the new goroutine, -8(SP) is unused (where SP refers to
+	// m->curg's SP while we're setting it up, before we've adjusted it).
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -(24+8)(R4)	// maintain 16-byte SP alignment
-	MOVD	$-(24+8)(R4), R0
+	MOVD	R5, -(24+8)(R4)
+	MOVD	ctxt+24(FP), R0
+	MOVD	R0, -(16+8)(R4)
+	MOVD	$-(24+8)(R4), R0 // maintain 16-byte SP alignment
 	MOVD	R0, RSP
 	BL	runtime·cgocallbackg(SB)
 
@@ -806,13 +770,16 @@
 	MOVD	R3, ret+16(FP)
 	RET
 
-TEXT runtime·memeq(SB),NOSPLIT,$-8-25
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$-8-25
 	MOVD	a+0(FP), R1
 	MOVD	b+8(FP), R2
 	MOVD	size+16(FP), R3
 	ADD	R1, R3, R6
 	MOVD	$1, R0
 	MOVB	R0, ret+24(FP)
+	CMP	R1, R2
+	BEQ	done
 loop:
 	CMP	R1, R6
 	BEQ	done
@@ -835,7 +802,7 @@
 	MOVD	R3, 8(RSP)
 	MOVD	R4, 16(RSP)
 	MOVD	R5, 24(RSP)
-	BL	runtime·memeq(SB)
+	BL	runtime·memequal(SB)
 	MOVBU	32(RSP), R3
 	MOVB	R3, ret+16(FP)
 	RET
@@ -870,6 +837,8 @@
 // On exit:
 // R4, R5, and R6 are clobbered
 TEXT runtime·cmpbody<>(SB),NOSPLIT,$-4-0
+	CMP	R2, R3
+	BEQ	samebytes // same starting pointers; compare lengths
 	CMP	R0, R1
 	CSEL    LT, R1, R0, R6 // R6 is min(R0, R1)
 
@@ -968,7 +937,7 @@
 	MOVD	R0, ret+24(FP)
 	RET
 
-// TODO: share code with memeq?
+// TODO: share code with memequal?
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
 	MOVD	a_len+8(FP), R1
 	MOVD	b_len+32(FP), R3
@@ -1027,3 +996,21 @@
 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
 	RET
 
+TEXT runtime·sigreturn(SB),NOSPLIT,$0-8
+        RET
+
+// This is called from .init_array and follows the platform, not Go, ABI.
+TEXT runtime·addmoduledata(SB),NOSPLIT,$0-0
+	SUB	$0x10, RSP
+	MOVD	R27, 8(RSP) // The access to global variables below implicitly uses R27, which is callee-save
+	MOVD	runtime·lastmoduledatap(SB), R1
+	MOVD	R0, moduledata_next(R1)
+	MOVD	R0, runtime·lastmoduledatap(SB)
+	MOVD	8(RSP), R27
+	ADD	$0x10, RSP
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVW	$1, R3
+	MOVB	R3, ret+0(FP)
+	RET

diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
new file mode 100644
index 0000000..7dd35aa
--- /dev/null
+++ b/src/runtime/asm_mips64x.s

@@ -0,0 +1,895 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+#define	REGCTXT	R22
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// R29 = stack; R4 = argc; R5 = argv
+
+	// initialize essential registers
+	JAL	runtime·reginit(SB)
+
+	ADDV	$-24, R29
+	MOVW	R4, 8(R29) // argc
+	MOVV	R5, 16(R29) // argv
+
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOVV	$runtime·g0(SB), g
+	MOVV	$(-64*1024), R23
+	ADDV	R23, R29, R1
+	MOVV	R1, g_stackguard0(g)
+	MOVV	R1, g_stackguard1(g)
+	MOVV	R1, (g_stack+stack_lo)(g)
+	MOVV	R29, (g_stack+stack_hi)(g)
+
+	// if there is a _cgo_init, call it using the gcc ABI.
+	MOVV	_cgo_init(SB), R25
+	BEQ	R25, nocgo
+
+	MOVV	R0, R7	// arg 3: not used
+	MOVV	R0, R6	// arg 2: not used
+	MOVV	$setg_gcc<>(SB), R5	// arg 1: setg
+	MOVV	g, R4	// arg 0: G
+	JAL	(R25)
+
+nocgo:
+	// update stackguard after _cgo_init
+	MOVV	(g_stack+stack_lo)(g), R1
+	ADDV	$const__StackGuard, R1
+	MOVV	R1, g_stackguard0(g)
+	MOVV	R1, g_stackguard1(g)
+
+	// set the per-goroutine and per-mach "registers"
+	MOVV	$runtime·m0(SB), R1
+
+	// save m->g0 = g0
+	MOVV	g, m_g0(R1)
+	// save m0 to g0->m
+	MOVV	R1, g_m(g)
+
+	JAL	runtime·check(SB)
+
+	// args are already prepared
+	JAL	runtime·args(SB)
+	JAL	runtime·osinit(SB)
+	JAL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVV	$runtime·mainPC(SB), R1		// entry
+	ADDV	$-24, R29
+	MOVV	R1, 16(R29)
+	MOVV	R0, 8(R29)
+	MOVV	R0, 0(R29)
+	JAL	runtime·newproc(SB)
+	ADDV	$24, R29
+
+	// start this M
+	JAL	runtime·mstart(SB)
+
+	MOVV	R0, 1(R0)
+	RET
+
+DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·mainPC(SB),RODATA,$8
+
+TEXT runtime·breakpoint(SB),NOSPLIT,$-8-0
+	MOVV	R0, 2(R0) // TODO: TD
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT,$-8-0
+	RET
+
+TEXT _cgo_reginit(SB),NOSPLIT,$-8-0
+	// crosscall1 needs to reginit, but can't
+	// get at the 'runtime.reginit' symbol.
+	JMP	runtime·reginit(SB)
+
+TEXT runtime·reginit(SB),NOSPLIT,$-8-0
+	// initialize essential FP registers
+	MOVD	$0.5, F26
+	SUBD	F26, F26, F24
+	ADDD	F26, F26, F28
+	ADDD	F28, F28, F30
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $-8-8
+	MOVV	buf+0(FP), R1
+	MOVV	R29, gobuf_sp(R1)
+	MOVV	R31, gobuf_pc(R1)
+	MOVV	g, gobuf_g(R1)
+	MOVV	R0, gobuf_lr(R1)
+	MOVV	R0, gobuf_ret(R1)
+	MOVV	R0, gobuf_ctxt(R1)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $-8-8
+	MOVV	buf+0(FP), R3
+	MOVV	gobuf_g(R3), g	// make sure g is not nil
+	JAL	runtime·save_g(SB)
+
+	MOVV	0(g), R2
+	MOVV	gobuf_sp(R3), R29
+	MOVV	gobuf_lr(R3), R31
+	MOVV	gobuf_ret(R3), R1
+	MOVV	gobuf_ctxt(R3), REGCTXT
+	MOVV	R0, gobuf_sp(R3)
+	MOVV	R0, gobuf_ret(R3)
+	MOVV	R0, gobuf_lr(R3)
+	MOVV	R0, gobuf_ctxt(R3)
+	MOVV	gobuf_pc(R3), R4
+	JMP	(R4)
+
+// void mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return. It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $-8-8
+	// Save caller state in g->sched
+	MOVV	R29, (g_sched+gobuf_sp)(g)
+	MOVV	R31, (g_sched+gobuf_pc)(g)
+	MOVV	R0, (g_sched+gobuf_lr)(g)
+	MOVV	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVV	g, R1
+	MOVV	g_m(g), R3
+	MOVV	m_g0(R3), g
+	JAL	runtime·save_g(SB)
+	BNE	g, R1, 2(PC)
+	JMP	runtime·badmcall(SB)
+	MOVV	fn+0(FP), REGCTXT			// context
+	MOVV	0(REGCTXT), R4			// code pointer
+	MOVV	(g_sched+gobuf_sp)(g), R29	// sp = m->g0->sched.sp
+	ADDV	$-16, R29
+	MOVV	R1, 8(R29)
+	MOVV	R0, 0(R29)
+	JAL	(R4)
+	JMP	runtime·badmcall2(SB)
+
+// systemstack_switch is a dummy routine that systemstack leaves at the bottom
+// of the G stack. We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the system stack because the one at the top of
+// the system stack terminates the stack walk (see topofstack()).
+TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	UNDEF
+	JAL	(R31)	// make sure this function is not leaf
+	RET
+
+// func systemstack(fn func())
+TEXT runtime·systemstack(SB), NOSPLIT, $0-8
+	MOVV	fn+0(FP), R1	// R1 = fn
+	MOVV	R1, REGCTXT		// context
+	MOVV	g_m(g), R2	// R2 = m
+
+	MOVV	m_gsignal(R2), R3	// R3 = gsignal
+	BEQ	g, R3, noswitch
+
+	MOVV	m_g0(R2), R3	// R3 = g0
+	BEQ	g, R3, noswitch
+
+	MOVV	m_curg(R2), R4
+	BEQ	g, R4, switch
+
+	// Bad: g is not gsignal, not g0, not curg. What is it?
+	// Hide call from linker nosplit analysis.
+	MOVV	$runtime·badsystemstack(SB), R4
+	JAL	(R4)
+
+switch:
+	// save our state in g->sched. Pretend to
+	// be systemstack_switch if the G stack is scanned.
+	MOVV	$runtime·systemstack_switch(SB), R4
+	ADDV	$8, R4	// get past prologue
+	MOVV	R4, (g_sched+gobuf_pc)(g)
+	MOVV	R29, (g_sched+gobuf_sp)(g)
+	MOVV	R0, (g_sched+gobuf_lr)(g)
+	MOVV	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOVV	R3, g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R1
+	// make it look like mstart called systemstack on g0, to stop traceback
+	ADDV	$-8, R1
+	MOVV	$runtime·mstart(SB), R2
+	MOVV	R2, 0(R1)
+	MOVV	R1, R29
+
+	// call target function
+	MOVV	0(REGCTXT), R4	// code pointer
+	JAL	(R4)
+
+	// switch back to g
+	MOVV	g_m(g), R1
+	MOVV	m_curg(R1), g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R29
+	MOVV	R0, (g_sched+gobuf_sp)(g)
+	RET
+
+noswitch:
+	// already on m stack, just call directly
+	MOVV	0(REGCTXT), R4	// code pointer
+	JAL	(R4)
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// Caller has already loaded:
+// R1: framesize, R2: argsize, R3: LR
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT,$-8-0
+	// Cannot grow scheduler stack (m->g0).
+	MOVV	g_m(g), R7
+	MOVV	m_g0(R7), R8
+	BNE	g, R8, 2(PC)
+	JAL	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVV	m_gsignal(R7), R8
+	BNE	g, R8, 2(PC)
+	JAL	runtime·abort(SB)
+
+	// Called from f.
+	// Set g->sched to context in f.
+	MOVV	REGCTXT, (g_sched+gobuf_ctxt)(g)
+	MOVV	R29, (g_sched+gobuf_sp)(g)
+	MOVV	R31, (g_sched+gobuf_pc)(g)
+	MOVV	R3, (g_sched+gobuf_lr)(g)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVV	R3, (m_morebuf+gobuf_pc)(R7)	// f's caller's PC
+	MOVV	R29, (m_morebuf+gobuf_sp)(R7)	// f's caller's SP
+	MOVV	g, (m_morebuf+gobuf_g)(R7)
+
+	// Call newstack on m->g0's stack.
+	MOVV	m_g0(R7), g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R29
+	JAL	runtime·newstack(SB)
+
+	// Not reached, but make sure the return PC from the call to newstack
+	// is still in this function, and not the beginning of the next.
+	UNDEF
+
+TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-8-0
+	MOVV	R0, REGCTXT
+	JMP	runtime·morestack(SB)
+
+TEXT runtime·stackBarrier(SB),NOSPLIT,$0
+	// We came here via a RET to an overwritten LR.
+	// R1 may be live. Other registers are available.
+
+	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
+	MOVV	(g_stkbar+slice_array)(g), R2
+	MOVV	g_stkbarPos(g), R3
+	MOVV	$stkbar__size, R4
+	MULVU	R3, R4
+	MOVV	LO, R4
+	ADDV	R2, R4
+	MOVV	stkbar_savedLRVal(R4), R4
+	// Record that this stack barrier was hit.
+	ADDV	$1, R3
+	MOVV	R3, g_stkbarPos(g)
+	// Jump to the original return PC.
+	JMP	(R4)
+
+// reflectcall: call a function with the given argument list
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	MOVV	$MAXSIZE, R23;		\
+	SGTU	R1, R23, R23;		\
+	BNE	R23, 3(PC);			\
+	MOVV	$NAME(SB), R4;	\
+	JMP	(R4)
+// Note: can't just "BR NAME(SB)" - bad inlining results.
+
+TEXT reflect·call(SB), NOSPLIT, $0-0
+	JMP	·reflectcall(SB)
+
+TEXT ·reflectcall(SB), NOSPLIT, $-8-32
+	MOVWU argsize+24(FP), R1
+	// NOTE(rsc): No call16, because CALLFN needs four words
+	// of argument space to invoke callwritebarrier.
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVV	$runtime·badreflectcall(SB), R4
+	JMP	(R4)
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVV	arg+16(FP), R1;			\
+	MOVWU	argsize+24(FP), R2;			\
+	MOVV	R29, R3;				\
+	ADDV	$8, R3;			\
+	ADDV	R3, R2;				\
+	BEQ	R3, R2, 6(PC);				\
+	MOVBU	(R1), R4;			\
+	ADDV	$1, R1;			\
+	MOVBU	R4, (R3);			\
+	ADDV	$1, R3;			\
+	JMP	-5(PC);				\
+	/* call function */			\
+	MOVV	f+8(FP), REGCTXT;			\
+	MOVV	(REGCTXT), R4;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	JAL	(R4);				\
+	/* copy return values back */		\
+	MOVV	arg+16(FP), R1;			\
+	MOVWU	n+24(FP), R2;			\
+	MOVWU	retoffset+28(FP), R4;		\
+	MOVV	R29, R3;				\
+	ADDV	R4, R3; 			\
+	ADDV	R4, R1;				\
+	SUBVU	R4, R2;				\
+	ADDV	$8, R3;			\
+	ADDV	R3, R2;				\
+loop:						\
+	BEQ	R3, R2, end;				\
+	MOVBU	(R3), R4;			\
+	ADDV	$1, R3;			\
+	MOVBU	R4, (R1);			\
+	ADDV	$1, R1;			\
+	JMP	loop;				\
+end:						\
+	/* execute write barrier updates */	\
+	MOVV	argtype+0(FP), R5;		\
+	MOVV	arg+16(FP), R1;			\
+	MOVWU	n+24(FP), R2;			\
+	MOVWU	retoffset+28(FP), R4;		\
+	MOVV	R5, 8(R29);			\
+	MOVV	R1, 16(R29);			\
+	MOVV	R2, 24(R29);			\
+	MOVV	R4, 32(R29);			\
+	JAL	runtime·callwritebarrier(SB);	\
+	RET
+
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	RET
+
+// void jmpdefer(fv, sp);
+// called from deferreturn.
+// 1. grab stored LR for caller
+// 2. sub 8 bytes to get back to JAL deferreturn
+// 3. JMP to fn
+TEXT runtime·jmpdefer(SB), NOSPLIT, $-8-16
+	MOVV	0(R29), R31
+	ADDV	$-8, R31
+
+	MOVV	fv+0(FP), REGCTXT
+	MOVV	argp+8(FP), R29
+	ADDV	$-8, R29
+	NOR	R0, R0	// prevent scheduling
+	MOVV	0(REGCTXT), R4
+	JMP	(R4)
+
+// Save state of caller into g->sched. Smashes R31.
+TEXT gosave<>(SB),NOSPLIT,$-8
+	MOVV	R31, (g_sched+gobuf_pc)(g)
+	MOVV	R29, (g_sched+gobuf_sp)(g)
+	MOVV	R0, (g_sched+gobuf_lr)(g)
+	MOVV	R0, (g_sched+gobuf_ret)(g)
+	MOVV	R0, (g_sched+gobuf_ctxt)(g)
+	RET
+
+// func asmcgocall(fn, arg unsafe.Pointer) int32
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.go for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+	MOVV	fn+0(FP), R25
+	MOVV	arg+8(FP), R4
+
+	MOVV	R29, R3	// save original stack pointer
+	MOVV	g, R2
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOVV	g_m(g), R5
+	MOVV	m_g0(R5), R6
+	BEQ	R6, g, g0
+
+	JAL	gosave<>(SB)
+	MOVV	R6, g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R29
+
+	// Now on a scheduling stack (a pthread-created stack).
+g0:
+	// Save room for two of our pointers.
+	ADDV	$-16, R29
+	MOVV	R2, 0(R29)	// save old g on stack
+	MOVV	(g_stack+stack_hi)(R2), R2
+	SUBVU	R3, R2
+	MOVV	R2, 8(R29)	// save depth in old g stack (can't just save SP, as stack might be copied during a callback)
+	JAL	(R25)
+
+	// Restore g, stack pointer. R2 is return value.
+	MOVV	0(R29), g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_stack+stack_hi)(g), R5
+	MOVV	8(R29), R6
+	SUBVU	R6, R5
+	MOVV	R5, R29
+
+	MOVW	R2, ret+16(FP)
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
+	MOVV	$fn+0(FP), R1
+	MOVV	R1, 8(R29)
+	MOVV	frame+8(FP), R1
+	MOVV	R1, 16(R29)
+	MOVV	framesize+16(FP), R1
+	MOVV	R1, 24(R29)
+	MOVV	ctxt+24(FP), R1
+	MOVV	R1, 32(R29)
+	MOVV	$runtime·cgocallback_gofunc(SB), R1
+	JAL	(R1)
+	RET
+
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// See cgocall.go for more details.
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+	NO_LOCAL_POINTERS
+
+	// Load m and g from thread-local storage.
+	MOVB	runtime·iscgo(SB), R1
+	BEQ	R1, nocgo
+	JAL	runtime·load_g(SB)
+nocgo:
+
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	BEQ	g, needm
+
+	MOVV	g_m(g), R3
+	MOVV	R3, savedm-8(SP)
+	JMP	havem
+
+needm:
+	MOVV	g, savedm-8(SP) // g is zero, so is m.
+	MOVV	$runtime·needm(SB), R4
+	JAL	(R4)
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then systemstack will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVV	g_m(g), R3
+	MOVV	m_g0(R3), R1
+	MOVV	R29, (g_sched+gobuf_sp)(R1)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R29) aka savedsp-16(SP).
+	MOVV	m_g0(R3), R1
+	MOVV	(g_sched+gobuf_sp)(R1), R2
+	MOVV	R2, savedsp-16(SP)
+	MOVV	R29, (g_sched+gobuf_sp)(R1)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, -8(SP) is unused (where SP refers to
+	// m->curg's SP while we're setting it up, before we've adjusted it).
+	MOVV	m_curg(R3), g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
+	MOVV	(g_sched+gobuf_pc)(g), R4
+	MOVV	R4, -24(R2)
+	MOVV    ctxt+24(FP), R1
+	MOVV    R1, -16(R2)
+	MOVV	$-24(R2), R29
+	JAL	runtime·cgocallbackg(SB)
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	MOVV	0(R29), R4
+	MOVV	R4, (g_sched+gobuf_pc)(g)
+	MOVV	$24(R29), R2
+	MOVV	R2, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVV	g_m(g), R3
+	MOVV	m_g0(R3), g
+	JAL	runtime·save_g(SB)
+	MOVV	(g_sched+gobuf_sp)(g), R29
+	MOVV	savedsp-16(SP), R2
+	MOVV	R2, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVV	savedm-8(SP), R3
+	BNE	R3, droppedm
+	MOVV	$runtime·dropm(SB), R4
+	JAL	(R4)
+droppedm:
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVV	gg+0(FP), g
+	// This only happens if iscgo, so jump straight to save_g
+	JAL	runtime·save_g(SB)
+	RET
+
+// void setg_gcc(G*); set g called from gcc with g in R1
+TEXT setg_gcc<>(SB),NOSPLIT,$0-0
+	MOVV	R1, g
+	JAL	runtime·save_g(SB)
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
+	MOVV	16(R29), R1		// LR saved by caller
+	MOVV	runtime·stackBarrierPC(SB), R2
+	BNE	R1, R2, nobar
+	// Get original return PC.
+	JAL	runtime·nextBarrierPC(SB)
+	MOVV	8(R29), R1
+nobar:
+	MOVV	R1, ret+8(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
+	MOVV	pc+8(FP), R1
+	MOVV	16(R29), R2
+	MOVV	runtime·stackBarrierPC(SB), R3
+	BEQ	R2, R3, setbar
+	MOVV	R1, 16(R29)		// set LR in caller
+	RET
+setbar:
+	// Set the stack barrier return PC.
+	MOVV	R1, 8(R29)
+	JAL	runtime·setNextBarrierPC(SB)
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
+	MOVV	argp+0(FP), R1
+	ADDV	$-8, R1
+	MOVV	R1, ret+8(FP)
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT,$-8-0
+	MOVW	(R0), R0
+	UNDEF
+
+// memhash_varlen(p unsafe.Pointer, h seed) uintptr
+// redirects to memhash(p, h, size) using the size
+// stored in the closure.
+TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
+	GO_ARGS
+	NO_LOCAL_POINTERS
+	MOVV	p+0(FP), R1
+	MOVV	h+8(FP), R2
+	MOVV	8(REGCTXT), R3
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memhash(SB)
+	MOVV	32(R29), R1
+	MOVV	R1, ret+16(FP)
+	RET
+
+// AES hashing not implemented for mips64
+TEXT runtime·aeshash(SB),NOSPLIT,$-8-0
+	MOVW	(R0), R1
+TEXT runtime·aeshash32(SB),NOSPLIT,$-8-0
+	MOVW	(R0), R1
+TEXT runtime·aeshash64(SB),NOSPLIT,$-8-0
+	MOVW	(R0), R1
+TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0
+	MOVW	(R0), R1
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT,$-8-25
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	size+16(FP), R3
+	ADDV	R1, R3, R4
+loop:
+	BNE	R1, R4, test
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+test:
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, loop
+
+	MOVB	R0, ret+24(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	RET
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT,$40-17
+	MOVV	a+0(FP), R1
+	MOVV	b+8(FP), R2
+	BEQ	R1, R2, eq
+	MOVV	8(REGCTXT), R3    // compiler stores size at offset 8 in the closure
+	MOVV	R1, 8(R29)
+	MOVV	R2, 16(R29)
+	MOVV	R3, 24(R29)
+	JAL	runtime·memequal(SB)
+	MOVBU	32(R29), R1
+	MOVB	R1, ret+16(FP)
+	RET
+eq:
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	RET
+
+// eqstring tests whether two strings are equal.
+// The compiler guarantees that strings passed
+// to eqstring have equal length.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT,$0-33
+	MOVV	s1str+0(FP), R1
+	MOVV	s2str+16(FP), R2
+	MOVV	$1, R3
+	MOVB	R3, ret+32(FP)
+	BNE	R1, R2, 2(PC)
+	RET
+	MOVV	s1len+8(FP), R3
+	ADDV	R1, R3, R4
+loop:
+	BNE	R1, R4, 2(PC)
+	RET
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, loop
+	MOVB	R0, ret+32(FP)
+	RET
+
+// TODO: share code with memequal?
+TEXT bytes·Equal(SB),NOSPLIT,$0-49
+	MOVV	a_len+8(FP), R3
+	MOVV	b_len+32(FP), R4
+	BNE	R3, R4, noteq		// unequal lengths are not equal
+
+	MOVV	a+0(FP), R1
+	MOVV	b+24(FP), R2
+	ADDV	R1, R3		// end
+
+loop:
+	BEQ	R1, R3, equal		// reached the end
+	MOVBU	(R1), R6
+	ADDV	$1, R1
+	MOVBU	(R2), R7
+	ADDV	$1, R2
+	BEQ	R6, R7, loop
+
+noteq:
+	MOVB	R0, ret+48(FP)
+	RET
+
+equal:
+	MOVV	$1, R1
+	MOVB	R1, ret+48(FP)
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
+	MOVV	s+0(FP), R1
+	MOVV	s_len+8(FP), R2
+	MOVBU	c+24(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+32(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+32(FP)
+	RET
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0-32
+	MOVV	p+0(FP), R1
+	MOVV	b_len+8(FP), R2
+	MOVBU	c+16(FP), R3	// byte to find
+	MOVV	R1, R4		// store base for later
+	ADDV	R1, R2		// end
+	ADDV	$-1, R1
+
+loop:
+	ADDV	$1, R1
+	BEQ	R1, R2, notfound
+	MOVBU	(R1), R5
+	BNE	R3, R5, loop
+
+	SUBV	R4, R1		// remove base
+	MOVV	R1, ret+24(FP)
+	RET
+
+notfound:
+	MOVV	$-1, R1
+	MOVV	R1, ret+24(FP)
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	MOVV	g_m(g), R2
+	MOVWU	m_fastrand(R2), R1
+	ADDU	R1, R1
+	BGEZ	R1, 2(PC)
+	XOR	$0x88888eef, R1
+	MOVW	R1, m_fastrand(R2)
+	MOVW	R1, ret+0(FP)
+	RET
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVW	$0, R1
+	RET
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT,$16
+	// g (R30) and REGTMP (R23)  might be clobbered by load_g. They
+	// are callee-save in the gcc calling convention, so save them.
+	MOVV	R23, savedR23-16(SP)
+	MOVV	g, savedG-8(SP)
+
+	JAL	runtime·load_g(SB)
+	MOVV	g_m(g), R1
+	MOVV	m_curg(R1), R1
+	MOVV	(g_stack+stack_hi)(R1), R2 // return value in R2
+
+	MOVV	savedG-8(SP), g
+	MOVV	savedR23-16(SP), R23
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT,$-8-0
+	NOR	R0, R0	// NOP
+	JAL	runtime·goexit1(SB)	// does not return
+	// traceback from goexit1 must hit code range of goexit
+	NOR	R0, R0	// NOP
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVW	$1, R1
+	MOVB	R1, ret+0(FP)
+	RET

diff --git a/src/runtime/asm_ppc64x.h b/src/runtime/asm_ppc64x.h
new file mode 100644
index 0000000..5e55055
--- /dev/null
+++ b/src/runtime/asm_ppc64x.h

@@ -0,0 +1,25 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// FIXED_FRAME defines the size of the fixed part of a stack frame. A stack
+// frame looks like this:
+//
+// +---------------------+
+// | local variable area |
+// +---------------------+
+// | argument area       |
+// +---------------------+ <- R1+FIXED_FRAME
+// | fixed area          |
+// +---------------------+ <- R1
+//
+// So a function that sets up a stack frame at all uses as least FIXED_FRAME
+// bytes of stack. This mostly affects assembly that calls other functions
+// with arguments (the arguments should be stored at FIXED_FRAME+0(R1),
+// FIXED_FRAME+8(R1) etc) and some other low-level places.
+//
+// The reason for using a constant is to make supporting PIC easier (although
+// we only support PIC on ppc64le which has a minimum 32 bytes of stack frame,
+// and currently always use that much, PIC on ppc64 would need to use 48).
+
+#define FIXED_FRAME 32

diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 0404124..32c63c2 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s

@@ -8,6 +8,7 @@
 #include "go_tls.h"
 #include "funcdata.h"
 #include "textflag.h"
+#include "asm_ppc64x.h"
 
 TEXT runtime·rt0_go(SB),NOSPLIT,$0
 	// R1 = stack; R3 = argc; R4 = argv; R13 = C TLS base pointer
@@ -15,9 +16,10 @@
 	// initialize essential registers
 	BL	runtime·reginit(SB)
 
-	SUB	$24, R1
-	MOVW	R3, 8(R1) // argc
-	MOVD	R4, 16(R1) // argv
+	SUB	$(FIXED_FRAME+16), R1
+	MOVD	R2, 24(R1)		// stash the TOC pointer away again now we've created a new frame
+	MOVW	R3, FIXED_FRAME+0(R1)	// argc
+	MOVD	R4, FIXED_FRAME+8(R1)	// argv
 
 	// create istack out of the given (operating system) stack.
 	// _cgo_init may update stackguard.
@@ -44,6 +46,7 @@
 	RLDCR	$0, R1, $~15, R1	// 16-byte align
 	BL	(CTR)			// may clobber R0, R3-R12
 	MOVD	R14, R1			// restore stack
+	MOVD	24(R1), R2
 	XOR	R0, R0			// fix R0
 
 nocgo:
@@ -73,8 +76,11 @@
 	MOVDU	R3, -8(R1)
 	MOVDU	R0, -8(R1)
 	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
 	BL	runtime·newproc(SB)
-	ADD	$24, R1
+	ADD	$(16+FIXED_FRAME), R1
 
 	// start this M
 	BL	runtime·mstart(SB)
@@ -85,19 +91,19 @@
 DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
 GLOBL	runtime·mainPC(SB),RODATA,$8
 
-TEXT runtime·breakpoint(SB),NOSPLIT,$-8-0
+TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R0, 2(R0) // TODO: TD
 	RET
 
-TEXT runtime·asminit(SB),NOSPLIT,$-8-0
+TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
 	RET
 
-TEXT _cgo_reginit(SB),NOSPLIT,$-8-0
+TEXT _cgo_reginit(SB),NOSPLIT|NOFRAME,$0-0
 	// crosscall_ppc64 and crosscall2 need to reginit, but can't
 	// get at the 'runtime.reginit' symbol.
 	BR	runtime·reginit(SB)
 
-TEXT runtime·reginit(SB),NOSPLIT,$-8-0
+TEXT runtime·reginit(SB),NOSPLIT|NOFRAME,$0-0
 	// set R0 to zero, it's expected by the toolchain
 	XOR R0, R0
 	// initialize essential FP registers
@@ -114,7 +120,7 @@
 
 // void gosave(Gobuf*)
 // save state in Gobuf; setjmp
-TEXT runtime·gosave(SB), NOSPLIT, $-8-8
+TEXT runtime·gosave(SB), NOSPLIT|NOFRAME, $0-8
 	MOVD	buf+0(FP), R3
 	MOVD	R1, gobuf_sp(R3)
 	MOVD	LR, R31
@@ -127,7 +133,7 @@
 
 // void gogo(Gobuf*)
 // restore state from Gobuf; longjmp
-TEXT runtime·gogo(SB), NOSPLIT, $-8-8
+TEXT runtime·gogo(SB), NOSPLIT|NOFRAME, $0-8
 	MOVD	buf+0(FP), R5
 	MOVD	gobuf_g(R5), g	// make sure g is not nil
 	BL	runtime·save_g(SB)
@@ -143,15 +149,15 @@
 	MOVD	R0, gobuf_lr(R5)
 	MOVD	R0, gobuf_ctxt(R5)
 	CMP	R0, R0 // set condition codes for == test, needed by stack split
-	MOVD	gobuf_pc(R5), R31
-	MOVD	R31, CTR
+	MOVD	gobuf_pc(R5), R12
+	MOVD	R12, CTR
 	BR	(CTR)
 
 // void mcall(fn func(*g))
 // Switch to m->g0's stack, call fn(g).
-// Fn must never return.  It should gogo(&g->sched)
+// Fn must never return. It should gogo(&g->sched)
 // to keep running g.
-TEXT runtime·mcall(SB), NOSPLIT, $-8-8
+TEXT runtime·mcall(SB), NOSPLIT|NOFRAME, $0-8
 	// Save caller state in g->sched
 	MOVD	R1, (g_sched+gobuf_sp)(g)
 	MOVD	LR, R31
@@ -168,20 +174,29 @@
 	BNE	2(PC)
 	BR	runtime·badmcall(SB)
 	MOVD	fn+0(FP), R11			// context
-	MOVD	0(R11), R4			// code pointer
-	MOVD	R4, CTR
+	MOVD	0(R11), R12			// code pointer
+	MOVD	R12, CTR
 	MOVD	(g_sched+gobuf_sp)(g), R1	// sp = m->g0->sched.sp
 	MOVDU	R3, -8(R1)
 	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
+	MOVDU	R0, -8(R1)
 	BL	(CTR)
+	MOVD	24(R1), R2
 	BR	runtime·badmcall2(SB)
 
 // systemstack_switch is a dummy routine that systemstack leaves at the bottom
-// of the G stack.  We need to distinguish the routine that
+// of the G stack. We need to distinguish the routine that
 // lives at the bottom of the G stack from the one that lives
 // at the top of the system stack because the one at the top of
 // the system stack terminates the stack walk (see topofstack()).
 TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	// We have several undefs here so that 16 bytes past
+	// $runtime·systemstack_switch lies within them whether or not the
+        // instructions that derive r2 from r12 are there.
+	UNDEF
+	UNDEF
 	UNDEF
 	BL	(LR)	// make sure this function is not leaf
 	RET
@@ -206,15 +221,15 @@
 
 	// Bad: g is not gsignal, not g0, not curg. What is it?
 	// Hide call from linker nosplit analysis.
-	MOVD	$runtime·badsystemstack(SB), R3
-	MOVD	R3, CTR
+	MOVD	$runtime·badsystemstack(SB), R12
+	MOVD	R12, CTR
 	BL	(CTR)
 
 switch:
-	// save our state in g->sched.  Pretend to
+	// save our state in g->sched. Pretend to
 	// be systemstack_switch if the G stack is scanned.
 	MOVD	$runtime·systemstack_switch(SB), R6
-	ADD	$8, R6	// get past prologue
+	ADD     $16, R6 // get past prologue (including r2-setting instructions when they're there)
 	MOVD	R6, (g_sched+gobuf_pc)(g)
 	MOVD	R1, (g_sched+gobuf_sp)(g)
 	MOVD	R0, (g_sched+gobuf_lr)(g)
@@ -225,16 +240,23 @@
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R3
 	// make it look like mstart called systemstack on g0, to stop traceback
-	SUB	$8, R3
+	SUB	$FIXED_FRAME, R3
 	MOVD	$runtime·mstart(SB), R4
 	MOVD	R4, 0(R3)
 	MOVD	R3, R1
 
 	// call target function
-	MOVD	0(R11), R3	// code pointer
-	MOVD	R3, CTR
+	MOVD	0(R11), R12	// code pointer
+	MOVD	R12, CTR
 	BL	(CTR)
 
+	// restore TOC pointer. It seems unlikely that we will use systemstack
+	// to call a function defined in another module, but the results of
+	// doing so would be so confusing that it's worth doing this.
+	MOVD	g_m(g), R3
+	MOVD	m_curg(R3), g
+	MOVD	(g_sched+gobuf_sp)(g), R3
+	MOVD	24(R3), R2
 	// switch back to g
 	MOVD	g_m(g), R3
 	MOVD	m_curg(R3), g
@@ -245,9 +267,10 @@
 
 noswitch:
 	// already on m stack, just call directly
-	MOVD	0(R11), R3	// code pointer
-	MOVD	R3, CTR
+	MOVD	0(R11), R12	// code pointer
+	MOVD	R12, CTR
 	BL	(CTR)
+	MOVD	24(R1), R2
 	RET
 
 /*
@@ -262,7 +285,7 @@
 // the top of a stack (for example, morestack calling newstack
 // calling the scheduler calling newm calling gc), so we must
 // record an argument size. For that purpose, it has no arguments.
-TEXT runtime·morestack(SB),NOSPLIT,$-8-0
+TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
 	// Cannot grow scheduler stack (m->g0).
 	MOVD	g_m(g), R7
 	MOVD	m_g0(R7), R8
@@ -300,7 +323,7 @@
 	// is still in this function, and not the beginning of the next.
 	UNDEF
 
-TEXT runtime·morestack_noctxt(SB),NOSPLIT,$-8-0
+TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R0, R11
 	BR	runtime·morestack(SB)
 
@@ -332,15 +355,15 @@
 	MOVD	$MAXSIZE, R31;		\
 	CMP	R3, R31;		\
 	BGT	4(PC);			\
-	MOVD	$NAME(SB), R31;	\
-	MOVD	R31, CTR;		\
+	MOVD	$NAME(SB), R12;		\
+	MOVD	R12, CTR;		\
 	BR	(CTR)
 // Note: can't just "BR NAME(SB)" - bad inlining results.
 
 TEXT reflect·call(SB), NOSPLIT, $0-0
 	BR	·reflectcall(SB)
 
-TEXT ·reflectcall(SB), NOSPLIT, $-8-32
+TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWZ argsize+24(FP), R3
 	// NOTE(rsc): No call16, because CALLFN needs four words
 	// of argument space to invoke callwritebarrier.
@@ -370,8 +393,8 @@
 	DISPATCH(runtime·call268435456, 268435456)
 	DISPATCH(runtime·call536870912, 536870912)
 	DISPATCH(runtime·call1073741824, 1073741824)
-	MOVD	$runtime·badreflectcall(SB), R31
-	MOVD	R31, CTR
+	MOVD	$runtime·badreflectcall(SB), R12
+	MOVD	R12, CTR
 	BR	(CTR)
 
 #define CALLFN(NAME,MAXSIZE)			\
@@ -381,7 +404,7 @@
 	MOVD	arg+16(FP), R3;			\
 	MOVWZ	argsize+24(FP), R4;			\
 	MOVD	R1, R5;				\
-	ADD	$(8-1), R5;			\
+	ADD	$(FIXED_FRAME-1), R5;			\
 	SUB	$1, R3;				\
 	ADD	R5, R4;				\
 	CMP	R5, R4;				\
@@ -391,10 +414,11 @@
 	BR	-4(PC);				\
 	/* call function */			\
 	MOVD	f+8(FP), R11;			\
-	MOVD	(R11), R31;			\
-	MOVD	R31, CTR;			\
+	MOVD	(R11), R12;			\
+	MOVD	R12, CTR;			\
 	PCDATA  $PCDATA_StackMapIndex, $0;	\
 	BL	(CTR);				\
+	MOVD	24(R1), R2;			\
 	/* copy return values back */		\
 	MOVD	arg+16(FP), R3;			\
 	MOVWZ	n+24(FP), R4;			\
@@ -403,7 +427,7 @@
 	ADD	R6, R5; 			\
 	ADD	R6, R3;				\
 	SUB	R6, R4;				\
-	ADD	$(8-1), R5;			\
+	ADD	$(FIXED_FRAME-1), R5;			\
 	SUB	$1, R3;				\
 	ADD	R5, R4;				\
 loop:						\
@@ -418,14 +442,13 @@
 	MOVD	arg+16(FP), R3;			\
 	MOVWZ	n+24(FP), R4;			\
 	MOVWZ	retoffset+28(FP), R6;		\
-	MOVD	R7, 8(R1);			\
-	MOVD	R3, 16(R1);			\
-	MOVD	R4, 24(R1);			\
-	MOVD	R6, 32(R1);			\
+	MOVD	R7, FIXED_FRAME+0(R1);			\
+	MOVD	R3, FIXED_FRAME+8(R1);			\
+	MOVD	R4, FIXED_FRAME+16(R1);			\
+	MOVD	R6, FIXED_FRAME+24(R1);			\
 	BL	runtime·callwritebarrier(SB);	\
 	RET
 
-CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -453,239 +476,32 @@
 CALLFN(·call536870912, 536870912)
 CALLFN(·call1073741824, 1073741824)
 
-// bool cas(uint32 *ptr, uint32 old, uint32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·cas(SB), NOSPLIT, $0-17
-	MOVD	ptr+0(FP), R3
-	MOVWZ	old+8(FP), R4
-	MOVWZ	new+12(FP), R5
-cas_again:
-	SYNC
-	LWAR	(R3), R6
-	CMPW	R6, R4
-	BNE	cas_fail
-	STWCCC	R5, (R3)
-	BNE	cas_again
-	MOVD	$1, R3
-	SYNC
-	ISYNC
-	MOVB	R3, ret+16(FP)
-	RET
-cas_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
-
-// bool	runtime·cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//	if(*val == *old){
-//		*val = new;
-//		return 1;
-//	} else {
-//		return 0;
-//	}
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVD	ptr+0(FP), R3
-	MOVD	old+8(FP), R4
-	MOVD	new+16(FP), R5
-cas64_again:
-	SYNC
-	LDAR	(R3), R6
-	CMP	R6, R4
-	BNE	cas64_fail
-	STDCCC	R5, (R3)
-	BNE	cas64_again
-	MOVD	$1, R3
-	SYNC
-	ISYNC
-	MOVB	R3, ret+24(FP)
-	RET
-cas64_fail:
-	MOVD	$0, R3
-	BR	-5(PC)
-
-TEXT runtime·casuintptr(SB), NOSPLIT, $0-25
-	BR	runtime·cas64(SB)
-
-TEXT runtime·atomicloaduintptr(SB), NOSPLIT, $-8-16
-	BR	runtime·atomicload64(SB)
-
-TEXT runtime·atomicloaduint(SB), NOSPLIT, $-8-16
-	BR	runtime·atomicload64(SB)
-
-TEXT runtime·atomicstoreuintptr(SB), NOSPLIT, $0-16
-	BR	runtime·atomicstore64(SB)
-
-// bool casp(void **val, void *old, void *new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	} else
-//		return 0;
-TEXT runtime·casp1(SB), NOSPLIT, $0-25
-	BR runtime·cas64(SB)
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//	*val += delta;
-//	return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	delta+8(FP), R5
-	SYNC
-	LWAR	(R4), R3
-	ADD	R5, R3
-	STWCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
-	ISYNC
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	delta+8(FP), R5
-	SYNC
-	LDAR	(R4), R3
-	ADD	R5, R3
-	STDCCC	R3, (R4)
-	BNE	-4(PC)
-	SYNC
-	ISYNC
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-	MOVD	ptr+0(FP), R4
-	MOVW	new+8(FP), R5
-	SYNC
-	LWAR	(R4), R3
-	STWCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
-	ISYNC
-	MOVW	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-	MOVD	ptr+0(FP), R4
-	MOVD	new+8(FP), R5
-	SYNC
-	LDAR	(R4), R3
-	STDCCC	R5, (R4)
-	BNE	-3(PC)
-	SYNC
-	ISYNC
-	MOVD	R3, ret+16(FP)
-	RET
-
-TEXT runtime·xchgp1(SB), NOSPLIT, $0-24
-	BR	runtime·xchg64(SB)
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	BR	runtime·xchg64(SB)
-
 TEXT runtime·procyield(SB),NOSPLIT,$0-0
 	RET
 
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	BR	runtime·atomicstore64(SB)
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R3
-	MOVW	val+8(FP), R4
-	SYNC
-	MOVW	R4, 0(R3)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R3
-	MOVD	val+8(FP), R4
-	SYNC
-	MOVD	R4, 0(R3)
-	RET
-
-// void	runtime·atomicor8(byte volatile*, byte);
-TEXT runtime·atomicor8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6
-	SLD	R6, R4, R4
-
-again:
-	SYNC
-	LWAR	(R5), R6
-	OR	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-	SYNC
-	ISYNC
-	RET
-
-// void	runtime·atomicand8(byte volatile*, byte);
-TEXT runtime·atomicand8(SB), NOSPLIT, $0-9
-	MOVD	ptr+0(FP), R3
-	MOVBZ	val+8(FP), R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	// R5 = (R3 << 0) & ~3
-	RLDCR	$0, R3, $~3, R5
-	// Compute val shift.
-#ifdef GOARCH_ppc64
-	// Big endian.  ptr = ptr ^ 3
-	XOR	$3, R3
-#endif
-	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
-	RLDC	$3, R3, $(3*8), R6
-	// Shift val for aligned ptr.  R4 = val << R6 | ^(0xFF << R6)
-	MOVD	$0xFF, R7
-	SLD	R6, R4
-	SLD	R6, R7
-	XOR $-1, R7
-	OR	R7, R4
-again:
-	SYNC
-	LWAR	(R5), R6
-	AND	R4, R6
-	STWCCC	R6, (R5)
-	BNE	again
-	SYNC
-	ISYNC
-	RET
-
 // void jmpdefer(fv, sp);
 // called from deferreturn.
 // 1. grab stored LR for caller
-// 2. sub 4 bytes to get back to BL deferreturn
+// 2. sub 8 bytes to get back to either nop or toc reload before deferreturn
 // 3. BR to fn
-TEXT runtime·jmpdefer(SB), NOSPLIT, $-8-16
+// When dynamically linking Go, it is not sufficient to rewind to the BL
+// deferreturn -- we might be jumping between modules and so we need to reset
+// the TOC pointer in r2. To do this, codegen inserts MOVD 24(R1), R2 *before*
+// the BL deferreturn and jmpdefer rewinds to that.
+TEXT runtime·jmpdefer(SB), NOSPLIT|NOFRAME, $0-16
 	MOVD	0(R1), R31
-	SUB	$4, R31
+	SUB     $8, R31
 	MOVD	R31, LR
 
 	MOVD	fv+0(FP), R11
 	MOVD	argp+8(FP), R1
-	SUB	$8, R1
-	MOVD	0(R11), R3
-	MOVD	R3, CTR
+	SUB	$FIXED_FRAME, R1
+	MOVD	0(R11), R12
+	MOVD	R12, CTR
 	BR	(CTR)
 
 // Save state of caller into g->sched. Smashes R31.
-TEXT gosave<>(SB),NOSPLIT,$-8
+TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
 	MOVD	LR, R31
 	MOVD	R31, (g_sched+gobuf_pc)(g)
 	MOVD	R1, (g_sched+gobuf_sp)(g)
@@ -702,7 +518,7 @@
 	MOVD	fn+0(FP), R3
 	MOVD	arg+8(FP), R4
 
-	MOVD	R1, R2		// save original stack pointer
+	MOVD	R1, R7		// save original stack pointer
 	MOVD	g, R5
 
 	// Figure out if we need to switch to m->g0 stack.
@@ -725,7 +541,7 @@
 	RLDCR	$0, R1, $~15, R1	// 16-byte alignment for gcc ABI
 	MOVD	R5, 40(R1)	// save old g on stack
 	MOVD	(g_stack+stack_hi)(R5), R5
-	SUB	R2, R5
+	SUB	R7, R5
 	MOVD	R5, 32(R1)	// save depth in old g stack (can't just save SP, as stack might be copied during a callback)
 	MOVD	R0, 0(R1)	// clear back chain pointer (TODO can we give it real back trace information?)
 	// This is a "global call", so put the global entry point in r12
@@ -737,8 +553,13 @@
 	// C code can clobber R0, so set it back to 0.  F27-F31 are
 	// callee save, so we don't need to recover those.
 	XOR	R0, R0
-	// Restore g, stack pointer.  R3 is errno, so don't touch it
+	// Restore g, stack pointer, toc pointer.
+	// R3 is errno, so don't touch it
 	MOVD	40(R1), g
+	MOVD    (g_stack+stack_hi)(g), R5
+	MOVD    32(R1), R6
+	SUB     R6, R5
+	MOVD    24(R5), R2
 	BL	runtime·save_g(SB)
 	MOVD	(g_stack+stack_hi)(g), R5
 	MOVD	32(R1), R6
@@ -748,24 +569,26 @@
 	MOVW	R3, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
 // Turn the fn into a Go func (by taking its address) and call
 // cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$24-24
+TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
 	MOVD	$fn+0(FP), R3
-	MOVD	R3, 8(R1)
+	MOVD	R3, FIXED_FRAME+0(R1)
 	MOVD	frame+8(FP), R3
-	MOVD	R3, 16(R1)
+	MOVD	R3, FIXED_FRAME+8(R1)
 	MOVD	framesize+16(FP), R3
-	MOVD	R3, 24(R1)
-	MOVD	$runtime·cgocallback_gofunc(SB), R3
-	MOVD	R3, CTR
+	MOVD	R3, FIXED_FRAME+16(R1)
+	MOVD	ctxt+24(FP), R3
+	MOVD	R3, FIXED_FRAME+24(R1)
+	MOVD	$runtime·cgocallback_gofunc(SB), R12
+	MOVD	R12, CTR
 	BL	(CTR)
 	RET
 
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize)
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-24
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -781,10 +604,16 @@
 	// lots of space, but the linker doesn't know. Hide the call from
 	// the linker analysis by using an indirect call.
 	CMP	g, $0
-	BNE	havem
+	BEQ	needm
+
+	MOVD	g_m(g), R8
+	MOVD	R8, savedm-8(SP)
+	BR	havem
+
+needm:
 	MOVD	g, savedm-8(SP) // g is zero, so is m.
-	MOVD	$runtime·needm(SB), R3
-	MOVD	R3, CTR
+	MOVD	$runtime·needm(SB), R12
+	MOVD	R12, CTR
 	BL	(CTR)
 
 	// Set m->sched.sp = SP, so that if a panic happens
@@ -798,13 +627,11 @@
 	// and then systemstack will try to use it. If we don't set it here,
 	// that restored SP will be uninitialized (typically 0) and
 	// will not be usable.
-	MOVD	g_m(g), R3
-	MOVD	m_g0(R3), R3
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), R3
 	MOVD	R1, (g_sched+gobuf_sp)(R3)
 
 havem:
-	MOVD	g_m(g), R8
-	MOVD	R8, savedm-8(SP)
 	// Now there's a valid m, and we're running on its m->g0.
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
@@ -829,19 +656,22 @@
 	// so that the traceback will seamlessly trace back into
 	// the earlier calls.
 	//
-	// In the new goroutine, -16(SP) and -8(SP) are unused.
+	// In the new goroutine, -8(SP) is unused (where SP refers to
+	// m->curg's SP while we're setting it up, before we've adjusted it).
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -24(R4)
-	MOVD	$-24(R4), R1
+	MOVD	R5, -(FIXED_FRAME+16)(R4)
+	MOVD	ctxt+24(FP), R3
+	MOVD	R3, -16(R4)
+	MOVD	$-(FIXED_FRAME+16)(R4), R1
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R1), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$24(R1), R4
+	MOVD	$(FIXED_FRAME+16)(R1), R4
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -859,8 +689,8 @@
 	MOVD	savedm-8(SP), R6
 	CMP	R6, $0
 	BNE	droppedm
-	MOVD	$runtime·dropm(SB), R3
-	MOVD	R3, CTR
+	MOVD	$runtime·dropm(SB), R12
+	MOVD	R12, CTR
 	BL	(CTR)
 droppedm:
 
@@ -876,7 +706,7 @@
 
 // void setg_gcc(G*); set g in C TLS.
 // Must obey the gcc calling convention.
-TEXT setg_gcc<>(SB),NOSPLIT,$-8-0
+TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
 	// The standard prologue clobbers R31, which is callee-save in
 	// the C ABI, so we have to use $-8-0 and save LR ourselves.
 	MOVD	LR, R4
@@ -893,38 +723,38 @@
 	RET
 
 TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
-	MOVD	16(R1), R3		// LR saved by caller
+	MOVD	FIXED_FRAME+8(R1), R3		// LR saved by caller
 	MOVD	runtime·stackBarrierPC(SB), R4
 	CMP	R3, R4
 	BNE	nobar
 	// Get original return PC.
 	BL	runtime·nextBarrierPC(SB)
-	MOVD	8(R1), R3
+	MOVD	FIXED_FRAME+0(R1), R3
 nobar:
 	MOVD	R3, ret+8(FP)
 	RET
 
 TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
 	MOVD	pc+8(FP), R3
-	MOVD	16(R1), R4
+	MOVD	FIXED_FRAME+8(R1), R4
 	MOVD	runtime·stackBarrierPC(SB), R5
 	CMP	R4, R5
 	BEQ	setbar
-	MOVD	R3, 16(R1)		// set LR in caller
+	MOVD	R3, FIXED_FRAME+8(R1)		// set LR in caller
 	RET
 setbar:
 	// Set the stack barrier return PC.
-	MOVD	R3, 8(R1)
+	MOVD	R3, FIXED_FRAME+0(R1)
 	BL	runtime·setNextBarrierPC(SB)
 	RET
 
 TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
 	MOVD	argp+0(FP), R3
-	SUB	$8, R3
+	SUB	$FIXED_FRAME, R3
 	MOVD	R3, ret+8(FP)
 	RET
 
-TEXT runtime·abort(SB),NOSPLIT,$-8-0
+TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R0
 	UNDEF
 
@@ -952,44 +782,31 @@
 	MOVD	p+0(FP), R3
 	MOVD	h+8(FP), R4
 	MOVD	8(R11), R5
-	MOVD	R3, 8(R1)
-	MOVD	R4, 16(R1)
-	MOVD	R5, 24(R1)
+	MOVD	R3, FIXED_FRAME+0(R1)
+	MOVD	R4, FIXED_FRAME+8(R1)
+	MOVD	R5, FIXED_FRAME+16(R1)
 	BL	runtime·memhash(SB)
-	MOVD	32(R1), R3
+	MOVD	FIXED_FRAME+24(R1), R3
 	MOVD	R3, ret+16(FP)
 	RET
 
 // AES hashing not implemented for ppc64
-TEXT runtime·aeshash(SB),NOSPLIT,$-8-0
+TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1
-TEXT runtime·aeshash32(SB),NOSPLIT,$-8-0
+TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1
-TEXT runtime·aeshash64(SB),NOSPLIT,$-8-0
+TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1
-TEXT runtime·aeshashstr(SB),NOSPLIT,$-8-0
+TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
 	MOVW	(R0), R1
 
-TEXT runtime·memeq(SB),NOSPLIT,$-8-25
-	MOVD	a+0(FP), R3
-	MOVD	b+8(FP), R4
-	MOVD	size+16(FP), R5
-	SUB	$1, R3
-	SUB	$1, R4
-	ADD	R3, R5, R8
-loop:
-	CMP	R3, R8
-	BNE	test
-	MOVD	$1, R3
-	MOVB	R3, ret+24(FP)
-	RET
-test:
-	MOVBZU	1(R3), R6
-	MOVBZU	1(R4), R7
-	CMP	R6, R7
-	BEQ	loop
+TEXT runtime·memequal(SB),NOSPLIT,$0-25
+	MOVD    a+0(FP), R3
+	MOVD    b+8(FP), R4
+	MOVD    size+16(FP), R5
 
-	MOVB	R0, ret+24(FP)
+	BL	runtime·memeqbody(SB)
+	MOVB    R9, ret+24(FP)
 	RET
 
 // memequal_varlen(a, b unsafe.Pointer) bool
@@ -999,75 +816,129 @@
 	CMP	R3, R4
 	BEQ	eq
 	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
-	MOVD	R3, 8(R1)
-	MOVD	R4, 16(R1)
-	MOVD	R5, 24(R1)
-	BL	runtime·memeq(SB)
-	MOVBZ	32(R1), R3
-	MOVB	R3, ret+16(FP)
+	BL	runtime·memeqbody(SB)
+	MOVB	R9, ret+16(FP)
 	RET
 eq:
 	MOVD	$1, R3
 	MOVB	R3, ret+16(FP)
 	RET
 
+// Do an efficieint memequal for ppc64
+// for reuse where possible.
+// R3 = s1
+// R4 = s2
+// R5 = len
+// R9 = return value
+// R6, R7 clobbered
+TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD    R5,CTR
+	CMP     R5,$8		// only optimize >=8
+	BLT     simplecheck
+	DCBT	(R3)		// cache hint
+	DCBT	(R4)
+	CMP	R5,$32		// optimize >= 32
+	MOVD	R5,R6		// needed if setup8a branch
+	BLT	setup8a		// 8 byte moves only
+setup32a:                       // 8 byte aligned, >= 32 bytes
+	SRADCC  $5,R5,R6        // number of 32 byte chunks to compare
+	MOVD	R6,CTR
+loop32a:
+	MOVD    0(R3),R6        // doublewords to compare
+	MOVD    0(R4),R7
+	MOVD	8(R3),R8	//
+	MOVD	8(R4),R9
+	CMP     R6,R7           // bytes batch?
+	BNE     noteq
+	MOVD	16(R3),R6
+	MOVD	16(R4),R7
+	CMP     R8,R9		// bytes match?
+	MOVD	24(R3),R8
+	MOVD	24(R4),R9
+	BNE     noteq
+	CMP     R6,R7           // bytes match?
+	BNE	noteq
+	ADD     $32,R3		// bump up to next 32
+	ADD     $32,R4
+	CMP     R8,R9           // bytes match?
+	BC      8,2,loop32a	// br ctr and cr
+	BNE	noteq
+	ANDCC	$24,R5,R6       // Any 8 byte chunks?
+	BEQ	leftover	// and result is 0
+setup8a:
+	SRADCC  $3,R6,R6        // get the 8 byte count
+	BEQ	leftover	// shifted value is 0
+	MOVD    R6,CTR
+loop8:
+	MOVD    0(R3),R6        // doublewords to compare
+	ADD	$8,R3
+	MOVD    0(R4),R7
+	ADD     $8,R4
+	CMP     R6,R7           // match?
+	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BNE     noteq
+leftover:
+	ANDCC   $7,R5,R6        // check for leftover bytes
+	BEQ     equal
+	MOVD    R6,CTR
+	BR	simple
+simplecheck:
+	CMP	R5,$0
+	BEQ	equal
+simple:
+	MOVBZ   0(R3), R6
+	ADD	$1,R3
+	MOVBZ   0(R4), R7
+	ADD     $1,R4
+	CMP     R6, R7
+	BNE     noteq
+	BC      8,2,simple
+	BNE	noteq
+	BR	equal
+noteq:
+	MOVD    $0, R9
+	RET
+equal:
+	MOVD    $1, R9
+	RET
+
 // eqstring tests whether two strings are equal.
 // The compiler guarantees that strings passed
 // to eqstring have equal length.
 // See runtime_test.go:eqstring_generic for
 // equivalent Go code.
 TEXT runtime·eqstring(SB),NOSPLIT,$0-33
-	MOVD	s1str+0(FP), R3
-	MOVD	s2str+16(FP), R4
-	MOVD	$1, R5
-	MOVB	R5, ret+32(FP)
-	CMP	R3, R4
-	BNE	2(PC)
+	MOVD    s1str+0(FP), R3
+	MOVD    s2str+16(FP), R4
+	MOVD    $1, R5
+	MOVB    R5, ret+32(FP)
+	CMP     R3, R4
+	BNE     2(PC)
 	RET
-	MOVD	s1len+8(FP), R5
-	SUB	$1, R3
-	SUB	$1, R4
-	ADD	R3, R5, R8
-loop:
-	CMP	R3, R8
-	BNE	2(PC)
-	RET
-	MOVBZU	1(R3), R6
-	MOVBZU	1(R4), R7
-	CMP	R6, R7
-	BEQ	loop
-	MOVB	R0, ret+32(FP)
+	MOVD    s1len+8(FP), R5
+	BL      runtime·memeqbody(SB)
+	MOVB    R9, ret+32(FP)
 	RET
 
-// TODO: share code with memeq?
 TEXT bytes·Equal(SB),NOSPLIT,$0-49
-	MOVD	a_len+8(FP), R3
-	MOVD	b_len+32(FP), R4
-
-	CMP	R3, R4		// unequal lengths are not equal
+	MOVD	a_len+8(FP), R4
+	MOVD	b_len+32(FP), R5
+	CMP	R5, R4		// unequal lengths are not equal
 	BNE	noteq
+	MOVD	a+0(FP), R3
+	MOVD	b+24(FP), R4
+	BL	runtime·memeqbody(SB)
 
-	MOVD	a+0(FP), R5
-	MOVD	b+24(FP), R6
-	SUB	$1, R5
-	SUB	$1, R6
-	ADD	R5, R3		// end-1
-
-loop:
-	CMP	R5, R3
-	BEQ	equal		// reached the end
-	MOVBZU	1(R5), R4
-	MOVBZU	1(R6), R7
-	CMP	R4, R7
-	BEQ	loop
+	MOVBZ	R9,ret+48(FP)
+	RET
 
 noteq:
-	MOVBZ	R0, ret+48(FP)
+	MOVBZ	$0,ret+48(FP)
 	RET
 
 equal:
-	MOVD	$1, R3
-	MOVBZ	R3, ret+48(FP)
+	MOVD	$1,R3
+	MOVBZ	R3,ret+48(FP)
 	RET
 
 TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
@@ -1118,6 +989,65 @@
 	MOVD	R3, ret+24(FP)
 	RET
 
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	s1_base+0(FP), R5
+	MOVD	s1_len+8(FP), R3
+	MOVD	s2_base+16(FP), R6
+	MOVD	s2_len+24(FP), R4
+	MOVD	$ret+32(FP), R7
+	BR	runtime·cmpbody<>(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	s1+0(FP), R5
+	MOVD	s1+8(FP), R3
+	MOVD	s2+24(FP), R6
+	MOVD	s2+32(FP), R4
+	MOVD	$ret+48(FP), R7
+	BR	runtime·cmpbody<>(SB)
+
+// On entry:
+// R3 is the length of s1
+// R4 is the length of s2
+// R5 points to the start of s1
+// R6 points to the start of s2
+// R7 points to return value (-1/0/1 will be written here)
+//
+// On exit:
+// R5, R6, R8, R9 and R10 are clobbered
+TEXT runtime·cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
+	CMP	R5, R6
+	BEQ	samebytes // same starting pointers; compare lengths
+	SUB	$1, R5
+	SUB	$1, R6
+	MOVD	R4, R8
+	CMP	R3, R4
+	BGE	2(PC)
+	MOVD	R3, R8	// R8 is min(R3, R4)
+	ADD	R5, R8	// R5 is current byte in s1, R8 is last byte in s1 to compare
+loop:
+	CMP	R5, R8
+	BEQ	samebytes // all compared bytes were the same; compare lengths
+	MOVBZU	1(R5), R9
+	MOVBZU	1(R6), R10
+	CMP	R9, R10
+	BEQ	loop
+	// bytes differed
+	MOVD	$1, R4
+	BGT	2(PC)
+	NEG	R4
+	MOVD	R4, (R7)
+	RET
+samebytes:
+	MOVD	$1, R8
+	CMP	R3, R4
+	BNE	3(PC)
+	MOVD	R0, (R7)
+	RET
+	BGT	2(PC)
+	NEG	R8
+	MOVD	R8, (R7)
+	RET
+
 TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
 	MOVD	g_m(g), R4
 	MOVWZ	m_fastrand(R4), R3
@@ -1135,7 +1065,7 @@
 
 // Called from cgo wrappers, this function returns g->m->curg.stack.hi.
 // Must obey the gcc calling convention.
-TEXT _cgo_topofstack(SB),NOSPLIT,$-8
+TEXT _cgo_topofstack(SB),NOSPLIT|NOFRAME,$0
 	// g (R30) and R31 are callee-save in the C ABI, so save them
 	MOVD	g, R4
 	MOVD	R31, R5
@@ -1153,8 +1083,16 @@
 
 // The top-most function running on a goroutine
 // returns to goexit+PCQuantum.
-TEXT runtime·goexit(SB),NOSPLIT,$-8-0
-	MOVD	R0, R0	// NOP
+//
+// When dynamically linking Go, it can be returned to from a function
+// implemented in a different module and so needs to reload the TOC pointer
+// from the stack (although this function declares that it does not set up x-a
+// frame, newproc1 does in fact allocate one for goexit and saves the TOC
+// pointer in the correct place).
+// goexit+_PCQuantum is halfway through the usual global entry point prologue
+// that derives r2 from r12 which is a bit silly, but not harmful.
+TEXT runtime·goexit(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	24(R1), R2
 	BL	runtime·goexit1(SB)	// does not return
 	// traceback from goexit1 must hit code range of goexit
 	MOVD	R0, R0	// NOP
@@ -1170,3 +1108,30 @@
 
 TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
 	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0-8
+        RET
+
+// prepGoExitFrame saves the current TOC pointer (i.e. the TOC pointer for the
+// module containing runtime) to the frame that goexit will execute in when
+// the goroutine exits. It's implemented in assembly mainly because that's the
+// easiest way to get access to R2.
+TEXT runtime·prepGoExitFrame(SB),NOSPLIT,$0-8
+      MOVD    sp+0(FP), R3
+      MOVD    R2, 24(R3)
+      RET
+
+TEXT runtime·addmoduledata(SB),NOSPLIT|NOFRAME,$0-0
+	ADD	$-8, R1
+	MOVD	R31, 0(R1)
+	MOVD	runtime·lastmoduledatap(SB), R4
+	MOVD	R3, moduledata_next(R4)
+	MOVD	R3, runtime·lastmoduledatap(SB)
+	MOVD	0(R1), R31
+	ADD	$8, R1
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVW	$1, R3
+	MOVB	R3, ret+0(FP)
+	RET

diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
new file mode 100644
index 0000000..896ccde
--- /dev/null
+++ b/src/runtime/asm_s390x.s

@@ -0,0 +1,1135 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// Indicate the status of vector facility
+// -1: 	init value
+// 0:	vector not installed
+// 1:	vector installed and enabled
+// 2:	vector installed but not enabled
+
+DATA runtime·vectorfacility+0x00(SB)/4, $-1
+GLOBL runtime·vectorfacility(SB), NOPTR, $4
+
+TEXT runtime·checkvectorfacility(SB),NOSPLIT,$32-0
+	MOVD    $2, R0
+	MOVD	R1, tmp-32(SP)
+	MOVD    $x-24(SP), R1
+//      STFLE   0(R1)
+	WORD    $0xB2B01000
+	MOVBZ   z-8(SP), R1
+	AND     $0x40, R1
+	BNE     vectorinstalled
+	MOVB    $0, runtime·vectorfacility(SB) //Vector not installed
+	MOVD	tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB   $0, $0xF, V16
+	VLGVB   $0, V16, R0
+	CMPBEQ  R0, $0xF, vectorenabled
+	MOVB    $2, runtime·vectorfacility(SB) //Vector installed but not enabled
+	MOVD    tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+vectorenabled:
+	MOVB    $1, runtime·vectorfacility(SB) //Vector installed and enabled
+	MOVD    tmp-32(SP), R1
+	MOVD    $0, R0
+	RET
+
+TEXT runtime·rt0_go(SB),NOSPLIT,$0
+	// R2 = argc; R3 = argv; R11 = temp; R13 = g; R15 = stack pointer
+	// C TLS base pointer in AR0:AR1
+
+	// initialize essential registers
+	XOR	R0, R0
+
+	SUB	$24, R15
+	MOVW	R2, 8(R15) // argc
+	MOVD	R3, 16(R15) // argv
+
+	// create istack out of the given (operating system) stack.
+	// _cgo_init may update stackguard.
+	MOVD	$runtime·g0(SB), g
+	MOVD	R15, R11
+	SUB	$(64*1024), R11
+	MOVD	R11, g_stackguard0(g)
+	MOVD	R11, g_stackguard1(g)
+	MOVD	R11, (g_stack+stack_lo)(g)
+	MOVD	R15, (g_stack+stack_hi)(g)
+
+	// if there is a _cgo_init, call it using the gcc ABI.
+	MOVD	_cgo_init(SB), R11
+	CMPBEQ	R11, $0, nocgo
+	MOVW	AR0, R4			// (AR0 << 32 | AR1) is the TLS base pointer; MOVD is translated to EAR
+	SLD	$32, R4, R4
+	MOVW	AR1, R4			// arg 2: TLS base pointer
+	MOVD	$setg_gcc<>(SB), R3 	// arg 1: setg
+	MOVD	g, R2			// arg 0: G
+	// C functions expect 160 bytes of space on caller stack frame
+	// and an 8-byte aligned stack pointer
+	MOVD	R15, R9			// save current stack (R9 is preserved in the Linux ABI)
+	SUB	$160, R15		// reserve 160 bytes
+	MOVD    $~7, R6
+	AND 	R6, R15			// 8-byte align
+	BL	R11			// this call clobbers volatile registers according to Linux ABI (R0-R5, R14)
+	MOVD	R9, R15			// restore stack
+	XOR	R0, R0			// zero R0
+
+nocgo:
+	// update stackguard after _cgo_init
+	MOVD	(g_stack+stack_lo)(g), R2
+	ADD	$const__StackGuard, R2
+	MOVD	R2, g_stackguard0(g)
+	MOVD	R2, g_stackguard1(g)
+
+	// set the per-goroutine and per-mach "registers"
+	MOVD	$runtime·m0(SB), R2
+
+	// save m->g0 = g0
+	MOVD	g, m_g0(R2)
+	// save m0 to g0->m
+	MOVD	R2, g_m(g)
+
+	BL	runtime·check(SB)
+
+	// argc/argv are already prepared on stack
+	BL	runtime·args(SB)
+	BL	runtime·osinit(SB)
+	BL	runtime·schedinit(SB)
+
+	// create a new goroutine to start program
+	MOVD	$runtime·mainPC(SB), R2		// entry
+	SUB     $24, R15
+	MOVD 	R2, 16(R15)
+	MOVD 	R0, 8(R15)
+	MOVD 	R0, 0(R15)
+	BL	runtime·newproc(SB)
+	ADD	$24, R15
+
+	// start this M
+	BL	runtime·mstart(SB)
+
+	MOVD	R0, 1(R0)
+	RET
+
+DATA	runtime·mainPC+0(SB)/8,$runtime·main(SB)
+GLOBL	runtime·mainPC(SB),RODATA,$8
+
+TEXT runtime·breakpoint(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	R0, 2(R0)
+	RET
+
+TEXT runtime·asminit(SB),NOSPLIT|NOFRAME,$0-0
+	RET
+
+/*
+ *  go-routine
+ */
+
+// void gosave(Gobuf*)
+// save state in Gobuf; setjmp
+TEXT runtime·gosave(SB), NOSPLIT, $-8-8
+	MOVD	buf+0(FP), R3
+	MOVD	R15, gobuf_sp(R3)
+	MOVD	LR, gobuf_pc(R3)
+	MOVD	g, gobuf_g(R3)
+	MOVD	$0, gobuf_lr(R3)
+	MOVD	$0, gobuf_ret(R3)
+	MOVD	$0, gobuf_ctxt(R3)
+	RET
+
+// void gogo(Gobuf*)
+// restore state from Gobuf; longjmp
+TEXT runtime·gogo(SB), NOSPLIT, $-8-8
+	MOVD	buf+0(FP), R5
+	MOVD	gobuf_g(R5), g	// make sure g is not nil
+	BL	runtime·save_g(SB)
+
+	MOVD	0(g), R4
+	MOVD	gobuf_sp(R5), R15
+	MOVD	gobuf_lr(R5), LR
+	MOVD	gobuf_ret(R5), R3
+	MOVD	gobuf_ctxt(R5), R12
+	MOVD	$0, gobuf_sp(R5)
+	MOVD	$0, gobuf_ret(R5)
+	MOVD	$0, gobuf_lr(R5)
+	MOVD	$0, gobuf_ctxt(R5)
+	CMP	R0, R0 // set condition codes for == test, needed by stack split
+	MOVD	gobuf_pc(R5), R6
+	BR	(R6)
+
+// void mcall(fn func(*g))
+// Switch to m->g0's stack, call fn(g).
+// Fn must never return.  It should gogo(&g->sched)
+// to keep running g.
+TEXT runtime·mcall(SB), NOSPLIT, $-8-8
+	// Save caller state in g->sched
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	LR, (g_sched+gobuf_pc)(g)
+	MOVD	R0, (g_sched+gobuf_lr)(g)
+	MOVD	g, (g_sched+gobuf_g)(g)
+
+	// Switch to m->g0 & its stack, call fn.
+	MOVD	g, R3
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), g
+	BL	runtime·save_g(SB)
+	CMP	g, R3
+	BNE	2(PC)
+	BR	runtime·badmcall(SB)
+	MOVD	fn+0(FP), R12			// context
+	MOVD	0(R12), R4			// code pointer
+	MOVD	(g_sched+gobuf_sp)(g), R15	// sp = m->g0->sched.sp
+	SUB	$16, R15
+	MOVD	R3, 8(R15)
+	MOVD	$0, 0(R15)
+	BL	(R4)
+	BR	runtime·badmcall2(SB)
+
+// systemstack_switch is a dummy routine that systemstack leaves at the bottom
+// of the G stack.  We need to distinguish the routine that
+// lives at the bottom of the G stack from the one that lives
+// at the top of the system stack because the one at the top of
+// the system stack terminates the stack walk (see topofstack()).
+TEXT runtime·systemstack_switch(SB), NOSPLIT, $0-0
+	UNDEF
+	BL	(LR)	// make sure this function is not leaf
+	RET
+
+// func systemstack(fn func())
+TEXT runtime·systemstack(SB), NOSPLIT, $0-8
+	MOVD	fn+0(FP), R3	// R3 = fn
+	MOVD	R3, R12		// context
+	MOVD	g_m(g), R4	// R4 = m
+
+	MOVD	m_gsignal(R4), R5	// R5 = gsignal
+	CMPBEQ	g, R5, noswitch
+
+	MOVD	m_g0(R4), R5	// R5 = g0
+	CMPBEQ	g, R5, noswitch
+
+	MOVD	m_curg(R4), R6
+	CMPBEQ	g, R6, switch
+
+	// Bad: g is not gsignal, not g0, not curg. What is it?
+	// Hide call from linker nosplit analysis.
+	MOVD	$runtime·badsystemstack(SB), R3
+	BL	(R3)
+
+switch:
+	// save our state in g->sched.  Pretend to
+	// be systemstack_switch if the G stack is scanned.
+	MOVD	$runtime·systemstack_switch(SB), R6
+	ADD	$16, R6	// get past prologue
+	MOVD	R6, (g_sched+gobuf_pc)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	R0, (g_sched+gobuf_lr)(g)
+	MOVD	g, (g_sched+gobuf_g)(g)
+
+	// switch to g0
+	MOVD	R5, g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R3
+	// make it look like mstart called systemstack on g0, to stop traceback
+	SUB	$8, R3
+	MOVD	$runtime·mstart(SB), R4
+	MOVD	R4, 0(R3)
+	MOVD	R3, R15
+
+	// call target function
+	MOVD	0(R12), R3	// code pointer
+	BL	(R3)
+
+	// switch back to g
+	MOVD	g_m(g), R3
+	MOVD	m_curg(R3), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	MOVD	$0, (g_sched+gobuf_sp)(g)
+	RET
+
+noswitch:
+	// already on m stack, just call directly
+	MOVD	0(R12), R3	// code pointer
+	BL	(R3)
+	RET
+
+/*
+ * support for morestack
+ */
+
+// Called during function prolog when more stack is needed.
+// Caller has already loaded:
+// R3: framesize, R4: argsize, R5: LR
+//
+// The traceback routines see morestack on a g0 as being
+// the top of a stack (for example, morestack calling newstack
+// calling the scheduler calling newm calling gc), so we must
+// record an argument size. For that purpose, it has no arguments.
+TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
+	// Cannot grow scheduler stack (m->g0).
+	MOVD	g_m(g), R7
+	MOVD	m_g0(R7), R8
+	CMPBNE	g, R8, 2(PC)
+	BL	runtime·abort(SB)
+
+	// Cannot grow signal stack (m->gsignal).
+	MOVD	m_gsignal(R7), R8
+	CMP	g, R8
+	BNE	2(PC)
+	BL	runtime·abort(SB)
+
+	// Called from f.
+	// Set g->sched to context in f.
+	MOVD	R12, (g_sched+gobuf_ctxt)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	LR, R8
+	MOVD	R8, (g_sched+gobuf_pc)(g)
+	MOVD	R5, (g_sched+gobuf_lr)(g)
+
+	// Called from f.
+	// Set m->morebuf to f's caller.
+	MOVD	R5, (m_morebuf+gobuf_pc)(R7)	// f's caller's PC
+	MOVD	R15, (m_morebuf+gobuf_sp)(R7)	// f's caller's SP
+	MOVD	g, (m_morebuf+gobuf_g)(R7)
+
+	// Call newstack on m->g0's stack.
+	MOVD	m_g0(R7), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	BL	runtime·newstack(SB)
+
+	// Not reached, but make sure the return PC from the call to newstack
+	// is still in this function, and not the beginning of the next.
+	UNDEF
+
+TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	$0, R12
+	BR	runtime·morestack(SB)
+
+TEXT runtime·stackBarrier(SB),NOSPLIT,$0
+	// We came here via a RET to an overwritten LR.
+	// R3 may be live. Other registers are available.
+
+	// Get the original return PC, g.stkbar[g.stkbarPos].savedLRVal.
+	MOVD	(g_stkbar+slice_array)(g), R4
+	MOVD	g_stkbarPos(g), R5
+	MOVD	$stkbar__size, R6
+	MULLD	R5, R6
+	ADD	R4, R6
+	MOVD	stkbar_savedLRVal(R6), R6
+	// Record that this stack barrier was hit.
+	ADD	$1, R5
+	MOVD	R5, g_stkbarPos(g)
+	// Jump to the original return PC.
+	BR	(R6)
+
+// reflectcall: call a function with the given argument list
+// func call(argtype *_type, f *FuncVal, arg *byte, argsize, retoffset uint32).
+// we don't have variable-sized frames, so we use a small number
+// of constant-sized-frame functions to encode a few bits of size in the pc.
+// Caution: ugly multiline assembly macros in your future!
+
+#define DISPATCH(NAME,MAXSIZE)		\
+	MOVD	$MAXSIZE, R4;		\
+	CMP	R3, R4;		\
+	BGT	3(PC);			\
+	MOVD	$NAME(SB), R5;	\
+	BR	(R5)
+// Note: can't just "BR NAME(SB)" - bad inlining results.
+
+TEXT reflect·call(SB), NOSPLIT, $0-0
+	BR	·reflectcall(SB)
+
+TEXT ·reflectcall(SB), NOSPLIT, $-8-32
+	MOVWZ argsize+24(FP), R3
+	// NOTE(rsc): No call16, because CALLFN needs four words
+	// of argument space to invoke callwritebarrier.
+	DISPATCH(runtime·call32, 32)
+	DISPATCH(runtime·call64, 64)
+	DISPATCH(runtime·call128, 128)
+	DISPATCH(runtime·call256, 256)
+	DISPATCH(runtime·call512, 512)
+	DISPATCH(runtime·call1024, 1024)
+	DISPATCH(runtime·call2048, 2048)
+	DISPATCH(runtime·call4096, 4096)
+	DISPATCH(runtime·call8192, 8192)
+	DISPATCH(runtime·call16384, 16384)
+	DISPATCH(runtime·call32768, 32768)
+	DISPATCH(runtime·call65536, 65536)
+	DISPATCH(runtime·call131072, 131072)
+	DISPATCH(runtime·call262144, 262144)
+	DISPATCH(runtime·call524288, 524288)
+	DISPATCH(runtime·call1048576, 1048576)
+	DISPATCH(runtime·call2097152, 2097152)
+	DISPATCH(runtime·call4194304, 4194304)
+	DISPATCH(runtime·call8388608, 8388608)
+	DISPATCH(runtime·call16777216, 16777216)
+	DISPATCH(runtime·call33554432, 33554432)
+	DISPATCH(runtime·call67108864, 67108864)
+	DISPATCH(runtime·call134217728, 134217728)
+	DISPATCH(runtime·call268435456, 268435456)
+	DISPATCH(runtime·call536870912, 536870912)
+	DISPATCH(runtime·call1073741824, 1073741824)
+	MOVD	$runtime·badreflectcall(SB), R5
+	BR	(R5)
+
+#define CALLFN(NAME,MAXSIZE)			\
+TEXT NAME(SB), WRAPPER, $MAXSIZE-24;		\
+	NO_LOCAL_POINTERS;			\
+	/* copy arguments to stack */		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	argsize+24(FP), R4;			\
+	MOVD	R15, R5;				\
+	ADD	$(8-1), R5;			\
+	SUB	$1, R3;				\
+	ADD	R5, R4;				\
+	CMP	R5, R4;				\
+	BEQ	6(PC);				\
+	ADD	$1, R3;				\
+	ADD	$1, R5;				\
+	MOVBZ	0(R3), R6;			\
+	MOVBZ	R6, 0(R5);			\
+	BR	-6(PC);				\
+	/* call function */			\
+	MOVD	f+8(FP), R12;			\
+	MOVD	(R12), R8;			\
+	PCDATA  $PCDATA_StackMapIndex, $0;	\
+	BL	(R8);				\
+	/* copy return values back */		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
+	MOVD	R15, R5;				\
+	ADD	R6, R5; 			\
+	ADD	R6, R3;				\
+	SUB	R6, R4;				\
+	ADD	$(8-1), R5;			\
+	SUB	$1, R3;				\
+	ADD	R5, R4;				\
+loop:						\
+	CMP	R5, R4;				\
+	BEQ	end;				\
+	ADD	$1, R5;				\
+	ADD	$1, R3;				\
+	MOVBZ	0(R5), R6;			\
+	MOVBZ	R6, 0(R3);			\
+	BR	loop;				\
+end:						\
+	/* execute write barrier updates */	\
+	MOVD	argtype+0(FP), R7;		\
+	MOVD	arg+16(FP), R3;			\
+	MOVWZ	n+24(FP), R4;			\
+	MOVWZ	retoffset+28(FP), R6;		\
+	MOVD	R7, 8(R15);			\
+	MOVD	R3, 16(R15);			\
+	MOVD	R4, 24(R15);			\
+	MOVD	R6, 32(R15);			\
+	BL	runtime·callwritebarrier(SB);	\
+	RET
+
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
+
+TEXT runtime·procyield(SB),NOSPLIT,$0-0
+	RET
+
+// void jmpdefer(fv, sp);
+// called from deferreturn.
+// 1. grab stored LR for caller
+// 2. sub 6 bytes to get back to BL deferreturn (size of BRASL instruction)
+// 3. BR to fn
+TEXT runtime·jmpdefer(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	0(R15), R1
+	SUB	$6, R1, LR
+
+	MOVD	fv+0(FP), R12
+	MOVD	argp+8(FP), R15
+	SUB	$8, R15
+	MOVD	0(R12), R3
+	BR	(R3)
+
+// Save state of caller into g->sched. Smashes R31.
+TEXT gosave<>(SB),NOSPLIT|NOFRAME,$0
+	MOVD	LR, (g_sched+gobuf_pc)(g)
+	MOVD	R15, (g_sched+gobuf_sp)(g)
+	MOVD	$0, (g_sched+gobuf_lr)(g)
+	MOVD	$0, (g_sched+gobuf_ret)(g)
+	MOVD	$0, (g_sched+gobuf_ctxt)(g)
+	RET
+
+// func asmcgocall(fn, arg unsafe.Pointer) int32
+// Call fn(arg) on the scheduler stack,
+// aligned appropriately for the gcc ABI.
+// See cgocall.go for more details.
+TEXT ·asmcgocall(SB),NOSPLIT,$0-20
+	// R2 = argc; R3 = argv; R11 = temp; R13 = g; R15 = stack pointer
+	// C TLS base pointer in AR0:AR1
+	MOVD	fn+0(FP), R3
+	MOVD	arg+8(FP), R4
+
+	MOVD	R15, R2		// save original stack pointer
+	MOVD	g, R5
+
+	// Figure out if we need to switch to m->g0 stack.
+	// We get called to create new OS threads too, and those
+	// come in on the m->g0 stack already.
+	MOVD	g_m(g), R6
+	MOVD	m_g0(R6), R6
+	CMPBEQ	R6, g, g0
+	BL	gosave<>(SB)
+	MOVD	R6, g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+
+	// Now on a scheduling stack (a pthread-created stack).
+g0:
+	// Save room for two of our pointers, plus 160 bytes of callee
+	// save area that lives on the caller stack.
+	SUB	$176, R15
+	MOVD	$~7, R6
+	AND	R6, R15                 // 8-byte alignment for gcc ABI
+	MOVD	R5, 168(R15)             // save old g on stack
+	MOVD	(g_stack+stack_hi)(R5), R5
+	SUB	R2, R5
+	MOVD	R5, 160(R15)             // save depth in old g stack (can't just save SP, as stack might be copied during a callback)
+	MOVD	R0, 0(R15)              // clear back chain pointer (TODO can we give it real back trace information?)
+	MOVD	R4, R2                  // arg in R2
+	BL	R3                      // can clobber: R0-R5, R14, F0-F3, F5, F7-F15
+
+	XOR	R0, R0                  // set R0 back to 0.
+	// Restore g, stack pointer.
+	MOVD	168(R15), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_stack+stack_hi)(g), R5
+	MOVD	160(R15), R6
+	SUB	R6, R5
+	MOVD	R5, R15
+
+	MOVW	R2, ret+16(FP)
+	RET
+
+// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// Turn the fn into a Go func (by taking its address) and call
+// cgocallback_gofunc.
+TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
+	MOVD	$fn+0(FP), R3
+	MOVD	R3, 8(R15)
+	MOVD	frame+8(FP), R3
+	MOVD	R3, 16(R15)
+	MOVD	framesize+16(FP), R3
+	MOVD	R3, 24(R15)
+	MOVD	ctxt+24(FP), R3
+	MOVD	R3, 32(R15)
+	MOVD	$runtime·cgocallback_gofunc(SB), R3
+	BL	(R3)
+	RET
+
+// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// See cgocall.go for more details.
+TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+	NO_LOCAL_POINTERS
+
+	// Load m and g from thread-local storage.
+	MOVB	runtime·iscgo(SB), R3
+	CMPBEQ	R3, $0, nocgo
+	BL	runtime·load_g(SB)
+
+nocgo:
+	// If g is nil, Go did not create the current thread.
+	// Call needm to obtain one for temporary use.
+	// In this case, we're running on the thread stack, so there's
+	// lots of space, but the linker doesn't know. Hide the call from
+	// the linker analysis by using an indirect call.
+	CMPBEQ	g, $0, needm
+
+	MOVD	g_m(g), R8
+	MOVD	R8, savedm-8(SP)
+	BR	havem
+
+needm:
+	MOVD	g, savedm-8(SP) // g is zero, so is m.
+	MOVD	$runtime·needm(SB), R3
+	BL	(R3)
+
+	// Set m->sched.sp = SP, so that if a panic happens
+	// during the function we are about to execute, it will
+	// have a valid SP to run on the g0 stack.
+	// The next few lines (after the havem label)
+	// will save this SP onto the stack and then write
+	// the same SP back to m->sched.sp. That seems redundant,
+	// but if an unrecovered panic happens, unwindm will
+	// restore the g->sched.sp from the stack location
+	// and then systemstack will try to use it. If we don't set it here,
+	// that restored SP will be uninitialized (typically 0) and
+	// will not be usable.
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), R3
+	MOVD	R15, (g_sched+gobuf_sp)(R3)
+
+havem:
+	// Now there's a valid m, and we're running on its m->g0.
+	// Save current m->g0->sched.sp on stack and then set it to SP.
+	// Save current sp in m->g0->sched.sp in preparation for
+	// switch back to m->curg stack.
+	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
+	MOVD	m_g0(R8), R3
+	MOVD	(g_sched+gobuf_sp)(R3), R4
+	MOVD	R4, savedsp-16(SP)
+	MOVD	R15, (g_sched+gobuf_sp)(R3)
+
+	// Switch to m->curg stack and call runtime.cgocallbackg.
+	// Because we are taking over the execution of m->curg
+	// but *not* resuming what had been running, we need to
+	// save that information (m->curg->sched) so we can restore it.
+	// We can restore m->curg->sched.sp easily, because calling
+	// runtime.cgocallbackg leaves SP unchanged upon return.
+	// To save m->curg->sched.pc, we push it onto the stack.
+	// This has the added benefit that it looks to the traceback
+	// routine like cgocallbackg is going to return to that
+	// PC (because the frame we allocate below has the same
+	// size as cgocallback_gofunc's frame declared above)
+	// so that the traceback will seamlessly trace back into
+	// the earlier calls.
+	//
+	// In the new goroutine, -8(SP) is unused (where SP refers to
+	// m->curg's SP while we're setting it up, before we've adjusted it).
+	MOVD	m_curg(R8), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
+	MOVD	(g_sched+gobuf_pc)(g), R5
+	MOVD	R5, -24(R4)
+	MOVD	ctxt+24(FP), R5
+	MOVD	R5, -16(R4)
+	MOVD	$-24(R4), R15
+	BL	runtime·cgocallbackg(SB)
+
+	// Restore g->sched (== m->curg->sched) from saved values.
+	MOVD	0(R15), R5
+	MOVD	R5, (g_sched+gobuf_pc)(g)
+	MOVD	$24(R15), R4
+	MOVD	R4, (g_sched+gobuf_sp)(g)
+
+	// Switch back to m->g0's stack and restore m->g0->sched.sp.
+	// (Unlike m->curg, the g0 goroutine never uses sched.pc,
+	// so we do not have to restore it.)
+	MOVD	g_m(g), R8
+	MOVD	m_g0(R8), g
+	BL	runtime·save_g(SB)
+	MOVD	(g_sched+gobuf_sp)(g), R15
+	MOVD	savedsp-16(SP), R4
+	MOVD	R4, (g_sched+gobuf_sp)(g)
+
+	// If the m on entry was nil, we called needm above to borrow an m
+	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVD	savedm-8(SP), R6
+	CMPBNE	R6, $0, droppedm
+	MOVD	$runtime·dropm(SB), R3
+	BL	(R3)
+droppedm:
+
+	// Done!
+	RET
+
+// void setg(G*); set g. for use by needm.
+TEXT runtime·setg(SB), NOSPLIT, $0-8
+	MOVD	gg+0(FP), g
+	// This only happens if iscgo, so jump straight to save_g
+	BL	runtime·save_g(SB)
+	RET
+
+// void setg_gcc(G*); set g in C TLS.
+// Must obey the gcc calling convention.
+TEXT setg_gcc<>(SB),NOSPLIT|NOFRAME,$0-0
+	// The standard prologue clobbers LR (R14), which is callee-save in
+	// the C ABI, so we have to use NOFRAME and save LR ourselves.
+	MOVD	LR, R1
+	// Also save g, R10, and R11 since they're callee-save in C ABI
+	MOVD	R10, R3
+	MOVD	g, R4
+	MOVD	R11, R5
+
+	MOVD	R2, g
+	BL	runtime·save_g(SB)
+
+	MOVD	R5, R11
+	MOVD	R4, g
+	MOVD	R3, R10
+	MOVD	R1, LR
+	RET
+
+TEXT runtime·getcallerpc(SB),NOSPLIT,$8-16
+	MOVD	16(R15), R3		// LR saved by caller
+	MOVD	runtime·stackBarrierPC(SB), R4
+	CMPBNE	R3, R4, nobar
+	// Get original return PC.
+	BL	runtime·nextBarrierPC(SB)
+	MOVD	8(R15), R3
+nobar:
+	MOVD	R3, ret+8(FP)
+	RET
+
+TEXT runtime·setcallerpc(SB),NOSPLIT,$8-16
+	MOVD	pc+8(FP), R3
+	MOVD	16(R15), R4
+	MOVD	runtime·stackBarrierPC(SB), R5
+	CMPBEQ	R4, R5, setbar
+	MOVD	R3, 16(R15)		// set LR in caller
+	RET
+setbar:
+	// Set the stack barrier return PC.
+	MOVD	R3, 8(R15)
+	BL	runtime·setNextBarrierPC(SB)
+	RET
+
+TEXT runtime·getcallersp(SB),NOSPLIT,$0-16
+	MOVD	argp+0(FP), R3
+	SUB	$8, R3
+	MOVD	R3, ret+8(FP)
+	RET
+
+TEXT runtime·abort(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R0
+	UNDEF
+
+// int64 runtime·cputicks(void)
+TEXT runtime·cputicks(SB),NOSPLIT,$0-8
+	// The TOD clock on s390 counts from the year 1900 in ~250ps intervals.
+	// This means that since about 1972 the msb has been set, making the
+	// result of a call to STORE CLOCK (stck) a negative number.
+	// We clear the msb to make it positive.
+	STCK	ret+0(FP)      // serialises before and after call
+	MOVD	ret+0(FP), R3  // R3 will wrap to 0 in the year 2043
+	SLD	$1, R3
+	SRD	$1, R3
+	MOVD	R3, ret+0(FP)
+	RET
+
+// memhash_varlen(p unsafe.Pointer, h seed) uintptr
+// redirects to memhash(p, h, size) using the size
+// stored in the closure.
+TEXT runtime·memhash_varlen(SB),NOSPLIT,$40-24
+	GO_ARGS
+	NO_LOCAL_POINTERS
+	MOVD	p+0(FP), R3
+	MOVD	h+8(FP), R4
+	MOVD	8(R12), R5
+	MOVD	R3, 8(R15)
+	MOVD	R4, 16(R15)
+	MOVD	R5, 24(R15)
+	BL	runtime·memhash(SB)
+	MOVD	32(R15), R3
+	MOVD	R3, ret+16(FP)
+	RET
+
+// AES hashing not implemented for s390x
+TEXT runtime·aeshash(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshash32(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshash64(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+TEXT runtime·aeshashstr(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	(R0), R15
+
+// memequal(p, q unsafe.Pointer, size uintptr) bool
+TEXT runtime·memequal(SB),NOSPLIT|NOFRAME,$0-25
+	MOVD	p+0(FP), R3
+	MOVD	q+8(FP), R5
+	MOVD	size+16(FP), R6
+	LA	ret+24(FP), R7
+	BR	runtime·memeqbody(SB)
+
+// memequal_varlen(a, b unsafe.Pointer) bool
+TEXT runtime·memequal_varlen(SB),NOSPLIT|NOFRAME,$0-17
+	MOVD	a+0(FP), R3
+	MOVD	b+8(FP), R5
+	MOVD	8(R12), R6    // compiler stores size at offset 8 in the closure
+	LA	ret+16(FP), R7
+	BR	runtime·memeqbody(SB)
+
+// eqstring tests whether two strings are equal.
+// The compiler guarantees that strings passed
+// to eqstring have equal length.
+// See runtime_test.go:eqstring_generic for
+// equivalent Go code.
+TEXT runtime·eqstring(SB),NOSPLIT|NOFRAME,$0-33
+	MOVD	s1str+0(FP), R3
+	MOVD	s1len+8(FP), R6
+	MOVD	s2str+16(FP), R5
+	LA	ret+32(FP), R7
+	BR	runtime·memeqbody(SB)
+
+TEXT bytes·Equal(SB),NOSPLIT|NOFRAME,$0-49
+	MOVD	a_len+8(FP), R2
+	MOVD	b_len+32(FP), R6
+	MOVD	a+0(FP), R3
+	MOVD	b+24(FP), R5
+	LA	ret+48(FP), R7
+	CMPBNE	R2, R6, notequal
+	BR	runtime·memeqbody(SB)
+notequal:
+	MOVB	$0, ret+48(FP)
+	RET
+
+// input:
+//   R3 = a
+//   R5 = b
+//   R6 = len
+//   R7 = address of output byte (stores 0 or 1 here)
+//   a and b have the same length
+TEXT runtime·memeqbody(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, equal
+loop:
+	CMPBEQ	R6, $0, equal
+	CMPBLT	R6, $32, tiny
+	CMP	R6, $256
+	BLT	tail
+	CLC	$256, 0(R3), 0(R5)
+	BNE	notequal
+	SUB	$256, R6
+	LA	256(R3), R3
+	LA	256(R5), R5
+	BR	loop
+tail:
+	SUB	$1, R6, R8
+	EXRL	$runtime·memeqbodyclc(SB), R8
+	BEQ	equal
+notequal:
+	MOVB	$0, 0(R7)
+	RET
+equal:
+	MOVB	$1, 0(R7)
+	RET
+tiny:
+	MOVD	$0, R2
+	CMPBLT	R6, $16, lt16
+	MOVD	0(R3), R8
+	MOVD	0(R5), R9
+	CMPBNE	R8, R9, notequal
+	MOVD	8(R3), R8
+	MOVD	8(R5), R9
+	CMPBNE	R8, R9, notequal
+	LA	16(R2), R2
+	SUB	$16, R6
+lt16:
+	CMPBLT	R6, $8, lt8
+	MOVD	0(R3)(R2*1), R8
+	MOVD	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	8(R2), R2
+	SUB	$8, R6
+lt8:
+	CMPBLT	R6, $4, lt4
+	MOVWZ	0(R3)(R2*1), R8
+	MOVWZ	0(R5)(R2*1), R9
+	CMPBNE	R8, R9, notequal
+	LA	4(R2), R2
+	SUB	$4, R6
+lt4:
+#define CHECK(n) \
+	CMPBEQ	R6, $n, equal \
+	MOVB	n(R3)(R2*1), R8 \
+	MOVB	n(R5)(R2*1), R9 \
+	CMPBNE	R8, R9, notequal
+	CHECK(0)
+	CHECK(1)
+	CHECK(2)
+	CHECK(3)
+	BR	equal
+
+TEXT runtime·memeqbodyclc(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
+
+TEXT runtime·fastrand1(SB), NOSPLIT, $0-4
+	MOVD	g_m(g), R4
+	MOVWZ	m_fastrand(R4), R3
+	ADD	R3, R3
+	CMPW	R3, $0
+	BGE	2(PC)
+	XOR	$0x88888eef, R3
+	MOVW	R3, m_fastrand(R4)
+	MOVW	R3, ret+0(FP)
+	RET
+
+TEXT bytes·IndexByte(SB),NOSPLIT,$0-40
+	MOVD	s+0(FP), R3     // s => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+24(FP), R5    // c => R5
+	MOVD	$ret+32(FP), R2 // &ret => R9
+	BR	runtime·indexbytebody(SB)
+
+TEXT strings·IndexByte(SB),NOSPLIT,$0-32
+	MOVD	s+0(FP), R3     // s => R3
+	MOVD	s_len+8(FP), R4 // s_len => R4
+	MOVBZ	c+16(FP), R5    // c => R5
+	MOVD	$ret+24(FP), R2 // &ret => R9
+	BR	runtime·indexbytebody(SB)
+
+// input:
+// R3: s
+// R4: s_len
+// R5: c -- byte sought
+// R2: &ret -- address to put index into
+TEXT runtime·indexbytebody(SB),NOSPLIT,$0
+	CMPBEQ	R4, $0, notfound
+	MOVD	R3, R6          // store base for later
+	ADD	R3, R4, R8      // the address after the end of the string
+	//if the length is small, use loop; otherwise, use vector or srst search
+	CMPBGE	R4, $16, large
+
+residual:
+	CMPBEQ	R3, R8, notfound
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, residual
+
+found:
+	SUB	R6, R3
+	SUB	$1, R3
+	MOVD	R3, 0(R2)
+	RET
+
+notfound:
+	MOVD	$-1, 0(R2)
+	RET
+
+large:
+	MOVB	runtime·vectorfacility(SB), R1
+	CMPBEQ	R1, $-1, checkvector	// vectorfacility = -1, vector not checked yet
+vectorchecked:
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+
+srstimpl:                       // vectorfacility != 1, not support or enable vector
+	MOVBZ	R5, R0          // c needs to be in R0, leave until last minute as currently R0 is expected to be 0
+srstloop:
+	WORD	$0xB25E0083     // srst %r8, %r3 (search the range [R3, R8))
+	BVS	srstloop        // interrupted - continue
+	BGT	notfoundr0
+foundr0:
+	XOR	R0, R0          // reset R0
+	SUB	R6, R8          // remove base
+	MOVD	R8, 0(R2)
+	RET
+notfoundr0:
+	XOR	R0, R0          // reset R0
+	MOVD	$-1, 0(R2)
+	RET
+
+vectorimpl:
+	//if the address is not 16byte aligned, use loop for the header
+	AND	$15, R3, R8
+	CMPBGT	R8, $0, notaligned
+
+aligned:
+	ADD	R6, R4, R8
+	AND	$-16, R8, R7
+	// replicate c across V17
+	VLVGB	$0, R5, V19
+	VREPB	$0, V19, V17
+
+vectorloop:
+	CMPBGE	R3, R7, residual
+	VL	0(R3), V16    // load string to be searched into V16
+	ADD	$16, R3
+	VFEEBS	V16, V17, V18 // search V17 in V16 and set conditional code accordingly
+	BVS	vectorloop
+
+	// when vector search found c in the string
+	VLGVB	$7, V18, R7   // load 7th element of V18 containing index into R7
+	SUB	$16, R3
+	SUB	R6, R3
+	ADD	R3, R7
+	MOVD	R7, 0(R2)
+	RET
+
+notaligned:
+	AND	$-16, R3, R8
+	ADD     $16, R8
+notalignedloop:
+	CMPBEQ	R3, R8, aligned
+	MOVBZ	0(R3), R7
+	LA	1(R3), R3
+	CMPBNE	R7, R5, notalignedloop
+	BR	found
+
+checkvector:
+	CALL	runtime·checkvectorfacility(SB)
+	MOVB    runtime·vectorfacility(SB), R1
+	BR	vectorchecked
+
+TEXT runtime·return0(SB), NOSPLIT, $0
+	MOVW	$0, R3
+	RET
+
+// Called from cgo wrappers, this function returns g->m->curg.stack.hi.
+// Must obey the gcc calling convention.
+TEXT _cgo_topofstack(SB),NOSPLIT|NOFRAME,$0
+	// g (R13), R10, R11 and LR (R14) are callee-save in the C ABI, so save them
+	MOVD	g, R1
+	MOVD	R10, R3
+	MOVD	LR, R4
+	MOVD	R11, R5
+
+	BL	runtime·load_g(SB)	// clobbers g (R13), R10, R11
+	MOVD	g_m(g), R2
+	MOVD	m_curg(R2), R2
+	MOVD	(g_stack+stack_hi)(R2), R2
+
+	MOVD	R1, g
+	MOVD	R3, R10
+	MOVD	R4, LR
+	MOVD	R5, R11
+	RET
+
+// The top-most function running on a goroutine
+// returns to goexit+PCQuantum.
+TEXT runtime·goexit(SB),NOSPLIT|NOFRAME,$0-0
+	BYTE $0x07; BYTE $0x00; // 2-byte nop
+	BL	runtime·goexit1(SB)	// does not return
+	// traceback from goexit1 must hit code range of goexit
+	BYTE $0x07; BYTE $0x00; // 2-byte nop
+
+TEXT runtime·prefetcht0(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht1(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetcht2(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·prefetchnta(SB),NOSPLIT,$0-8
+	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$0-8
+	RET
+
+TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
+	SYNC
+	RET
+
+TEXT runtime·cmpstring(SB),NOSPLIT|NOFRAME,$0-40
+	MOVD	s1_base+0(FP), R3
+	MOVD	s1_len+8(FP), R4
+	MOVD	s2_base+16(FP), R5
+	MOVD	s2_len+24(FP), R6
+	LA	ret+32(FP), R7
+	BR	runtime·cmpbody(SB)
+
+TEXT bytes·Compare(SB),NOSPLIT|NOFRAME,$0-56
+	MOVD	s1+0(FP), R3
+	MOVD	s1+8(FP), R4
+	MOVD	s2+24(FP), R5
+	MOVD	s2+32(FP), R6
+	LA	res+48(FP), R7
+	BR	runtime·cmpbody(SB)
+
+// input:
+//   R3 = a
+//   R4 = alen
+//   R5 = b
+//   R6 = blen
+//   R7 = address of output word (stores -1/0/1 here)
+TEXT runtime·cmpbody(SB),NOSPLIT|NOFRAME,$0-0
+	CMPBEQ	R3, R5, cmplengths
+	MOVD	R4, R8
+	CMPBLE	R4, R6, amin
+	MOVD	R6, R8
+amin:
+	CMPBEQ	R8, $0, cmplengths
+	CMP	R8, $256
+	BLE	tail
+loop:
+	CLC	$256, 0(R3), 0(R5)
+	BGT	gt
+	BLT	lt
+	SUB	$256, R8
+	CMP	R8, $256
+	BGT	loop
+tail:
+	SUB	$1, R8
+	EXRL	$runtime·cmpbodyclc(SB), R8
+	BGT	gt
+	BLT	lt
+cmplengths:
+	CMP	R4, R6
+	BEQ	eq
+	BLT	lt
+gt:
+	MOVD	$1, 0(R7)
+	RET
+lt:
+	MOVD	$-1, 0(R7)
+	RET
+eq:
+	MOVD	$0, 0(R7)
+	RET
+
+TEXT runtime·cmpbodyclc(SB),NOSPLIT|NOFRAME,$0-0
+	CLC	$1, 0(R3), 0(R5)
+	RET
+
+// This is called from .init_array and follows the platform, not Go, ABI.
+// We are overly conservative. We could only save the registers we use.
+// However, since this function is only called once per loaded module
+// performance is unimportant.
+TEXT runtime·addmoduledata(SB),NOSPLIT|NOFRAME,$0-0
+	// Save R6-R15, F0, F2, F4 and F6 in the
+	// register save area of the calling function
+	STMG	R6, R15, 48(R15)
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	// append the argument (passed in R2, as per the ELF ABI) to the
+	// moduledata linked list.
+	MOVD	runtime·lastmoduledatap(SB), R1
+	MOVD	R2, moduledata_next(R1)
+	MOVD	R2, runtime·lastmoduledatap(SB)
+
+	// Restore R6-R15, F0, F2, F4 and F6
+	LMG	48(R15), R6, R15
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+	RET
+
+TEXT ·checkASM(SB),NOSPLIT,$0-1
+	MOVB	$1, ret+0(FP)
+	RET

diff --git a/src/runtime/atomic_386.go b/src/runtime/atomic_386.go
deleted file mode 100644
index f8d589e..0000000
--- a/src/runtime/atomic_386.go
+++ /dev/null

@@ -1,82 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// The calls to nop are to keep these functions from being inlined.
-// If they are inlined we have no guarantee that later rewrites of the
-// code by optimizers will preserve the relative order of memory accesses.
-
-//go:nosplit
-func atomicload(ptr *uint32) uint32 {
-	nop()
-	return *ptr
-}
-
-//go:nosplit
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer {
-	nop()
-	return *(*unsafe.Pointer)(ptr)
-}
-
-//go:nosplit
-func xadd64(ptr *uint64, delta int64) uint64 {
-	for {
-		old := *ptr
-		if cas64(ptr, old, old+uint64(delta)) {
-			return old + uint64(delta)
-		}
-	}
-}
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:nosplit
-func xchg64(ptr *uint64, new uint64) uint64 {
-	for {
-		old := *ptr
-		if cas64(ptr, old, new) {
-			return old
-		}
-	}
-}
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/atomic_amd64x.go b/src/runtime/atomic_amd64x.go
deleted file mode 100644
index edcc6d6..0000000
--- a/src/runtime/atomic_amd64x.go
+++ /dev/null

@@ -1,72 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build amd64 amd64p32
-
-package runtime
-
-import "unsafe"
-
-// The calls to nop are to keep these functions from being inlined.
-// If they are inlined we have no guarantee that later rewrites of the
-// code by optimizers will preserve the relative order of memory accesses.
-
-//go:nosplit
-func atomicload(ptr *uint32) uint32 {
-	nop()
-	return *ptr
-}
-
-//go:nosplit
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer {
-	nop()
-	return *(*unsafe.Pointer)(ptr)
-}
-
-//go:nosplit
-func atomicload64(ptr *uint64) uint64 {
-	nop()
-	return *ptr
-}
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/atomic_arm.go b/src/runtime/atomic_arm.go
deleted file mode 100644
index 02a1f35..0000000
--- a/src/runtime/atomic_arm.go
+++ /dev/null

@@ -1,175 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-var locktab [57]struct {
-	l   mutex
-	pad [_CacheLineSize - unsafe.Sizeof(mutex{})]byte
-}
-
-func addrLock(addr *uint64) *mutex {
-	return &locktab[(uintptr(unsafe.Pointer(addr))>>3)%uintptr(len(locktab))].l
-}
-
-// Atomic add and return new value.
-//go:nosplit
-func xadd(val *uint32, delta int32) uint32 {
-	for {
-		oval := *val
-		nval := oval + uint32(delta)
-		if cas(val, oval, nval) {
-			return nval
-		}
-	}
-}
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:nosplit
-func xchg(addr *uint32, v uint32) uint32 {
-	for {
-		old := *addr
-		if cas(addr, old, v) {
-			return old
-		}
-	}
-}
-
-//go:nosplit
-func xchgp1(addr unsafe.Pointer, v unsafe.Pointer) unsafe.Pointer {
-	for {
-		old := *(*unsafe.Pointer)(addr)
-		if casp1((*unsafe.Pointer)(addr), old, v) {
-			return old
-		}
-	}
-}
-
-//go:nosplit
-func xchguintptr(addr *uintptr, v uintptr) uintptr {
-	return uintptr(xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
-}
-
-//go:nosplit
-func atomicload(addr *uint32) uint32 {
-	return xadd(addr, 0)
-}
-
-//go:nosplit
-func atomicloadp(addr unsafe.Pointer) unsafe.Pointer {
-	return unsafe.Pointer(uintptr(xadd((*uint32)(addr), 0)))
-}
-
-//go:nosplit
-func atomicstorep1(addr unsafe.Pointer, v unsafe.Pointer) {
-	for {
-		old := *(*unsafe.Pointer)(addr)
-		if casp1((*unsafe.Pointer)(addr), old, v) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicstore(addr *uint32, v uint32) {
-	for {
-		old := *addr
-		if cas(addr, old, v) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func cas64(addr *uint64, old, new uint64) bool {
-	var ok bool
-	systemstack(func() {
-		lock(addrLock(addr))
-		if *addr == old {
-			*addr = new
-			ok = true
-		}
-		unlock(addrLock(addr))
-	})
-	return ok
-}
-
-//go:nosplit
-func xadd64(addr *uint64, delta int64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr + uint64(delta)
-		*addr = r
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func xchg64(addr *uint64, v uint64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr
-		*addr = v
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func atomicload64(addr *uint64) uint64 {
-	var r uint64
-	systemstack(func() {
-		lock(addrLock(addr))
-		r = *addr
-		unlock(addrLock(addr))
-	})
-	return r
-}
-
-//go:nosplit
-func atomicstore64(addr *uint64, v uint64) {
-	systemstack(func() {
-		lock(addrLock(addr))
-		*addr = v
-		unlock(addrLock(addr))
-	})
-}
-
-//go:nosplit
-func atomicor8(addr *uint8, v uint8) {
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8) // little endian
-	for {
-		old := *addr32
-		if cas(addr32, old, old|word) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicand8(addr *uint8, v uint8) {
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
-	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
-	word |= ^mask
-	for {
-		old := *addr32
-		if cas(addr32, old, old&word) {
-			return
-		}
-	}
-}

diff --git a/src/runtime/atomic_arm64.go b/src/runtime/atomic_arm64.go
deleted file mode 100644
index a377e3e..0000000
--- a/src/runtime/atomic_arm64.go
+++ /dev/null

@@ -1,82 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd64
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload(ptr *uint32) uint32
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer
-
-//go:nosplit
-func atomicor8(addr *uint8, v uint8) {
-	// TODO(dfc) implement this in asm.
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8) // little endian
-	for {
-		old := *addr32
-		if cas(addr32, old, old|word) {
-			return
-		}
-	}
-}
-
-//go:nosplit
-func atomicand8(addr *uint8, v uint8) {
-	// TODO(dfc) implement this in asm.
-	// Align down to 4 bytes and use 32-bit CAS.
-	uaddr := uintptr(unsafe.Pointer(addr))
-	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
-	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
-	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
-	word |= ^mask
-	for {
-		old := *addr32
-		if cas(addr32, old, old&word) {
-			return
-		}
-	}
-}
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/atomic_arm64.s b/src/runtime/atomic_arm64.s
index d3ab2a1..4704aa6 100644
--- a/src/runtime/atomic_arm64.s
+++ b/src/runtime/atomic_arm64.s

@@ -4,114 +4,6 @@
 
 #include "textflag.h"
 
-// uint32 runtime·atomicload(uint32 volatile* addr)
-TEXT ·atomicload(SB),NOSPLIT,$-8-12
-	MOVD	ptr+0(FP), R0
-	LDARW	(R0), R0
-	MOVW	R0, ret+8(FP)
-	RET
-
-// uint64 runtime·atomicload64(uint64 volatile* addr)
-TEXT ·atomicload64(SB),NOSPLIT,$-8-16
-	MOVD	ptr+0(FP), R0
-	LDAR	(R0), R0
-	MOVD	R0, ret+8(FP)
-	RET
-
-// void *runtime·atomicloadp(void *volatile *addr)
-TEXT ·atomicloadp(SB),NOSPLIT,$-8-16
-	MOVD	ptr+0(FP), R0
-	LDAR	(R0), R0
-	MOVD	R0, ret+8(FP)
-	RET
-
-TEXT runtime·atomicstorep1(SB), NOSPLIT, $0-16
-	B	runtime·atomicstore64(SB)
-
-TEXT runtime·atomicstore(SB), NOSPLIT, $0-12
-	MOVD	ptr+0(FP), R0
-	MOVW	val+8(FP), R1
-	STLRW	R1, (R0)
-	RET
-
-TEXT runtime·atomicstore64(SB), NOSPLIT, $0-16
-	MOVD	ptr+0(FP), R0
-	MOVD	val+8(FP), R1
-	STLR	R1, (R0)
-	RET
-
-TEXT runtime·xchg(SB), NOSPLIT, $0-20
-again:
-	MOVD	ptr+0(FP), R0
-	MOVW	new+8(FP), R1
-	LDAXRW	(R0), R2
-	STLXRW	R1, (R0), R3
-	CBNZ	R3, again
-	MOVW	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xchg64(SB), NOSPLIT, $0-24
-again:
-	MOVD	ptr+0(FP), R0
-	MOVD	new+8(FP), R1
-	LDAXR	(R0), R2
-	STLXR	R1, (R0), R3
-	CBNZ	R3, again
-	MOVD	R2, ret+16(FP)
-	RET
-
-// bool runtime·cas64(uint64 *ptr, uint64 old, uint64 new)
-// Atomically:
-//      if(*val == *old){
-//              *val = new;
-//              return 1;
-//      } else {
-//              return 0;
-//      }
-TEXT runtime·cas64(SB), NOSPLIT, $0-25
-	MOVD	ptr+0(FP), R0
-	MOVD	old+8(FP), R1
-	MOVD	new+16(FP), R2
-again:
-	LDAXR	(R0), R3
-	CMP	R1, R3
-	BNE	ok
-	STLXR	R2, (R0), R3
-	CBNZ	R3, again
-ok:
-	CSET	EQ, R0
-	MOVB	R0, ret+24(FP)
-	RET
-
-// uint32 xadd(uint32 volatile *ptr, int32 delta)
-// Atomically:
-//      *val += delta;
-//      return *val;
-TEXT runtime·xadd(SB), NOSPLIT, $0-20
-again:
-	MOVD	ptr+0(FP), R0
-	MOVW	delta+8(FP), R1
-	LDAXRW	(R0), R2
-	ADDW	R2, R1, R2
-	STLXRW	R2, (R0), R3
-	CBNZ	R3, again
-	MOVW	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xadd64(SB), NOSPLIT, $0-24
-again:
-	MOVD	ptr+0(FP), R0
-	MOVD	delta+8(FP), R1
-	LDAXR	(R0), R2
-	ADD	R2, R1, R2
-	STLXR	R2, (R0), R3
-	CBNZ	R3, again
-	MOVD	R2, ret+16(FP)
-	RET
-
-TEXT runtime·xchguintptr(SB), NOSPLIT, $0-24
-	B	runtime·xchg64(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-8-0
 	DMB	$0xe	// DMB ST
 	RET

diff --git a/src/runtime/atomic_mips64x.s b/src/runtime/atomic_mips64x.s
new file mode 100644
index 0000000..0f849ca
--- /dev/null
+++ b/src/runtime/atomic_mips64x.s

@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+#define SYNC	WORD $0xf
+
+TEXT ·publicationBarrier(SB),NOSPLIT,$-8-0
+	SYNC
+	RET

diff --git a/src/runtime/atomic_pointer.go b/src/runtime/atomic_pointer.go
index f84afe0..4fe3340 100644
--- a/src/runtime/atomic_pointer.go
+++ b/src/runtime/atomic_pointer.go

@@ -4,34 +4,29 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // These functions cannot have go:noescape annotations,
 // because while ptr does not escape, new does.
 // If new is marked as not escaping, the compiler will make incorrect
 // escape analysis decisions about the pointer value being stored.
-// Instead, these are wrappers around the actual atomics (xchgp1 and so on)
+// Instead, these are wrappers around the actual atomics (casp1 and so on)
 // that use noescape to convey which arguments do not escape.
-//
-// Additionally, these functions must update the shadow heap for
-// write barrier checking.
 
+// atomicstorep performs *ptr = new atomically and invokes a write barrier.
+//
 //go:nosplit
 func atomicstorep(ptr unsafe.Pointer, new unsafe.Pointer) {
-	atomicstorep1(noescape(ptr), new)
+	atomic.StorepNoWB(noescape(ptr), new)
 	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
 }
 
 //go:nosplit
-func xchgp(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
-	old := xchgp1(noescape(ptr), new)
-	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
-	return old
-}
-
-//go:nosplit
 func casp(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool {
-	if !casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new) {
+	if !atomic.Casp1((*unsafe.Pointer)(noescape(unsafe.Pointer(ptr))), noescape(old), new) {
 		return false
 	}
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
@@ -49,7 +44,6 @@
 //go:nosplit
 func sync_atomic_StorePointer(ptr *unsafe.Pointer, new unsafe.Pointer) {
 	sync_atomic_StoreUintptr((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
-	atomicstorep1(noescape(unsafe.Pointer(ptr)), new)
 	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 }
 
@@ -58,9 +52,9 @@
 
 //go:linkname sync_atomic_SwapPointer sync/atomic.SwapPointer
 //go:nosplit
-func sync_atomic_SwapPointer(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
-	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(ptr)), uintptr(new)))
-	writebarrierptr_nostore((*uintptr)(ptr), uintptr(new))
+func sync_atomic_SwapPointer(ptr *unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer {
+	old := unsafe.Pointer(sync_atomic_SwapUintptr((*uintptr)(noescape(unsafe.Pointer(ptr))), uintptr(new)))
+	writebarrierptr_nostore((*uintptr)(unsafe.Pointer(ptr)), uintptr(new))
 	return old
 }
 

diff --git a/src/runtime/atomic_ppc64x.go b/src/runtime/atomic_ppc64x.go
deleted file mode 100644
index b58ee5a..0000000
--- a/src/runtime/atomic_ppc64x.go
+++ /dev/null

@@ -1,60 +0,0 @@
-// Copyright 2014 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-
-package runtime
-
-import "unsafe"
-
-//go:noescape
-func xadd(ptr *uint32, delta int32) uint32
-
-//go:noescape
-func xadd64(ptr *uint64, delta int64) uint64
-
-//go:noescape
-//go:linkname xadduintptr runtime.xadd64
-func xadduintptr(ptr *uintptr, delta uintptr) uintptr
-
-//go:noescape
-func xchg(ptr *uint32, new uint32) uint32
-
-//go:noescape
-func xchg64(ptr *uint64, new uint64) uint64
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func xchgp1(ptr unsafe.Pointer, new unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func xchguintptr(ptr *uintptr, new uintptr) uintptr
-
-//go:noescape
-func atomicload(ptr *uint32) uint32
-
-//go:noescape
-func atomicload64(ptr *uint64) uint64
-
-//go:noescape
-func atomicloadp(ptr unsafe.Pointer) unsafe.Pointer
-
-//go:noescape
-func atomicand8(ptr *uint8, val uint8)
-
-//go:noescape
-func atomicor8(ptr *uint8, val uint8)
-
-// NOTE: Do not add atomicxor8 (XOR is not idempotent).
-
-//go:noescape
-func cas64(ptr *uint64, old, new uint64) bool
-
-//go:noescape
-func atomicstore(ptr *uint32, val uint32)
-
-//go:noescape
-func atomicstore64(ptr *uint64, val uint64)
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func atomicstorep1(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/atomic_ppc64x.s b/src/runtime/atomic_ppc64x.s
index 28c5bf3..57f672f 100644
--- a/src/runtime/atomic_ppc64x.s
+++ b/src/runtime/atomic_ppc64x.s

@@ -6,42 +6,9 @@
 
 #include "textflag.h"
 
-// uint32 runtime·atomicload(uint32 volatile* addr)
-TEXT ·atomicload(SB),NOSPLIT,$-8-12
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVWZ	0(R3), R3
-	CMPW	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVW	R3, ret+8(FP)
-	RET
-
-// uint64 runtime·atomicload64(uint64 volatile* addr)
-TEXT ·atomicload64(SB),NOSPLIT,$-8-16
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVD	0(R3), R3
-	CMP	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVD	R3, ret+8(FP)
-	RET
-
-// void *runtime·atomicloadp(void *volatile *addr)
-TEXT ·atomicloadp(SB),NOSPLIT,$-8-16
-	MOVD	addr+0(FP), R3
-	SYNC
-	MOVD	0(R3), R3
-	CMP	R3, R3, CR7
-	BC	4, 30, 1(PC) // bne- cr7,0x4
-	ISYNC
-	MOVD	R3, ret+8(FP)
-	RET
-
-TEXT ·publicationBarrier(SB),NOSPLIT,$-8-0
+TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
 	// LWSYNC is the "export" barrier recommended by Power ISA
 	// v2.07 book II, appendix B.2.2.2.
 	// LWSYNC is a load/load, load/store, and store/store barrier.
-	WORD $0x7c2004ac	// LWSYNC
+	LWSYNC
 	RET

diff --git a/src/runtime/callers_test.go b/src/runtime/callers_test.go
new file mode 100644
index 0000000..ad83f99
--- /dev/null
+++ b/src/runtime/callers_test.go

@@ -0,0 +1,83 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"strings"
+	"testing"
+)
+
+func f1(pan bool) []uintptr {
+	return f2(pan) // line 14
+}
+
+func f2(pan bool) []uintptr {
+	return f3(pan) // line 18
+}
+
+func f3(pan bool) []uintptr {
+	if pan {
+		panic("f3") // line 23
+	}
+	ret := make([]uintptr, 20)
+	return ret[:runtime.Callers(0, ret)] // line 26
+}
+
+func testCallers(t *testing.T, pcs []uintptr, pan bool) {
+	m := make(map[string]int, len(pcs))
+	frames := runtime.CallersFrames(pcs)
+	for {
+		frame, more := frames.Next()
+		if frame.Function != "" {
+			m[frame.Function] = frame.Line
+		}
+		if !more {
+			break
+		}
+	}
+
+	var seen []string
+	for k := range m {
+		seen = append(seen, k)
+	}
+	t.Logf("functions seen: %s", strings.Join(seen, " "))
+
+	var f3Line int
+	if pan {
+		f3Line = 23
+	} else {
+		f3Line = 26
+	}
+	want := []struct {
+		name string
+		line int
+	}{
+		{"f1", 14},
+		{"f2", 18},
+		{"f3", f3Line},
+	}
+	for _, w := range want {
+		if got := m["runtime_test."+w.name]; got != w.line {
+			t.Errorf("%s is line %d, want %d", w.name, got, w.line)
+		}
+	}
+}
+
+func TestCallers(t *testing.T) {
+	testCallers(t, f1(false), false)
+}
+
+func TestCallersPanic(t *testing.T) {
+	defer func() {
+		if r := recover(); r == nil {
+			t.Fatal("did not panic")
+		}
+		pcs := make([]uintptr, 20)
+		pcs = pcs[:runtime.Callers(0, pcs)]
+		testCallers(t, pcs, true)
+	}()
+	f1(true)
+}

diff --git a/src/runtime/cgo.go b/src/runtime/cgo.go
index 169a31d..9cf7b58 100644
--- a/src/runtime/cgo.go
+++ b/src/runtime/cgo.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -11,19 +11,19 @@
 // Filled in by runtime/cgo when linked into binary.
 
 //go:linkname _cgo_init _cgo_init
-//go:linkname _cgo_malloc _cgo_malloc
-//go:linkname _cgo_free _cgo_free
 //go:linkname _cgo_thread_start _cgo_thread_start
 //go:linkname _cgo_sys_thread_create _cgo_sys_thread_create
 //go:linkname _cgo_notify_runtime_init_done _cgo_notify_runtime_init_done
+//go:linkname _cgo_callers _cgo_callers
+//go:linkname _cgo_set_context_function _cgo_set_context_function
 
 var (
 	_cgo_init                     unsafe.Pointer
-	_cgo_malloc                   unsafe.Pointer
-	_cgo_free                     unsafe.Pointer
 	_cgo_thread_start             unsafe.Pointer
 	_cgo_sys_thread_create        unsafe.Pointer
 	_cgo_notify_runtime_init_done unsafe.Pointer
+	_cgo_callers                  unsafe.Pointer
+	_cgo_set_context_function     unsafe.Pointer
 )
 
 // iscgo is set to true by the runtime/cgo package

diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
index a895083..dc8897d 100644
--- a/src/runtime/cgo/asm_386.s
+++ b/src/runtime/cgo/asm_386.s

@@ -1,13 +1,12 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
 TEXT crosscall2(SB),NOSPLIT,$0
 	PUSHL	BP
 	MOVL	SP, BP
@@ -15,14 +14,16 @@
 	PUSHL	SI
 	PUSHL	DI
 	
-	SUBL	$8, SP
+	SUBL	$12, SP
+	MOVL	20(BP), AX
+	MOVL	AX, 8(SP)
 	MOVL	16(BP), AX
 	MOVL	AX, 4(SP)
 	MOVL	12(BP), AX
 	MOVL	AX, 0(SP)
 	MOVL	8(BP), AX
 	CALL	AX
-	ADDL	$8, SP
+	ADDL	$12, SP
 	
 	POPL	DI
 	POPL	SI

diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 6095bd1..541bd9e 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s

@@ -1,47 +1,76 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
 TEXT crosscall2(SB),NOSPLIT,$0
+#ifndef GOOS_windows
 	SUBQ	$0x58, SP	/* keeps stack pointer 32-byte aligned */
-	MOVQ	BX, 0x10(SP)
-	MOVQ	BP, 0x18(SP)
-	MOVQ	R12, 0x20(SP)
-	MOVQ	R13, 0x28(SP)
-	MOVQ	R14, 0x30(SP)
-	MOVQ	R15, 0x38(SP)
+#else
+	SUBQ	$0x118, SP	/* also need to save xmm6 - xmm15 */
+#endif
+	MOVQ	BX, 0x18(SP)
+	MOVQ	BP, 0x20(SP)
+	MOVQ	R12, 0x28(SP)
+	MOVQ	R13, 0x30(SP)
+	MOVQ	R14, 0x38(SP)
+	MOVQ	R15, 0x40(SP)
 
 #ifdef GOOS_windows
-	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, and R15
-	MOVQ	DI, 0x40(SP)
-	MOVQ	SI, 0x48(SP)
+	// Win64 save RBX, RBP, RDI, RSI, RSP, R12, R13, R14, R15 and XMM6 -- XMM15.
+	MOVQ	DI, 0x48(SP)
+	MOVQ	SI, 0x50(SP)
+	MOVUPS	X6, 0x60(SP)
+	MOVUPS	X7, 0x70(SP)
+	MOVUPS	X8, 0x80(SP)
+	MOVUPS	X9, 0x90(SP)
+	MOVUPS	X10, 0xa0(SP)
+	MOVUPS	X11, 0xb0(SP)
+	MOVUPS	X12, 0xc0(SP)
+	MOVUPS	X13, 0xd0(SP)
+	MOVUPS	X14, 0xe0(SP)
+	MOVUPS	X15, 0xf0(SP)
 
-	MOVQ	DX, 0(SP)	/* arg */
-	MOVQ	R8, 8(SP)	/* argsize (includes padding) */
+	MOVQ	DX, 0x0(SP)	/* arg */
+	MOVQ	R8, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	R9, 0x10(SP)	/* ctxt */
 	
 	CALL	CX	/* fn */
 	
-	MOVQ	0x40(SP), DI
-	MOVQ	0x48(SP), SI
+	MOVQ	0x48(SP), DI
+	MOVQ	0x50(SP), SI
+	MOVUPS	0x60(SP), X6
+	MOVUPS	0x70(SP), X7
+	MOVUPS	0x80(SP), X8
+	MOVUPS	0x90(SP), X9
+	MOVUPS	0xa0(SP), X10
+	MOVUPS	0xb0(SP), X11
+	MOVUPS	0xc0(SP), X12
+	MOVUPS	0xd0(SP), X13
+	MOVUPS	0xe0(SP), X14
+	MOVUPS	0xf0(SP), X15
 #else
-	MOVQ	SI, 0(SP)	/* arg */
-	MOVQ	DX, 8(SP)	/* argsize (includes padding) */
+	MOVQ	SI, 0x0(SP)	/* arg */
+	MOVQ	DX, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	CX, 0x10(SP)	/* ctxt */
 
 	CALL	DI	/* fn */
 #endif
 
-	MOVQ	0x10(SP), BX
-	MOVQ	0x18(SP), BP
-	MOVQ	0x20(SP), R12
-	MOVQ	0x28(SP), R13
-	MOVQ	0x30(SP), R14
-	MOVQ	0x38(SP), R15
+	MOVQ	0x18(SP), BX
+	MOVQ	0x20(SP), BP
+	MOVQ	0x28(SP), R12
+	MOVQ	0x30(SP), R13
+	MOVQ	0x38(SP), R14
+	MOVQ	0x40(SP), R15
 	
+#ifndef GOOS_windows
 	ADDQ	$0x58, SP
+#else
+	ADDQ	$0x118, SP
+#endif
 	RET

diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
index 9aeaf9a..0f35422 100644
--- a/src/runtime/cgo/asm_arm.s
+++ b/src/runtime/cgo/asm_arm.s

@@ -1,24 +1,55 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
 TEXT crosscall2(SB),NOSPLIT,$-4
 	/* 
 	 * We still need to save all callee save register as before, and then
-	 *  push 2 args for fn (R1 and R2).
+	 *  push 3 args for fn (R1, R2, R3).
 	 * Also note that at procedure entry in gc world, 4(R13) will be the
 	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
 	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
 	 *  nevertheless.
 	 */
-	MOVM.WP	[R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	SUB	$(8*9), R13 // Reserve space for the floating point registers.
+	MOVM.WP	[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+
+	// Skip floating point registers on GOARM < 6.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfpsave
+	MOVD	F8, (14*4+8*1)(R13)
+	MOVD	F9, (14*4+8*2)(R13)
+	MOVD	F10, (14*4+8*3)(R13)
+	MOVD	F11, (14*4+8*4)(R13)
+	MOVD	F12, (14*4+8*5)(R13)
+	MOVD	F13, (14*4+8*6)(R13)
+	MOVD	F14, (14*4+8*7)(R13)
+	MOVD	F15, (14*4+8*8)(R13)
+
+skipfpsave:
 	BL	runtime·load_g(SB)
 	MOVW	R15, R14 // R15 is PC.
 	MOVW	0(R13), R15
-	MOVM.IAW	(R13), [R0, R1, R2, R4, R5, R6, R7, R8, R9, g, R11, R12, R15]
+
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfprest
+	MOVD	(14*4+8*1)(R13), F8
+	MOVD	(14*4+8*2)(R13), F9
+	MOVD	(14*4+8*3)(R13), F10
+	MOVD	(14*4+8*4)(R13), F11
+	MOVD	(14*4+8*5)(R13), F12
+	MOVD	(14*4+8*6)(R13), F13
+	MOVD	(14*4+8*7)(R13), F14
+	MOVD	(14*4+8*8)(R13), F15
+
+skipfprest:
+	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14]
+	ADD	$(8*9), R13
+	MOVW	R14, R15

diff --git a/src/runtime/cgo/asm_arm64.s b/src/runtime/cgo/asm_arm64.s
index c6f98fa..e55a70f 100644
--- a/src/runtime/cgo/asm_arm64.s
+++ b/src/runtime/cgo/asm_arm64.s

@@ -1,36 +1,44 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
 TEXT crosscall2(SB),NOSPLIT,$-8
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 2 args for fn (R1 and R2).
+	 *  push 3 args for fn (R1, R2, R3).
 	 * Also note that at procedure entry in gc world, 8(RSP) will be the
 	 *  first arg.
 	 * TODO(minux): use LDP/STP here if it matters.
 	 */
-	SUB	$128, RSP
+	SUB	$(8*24), RSP
 	MOVD	R1, (8*1)(RSP)
 	MOVD	R2, (8*2)(RSP)
-	MOVD	R19, (8*3)(RSP)
-	MOVD	R20, (8*4)(RSP)
-	MOVD	R21, (8*5)(RSP)
-	MOVD	R22, (8*6)(RSP)
-	MOVD	R23, (8*7)(RSP)
-	MOVD	R24, (8*8)(RSP)
-	MOVD	R25, (8*9)(RSP)
-	MOVD	R26, (8*10)(RSP)
-	MOVD	R27, (8*11)(RSP)
-	MOVD	g, (8*12)(RSP)
-	MOVD	R29, (8*13)(RSP)
-	MOVD	R30, (8*14)(RSP)
+	MOVD	R3, (8*3)(RSP)
+	MOVD	R19, (8*4)(RSP)
+	MOVD	R20, (8*5)(RSP)
+	MOVD	R21, (8*6)(RSP)
+	MOVD	R22, (8*7)(RSP)
+	MOVD	R23, (8*8)(RSP)
+	MOVD	R24, (8*9)(RSP)
+	MOVD	R25, (8*10)(RSP)
+	MOVD	R26, (8*11)(RSP)
+	MOVD	R27, (8*12)(RSP)
+	MOVD	g, (8*13)(RSP)
+	MOVD	R29, (8*14)(RSP)
+	MOVD	R30, (8*15)(RSP)
+	FMOVD	F8, (8*16)(RSP)
+	FMOVD	F9, (8*17)(RSP)
+	FMOVD	F10, (8*18)(RSP)
+	FMOVD	F11, (8*19)(RSP)
+	FMOVD	F12, (8*20)(RSP)
+	FMOVD	F13, (8*21)(RSP)
+	FMOVD	F14, (8*22)(RSP)
+	FMOVD	F15, (8*23)(RSP)
 
 	MOVD	R0, R19
 
@@ -41,17 +49,26 @@
 
 	MOVD	(8*1)(RSP), R1
 	MOVD	(8*2)(RSP), R2
-	MOVD	(8*3)(RSP), R19
-	MOVD	(8*4)(RSP), R20
-	MOVD	(8*5)(RSP), R21
-	MOVD	(8*6)(RSP), R22
-	MOVD	(8*7)(RSP), R23
-	MOVD	(8*8)(RSP), R24
-	MOVD	(8*9)(RSP), R25
-	MOVD	(8*10)(RSP), R26
-	MOVD	(8*11)(RSP), R27
-	MOVD	(8*12)(RSP), g
-	MOVD	(8*13)(RSP), R29
-	MOVD	(8*14)(RSP), R30
-	ADD	$128, RSP
+	MOVD	(8*3)(RSP), R3
+	MOVD	(8*4)(RSP), R19
+	MOVD	(8*5)(RSP), R20
+	MOVD	(8*6)(RSP), R21
+	MOVD	(8*7)(RSP), R22
+	MOVD	(8*8)(RSP), R23
+	MOVD	(8*9)(RSP), R24
+	MOVD	(8*10)(RSP), R25
+	MOVD	(8*11)(RSP), R26
+	MOVD	(8*12)(RSP), R27
+	MOVD	(8*13)(RSP), g
+	MOVD	(8*14)(RSP), R29
+	MOVD	(8*15)(RSP), R30
+	FMOVD	(8*16)(RSP), F8
+	FMOVD	(8*17)(RSP), F9
+	FMOVD	(8*18)(RSP), F10
+	FMOVD	(8*19)(RSP), F11
+	FMOVD	(8*20)(RSP), F12
+	FMOVD	(8*21)(RSP), F13
+	FMOVD	(8*22)(RSP), F14
+	FMOVD	(8*23)(RSP), F15
+	ADD	$(8*24), RSP
 	RET

diff --git a/src/runtime/cgo/asm_mips64x.s b/src/runtime/cgo/asm_mips64x.s
new file mode 100644
index 0000000..19e9014
--- /dev/null
+++ b/src/runtime/cgo/asm_mips64x.s

@@ -0,0 +1,76 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+/*
+ * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
+ * Save registers and call fn with two arguments.
+ */
+TEXT crosscall2(SB),NOSPLIT,$-8
+	/*
+	 * We still need to save all callee save register as before, and then
+	 *  push 3 args for fn (R5, R6, R7).
+	 * Also note that at procedure entry in gc world, 8(R29) will be the
+	 *  first arg.
+	 */
+	ADDV	$(-8*23), R29
+	MOVV	R5, (8*1)(R29)
+	MOVV	R6, (8*2)(R29)
+	MOVV	R7, (8*3)(R29)
+	MOVV	R16, (8*4)(R29)
+	MOVV	R17, (8*5)(R29)
+	MOVV	R18, (8*6)(R29)
+	MOVV	R19, (8*7)(R29)
+	MOVV	R20, (8*8)(R29)
+	MOVV	R21, (8*9)(R29)
+	MOVV	R22, (8*10)(R29)
+	MOVV	R23, (8*11)(R29)
+	MOVV	RSB, (8*12)(R29)
+	MOVV	g, (8*13)(R29)
+	MOVV	R31, (8*14)(R29)
+	MOVD	F24, (8*15)(R29)
+	MOVD	F25, (8*16)(R29)
+	MOVD	F26, (8*17)(R29)
+	MOVD	F27, (8*18)(R29)
+	MOVD	F28, (8*19)(R29)
+	MOVD	F29, (8*20)(R29)
+	MOVD	F30, (8*21)(R29)
+	MOVD	F31, (8*22)(R29)
+
+	// Initialize Go ABI environment
+	// prepare SB register = PC & 0xffffffff00000000
+	BGEZAL	R0, 1(PC)
+	SRLV	$32, R31, RSB
+	SLLV	$32, RSB
+	JAL	runtime·reginit(SB)
+	JAL	runtime·load_g(SB)
+	JAL	(R4)
+
+	MOVV	(8*1)(R29), R5
+	MOVV	(8*2)(R29), R6
+	MOVV	(8*3)(R29), R7
+	MOVV	(8*4)(R29), R16
+	MOVV	(8*5)(R29), R17
+	MOVV	(8*6)(R29), R18
+	MOVV	(8*7)(R29), R19
+	MOVV	(8*8)(R29), R20
+	MOVV	(8*9)(R29), R21
+	MOVV	(8*10)(R29), R22
+	MOVV	(8*11)(R29), R23
+	MOVV	(8*12)(R29), RSB
+	MOVV	(8*13)(R29), g
+	MOVV	(8*14)(R29), R31
+	MOVD	(8*15)(R29), F24
+	MOVD	(8*16)(R29), F25
+	MOVD	(8*17)(R29), F26
+	MOVD	(8*18)(R29), F27
+	MOVD	(8*19)(R29), F28
+	MOVD	(8*20)(R29), F29
+	MOVD	(8*21)(R29), F30
+	MOVD	(8*22)(R29), F31
+	ADDV	$(8*23), R29
+	RET

diff --git a/src/runtime/cgo/asm_nacl_amd64p32.s b/src/runtime/cgo/asm_nacl_amd64p32.s
index eb92014..82aaecd 100644
--- a/src/runtime/cgo/asm_nacl_amd64p32.s
+++ b/src/runtime/cgo/asm_nacl_amd64p32.s

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/asm_ppc64x.s b/src/runtime/cgo/asm_ppc64x.s
index 0c08a1d..dded1be 100644
--- a/src/runtime/cgo/asm_ppc64x.s
+++ b/src/runtime/cgo/asm_ppc64x.s

@@ -1,17 +1,16 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // +build ppc64 ppc64le
 
 #include "textflag.h"
+#include "asm_ppc64x.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32), void*, int32)
- * Save registers and call fn with two arguments.
- * crosscall2 obeys the C ABI; fn obeys the Go ABI.
- */
-TEXT crosscall2(SB),NOSPLIT,$-8
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
+TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// TODO(austin): ABI v1 (fn is probably a function descriptor)
 
 	// Start with standard C stack frame layout and linkage
@@ -21,18 +20,20 @@
 
 	BL	saveregs2<>(SB)
 
-	MOVDU	R1, (-288-3*8)(R1)
+	MOVDU	R1, (-288-3*8-FIXED_FRAME)(R1)
 
 	// Initialize Go ABI environment
 	BL	runtime·reginit(SB)
 	BL	runtime·load_g(SB)
 
+	MOVD	R3, R12
 	MOVD	R3, CTR
-	MOVD	R4, 8(R1)
-	MOVD	R5, 16(R1)
+	MOVD	R4, FIXED_FRAME+0(R1)
+	MOVD	R5, FIXED_FRAME+8(R1)
+	MOVD	R6, FIXED_FRAME+16(R1)
 	BL	(CTR)
 
-	ADD	$(288+3*8), R1
+	ADD	$(288+3*8+FIXED_FRAME), R1
 
 	BL	restoreregs2<>(SB)
 
@@ -41,7 +42,7 @@
 	MOVD	R0, LR
 	RET
 
-TEXT saveregs2<>(SB),NOSPLIT,$-8
+TEXT saveregs2<>(SB),NOSPLIT|NOFRAME,$0
 	// O=-288; for R in R{14..31}; do echo "\tMOVD\t$R, $O(R1)"|sed s/R30/g/; ((O+=8)); done; for F in F{14..31}; do echo "\tFMOVD\t$F, $O(R1)"; ((O+=8)); done
 	MOVD	R14, -288(R1)
 	MOVD	R15, -280(R1)
@@ -82,7 +83,7 @@
 
 	RET
 
-TEXT restoreregs2<>(SB),NOSPLIT,$-8
+TEXT restoreregs2<>(SB),NOSPLIT|NOFRAME,$0
 	// O=-288; for R in R{14..31}; do echo "\tMOVD\t$O(R1), $R"|sed s/R30/g/; ((O+=8)); done; for F in F{14..31}; do echo "\tFMOVD\t$O(R1), $F"; ((O+=8)); done
 	MOVD	-288(R1), R14
 	MOVD	-280(R1), R15

diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
new file mode 100644
index 0000000..ae688b6
--- /dev/null
+++ b/src/runtime/cgo/asm_s390x.s

@@ -0,0 +1,43 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls fn with three arguments.
+TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
+	// Start with standard C stack frame layout and linkage
+
+	// Save R6-R15, F0, F2, F4 and F6 in the
+	// register save area of the calling function
+	STMG	R6, R15, 48(R15)
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	// Initialize Go ABI environment
+	XOR	R0, R0
+	BL	runtime·load_g(SB)
+
+	// Allocate 32 bytes on the stack
+	SUB	$32, R15
+
+	MOVD	R3, 8(R15)  // arg1
+	MOVW	R4, 16(R15) // arg2
+	MOVD	R5, 24(R15) // arg3
+	BL	(R2)        // fn(arg1, arg2, arg3)
+
+	ADD	$32, R15
+
+	// Restore R6-R15, F0, F2, F4 and F6
+	LMG	48(R15), R6, R15
+	FMOVD	F0, 128(R15)
+	FMOVD	F2, 136(R15)
+	FMOVD	F4, 144(R15)
+	FMOVD	F6, 152(R15)
+
+	RET
+

diff --git a/src/runtime/cgo/callbacks.go b/src/runtime/cgo/callbacks.go
index 08f230d..9bde5a9 100644
--- a/src/runtime/cgo/callbacks.go
+++ b/src/runtime/cgo/callbacks.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -11,7 +11,7 @@
 
 // cgocallback is defined in runtime
 //go:linkname _runtime_cgocallback runtime.cgocallback
-func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr)
+func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
 
 // The declaration of crosscall2 is:
 //   void crosscall2(void (*fn)(void *, int), void *, int);
@@ -19,10 +19,14 @@
 // We need to export the symbol crosscall2 in order to support
 // callbacks from shared libraries. This applies regardless of
 // linking mode.
+//
+// Compatibility note: crosscall2 actually takes four arguments, but
+// it works to call it with three arguments when calling _cgo_panic.
+// That is supported for backward compatibility.
 //go:cgo_export_static crosscall2
 //go:cgo_export_dynamic crosscall2
 
-// Panic.  The argument is converted into a Go string.
+// Panic. The argument is converted into a Go string.
 
 // Call like this in code compiled with gcc:
 //   struct { const char *p; } a;
@@ -39,7 +43,7 @@
 //go:nosplit
 //go:norace
 func _cgo_panic(a unsafe.Pointer, n int32) {
-	_runtime_cgocallback(unsafe.Pointer(&_runtime_cgo_panic_internal), a, uintptr(n))
+	_runtime_cgocallback(unsafe.Pointer(&_runtime_cgo_panic_internal), a, uintptr(n), 0)
 }
 
 //go:cgo_import_static x_cgo_init
@@ -48,18 +52,6 @@
 var x_cgo_init byte
 var _cgo_init = &x_cgo_init
 
-//go:cgo_import_static x_cgo_malloc
-//go:linkname x_cgo_malloc x_cgo_malloc
-//go:linkname _cgo_malloc _cgo_malloc
-var x_cgo_malloc byte
-var _cgo_malloc = &x_cgo_malloc
-
-//go:cgo_import_static x_cgo_free
-//go:linkname x_cgo_free x_cgo_free
-//go:linkname _cgo_free _cgo_free
-var x_cgo_free byte
-var _cgo_free = &x_cgo_free
-
 //go:cgo_import_static x_cgo_thread_start
 //go:linkname x_cgo_thread_start x_cgo_thread_start
 //go:linkname _cgo_thread_start _cgo_thread_start
@@ -69,7 +61,7 @@
 // Creates a new system thread without updating any Go state.
 //
 // This method is invoked during shared library loading to create a new OS
-// thread to perform the runtime initialization.  This method is similar to
+// thread to perform the runtime initialization. This method is similar to
 // _cgo_sys_thread_start except that it doesn't update any Go state.
 
 //go:cgo_import_static x_cgo_sys_thread_create
@@ -78,11 +70,11 @@
 var x_cgo_sys_thread_create byte
 var _cgo_sys_thread_create = &x_cgo_sys_thread_create
 
-// Notifies that the runtime has been intialized.
+// Notifies that the runtime has been initialized.
 //
 // We currently block at every CGO entry point (via _cgo_wait_runtime_init_done)
 // to ensure that the runtime has been initialized before the CGO call is
-// executed.  This is necessary for shared libraries where we kickoff runtime
+// executed. This is necessary for shared libraries where we kickoff runtime
 // initialization in a separate thread and return without waiting for this
 // thread to complete the init.
 
@@ -92,5 +84,13 @@
 var x_cgo_notify_runtime_init_done byte
 var _cgo_notify_runtime_init_done = &x_cgo_notify_runtime_init_done
 
+// Sets the traceback context function. See runtime.SetCgoTraceback.
+
+//go:cgo_import_static x_cgo_set_context_function
+//go:linkname x_cgo_set_context_function x_cgo_set_context_function
+//go:linkname _cgo_set_context_function _cgo_set_context_function
+var x_cgo_set_context_function byte
+var _cgo_set_context_function = &x_cgo_set_context_function
+
 //go:cgo_export_static _cgo_topofstack
 //go:cgo_export_dynamic _cgo_topofstack

diff --git a/src/runtime/cgo/callbacks_traceback.go b/src/runtime/cgo/callbacks_traceback.go
new file mode 100644
index 0000000..f754846
--- /dev/null
+++ b/src/runtime/cgo/callbacks_traceback.go

@@ -0,0 +1,17 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+package cgo
+
+import _ "unsafe" // for go:linkname
+
+// Calls the traceback function passed to SetCgoTraceback.
+
+//go:cgo_import_static x_cgo_callers
+//go:linkname x_cgo_callers x_cgo_callers
+//go:linkname _cgo_callers _cgo_callers
+var x_cgo_callers byte
+var _cgo_callers = &x_cgo_callers

diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index cb24678..ce0e6a3 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -20,7 +20,9 @@
 #cgo !android,linux LDFLAGS: -lpthread
 #cgo netbsd LDFLAGS: -lpthread
 #cgo openbsd LDFLAGS: -lpthread
-#cgo windows LDFLAGS: -lm -mthreads
+// we must explicitly link msvcrt, because runtime needs ntdll, and ntdll
+// exports some incompatible libc functions. See golang.org/issue/12030.
+#cgo windows LDFLAGS: -lmsvcrt -lm -mthreads
 
 #cgo CFLAGS: -Wall -Werror
 

diff --git a/src/runtime/cgo/dragonfly.go b/src/runtime/cgo/dragonfly.go
index 69d52b5..d6d6918 100644
--- a/src/runtime/cgo/dragonfly.go
+++ b/src/runtime/cgo/dragonfly.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/freebsd.go b/src/runtime/cgo/freebsd.go
index 99cf3fb..5c9ddbd 100644
--- a/src/runtime/cgo/freebsd.go
+++ b/src/runtime/cgo/freebsd.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_386.S b/src/runtime/cgo/gcc_386.S
index bf41427..ff55b2c 100644
--- a/src/runtime/cgo/gcc_386.S
+++ b/src/runtime/cgo/gcc_386.S

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -35,11 +35,6 @@
 	popl %ebp
 	ret
 
-.globl EXT(__stack_chk_fail_local)
-EXT(__stack_chk_fail_local):
-1:
-	jmp 1b
-
 #ifdef __ELF__
 .section .note.GNU-stack,"",@progbits
 #endif

diff --git a/src/runtime/cgo/gcc_amd64.S b/src/runtime/cgo/gcc_amd64.S
index 32d0200..17d9d47 100644
--- a/src/runtime/cgo/gcc_amd64.S
+++ b/src/runtime/cgo/gcc_amd64.S

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_android.c b/src/runtime/cgo/gcc_android.c
index be27725..b756ede 100644
--- a/src/runtime/cgo/gcc_android.c
+++ b/src/runtime/cgo/gcc_android.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_android_386.c b/src/runtime/cgo/gcc_android_386.c
new file mode 100644
index 0000000..23a15f1
--- /dev/null
+++ b/src/runtime/cgo/gcc_android_386.c

@@ -0,0 +1,87 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h> /* for strerror */
+#include <pthread.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static pthread_key_t k1;
+
+#define magic1 (0x23581321U)
+
+static void
+inittls(void)
+{
+	uint32 x;
+	pthread_key_t tofree[128], k;
+	int i, ntofree;
+
+	/*
+	 * Same logic, code as gcc_android_amd64.c:/inittls.
+	 * Note that this is a temporary hack that should be fixed soon.
+	 *
+	 * TODO: fix this.
+	 *
+	 * The linker and runtime hard-code this constant offset
+	 * from %gs where we expect to find g. Disgusting.
+	 *
+	 * Known to src/cmd/link/internal/ld/sym.go:/0xf8
+	 * and to src/runtime/sys_linux_386.s:/0xf8 or /GOOS_android.
+	 * TODO(hyangah): check 0xb0 works with API23+
+	 *
+	 * As disgusting as on the darwin/386, darwin/amd64.
+	 */
+	ntofree = 0;
+	for(;;) {
+		if(pthread_key_create(&k, nil) < 0) {
+			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
+			abort();
+		}
+		pthread_setspecific(k, (void*)magic1);
+		asm volatile("movl %%gs:0xf8, %0" : "=r"(x));
+		pthread_setspecific(k, 0);
+		if (x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
+	}
+	// TODO: output to stderr is not useful for apps.
+	// Can we fall back to Android's log library?
+
+	/*
+	 * We got the key we wanted.  Free the others.
+	 */
+	for(i=0; i<ntofree; i++) {
+		pthread_key_delete(tofree[i]);
+	}
+}
+
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	pthread_setspecific(k1, (void*)ts.g);
+
+	crosscall_386(ts.fn);
+	return nil;
+}
+
+void (*x_cgo_inittls)(void) = inittls;
+void* (*x_cgo_threadentry)(void*) = threadentry;

diff --git a/src/runtime/cgo/gcc_android_amd64.c b/src/runtime/cgo/gcc_android_amd64.c
new file mode 100644
index 0000000..e006c49
--- /dev/null
+++ b/src/runtime/cgo/gcc_android_amd64.c

@@ -0,0 +1,92 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <string.h> /* for strerror */
+#include <pthread.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void* threadentry(void*);
+static pthread_key_t k1;
+
+#define magic1 (0x23581321345589ULL)
+
+static void
+inittls(void)
+{
+	uint64 x;
+	pthread_key_t tofree[128], k;
+	int i, ntofree;
+
+	/*
+	 * Same logic, code as gcc_darwin_386.c:/inittls.
+	 * Note that this is a temporary hack that should be fixed soon.
+	 * Android-L and M bionic's pthread implementation differ
+	 * significantly, and can change any time.
+	 * https://android-review.googlesource.com/#/c/134202
+	 *
+	 * We chose %fs:0x1d0 which seems to work in testing with Android
+	 * emulators (API22, API23) but it may break any time.
+	 *
+	 * TODO: fix this.
+	 *
+	 * The linker and runtime hard-code this constant offset
+	 * from %fs where we expect to find g. Disgusting.
+	 *
+	 * Known to src/cmd/link/internal/ld/sym.go:/0x1d0
+	 * and to src/runtime/sys_linux_amd64.s:/0x1d0 or /GOOS_android.
+	 *
+	 * As disgusting as on the darwin/386, darwin/amd64.
+	 */
+	ntofree = 0;
+	for(;;) {
+		if(pthread_key_create(&k, nil) < 0) {
+			fprintf(stderr, "runtime/cgo: pthread_key_create failed\n");
+			abort();
+		}
+		pthread_setspecific(k, (void*)magic1);
+		asm volatile("movq %%fs:0x1d0, %0" : "=r"(x));
+		pthread_setspecific(k, 0);
+		if(x == magic1) {
+			k1 = k;
+			break;
+		}
+		if(ntofree >= nelem(tofree)) {
+			fprintf(stderr, "runtime/cgo: could not obtain pthread_keys\n");
+			fprintf(stderr, "\ttried");
+			for(i=0; i<ntofree; i++)
+				fprintf(stderr, " %#x", (unsigned)tofree[i]);
+			fprintf(stderr, "\n");
+			abort();
+		}
+		tofree[ntofree++] = k;
+	}
+	// TODO: output to stderr is not useful for apps.
+	// Can we fall back to Android's log library?
+
+	/*
+	 * We got the key we wanted.  Free the others.
+	 */
+	for(i=0; i<ntofree; i++) {
+		pthread_key_delete(tofree[i]);
+	}
+}
+
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	pthread_setspecific(k1, (void*)ts.g);
+
+	crosscall_amd64(ts.fn);
+	return nil;
+}
+
+void (*x_cgo_inittls)(void) = inittls;
+void* (*x_cgo_threadentry)(void*) = threadentry;

diff --git a/src/runtime/cgo/gcc_android_arm.c b/src/runtime/cgo/gcc_android_arm.c
index 85cd244..c7b13f9 100644
--- a/src/runtime/cgo/gcc_android_arm.c
+++ b/src/runtime/cgo/gcc_android_arm.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_android_arm64.c b/src/runtime/cgo/gcc_android_arm64.c
new file mode 100644
index 0000000..f8ad684
--- /dev/null
+++ b/src/runtime/cgo/gcc_android_arm64.c

@@ -0,0 +1,38 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <sys/limits.h>
+#include "libcgo.h"
+
+#define magic1 (0x23581321345589ULL)
+
+// inittls allocates a thread-local storage slot for g.
+//
+// It finds the first available slot using pthread_key_create and uses
+// it as the offset value for runtime.tlsg.
+static void
+inittls(void **tlsg, void **tlsbase)
+{
+	pthread_key_t k;
+	int i, err;
+
+	err = pthread_key_create(&k, nil);
+	if(err != 0) {
+		fatalf("pthread_key_create failed: %d", err);
+	}
+	pthread_setspecific(k, (void*)magic1);
+	for (i=0; i<PTHREAD_KEYS_MAX; i++) {
+		if (*(tlsbase+i) == (void*)magic1) {
+			*tlsg = (void*)(i*sizeof(void *));
+			pthread_setspecific(k, 0);
+			return;
+		}
+	}
+	fatalf("could not find pthread key");
+}
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase) = inittls;

diff --git a/src/runtime/cgo/gcc_arm.S b/src/runtime/cgo/gcc_arm.S
index 980ab57..fe1c48b 100644
--- a/src/runtime/cgo/gcc_arm.S
+++ b/src/runtime/cgo/gcc_arm.S

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -36,10 +36,6 @@
 
 	pop {r4, r5, r6, r7, r8, r9, r10, r11, ip, pc}
 
-.globl EXT(__stack_chk_fail_local)
-EXT(__stack_chk_fail_local):
-1:
-	b 1b
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits

diff --git a/src/runtime/cgo/gcc_arm64.S b/src/runtime/cgo/gcc_arm64.S
index b7379d1..59dce08 100644
--- a/src/runtime/cgo/gcc_arm64.S
+++ b/src/runtime/cgo/gcc_arm64.S

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -47,10 +47,6 @@
 	ldp x19, x20, [sp], #16
 	ret
 
-.globl EXT(__stack_chk_fail_local)
-EXT(__stack_chk_fail_local):
-1:
-	b 1b
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits

diff --git a/src/runtime/cgo/gcc_context.c b/src/runtime/cgo/gcc_context.c
new file mode 100644
index 0000000..1e6cf7e
--- /dev/null
+++ b/src/runtime/cgo/gcc_context.c

@@ -0,0 +1,21 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build darwin dragonfly freebsd linux netbsd openbsd solaris windows
+
+#include "libcgo.h"
+
+// Releases the cgo traceback context.
+void _cgo_release_context(uintptr_t ctxt) {
+	void (*pfn)(struct context_arg*);
+
+	pfn = _cgo_get_context_function();
+	if (ctxt != 0 && pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = ctxt;
+		(*pfn)(&arg);
+	}
+}

diff --git a/src/runtime/cgo/gcc_darwin_386.c b/src/runtime/cgo/gcc_darwin_386.c
index 6668ba4..effbcdf 100644
--- a/src/runtime/cgo/gcc_darwin_386.c
+++ b/src/runtime/cgo/gcc_darwin_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_darwin_amd64.c b/src/runtime/cgo/gcc_darwin_amd64.c
index dc679ac..15396b0 100644
--- a/src/runtime/cgo/gcc_darwin_amd64.c
+++ b/src/runtime/cgo/gcc_darwin_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_darwin_arm.c b/src/runtime/cgo/gcc_darwin_arm.c
index c303b91..dbf88c3 100644
--- a/src/runtime/cgo/gcc_darwin_arm.c
+++ b/src/runtime/cgo/gcc_darwin_arm.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index b64a063..a9eb4f2 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_dragonfly_amd64.c b/src/runtime/cgo/gcc_dragonfly_amd64.c
index f79f652..b534dcc 100644
--- a/src/runtime/cgo/gcc_dragonfly_amd64.c
+++ b/src/runtime/cgo/gcc_dragonfly_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -56,6 +56,7 @@
 threadentry(void *v)
 {
 	ThreadStart ts;
+	stack_t ss;
 
 	ts = *(ThreadStart*)v;
 	free(v);
@@ -65,6 +66,17 @@
 	 */
 	setg_gcc((void*)ts.g);
 
+	// On DragonFly, a new thread inherits the signal stack of the
+	// creating thread. That confuses minit, so we remove that
+	// signal stack here before calling the regular mstart. It's
+	// a bit baroque to remove a signal stack here only to add one
+	// in minit, but it's a simple change that keeps DragonFly
+	// working like other OS's. At this point all signals are
+	// blocked, so there is no race.
+	memset(&ss, 0, sizeof ss);
+	ss.ss_flags = SS_DISABLE;
+	sigaltstack(&ss, nil);
+
 	crosscall_amd64(ts.fn);
 	return nil;
 }

diff --git a/src/runtime/cgo/gcc_fatalf.c b/src/runtime/cgo/gcc_fatalf.c
index 21c1acf..5ac419b 100644
--- a/src/runtime/cgo/gcc_fatalf.c
+++ b/src/runtime/cgo/gcc_fatalf.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_freebsd_386.c b/src/runtime/cgo/gcc_freebsd_386.c
index 074418f..d288666 100644
--- a/src/runtime/cgo/gcc_freebsd_386.c
+++ b/src/runtime/cgo/gcc_freebsd_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_freebsd_amd64.c b/src/runtime/cgo/gcc_freebsd_amd64.c
index f79f652..e532ad6 100644
--- a/src/runtime/cgo/gcc_freebsd_amd64.c
+++ b/src/runtime/cgo/gcc_freebsd_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_freebsd_arm.c b/src/runtime/cgo/gcc_freebsd_arm.c
index 2a86a91..c4e7574 100644
--- a/src/runtime/cgo/gcc_freebsd_arm.c
+++ b/src/runtime/cgo/gcc_freebsd_arm.c

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -51,7 +51,7 @@
 
 	// Not sure why the memset is necessary here,
 	// but without it, we get a bogus stack size
-	// out of pthread_attr_getstacksize.  C'est la Linux.
+	// out of pthread_attr_getstacksize. C'est la Linux.
 	memset(&attr, 0, sizeof attr);
 	pthread_attr_init(&attr);
 	size = 0;

diff --git a/src/runtime/cgo/gcc_libinit.c b/src/runtime/cgo/gcc_libinit.c
index c3e94f5..0bdf40a 100644
--- a/src/runtime/cgo/gcc_libinit.c
+++ b/src/runtime/cgo/gcc_libinit.c

@@ -1,19 +1,23 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build cgo
 // +build darwin dragonfly freebsd linux netbsd solaris
-// +build !ppc64,!ppc64le
 
 #include <pthread.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h> // strerror
+#include "libcgo.h"
 
 static pthread_cond_t runtime_init_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t runtime_init_mu = PTHREAD_MUTEX_INITIALIZER;
 static int runtime_init_done;
 
+// The context function, used when tracing back C calls into Go.
+static void (*cgo_context_function)(struct context_arg*);
+
 void
 x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
 	pthread_t p;
@@ -24,13 +28,35 @@
 	}
 }
 
-void
+uintptr_t
 _cgo_wait_runtime_init_done() {
+	void (*pfn)(struct context_arg*);
+
 	pthread_mutex_lock(&runtime_init_mu);
 	while (runtime_init_done == 0) {
 		pthread_cond_wait(&runtime_init_cond, &runtime_init_mu);
 	}
+
+	// TODO(iant): For the case of a new C thread calling into Go, such
+	// as when using -buildmode=c-archive, we know that Go runtime
+	// initialization is complete but we do not know that all Go init
+	// functions have been run. We should not fetch cgo_context_function
+	// until they have been, because that is where a call to
+	// SetCgoTraceback is likely to occur. We are going to wait for Go
+	// initialization to be complete anyhow, later, by waiting for
+	// main_init_done to be closed in cgocallbackg1. We should wait here
+	// instead. See also issue #15943.
+	pfn = cgo_context_function;
+
 	pthread_mutex_unlock(&runtime_init_mu);
+	if (pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = 0;
+		(*pfn)(&arg);
+		return arg.Context;
+	}
+	return 0;
 }
 
 void
@@ -40,3 +66,21 @@
 	pthread_cond_broadcast(&runtime_init_cond);
 	pthread_mutex_unlock(&runtime_init_mu);
 }
+
+// Sets the context function to call to record the traceback context
+// when calling a Go function from C code. Called from runtime.SetCgoTraceback.
+void x_cgo_set_context_function(void (*context)(struct context_arg*)) {
+	pthread_mutex_lock(&runtime_init_mu);
+	cgo_context_function = context;
+	pthread_mutex_unlock(&runtime_init_mu);
+}
+
+// Gets the context function.
+void (*(_cgo_get_context_function(void)))(struct context_arg*) {
+	void (*ret)(struct context_arg*);
+
+	pthread_mutex_lock(&runtime_init_mu);
+	ret = cgo_context_function;
+	pthread_mutex_unlock(&runtime_init_mu);
+	return ret;
+}

diff --git a/src/runtime/cgo/gcc_libinit_linux_ppc64x.c b/src/runtime/cgo/gcc_libinit_linux_ppc64x.c
deleted file mode 100644
index 82413a5..0000000
--- a/src/runtime/cgo/gcc_libinit_linux_ppc64x.c
+++ /dev/null

@@ -1,26 +0,0 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// TODO: see issue #10410
-// +build linux
-// +build ppc64 ppc64le
-
-#include <stdio.h>
-#include <stdlib.h>
-
-void
-x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
-	fprintf(stderr, "x_cgo_sys_thread_create not implemented");
-	abort();
-}
-
-void
-_cgo_wait_runtime_init_done() {
-	// TODO(spetrovic): implement this method.
-}
-
-void
-x_cgo_notify_runtime_init_done(void* dummy) {
-	// TODO(spetrovic): implement this method.
-}
\ No newline at end of file

diff --git a/src/runtime/cgo/gcc_libinit_openbsd.c b/src/runtime/cgo/gcc_libinit_openbsd.c
index 7e5b646..626bf8a 100644
--- a/src/runtime/cgo/gcc_libinit_openbsd.c
+++ b/src/runtime/cgo/gcc_libinit_openbsd.c

@@ -1,9 +1,13 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include <stdio.h>
 #include <stdlib.h>
+#include "libcgo.h"
+
+// The context function, used when tracing back C calls into Go.
+static void (*cgo_context_function)(struct context_arg*);
 
 void
 x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
@@ -11,12 +15,36 @@
 	abort();
 }
 
-void
+uintptr_t
 _cgo_wait_runtime_init_done() {
+	void (*pfn)(struct context_arg*);
+
 	// TODO(spetrovic): implement this method.
+
+	pfn = _cgo_get_context_function();
+	if (pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = 0;
+		(*pfn)(&arg);
+		return arg.Context;
+	}
+	return 0;
 }
 
 void
 x_cgo_notify_runtime_init_done(void* dummy) {
 	// TODO(spetrovic): implement this method.
-}
\ No newline at end of file
+}
+
+// Sets the context function to call to record the traceback context
+// when calling a Go function from C code. Called from runtime.SetCgoTraceback.
+void x_cgo_set_context_function(void (*context)(struct context_arg*)) {
+	// TODO(iant): Needs synchronization.
+	cgo_context_function = context;
+}
+
+// Gets the context function.
+void (*(_cgo_get_context_function(void)))(struct context_arg*) {
+	return cgo_context_function;
+}

diff --git a/src/runtime/cgo/gcc_libinit_windows.c b/src/runtime/cgo/gcc_libinit_windows.c
index 7e5b646..0824e20 100644
--- a/src/runtime/cgo/gcc_libinit_windows.c
+++ b/src/runtime/cgo/gcc_libinit_windows.c

@@ -1,22 +1,123 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build cgo
+#define WIN64_LEAN_AND_MEAN
+#include <windows.h>
+#include <process.h>
+
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "libcgo.h"
+
+static volatile long runtime_init_once_gate = 0;
+static volatile long runtime_init_once_done = 0;
+
+static CRITICAL_SECTION runtime_init_cs;
+
+static HANDLE runtime_init_wait;
+static int runtime_init_done;
+
+// Pre-initialize the runtime synchronization objects
 void
-x_cgo_sys_thread_create(void* (*func)(void*), void* arg) {
-	fprintf(stderr, "x_cgo_sys_thread_create not implemented");
-	abort();
+_cgo_preinit_init() {
+	 runtime_init_wait = CreateEvent(NULL, TRUE, FALSE, NULL);
+	 if (runtime_init_wait == NULL) {
+		fprintf(stderr, "runtime: failed to create runtime initialization wait event.\n");
+		abort();
+	 }
+
+	 InitializeCriticalSection(&runtime_init_cs);
+}
+
+// Make sure that the preinit sequence has run.
+void
+_cgo_maybe_run_preinit() {
+	 if (!InterlockedExchangeAdd(&runtime_init_once_done, 0)) {
+			if (InterlockedIncrement(&runtime_init_once_gate) == 1) {
+				 _cgo_preinit_init();
+				 InterlockedIncrement(&runtime_init_once_done);
+			} else {
+				 // Decrement to avoid overflow.
+				 InterlockedDecrement(&runtime_init_once_gate);
+				 while(!InterlockedExchangeAdd(&runtime_init_once_done, 0)) {
+						Sleep(0);
+				 }
+			}
+	 }
 }
 
 void
+x_cgo_sys_thread_create(void (*func)(void*), void* arg) {
+	uintptr_t thandle;
+
+	thandle = _beginthread(func, 0, arg);
+	if(thandle == -1) {
+		fprintf(stderr, "runtime: failed to create new OS thread (%d)\n", errno);
+		abort();
+	}
+}
+
+int
+_cgo_is_runtime_initialized() {
+	 EnterCriticalSection(&runtime_init_cs);
+	 int status = runtime_init_done;
+	 LeaveCriticalSection(&runtime_init_cs);
+	 return status;
+}
+
+uintptr_t
 _cgo_wait_runtime_init_done() {
-	// TODO(spetrovic): implement this method.
+	void (*pfn)(struct context_arg*);
+
+	 _cgo_maybe_run_preinit();
+	while (!_cgo_is_runtime_initialized()) {
+			WaitForSingleObject(runtime_init_wait, INFINITE);
+	}
+	pfn = _cgo_get_context_function();
+	if (pfn != nil) {
+		struct context_arg arg;
+
+		arg.Context = 0;
+		(*pfn)(&arg);
+		return arg.Context;
+	}
+	return 0;
 }
 
 void
 x_cgo_notify_runtime_init_done(void* dummy) {
-	// TODO(spetrovic): implement this method.
-}
\ No newline at end of file
+	 _cgo_maybe_run_preinit();
+
+	 EnterCriticalSection(&runtime_init_cs);
+	runtime_init_done = 1;
+	 LeaveCriticalSection(&runtime_init_cs);
+
+	 if (!SetEvent(runtime_init_wait)) {
+		fprintf(stderr, "runtime: failed to signal runtime initialization complete.\n");
+		abort();
+	}
+}
+
+// The context function, used when tracing back C calls into Go.
+static void (*cgo_context_function)(struct context_arg*);
+
+// Sets the context function to call to record the traceback context
+// when calling a Go function from C code. Called from runtime.SetCgoTraceback.
+void x_cgo_set_context_function(void (*context)(struct context_arg*)) {
+	EnterCriticalSection(&runtime_init_cs);
+	cgo_context_function = context;
+	LeaveCriticalSection(&runtime_init_cs);
+}
+
+// Gets the context function.
+void (*(_cgo_get_context_function(void)))(struct context_arg*) {
+	void (*ret)(struct context_arg*);
+
+	EnterCriticalSection(&runtime_init_cs);
+	ret = cgo_context_function;
+	LeaveCriticalSection(&runtime_init_cs);
+	return ret;
+}

diff --git a/src/runtime/cgo/gcc_linux_386.c b/src/runtime/cgo/gcc_linux_386.c
index 9801c87..30fe92b 100644
--- a/src/runtime/cgo/gcc_linux_386.c
+++ b/src/runtime/cgo/gcc_linux_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -10,6 +10,10 @@
 static void *threadentry(void*);
 static void (*setg_gcc)(void*);
 
+// These will be set in gcc_android_386.c for android-specific customization.
+void (*x_cgo_inittls)(void);
+void* (*x_cgo_threadentry)(void*);
+
 void
 x_cgo_init(G *g, void (*setg)(void*))
 {
@@ -21,6 +25,10 @@
 	pthread_attr_getstacksize(&attr, &size);
 	g->stacklo = (uintptr)&attr - size + 4096;
 	pthread_attr_destroy(&attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls();
+	}
 }
 
 
@@ -38,7 +46,7 @@
 
 	// Not sure why the memset is necessary here,
 	// but without it, we get a bogus stack size
-	// out of pthread_attr_getstacksize.  C'est la Linux.
+	// out of pthread_attr_getstacksize. C'est la Linux.
 	memset(&attr, 0, sizeof attr);
 	pthread_attr_init(&attr);
 	size = 0;
@@ -57,6 +65,10 @@
 static void*
 threadentry(void *v)
 {
+	if (x_cgo_threadentry) {
+		return x_cgo_threadentry(v);
+	}
+
 	ThreadStart ts;
 
 	ts = *(ThreadStart*)v;

diff --git a/src/runtime/cgo/gcc_linux_amd64.c b/src/runtime/cgo/gcc_linux_amd64.c
index 275d5dd..0c34c66 100644
--- a/src/runtime/cgo/gcc_linux_amd64.c
+++ b/src/runtime/cgo/gcc_linux_amd64.c

@@ -1,26 +1,56 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include <pthread.h>
+#include <errno.h>
 #include <string.h> // strerror
 #include <signal.h>
+#include <stdlib.h>
 #include "libcgo.h"
 
 static void* threadentry(void*);
 static void (*setg_gcc)(void*);
 
+// These will be set in gcc_android_amd64.c for android-specific customization.
+void (*x_cgo_inittls)(void);
+void* (*x_cgo_threadentry)(void*);
+
 void
 x_cgo_init(G* g, void (*setg)(void*))
 {
-	pthread_attr_t attr;
+	pthread_attr_t *attr;
 	size_t size;
 
+	/* The memory sanitizer distributed with versions of clang
+	   before 3.8 has a bug: if you call mmap before malloc, mmap
+	   may return an address that is later overwritten by the msan
+	   library.  Avoid this problem by forcing a call to malloc
+	   here, before we ever call malloc.
+
+	   This is only required for the memory sanitizer, so it's
+	   unfortunate that we always run it.  It should be possible
+	   to remove this when we no longer care about versions of
+	   clang before 3.8.  The test for this is
+	   misc/cgo/testsanitizers.
+
+	   GCC works hard to eliminate a seemingly unnecessary call to
+	   malloc, so we actually use the memory we allocate.  */
+
 	setg_gcc = setg;
-	pthread_attr_init(&attr);
-	pthread_attr_getstacksize(&attr, &size);
-	g->stacklo = (uintptr)&attr - size + 4096;
-	pthread_attr_destroy(&attr);
+	attr = (pthread_attr_t*)malloc(sizeof *attr);
+	if (attr == NULL) {
+		fatalf("malloc failed: %s", strerror(errno));
+	}
+	pthread_attr_init(attr);
+	pthread_attr_getstacksize(attr, &size);
+	g->stacklo = (uintptr)&size - size + 4096;
+	pthread_attr_destroy(attr);
+	free(attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls();
+	}
 }
 
 
@@ -52,10 +82,16 @@
 static void*
 threadentry(void *v)
 {
+	if (x_cgo_threadentry) {
+		return x_cgo_threadentry(v);
+	}
+
 	ThreadStart ts;
 
 	ts = *(ThreadStart*)v;
+	_cgo_tsan_acquire();
 	free(v);
+	_cgo_tsan_release();
 
 	/*
 	 * Set specific keys.

diff --git a/src/runtime/cgo/gcc_linux_arm.c b/src/runtime/cgo/gcc_linux_arm.c
index 7d4b4d6..945c3f1 100644
--- a/src/runtime/cgo/gcc_linux_arm.c
+++ b/src/runtime/cgo/gcc_linux_arm.c

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -26,7 +26,7 @@
 
 	// Not sure why the memset is necessary here,
 	// but without it, we get a bogus stack size
-	// out of pthread_attr_getstacksize.  C'est la Linux.
+	// out of pthread_attr_getstacksize. C'est la Linux.
 	memset(&attr, 0, sizeof attr);
 	pthread_attr_init(&attr);
 	size = 0;

diff --git a/src/runtime/cgo/gcc_linux_arm64.c b/src/runtime/cgo/gcc_linux_arm64.c
index ea11cf5..ca9ba0b 100644
--- a/src/runtime/cgo/gcc_linux_arm64.c
+++ b/src/runtime/cgo/gcc_linux_arm64.c

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -26,7 +26,7 @@
 
 	// Not sure why the memset is necessary here,
 	// but without it, we get a bogus stack size
-	// out of pthread_attr_getstacksize.  C'est la Linux.
+	// out of pthread_attr_getstacksize. C'est la Linux.
 	memset(&attr, 0, sizeof attr);
 	pthread_attr_init(&attr);
 	size = 0;

diff --git a/src/runtime/cgo/gcc_linux_mips64x.c b/src/runtime/cgo/gcc_linux_mips64x.c
new file mode 100644
index 0000000..5bf5197
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_mips64x.c

@@ -0,0 +1,77 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build linux
+// +build mips64 mips64le
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase);
+void (*setg_gcc)(void*);
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	// Not sure why the memset is necessary here,
+	// but without it, we get a bogus stack size
+	// out of pthread_attr_getstacksize.  C'est la Linux.
+	memset(&attr, 0, sizeof attr);
+	pthread_attr_init(&attr);
+	size = 0;
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+extern void crosscall1(void (*fn)(void), void (*setg_gcc)(void*), void *g);
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
+	return nil;
+}
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+
+	if (x_cgo_inittls) {
+		x_cgo_inittls(tlsg, tlsbase);
+	}
+}

diff --git a/src/runtime/cgo/gcc_linux_ppc64x.c b/src/runtime/cgo/gcc_linux_ppc64x.c
index b176295..fb19805 100644
--- a/src/runtime/cgo/gcc_linux_ppc64x.c
+++ b/src/runtime/cgo/gcc_linux_ppc64x.c

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_linux_s390x.c b/src/runtime/cgo/gcc_linux_s390x.c
new file mode 100644
index 0000000..81e3b33
--- /dev/null
+++ b/src/runtime/cgo/gcc_linux_s390x.c

@@ -0,0 +1,68 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <pthread.h>
+#include <string.h>
+#include <signal.h>
+#include "libcgo.h"
+
+static void *threadentry(void*);
+
+void (*x_cgo_inittls)(void **tlsg, void **tlsbase);
+static void (*setg_gcc)(void*);
+
+void
+x_cgo_init(G *g, void (*setg)(void*), void **tlsbase)
+{
+	pthread_attr_t attr;
+	size_t size;
+
+	setg_gcc = setg;
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	g->stacklo = (uintptr)&attr - size + 4096;
+	pthread_attr_destroy(&attr);
+}
+
+void
+_cgo_sys_thread_start(ThreadStart *ts)
+{
+	pthread_attr_t attr;
+	sigset_t ign, oset;
+	pthread_t p;
+	size_t size;
+	int err;
+
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
+	pthread_attr_init(&attr);
+	pthread_attr_getstacksize(&attr, &size);
+	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
+	ts->g->stackhi = size;
+	err = pthread_create(&p, &attr, threadentry, ts);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
+	if (err != 0) {
+		fatalf("pthread_create failed: %s", strerror(err));
+	}
+}
+
+extern void crosscall_s390x(void (*fn)(void), void *g);
+
+static void*
+threadentry(void *v)
+{
+	ThreadStart ts;
+
+	ts = *(ThreadStart*)v;
+	free(v);
+
+	// Save g for this thread in C TLS
+	setg_gcc((void*)ts.g);
+
+	crosscall_s390x(ts.fn, (void*)ts.g);
+	return nil;
+}

diff --git a/src/runtime/cgo/gcc_mips64x.S b/src/runtime/cgo/gcc_mips64x.S
new file mode 100644
index 0000000..adeb7ae
--- /dev/null
+++ b/src/runtime/cgo/gcc_mips64x.S

@@ -0,0 +1,79 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+/*
+ * void crosscall1(void (*fn)(void), void (*setg_gcc)(void *g), void *g)
+ *
+ * Calling into the gc tool chain, where all registers are caller save.
+ * Called from standard MIPS N64 ABI, where $16-$23, $28, $30, and $f24-$f31
+ * are callee-save, so they must be saved explicitly, along with $31 (LR).
+ */
+.globl crosscall1
+.set noat
+crosscall1:
+	daddiu	$29, $29, -160
+	sd	$31, 0($29)
+	sd	$16, 8($29)
+	sd	$17, 16($29)
+	sd	$18, 24($29)
+	sd	$19, 32($29)
+	sd	$20, 40($29)
+	sd	$21, 48($29)
+	sd	$22, 56($29)
+	sd	$23, 64($29)
+	sd	$28, 72($29)
+	sd	$30, 80($29)
+	sdc1	$f24, 88($29)
+	sdc1	$f25, 96($29)
+	sdc1	$f26, 104($29)
+	sdc1	$f27, 112($29)
+	sdc1	$f28, 120($29)
+	sdc1	$f29, 128($29)
+	sdc1	$f30, 136($29)
+	sdc1	$f31, 144($29)
+
+	dla	$23,_cgo_reginit
+
+	// prepare SB register = pc & 0xffffffff00000000
+	bal	1f
+1:
+	dsrl	$28, $31, 32
+	dsll	$28, $28, 32
+
+	move	$20, $4 // save R4
+	jalr	$23	// call _cgo_reginit, set up Go ABI constant registers
+	move	$1, $6
+	jalr	$5	// call setg_gcc (clobbers R4)
+	jalr	$20	// call fn
+
+	ld	$16, 8($29)
+	ld	$17, 16($29)
+	ld	$18, 24($29)
+	ld	$19, 32($29)
+	ld	$20, 40($29)
+	ld	$21, 48($29)
+	ld	$22, 56($29)
+	ld	$23, 64($29)
+	ld	$28, 72($29)
+	ld	$30, 80($29)
+	ldc1	$f24, 88($29)
+	ldc1	$f25, 96($29)
+	ldc1	$f26, 104($29)
+	ldc1	$f27, 112($29)
+	ldc1	$f28, 120($29)
+	ldc1	$f29, 128($29)
+	ldc1	$f30, 136($29)
+	ldc1	$f31, 144($29)
+	ld	$31, 0($29)
+
+	daddiu	$29, $29, 160
+	jr	$31
+
+.set at
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif

diff --git a/src/runtime/cgo/gcc_mmap.c b/src/runtime/cgo/gcc_mmap.c
new file mode 100644
index 0000000..088bcb2
--- /dev/null
+++ b/src/runtime/cgo/gcc_mmap.c

@@ -0,0 +1,25 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,amd64
+
+#include <errno.h>
+#include <stdint.h>
+#include <sys/mman.h>
+
+#include "libcgo.h"
+
+void *
+x_cgo_mmap(void *addr, uintptr_t length, int32_t prot, int32_t flags, int32_t fd, uint32_t offset) {
+	void *p;
+
+	_cgo_tsan_acquire();
+	p = mmap(addr, length, prot, flags, fd, offset);
+	_cgo_tsan_release();
+	if (p == MAP_FAILED) {
+		/* This is what the Go code expects on failure.  */
+		p = (void *) (uintptr_t) errno;
+	}
+	return p;
+}

diff --git a/src/runtime/cgo/gcc_netbsd_386.c b/src/runtime/cgo/gcc_netbsd_386.c
index 2505e6d..99558ea 100644
--- a/src/runtime/cgo/gcc_netbsd_386.c
+++ b/src/runtime/cgo/gcc_netbsd_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -55,6 +55,7 @@
 threadentry(void *v)
 {
 	ThreadStart ts;
+	stack_t ss;
 
 	ts = *(ThreadStart*)v;
 	free(v);
@@ -64,6 +65,17 @@
 	 */
 	setg_gcc((void*)ts.g);
 
+	// On NetBSD, a new thread inherits the signal stack of the
+	// creating thread. That confuses minit, so we remove that
+	// signal stack here before calling the regular mstart. It's
+	// a bit baroque to remove a signal stack here only to add one
+	// in minit, but it's a simple change that keeps NetBSD
+	// working like other OS's. At this point all signals are
+	// blocked, so there is no race.
+	memset(&ss, 0, sizeof ss);
+	ss.ss_flags = SS_DISABLE;
+	sigaltstack(&ss, nil);
+
 	crosscall_386(ts.fn);
 	return nil;
 }

diff --git a/src/runtime/cgo/gcc_netbsd_amd64.c b/src/runtime/cgo/gcc_netbsd_amd64.c
index 8f64650..f5c8b1e 100644
--- a/src/runtime/cgo/gcc_netbsd_amd64.c
+++ b/src/runtime/cgo/gcc_netbsd_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -56,6 +56,7 @@
 threadentry(void *v)
 {
 	ThreadStart ts;
+	stack_t ss;
 
 	ts = *(ThreadStart*)v;
 	free(v);
@@ -65,6 +66,17 @@
 	 */
 	setg_gcc((void*)ts.g);
 
+	// On NetBSD, a new thread inherits the signal stack of the
+	// creating thread. That confuses minit, so we remove that
+	// signal stack here before calling the regular mstart. It's
+	// a bit baroque to remove a signal stack here only to add one
+	// in minit, but it's a simple change that keeps NetBSD
+	// working like other OS's. At this point all signals are
+	// blocked, so there is no race.
+	memset(&ss, 0, sizeof ss);
+	ss.ss_flags = SS_DISABLE;
+	sigaltstack(&ss, nil);
+
 	crosscall_amd64(ts.fn);
 	return nil;
 }

diff --git a/src/runtime/cgo/gcc_netbsd_arm.c b/src/runtime/cgo/gcc_netbsd_arm.c
index 7a98c0d..97ce908 100644
--- a/src/runtime/cgo/gcc_netbsd_arm.c
+++ b/src/runtime/cgo/gcc_netbsd_arm.c

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -57,10 +57,22 @@
 threadentry(void *v)
 {
 	ThreadStart ts;
+	stack_t ss;
 
 	ts = *(ThreadStart*)v;
 	free(v);
 
+	// On NetBSD, a new thread inherits the signal stack of the
+	// creating thread. That confuses minit, so we remove that
+	// signal stack here before calling the regular mstart. It's
+	// a bit baroque to remove a signal stack here only to add one
+	// in minit, but it's a simple change that keeps NetBSD
+	// working like other OS's. At this point all signals are
+	// blocked, so there is no race.
+	memset(&ss, 0, sizeof ss);
+	ss.ss_flags = SS_DISABLE;
+	sigaltstack(&ss, nil);
+
 	crosscall_arm1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }

diff --git a/src/runtime/cgo/gcc_openbsd_386.c b/src/runtime/cgo/gcc_openbsd_386.c
index c4be9a0..1bc61ff 100644
--- a/src/runtime/cgo/gcc_openbsd_386.c
+++ b/src/runtime/cgo/gcc_openbsd_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -13,9 +13,15 @@
 static void* threadentry(void*);
 static void (*setg_gcc)(void*);
 
-// TCB_SIZE is sizeof(struct thread_control_block),
-// as defined in /usr/src/lib/librthread/tcb.h
+// TCB_SIZE is sizeof(struct thread_control_block), as defined in
+// /usr/src/lib/librthread/tcb.h on OpenBSD 5.9 and earlier.
 #define TCB_SIZE (4 * sizeof(void *))
+
+// TIB_SIZE is sizeof(struct tib), as defined in
+// /usr/include/tib.h on OpenBSD 6.0 and later.
+#define TIB_SIZE (4 * sizeof(void *) + 6 * sizeof(int))
+
+// TLS_SIZE is the size of TLS needed for Go.
 #define TLS_SIZE (2 * sizeof(void *))
 
 void *__get_tcb(void);
@@ -29,25 +35,38 @@
 	void *arg;
 };
 
+static int has_tib = 0;
+
 static void
 tcb_fixup(int mainthread)
 {
-	void *newtcb, *oldtcb;
+	void *tls, *newtcb, *oldtcb;
+	size_t tls_size, tcb_size;
+
+	// TODO(jsing): Remove once OpenBSD 6.1 is released and OpenBSD 5.9 is
+	// no longer supported.
 
 	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
 	// we need to allocate our own TLS space while preserving the existing
-	// TCB that has been setup via librthread.
+	// TCB or TIB that has been setup via librthread.
 
-	newtcb = malloc(TCB_SIZE + TLS_SIZE);
-	if(newtcb == NULL)
+	tcb_size = has_tib ? TIB_SIZE : TCB_SIZE;
+	tls_size = TLS_SIZE + tcb_size;
+	tls = malloc(tls_size);
+	if(tls == NULL)
 		abort();
 
 	// The signal trampoline expects the TLS slots to be zeroed.
-	bzero(newtcb, TLS_SIZE);
+	bzero(tls, TLS_SIZE);
 
 	oldtcb = __get_tcb();
-	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
-	__set_tcb(newtcb + TLS_SIZE);
+	newtcb = tls + TLS_SIZE;
+	bcopy(oldtcb, newtcb, tcb_size);
+	if(has_tib) {
+		 // Fix up self pointer.
+		*(uintptr_t *)(newtcb) = (uintptr_t)newtcb;
+	}
+	__set_tcb(newtcb);
 
 	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
 	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
@@ -79,6 +98,10 @@
 		fprintf(stderr, "runtime/cgo: dlsym failed to find pthread_create: %s\n", dlerror());
 		abort();
 	}
+	// _rthread_init is hidden in OpenBSD librthread that has TIB.
+	if(dlsym(handle, "_rthread_init") == NULL) {
+		has_tib = 1;
+	}
 	dlclose(handle);
 }
 
@@ -144,6 +167,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_getstacksize(&attr, &size);
+
 	// Leave stacklo=0 and set stackhi=size; mstack will do the rest.
 	ts->g->stackhi = size;
 	err = sys_pthread_create(&p, &attr, threadentry, ts);

diff --git a/src/runtime/cgo/gcc_openbsd_amd64.c b/src/runtime/cgo/gcc_openbsd_amd64.c
index 8522cd4..4d4d143 100644
--- a/src/runtime/cgo/gcc_openbsd_amd64.c
+++ b/src/runtime/cgo/gcc_openbsd_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -13,9 +13,15 @@
 static void* threadentry(void*);
 static void (*setg_gcc)(void*);
 
-// TCB_SIZE is sizeof(struct thread_control_block),
-// as defined in /usr/src/lib/librthread/tcb.h
+// TCB_SIZE is sizeof(struct thread_control_block), as defined in
+// /usr/src/lib/librthread/tcb.h on OpenBSD 5.9 and earlier.
 #define TCB_SIZE (4 * sizeof(void *))
+
+// TIB_SIZE is sizeof(struct tib), as defined in
+// /usr/include/tib.h on OpenBSD 6.0 and later.
+#define TIB_SIZE (4 * sizeof(void *) + 6 * sizeof(int))
+
+// TLS_SIZE is the size of TLS needed for Go.
 #define TLS_SIZE (2 * sizeof(void *))
 
 void *__get_tcb(void);
@@ -29,25 +35,38 @@
 	void *arg;
 };
 
+static int has_tib = 0;
+
 static void
 tcb_fixup(int mainthread)
 {
-	void *newtcb, *oldtcb;
+	void *tls, *newtcb, *oldtcb;
+	size_t tls_size, tcb_size;
+
+	// TODO(jsing): Remove once OpenBSD 6.1 is released and OpenBSD 5.9 is
+	// no longer supported.
 
 	// The OpenBSD ld.so(1) does not currently support PT_TLS. As a result,
 	// we need to allocate our own TLS space while preserving the existing
-	// TCB that has been setup via librthread.
+	// TCB or TIB that has been setup via librthread.
 
-	newtcb = malloc(TCB_SIZE + TLS_SIZE);
-	if(newtcb == NULL)
+	tcb_size = has_tib ? TIB_SIZE : TCB_SIZE;
+	tls_size = TLS_SIZE + tcb_size;
+	tls = malloc(tls_size);
+	if(tls == NULL)
 		abort();
 
 	// The signal trampoline expects the TLS slots to be zeroed.
-	bzero(newtcb, TLS_SIZE);
+	bzero(tls, TLS_SIZE);
 
 	oldtcb = __get_tcb();
-	bcopy(oldtcb, newtcb + TLS_SIZE, TCB_SIZE);
-	__set_tcb(newtcb + TLS_SIZE);
+	newtcb = tls + TLS_SIZE;
+	bcopy(oldtcb, newtcb, tcb_size);
+	if(has_tib) {
+		 // Fix up self pointer.
+		*(uintptr_t *)(newtcb) = (uintptr_t)newtcb;
+	}
+	__set_tcb(newtcb);
 
 	// NOTE(jsing, minux): we can't free oldtcb without causing double-free
 	// problem. so newtcb will be memory leaks. Get rid of this when OpenBSD
@@ -79,6 +98,10 @@
 		fprintf(stderr, "runtime/cgo: dlsym failed to find pthread_create: %s\n", dlerror());
 		abort();
 	}
+	// _rthread_init is hidden in OpenBSD librthread that has TIB.
+	if(dlsym(handle, "_rthread_init") == NULL) {
+		has_tib = 1;
+	}
 	dlclose(handle);
 }
 

diff --git a/src/runtime/cgo/gcc_ppc64x.S b/src/runtime/cgo/gcc_ppc64x.S
index 05af070..5f37a8b 100644
--- a/src/runtime/cgo/gcc_ppc64x.S
+++ b/src/runtime/cgo/gcc_ppc64x.S

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -31,11 +31,13 @@
 
 	// Set up Go ABI constant registers
 	bl	_cgo_reginit
+	nop
 
 	// Restore g pointer (r30 in Go ABI, which may have been clobbered by C)
 	mr	%r30, %r4
 
 	// Call fn
+	mr	%r12, %r3
 	mtctr	%r3
 	bctrl
 
@@ -129,11 +131,6 @@
 
 	blr
 
-.globl EXT(__stack_chk_fail_local)
-EXT(__stack_chk_fail_local):
-1:
-	// TODO(austin)
-	b 1b
 
 #ifdef __ELF__
 .section .note.GNU-stack,"",%progbits

diff --git a/src/runtime/cgo/gcc_s390x.S b/src/runtime/cgo/gcc_s390x.S
new file mode 100644
index 0000000..022f82d
--- /dev/null
+++ b/src/runtime/cgo/gcc_s390x.S

@@ -0,0 +1,46 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+ * void crosscall_s390x(void (*fn)(void), void *g)
+ *
+ * Calling into the go tool chain, where all registers are caller save.
+ * Called from standard s390x C ABI, where r6-r13, r15, and f0, f2, f4 and f6 are
+ * callee-save, so they must be saved explicitly.
+ */
+.globl crosscall_s390x
+crosscall_s390x:
+	/*
+	 * save r6-r15, f0, f2, f4 and f6 in the
+	 * register save area of the calling function
+	 */
+	stmg	%r6, %r15, 48(%r15)
+	stdy	%f0, 128(%r15)
+	stdy	%f2, 136(%r15)
+	stdy	%f4, 144(%r15)
+	stdy	%f6, 152(%r15)
+
+	/* set r0 to 0 */
+	xgr	%r0, %r0
+
+	/* restore g pointer */
+	lgr	%r13, %r3
+
+	/* grow stack 8 bytes and call fn */
+	agfi    %r15, -8
+	basr    %r14, %r2
+	agfi	%r15, 8
+
+	/* restore registers */
+	lmg	%r6, %r15, 48(%r15)
+	ldy	%f0, 128(%r15)
+	ldy	%f2, 136(%r15)
+	ldy	%f4, 144(%r15)
+	ldy	%f6, 152(%r15)
+
+	br      %r14 /* restored by lmg */
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",%progbits
+#endif

diff --git a/src/runtime/cgo/gcc_setenv.c b/src/runtime/cgo/gcc_setenv.c
index ca29dcb..8708d40 100644
--- a/src/runtime/cgo/gcc_setenv.c
+++ b/src/runtime/cgo/gcc_setenv.c

@@ -1,7 +1,8 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build cgo
 // +build darwin dragonfly freebsd linux netbsd openbsd solaris
 
 #include "libcgo.h"

diff --git a/src/runtime/cgo/gcc_signal_darwin_armx.c b/src/runtime/cgo/gcc_signal_darwin_armx.c
index e36fe26..02c54d8 100644
--- a/src/runtime/cgo/gcc_signal_darwin_armx.c
+++ b/src/runtime/cgo/gcc_signal_darwin_armx.c

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -182,6 +182,7 @@
 	int ret;
 	pthread_t thr = NULL;
 	pthread_attr_t attr;
+	sigset_t ign, oset;
 
 	ret = mach_port_allocate(
 		mach_task_self(),
@@ -192,11 +193,18 @@
 		abort();
 	}
 
+	// Block all signals to the exception handler thread
+	sigfillset(&ign);
+	pthread_sigmask(SIG_SETMASK, &ign, &oset);
+
 	// Start a thread to handle exceptions.
 	uintptr_t port_set = (uintptr_t)mach_exception_handler_port_set;
 	pthread_attr_init(&attr);
 	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
 	ret = pthread_create(&thr, &attr, mach_exception_handler, (void*)port_set);
+
+	pthread_sigmask(SIG_SETMASK, &oset, nil);
+
 	if (ret) {
 		fprintf(stderr, "runtime/cgo: pthread_create failed: %d\n", ret);
 		abort();

diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_darwin_lldb.c
index b26315f..12cc388 100644
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ b/src/runtime/cgo/gcc_signal_darwin_lldb.c

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_solaris_amd64.c b/src/runtime/cgo/gcc_solaris_amd64.c
index 72ace56..98a1a8b 100644
--- a/src/runtime/cgo/gcc_solaris_amd64.c
+++ b/src/runtime/cgo/gcc_solaris_amd64.c

@@ -20,6 +20,12 @@
 	if (getcontext(&ctx) != 0)
 		perror("runtime/cgo: getcontext failed");
 	g->stacklo = (uintptr_t)ctx.uc_stack.ss_sp;
+
+	// Solaris processes report a tiny stack when run with "ulimit -s unlimited".
+	// Correct that as best we can: assume it's at least 1 MB.
+	// See golang.org/issue/12210.
+	if(ctx.uc_stack.ss_size < 1024*1024)
+		g->stacklo -= 1024*1024 - ctx.uc_stack.ss_size;
 }
 
 void

diff --git a/src/runtime/cgo/gcc_traceback.c b/src/runtime/cgo/gcc_traceback.c
new file mode 100644
index 0000000..667ea4c
--- /dev/null
+++ b/src/runtime/cgo/gcc_traceback.c

@@ -0,0 +1,31 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build cgo
+// +build linux
+
+#include <stdint.h>
+
+struct cgoTracebackArg {
+	uintptr_t  Context;
+	uintptr_t  SigContext;
+	uintptr_t* Buf;
+	uintptr_t  Max;
+};
+
+// Call the user's traceback function and then call sigtramp.
+// The runtime signal handler will jump to this code.
+// We do it this way so that the user's traceback function will be called
+// by a C function with proper unwind info.
+void
+x_cgo_callers(uintptr_t sig, void *info, void *context, void (*cgoTraceback)(struct cgoTracebackArg*), uintptr_t* cgoCallers, void (*sigtramp)(uintptr_t, void*, void*)) {
+	struct cgoTracebackArg arg;
+
+	arg.Context = 0;
+	arg.SigContext = (uintptr_t)(context);
+	arg.Buf = cgoCallers;
+	arg.Max = 32; // must match len(runtime.cgoCallers)
+	(*cgoTraceback)(&arg);
+	sigtramp(sig, info, context);
+}

diff --git a/src/runtime/cgo/gcc_util.c b/src/runtime/cgo/gcc_util.c
index 143734e..99af021 100644
--- a/src/runtime/cgo/gcc_util.c
+++ b/src/runtime/cgo/gcc_util.c

@@ -1,34 +1,9 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "libcgo.h"
 
-/* Stub for calling malloc from Go */
-void
-x_cgo_malloc(void *p)
-{
-	struct a {
-		long long n;
-		void *ret;
-	} *a = p;
-
-	a->ret = malloc(a->n);
-	if(a->ret == NULL && a->n == 0)
-		a->ret = malloc(1);
-}
-
-/* Stub for calling free from Go */
-void
-x_cgo_free(void *p)
-{
-	struct a {
-		void *arg;
-	} *a = p;
-
-	free(a->arg);
-}
-
 /* Stub for creating a new thread */
 void
 x_cgo_thread_start(ThreadStart *arg)
@@ -36,7 +11,9 @@
 	ThreadStart *ts;
 
 	/* Make our own copy that can persist after we return. */
+	_cgo_tsan_acquire();
 	ts = malloc(sizeof *ts);
+	_cgo_tsan_release();
 	if(ts == nil) {
 		fprintf(stderr, "runtime/cgo: out of memory in thread_start\n");
 		abort();

diff --git a/src/runtime/cgo/gcc_windows_386.c b/src/runtime/cgo/gcc_windows_386.c
index acd038c..fa0c69b 100644
--- a/src/runtime/cgo/gcc_windows_386.c
+++ b/src/runtime/cgo/gcc_windows_386.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/gcc_windows_amd64.c b/src/runtime/cgo/gcc_windows_amd64.c
index ce7e06b..a3c3896 100644
--- a/src/runtime/cgo/gcc_windows_amd64.c
+++ b/src/runtime/cgo/gcc_windows_amd64.c

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/iscgo.go b/src/runtime/cgo/iscgo.go
index 54f0a13..e12d0f4 100644
--- a/src/runtime/cgo/iscgo.go
+++ b/src/runtime/cgo/iscgo.go

@@ -1,12 +1,12 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // The runtime package contains an uninitialized definition
-// for runtime·iscgo.  Override it to tell the runtime we're here.
+// for runtime·iscgo. Override it to tell the runtime we're here.
 // There are various function pointers that should be set too,
 // but those depend on dynamic linker magic to get initialized
-// correctly, and sometimes they break.  This variable is a
+// correctly, and sometimes they break. This variable is a
 // backup: it depends only on old C style static linking rules.
 
 package cgo

diff --git a/src/runtime/cgo/libcgo.h b/src/runtime/cgo/libcgo.h
index bda2499..01f9e72 100644
--- a/src/runtime/cgo/libcgo.h
+++ b/src/runtime/cgo/libcgo.h

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -57,8 +57,10 @@
 
 /*
  * Waits for the Go runtime to be initialized (OS dependent).
+ * If runtime.SetCgoTraceback is used to set a context function,
+ * calls the context function and returns the context value.
  */
-void _cgo_wait_runtime_init_done();
+uintptr_t _cgo_wait_runtime_init_done();
 
 /*
  * Call fn in the 6c world.
@@ -84,3 +86,50 @@
  * Starts a mach message server processing EXC_BAD_ACCESS.
  */
 void darwin_arm_init_mach_exception_handler(void);
+
+/*
+ * The cgo context function. See runtime.SetCgoTraceback.
+ */
+struct context_arg {
+	uintptr_t Context;
+};
+extern void (*(_cgo_get_context_function(void)))(struct context_arg*);
+
+/*
+ * TSAN support.  This is only useful when building with
+ *   CGO_CFLAGS="-fsanitize=thread" CGO_LDFLAGS="-fsanitize=thread" go install
+ */
+#undef CGO_TSAN
+#if defined(__has_feature)
+# if __has_feature(thread_sanitizer)
+#  define CGO_TSAN
+# endif
+#elif defined(__SANITIZE_THREAD__)
+# define CGO_TSAN
+#endif
+
+#ifdef CGO_TSAN
+
+// These must match the definitions in yesTsanProlog in cmd/cgo/out.go.
+
+long long _cgo_sync __attribute__ ((common));
+
+extern void __tsan_acquire(void*);
+extern void __tsan_release(void*);
+
+__attribute__ ((unused))
+static void _cgo_tsan_acquire() {
+	__tsan_acquire(&_cgo_sync);
+}
+
+__attribute__ ((unused))
+static void _cgo_tsan_release() {
+	__tsan_release(&_cgo_sync);
+}
+
+#else // !defined(CGO_TSAN)
+
+#define _cgo_tsan_acquire()
+#define _cgo_tsan_release()
+
+#endif // !defined(CGO_TSAN)

diff --git a/src/runtime/cgo/mmap.go b/src/runtime/cgo/mmap.go
new file mode 100644
index 0000000..ff98359
--- /dev/null
+++ b/src/runtime/cgo/mmap.go

@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,amd64
+
+package cgo
+
+// Import "unsafe" because we use go:linkname.
+import _ "unsafe"
+
+// When using cgo, call the C library for mmap, so that we call into
+// any sanitizer interceptors. This supports using the memory
+// sanitizer with Go programs. The memory sanitizer only applies to
+// C/C++ code; this permits that code to see the Go code as normal
+// program addresses that have been initialized.
+
+//go:cgo_import_static x_cgo_mmap
+//go:linkname x_cgo_mmap x_cgo_mmap
+//go:linkname _cgo_mmap _cgo_mmap
+var x_cgo_mmap byte
+var _cgo_mmap = &x_cgo_mmap

diff --git a/src/runtime/cgo/netbsd.go b/src/runtime/cgo/netbsd.go
index ac6b18a..2cecd0c 100644
--- a/src/runtime/cgo/netbsd.go
+++ b/src/runtime/cgo/netbsd.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/openbsd.go b/src/runtime/cgo/openbsd.go
index 61af3a8..5c70dbd 100644
--- a/src/runtime/cgo/openbsd.go
+++ b/src/runtime/cgo/openbsd.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/setenv.go b/src/runtime/cgo/setenv.go
index 20d5703..fab4339 100644
--- a/src/runtime/cgo/setenv.go
+++ b/src/runtime/cgo/setenv.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgo/signal_darwin_armx.go b/src/runtime/cgo/signal_darwin_armx.go
index 9c1ba5d..9f6741e 100644
--- a/src/runtime/cgo/signal_darwin_armx.go
+++ b/src/runtime/cgo/signal_darwin_armx.go

@@ -13,10 +13,14 @@
 //go:linkname x_cgo_panicmem x_cgo_panicmem
 var x_cgo_panicmem uintptr
 
+// use a pointer to avoid relocation of external symbol in __TEXT
+// make linker happy
+var _cgo_panicmem = &x_cgo_panicmem
+
 // TODO(crawshaw): move this into x_cgo_init, it will not run until
 // runtime has finished loading, which may be after its use.
 func init() {
-	x_cgo_panicmem = funcPC(panicmem)
+	*_cgo_panicmem = funcPC(panicmem)
 }
 
 func funcPC(f interface{}) uintptr {

diff --git a/src/runtime/cgo_mips64x.go b/src/runtime/cgo_mips64x.go
new file mode 100644
index 0000000..f718e92
--- /dev/null
+++ b/src/runtime/cgo_mips64x.go

@@ -0,0 +1,12 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+package runtime
+
+// crosscall1 calls into the runtime to set up the registers the
+// Go runtime expects and so the symbol it calls needs to be exported
+// for external linking to work.
+//go:cgo_export_static _cgo_reginit

diff --git a/src/runtime/cgo_mmap.go b/src/runtime/cgo_mmap.go
new file mode 100644
index 0000000..a23cc79
--- /dev/null
+++ b/src/runtime/cgo_mmap.go

@@ -0,0 +1,41 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Support for memory sanitizer. See runtime/cgo/mmap.go.
+
+// +build linux,amd64
+
+package runtime
+
+import "unsafe"
+
+// _cgo_mmap is filled in by runtime/cgo when it is linked into the
+// program, so it is only non-nil when using cgo.
+//go:linkname _cgo_mmap _cgo_mmap
+var _cgo_mmap unsafe.Pointer
+
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer {
+	if _cgo_mmap != nil {
+		// Make ret a uintptr so that writing to it in the
+		// function literal does not trigger a write barrier.
+		// A write barrier here could break because of the way
+		// that mmap uses the same value both as a pointer and
+		// an errno value.
+		// TODO: Fix mmap to return two values.
+		var ret uintptr
+		systemstack(func() {
+			ret = callCgoMmap(addr, n, prot, flags, fd, off)
+		})
+		return unsafe.Pointer(ret)
+	}
+	return sysMmap(addr, n, prot, flags, fd, off)
+}
+
+// sysMmap calls the mmap system call. It is implemented in assembly.
+func sysMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+
+// cgoMmap calls the mmap function in the runtime/cgo package on the
+// callCgoMmap calls the mmap function in the runtime/cgo package
+// using the GCC calling convention. It is implemented in assembly.
+func callCgoMmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) uintptr

diff --git a/src/runtime/cgo_ppc64x.go b/src/runtime/cgo_ppc64x.go
new file mode 100644
index 0000000..fb2da32
--- /dev/null
+++ b/src/runtime/cgo_ppc64x.go

@@ -0,0 +1,12 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package runtime
+
+// crosscall_ppc64 calls into the runtime to set up the registers the
+// Go runtime expects and so the symbol it calls needs to be exported
+// for external linking to work.
+//go:cgo_export_static _cgo_reginit

diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index f09a66a..0f8386b 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -28,7 +28,7 @@
 // and then unlocks g from m.
 //
 // The above description skipped over the possibility of the gcc-compiled
-// function f calling back into Go.  If that happens, we continue down
+// function f calling back into Go. If that happens, we continue down
 // the rabbit hole during the execution of f.
 //
 // To make it possible for gcc-compiled C code to call a Go function p.GoF,
@@ -38,9 +38,9 @@
 // GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
 // (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
 // adapter from the gcc function call ABI to the 6c function call ABI.
-// It is called from gcc to call 6c functions.  In this case it calls
+// It is called from gcc to call 6c functions. In this case it calls
 // _cgoexp_GoF(frame, framesize), still running on m->g0's stack
-// and outside the $GOMAXPROCS limit.  Thus, this code cannot yet
+// and outside the $GOMAXPROCS limit. Thus, this code cannot yet
 // call arbitrary Go code directly and must be careful not to allocate
 // memory or use up m->g0's stack.
 //
@@ -79,7 +79,14 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Addresses collected in a cgo backtrace when crashing.
+// Length must match arg.Max in x_cgo_callers in runtime/cgo/gcc_traceback.c.
+type cgoCallers [32]uintptr
 
 // Call from Go to C.
 //go:nosplit
@@ -106,6 +113,9 @@
 	mp.ncgo++
 	defer endcgo(mp)
 
+	// Reset traceback.
+	mp.cgoCallers[0] = 0
+
 	/*
 	 * Announce we are entering a system call
 	 * so that the scheduler knows to create another
@@ -135,28 +145,9 @@
 	unlockOSThread() // invalidates mp
 }
 
-// Helper functions for cgo code.
-
-func cmalloc(n uintptr) unsafe.Pointer {
-	var args struct {
-		n   uint64
-		ret unsafe.Pointer
-	}
-	args.n = uint64(n)
-	cgocall(_cgo_malloc, unsafe.Pointer(&args))
-	if args.ret == nil {
-		throw("C malloc failed")
-	}
-	return args.ret
-}
-
-func cfree(p unsafe.Pointer) {
-	cgocall(_cgo_free, p)
-}
-
 // Call from C back to Go.
 //go:nosplit
-func cgocallbackg() {
+func cgocallbackg(ctxt uintptr) {
 	gp := getg()
 	if gp != gp.m.curg {
 		println("runtime: bad g in cgocallback")
@@ -174,20 +165,43 @@
 	savedsp := unsafe.Pointer(gp.syscallsp)
 	savedpc := gp.syscallpc
 	exitsyscall(0) // coming out of cgo call
-	cgocallbackg1()
+
+	cgocallbackg1(ctxt)
+
 	// going back to cgo call
 	reentersyscall(savedpc, uintptr(savedsp))
 
 	gp.m.syscall = syscall
 }
 
-func cgocallbackg1() {
+func cgocallbackg1(ctxt uintptr) {
 	gp := getg()
 	if gp.m.needextram {
 		gp.m.needextram = false
 		systemstack(newextram)
 	}
 
+	if ctxt != 0 {
+		s := append(gp.cgoCtxt, ctxt)
+
+		// Now we need to set gp.cgoCtxt = s, but we could get
+		// a SIGPROF signal while manipulating the slice, and
+		// the SIGPROF handler could pick up gp.cgoCtxt while
+		// tracing up the stack.  We need to ensure that the
+		// handler always sees a valid slice, so set the
+		// values in an order such that it always does.
+		p := (*slice)(unsafe.Pointer(&gp.cgoCtxt))
+		atomicstorep(unsafe.Pointer(&p.array), unsafe.Pointer(&s[0]))
+		p.cap = cap(s)
+		p.len = len(s)
+
+		defer func(gp *g) {
+			// Decrease the length of the slice by one, safely.
+			p := (*slice)(unsafe.Pointer(&gp.cgoCtxt))
+			p.len--
+		}(gp)
+	}
+
 	if gp.m.ncgo == 0 {
 		// The C call to Go came from a thread not currently running
 		// any Go. In the case of -buildmode=c-archive or c-shared,
@@ -220,27 +234,43 @@
 	case "arm":
 		// On arm, stack frame is two words and there's a saved LR between
 		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*ptrSize))
+		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
 	case "arm64":
 		// On arm64, stack frame is four words and there's a saved LR between
 		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 5*ptrSize))
+		cb = (*args)(unsafe.Pointer(sp + 5*sys.PtrSize))
 	case "amd64":
-		// On amd64, stack frame is one word, plus caller PC.
+		// On amd64, stack frame is two words, plus caller PC.
 		if framepointer_enabled {
 			// In this case, there's also saved BP.
-			cb = (*args)(unsafe.Pointer(sp + 3*ptrSize))
+			cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
 			break
 		}
-		cb = (*args)(unsafe.Pointer(sp + 2*ptrSize))
+		cb = (*args)(unsafe.Pointer(sp + 3*sys.PtrSize))
 	case "386":
 		// On 386, stack frame is three words, plus caller PC.
-		cb = (*args)(unsafe.Pointer(sp + 4*ptrSize))
-	case "ppc64", "ppc64le":
-		// On ppc64, stack frame is two words and there's a
-		// saved LR between SP and the stack frame and between
-		// the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*ptrSize))
+		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
+	case "ppc64", "ppc64le", "s390x":
+		// On ppc64 and s390x, the callback arguments are in the arguments area of
+		// cgocallback's stack frame. The stack looks like this:
+		// +--------------------+------------------------------+
+		// |                    | ...                          |
+		// | cgoexp_$fn         +------------------------------+
+		// |                    | fixed frame area             |
+		// +--------------------+------------------------------+
+		// |                    | arguments area               |
+		// | cgocallback        +------------------------------+ <- sp + 2*minFrameSize + 2*ptrSize
+		// |                    | fixed frame area             |
+		// +--------------------+------------------------------+ <- sp + minFrameSize + 2*ptrSize
+		// |                    | local variables (2 pointers) |
+		// | cgocallback_gofunc +------------------------------+ <- sp + minFrameSize
+		// |                    | fixed frame area             |
+		// +--------------------+------------------------------+ <- sp
+		cb = (*args)(unsafe.Pointer(sp + 2*sys.MinFrameSize + 2*sys.PtrSize))
+	case "mips64", "mips64le":
+		// On mips64x, stack frame is two words and there's a saved LR between
+		// SP and the stack frame and between the stack frame and the arguments.
+		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
 	}
 
 	// Invoke callback.
@@ -249,11 +279,18 @@
 	// For cgo, cb.arg points into a C stack frame and therefore doesn't
 	// hold any pointers that the GC can find anyway - the write barrier
 	// would be a no-op.
-	reflectcall(nil, unsafe.Pointer(cb.fn), unsafe.Pointer(cb.arg), uint32(cb.argsize), 0)
+	reflectcall(nil, unsafe.Pointer(cb.fn), cb.arg, uint32(cb.argsize), 0)
 
 	if raceenabled {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
 	}
+	if msanenabled {
+		// Tell msan that we wrote to the entire argument block.
+		// This tells msan that we set the results.
+		// Since we have already called the function it doesn't
+		// matter that we are writing to the non-result parameters.
+		msanwrite(cb.arg, cb.argsize)
+	}
 
 	// Do not unwind m->g0->sched.sp.
 	// Our caller, cgocallback, will do that.
@@ -271,14 +308,10 @@
 	switch GOARCH {
 	default:
 		throw("unwindm not implemented")
-	case "386", "amd64":
-		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp))
-	case "arm":
-		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 4))
+	case "386", "amd64", "arm", "ppc64", "ppc64le", "mips64", "mips64le", "s390x":
+		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + sys.MinFrameSize))
 	case "arm64":
 		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 16))
-	case "ppc64", "ppc64le":
-		sched.sp = *(*uintptr)(unsafe.Pointer(sched.sp + 8))
 	}
 	releasem(mp)
 }
@@ -294,3 +327,290 @@
 }
 
 var racecgosync uint64 // represents possible synchronization in C code
+
+// Pointer checking for cgo code.
+
+// We want to detect all cases where a program that does not use
+// unsafe makes a cgo call passing a Go pointer to memory that
+// contains a Go pointer. Here a Go pointer is defined as a pointer
+// to memory allocated by the Go runtime. Programs that use unsafe
+// can evade this restriction easily, so we don't try to catch them.
+// The cgo program will rewrite all possibly bad pointer arguments to
+// call cgoCheckPointer, where we can catch cases of a Go pointer
+// pointing to a Go pointer.
+
+// Complicating matters, taking the address of a slice or array
+// element permits the C program to access all elements of the slice
+// or array. In that case we will see a pointer to a single element,
+// but we need to check the entire data structure.
+
+// The cgoCheckPointer call takes additional arguments indicating that
+// it was called on an address expression. An additional argument of
+// true means that it only needs to check a single element. An
+// additional argument of a slice or array means that it needs to
+// check the entire slice/array, but nothing else. Otherwise, the
+// pointer could be anything, and we check the entire heap object,
+// which is conservative but safe.
+
+// When and if we implement a moving garbage collector,
+// cgoCheckPointer will pin the pointer for the duration of the cgo
+// call.  (This is necessary but not sufficient; the cgo program will
+// also have to change to pin Go pointers that cannot point to Go
+// pointers.)
+
+// cgoCheckPointer checks if the argument contains a Go pointer that
+// points to a Go pointer, and panics if it does. It returns the pointer.
+func cgoCheckPointer(ptr interface{}, args ...interface{}) interface{} {
+	if debug.cgocheck == 0 {
+		return ptr
+	}
+
+	ep := (*eface)(unsafe.Pointer(&ptr))
+	t := ep._type
+
+	top := true
+	if len(args) > 0 && (t.kind&kindMask == kindPtr || t.kind&kindMask == kindUnsafePointer) {
+		p := ep.data
+		if t.kind&kindDirectIface == 0 {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return ptr
+		}
+		aep := (*eface)(unsafe.Pointer(&args[0]))
+		switch aep._type.kind & kindMask {
+		case kindBool:
+			if t.kind&kindMask == kindUnsafePointer {
+				// We don't know the type of the element.
+				break
+			}
+			pt := (*ptrtype)(unsafe.Pointer(t))
+			cgoCheckArg(pt.elem, p, true, false, cgoCheckPointerFail)
+			return ptr
+		case kindSlice:
+			// Check the slice rather than the pointer.
+			ep = aep
+			t = ep._type
+		case kindArray:
+			// Check the array rather than the pointer.
+			// Pass top as false since we have a pointer
+			// to the array.
+			ep = aep
+			t = ep._type
+			top = false
+		default:
+			throw("can't happen")
+		}
+	}
+
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, top, cgoCheckPointerFail)
+	return ptr
+}
+
+const cgoCheckPointerFail = "cgo argument has Go pointer to Go pointer"
+const cgoResultFail = "cgo result has Go pointer"
+
+// cgoCheckArg is the real work of cgoCheckPointer. The argument p
+// is either a pointer to the value (of type t), or the value itself,
+// depending on indir. The top parameter is whether we are at the top
+// level, where Go pointers are allowed.
+func cgoCheckArg(t *_type, p unsafe.Pointer, indir, top bool, msg string) {
+	if t.kind&kindNoPointers != 0 {
+		// If the type has no pointers there is nothing to do.
+		return
+	}
+
+	switch t.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		if !indir {
+			if at.len != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(at.elem, p, at.elem.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for i := uintptr(0); i < at.len; i++ {
+			cgoCheckArg(at.elem, p, true, top, msg)
+			p = add(p, at.elem.size)
+		}
+	case kindChan, kindMap:
+		// These types contain internal pointers that will
+		// always be allocated in the Go heap. It's never OK
+		// to pass them to C.
+		panic(errorString(msg))
+	case kindFunc:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		panic(errorString(msg))
+	case kindInterface:
+		it := *(**_type)(p)
+		if it == nil {
+			return
+		}
+		// A type known at compile time is OK since it's
+		// constant. A type not known at compile time will be
+		// in the heap and will not be OK.
+		if inheap(uintptr(unsafe.Pointer(it))) {
+			panic(errorString(msg))
+		}
+		p = *(*unsafe.Pointer)(add(p, sys.PtrSize))
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		cgoCheckArg(it, p, it.kind&kindDirectIface == 0, false, msg)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		s := (*slice)(p)
+		p = s.array
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+		if st.elem.kind&kindNoPointers != 0 {
+			return
+		}
+		for i := 0; i < s.cap; i++ {
+			cgoCheckArg(st.elem, p, true, false, msg)
+			p = add(p, st.elem.size)
+		}
+	case kindString:
+		ss := (*stringStruct)(p)
+		if !cgoIsGoPointer(ss.str) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		if !indir {
+			if len(st.fields) != 1 {
+				throw("can't happen")
+			}
+			cgoCheckArg(st.fields[0].typ, p, st.fields[0].typ.kind&kindDirectIface == 0, top, msg)
+			return
+		}
+		for _, f := range st.fields {
+			cgoCheckArg(f.typ, add(p, f.offset), true, top, msg)
+		}
+	case kindPtr, kindUnsafePointer:
+		if indir {
+			p = *(*unsafe.Pointer)(p)
+		}
+
+		if !cgoIsGoPointer(p) {
+			return
+		}
+		if !top {
+			panic(errorString(msg))
+		}
+
+		cgoCheckUnknownPointer(p, msg)
+	}
+}
+
+// cgoCheckUnknownPointer is called for an arbitrary pointer into Go
+// memory. It checks whether that Go memory contains any other
+// pointer into Go memory. If it does, we panic.
+// The return values are unused but useful to see in panic tracebacks.
+func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
+	if cgoInRange(p, mheap_.arena_start, mheap_.arena_used) {
+		if !inheap(uintptr(p)) {
+			// On 32-bit systems it is possible for C's allocated memory
+			// to have addresses between arena_start and arena_used.
+			// Either this pointer is a stack or an unused span or it's
+			// a C allocation. Escape analysis should prevent the first,
+			// garbage collection should prevent the second,
+			// and the third is completely OK.
+			return
+		}
+
+		b, hbits, span, _ := heapBitsForObject(uintptr(p), 0, 0)
+		base = b
+		if base == 0 {
+			return
+		}
+		n := span.elemsize
+		for i = uintptr(0); i < n; i += sys.PtrSize {
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				// No more possible pointers.
+				break
+			}
+			if hbits.isPointer() {
+				if cgoIsGoPointer(*(*unsafe.Pointer)(unsafe.Pointer(base + i))) {
+					panic(errorString(msg))
+				}
+			}
+			hbits = hbits.next()
+		}
+
+		return
+	}
+
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if cgoInRange(p, datap.data, datap.edata) || cgoInRange(p, datap.bss, datap.ebss) {
+			// We have no way to know the size of the object.
+			// We have to assume that it might contain a pointer.
+			panic(errorString(msg))
+		}
+		// In the text or noptr sections, we know that the
+		// pointer does not point to a Go pointer.
+	}
+
+	return
+}
+
+// cgoIsGoPointer returns whether the pointer is a Go pointer--a
+// pointer to Go memory. We only care about Go memory that might
+// contain pointers.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoIsGoPointer(p unsafe.Pointer) bool {
+	if p == nil {
+		return false
+	}
+
+	if inHeapOrStack(uintptr(p)) {
+		return true
+	}
+
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if cgoInRange(p, datap.data, datap.edata) || cgoInRange(p, datap.bss, datap.ebss) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// cgoInRange returns whether p is between start and end.
+//go:nosplit
+//go:nowritebarrierrec
+func cgoInRange(p unsafe.Pointer, start, end uintptr) bool {
+	return start <= uintptr(p) && uintptr(p) < end
+}
+
+// cgoCheckResult is called to check the result parameter of an
+// exported Go function. It panics if the result is or contains a Go
+// pointer.
+func cgoCheckResult(val interface{}) {
+	if debug.cgocheck == 0 {
+		return
+	}
+
+	ep := (*eface)(unsafe.Pointer(&val))
+	t := ep._type
+	cgoCheckArg(t, ep.data, t.kind&kindDirectIface == 0, false, cgoResultFail)
+}

diff --git a/src/runtime/cgocallback.go b/src/runtime/cgocallback.go
index f93acab..59953f1 100644
--- a/src/runtime/cgocallback.go
+++ b/src/runtime/cgocallback.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/cgocheck.go b/src/runtime/cgocheck.go
new file mode 100644
index 0000000..2d06414
--- /dev/null
+++ b/src/runtime/cgocheck.go

@@ -0,0 +1,257 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Code to check that pointer writes follow the cgo rules.
+// These functions are invoked via the write barrier when debug.cgocheck > 1.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const cgoWriteBarrierFail = "Go pointer stored into non-Go memory"
+
+// cgoCheckWriteBarrier is called whenever a pointer is stored into memory.
+// It throws if the program is storing a Go pointer into non-Go memory.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckWriteBarrier(dst *uintptr, src uintptr) {
+	if !cgoIsGoPointer(unsafe.Pointer(src)) {
+		return
+	}
+	if cgoIsGoPointer(unsafe.Pointer(dst)) {
+		return
+	}
+
+	// If we are running on the system stack then dst might be an
+	// address on the stack, which is OK.
+	g := getg()
+	if g == g.m.g0 || g == g.m.gsignal {
+		return
+	}
+
+	// Allocating memory can write to various mfixalloc structs
+	// that look like they are non-Go memory.
+	if g.m.mallocing != 0 {
+		return
+	}
+
+	systemstack(func() {
+		println("write of Go pointer", hex(src), "to non-Go memory", hex(uintptr(unsafe.Pointer(dst))))
+		throw(cgoWriteBarrierFail)
+	})
+}
+
+// cgoCheckMemmove is called when moving a block of memory.
+// dst and src point off bytes into the value to copy.
+// size is the number of bytes to copy.
+// It throws if the program is copying a block that contains a Go pointer
+// into non-Go memory.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckMemmove(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
+	if typ.kind&kindNoPointers != 0 {
+		return
+	}
+	if !cgoIsGoPointer(src) {
+		return
+	}
+	if cgoIsGoPointer(dst) {
+		return
+	}
+	cgoCheckTypedBlock(typ, src, off, size)
+}
+
+// cgoCheckSliceCopy is called when copying n elements of a slice from
+// src to dst.  typ is the element type of the slice.
+// It throws if the program is copying slice elements that contain Go pointers
+// into non-Go memory.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckSliceCopy(typ *_type, dst, src slice, n int) {
+	if typ.kind&kindNoPointers != 0 {
+		return
+	}
+	if !cgoIsGoPointer(src.array) {
+		return
+	}
+	if cgoIsGoPointer(dst.array) {
+		return
+	}
+	p := src.array
+	for i := 0; i < n; i++ {
+		cgoCheckTypedBlock(typ, p, 0, typ.size)
+		p = add(p, typ.size)
+	}
+}
+
+// cgoCheckTypedBlock checks the block of memory at src, for up to size bytes,
+// and throws if it finds a Go pointer. The type of the memory is typ,
+// and src is off bytes into that type.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckTypedBlock(typ *_type, src unsafe.Pointer, off, size uintptr) {
+	// Anything past typ.ptrdata is not a pointer.
+	if typ.ptrdata <= off {
+		return
+	}
+	if ptrdataSize := typ.ptrdata - off; size > ptrdataSize {
+		size = ptrdataSize
+	}
+
+	if typ.kind&kindGCProg == 0 {
+		cgoCheckBits(src, typ.gcdata, off, size)
+		return
+	}
+
+	// The type has a GC program. Try to find GC bits somewhere else.
+	for datap := &firstmoduledata; datap != nil; datap = datap.next {
+		if cgoInRange(src, datap.data, datap.edata) {
+			doff := uintptr(src) - datap.data
+			cgoCheckBits(add(src, -doff), datap.gcdatamask.bytedata, off+doff, size)
+			return
+		}
+		if cgoInRange(src, datap.bss, datap.ebss) {
+			boff := uintptr(src) - datap.bss
+			cgoCheckBits(add(src, -boff), datap.gcbssmask.bytedata, off+boff, size)
+			return
+		}
+	}
+
+	aoff := uintptr(src) - mheap_.arena_start
+	idx := aoff >> _PageShift
+	s := h_spans[idx]
+	if s.state == _MSpanStack {
+		// There are no heap bits for value stored on the stack.
+		// For a channel receive src might be on the stack of some
+		// other goroutine, so we can't unwind the stack even if
+		// we wanted to.
+		// We can't expand the GC program without extra storage
+		// space we can't easily get.
+		// Fortunately we have the type information.
+		systemstack(func() {
+			cgoCheckUsingType(typ, src, off, size)
+		})
+		return
+	}
+
+	// src must be in the regular heap.
+
+	hbits := heapBitsForAddr(uintptr(src))
+	for i := uintptr(0); i < off+size; i += sys.PtrSize {
+		bits := hbits.bits()
+		if i >= off && bits&bitPointer != 0 {
+			v := *(*unsafe.Pointer)(add(src, i))
+			if cgoIsGoPointer(v) {
+				systemstack(func() {
+					throw(cgoWriteBarrierFail)
+				})
+			}
+		}
+		hbits = hbits.next()
+	}
+}
+
+// cgoCheckBits checks the block of memory at src, for up to size
+// bytes, and throws if it finds a Go pointer. The gcbits mark each
+// pointer value. The src pointer is off bytes into the gcbits.
+//go:nosplit
+//go:nowritebarrier
+func cgoCheckBits(src unsafe.Pointer, gcbits *byte, off, size uintptr) {
+	skipMask := off / sys.PtrSize / 8
+	skipBytes := skipMask * sys.PtrSize * 8
+	ptrmask := addb(gcbits, skipMask)
+	src = add(src, skipBytes)
+	off -= skipBytes
+	size += off
+	var bits uint32
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if i&(sys.PtrSize*8-1) == 0 {
+			bits = uint32(*ptrmask)
+			ptrmask = addb(ptrmask, 1)
+		} else {
+			bits >>= 1
+		}
+		if off > 0 {
+			off -= sys.PtrSize
+		} else {
+			if bits&1 != 0 {
+				v := *(*unsafe.Pointer)(add(src, i))
+				if cgoIsGoPointer(v) {
+					systemstack(func() {
+						throw(cgoWriteBarrierFail)
+					})
+				}
+			}
+		}
+	}
+}
+
+// cgoCheckUsingType is like cgoCheckTypedBlock, but is a last ditch
+// fall back to look for pointers in src using the type information.
+// We only use this when looking at a value on the stack when the type
+// uses a GC program, because otherwise it's more efficient to use the
+// GC bits. This is called on the system stack.
+//go:nowritebarrier
+//go:systemstack
+func cgoCheckUsingType(typ *_type, src unsafe.Pointer, off, size uintptr) {
+	if typ.kind&kindNoPointers != 0 {
+		return
+	}
+
+	// Anything past typ.ptrdata is not a pointer.
+	if typ.ptrdata <= off {
+		return
+	}
+	if ptrdataSize := typ.ptrdata - off; size > ptrdataSize {
+		size = ptrdataSize
+	}
+
+	if typ.kind&kindGCProg == 0 {
+		cgoCheckBits(src, typ.gcdata, off, size)
+		return
+	}
+	switch typ.kind & kindMask {
+	default:
+		throw("can't happen")
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(typ))
+		for i := uintptr(0); i < at.len; i++ {
+			if off < at.elem.size {
+				cgoCheckUsingType(at.elem, src, off, size)
+			}
+			src = add(src, at.elem.size)
+			skipped := off
+			if skipped > at.elem.size {
+				skipped = at.elem.size
+			}
+			checked := at.elem.size - skipped
+			off -= skipped
+			if size <= checked {
+				return
+			}
+			size -= checked
+		}
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(typ))
+		for _, f := range st.fields {
+			if off < f.typ.size {
+				cgoCheckUsingType(f.typ, src, off, size)
+			}
+			src = add(src, f.typ.size)
+			skipped := off
+			if skipped > f.typ.size {
+				skipped = f.typ.size
+			}
+			checked := f.typ.size - skipped
+			off -= skipped
+			if size <= checked {
+				return
+			}
+			size -= checked
+		}
+	}
+}

diff --git a/src/runtime/chan.go b/src/runtime/chan.go
index cfee12a..712ad8c 100644
--- a/src/runtime/chan.go
+++ b/src/runtime/chan.go

@@ -6,7 +6,15 @@
 
 // This file contains the implementation of Go channels.
 
-import "unsafe"
+// Invariants:
+//  At least one of c.sendq and c.recvq is empty.
+// For buffered channels, also:
+//  c.qcount > 0 implies that c.recvq is empty.
+//  c.qcount < c.dataqsiz implies that c.sendq is empty.
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	maxAlign  = 8
@@ -25,7 +33,14 @@
 	recvx    uint   // receive index
 	recvq    waitq  // list of recv waiters
 	sendq    waitq  // list of send waiters
-	lock     mutex
+
+	// lock protects all fields in hchan, as well as several
+	// fields in sudogs blocked on this channel.
+	//
+	// Do not change another G's status while holding this lock
+	// (in particular, do not ready a G), as this can deadlock
+	// with stack shrinking.
+	lock mutex
 }
 
 type waitq struct {
@@ -48,8 +63,8 @@
 	if hchanSize%maxAlign != 0 || elem.align > maxAlign {
 		throw("makechan: bad alignment")
 	}
-	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/uintptr(elem.size)) {
-		panic("makechan: size out of range")
+	if size < 0 || int64(uintptr(size)) != size || (elem.size > 0 && uintptr(size) > (_MaxMem-hchanSize)/elem.size) {
+		panic(plainError("makechan: size out of range"))
 	}
 
 	var c *hchan
@@ -59,7 +74,7 @@
 		// buf points into the same allocation, elemtype is persistent.
 		// SudoG's are referenced from their owning thread so they can't be collected.
 		// TODO(dvyukov,rlh): Rethink when collector can move allocated objects.
-		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*uintptr(elem.size), nil, flagNoScan))
+		c = (*hchan)(mallocgc(hchanSize+uintptr(size)*elem.size, nil, true))
 		if size > 0 && elem.size != 0 {
 			c.buf = add(unsafe.Pointer(c), hchanSize)
 		} else {
@@ -69,7 +84,7 @@
 		}
 	} else {
 		c = new(hchan)
-		c.buf = newarray(elem, uintptr(size))
+		c.buf = newarray(elem, int(size))
 	}
 	c.elemsize = uint16(elem.size)
 	c.elemtype = elem
@@ -108,6 +123,9 @@
 	if raceenabled {
 		raceReadObjectPC(t.elem, ep, callerpc, funcPC(chansend))
 	}
+	if msanenabled {
+		msanread(ep, t.elem.size)
+	}
 
 	if c == nil {
 		if !block {
@@ -150,135 +168,121 @@
 	}
 
 	lock(&c.lock)
+
 	if c.closed != 0 {
 		unlock(&c.lock)
-		panic("send on closed channel")
+		panic(plainError("send on closed channel"))
 	}
 
-	if c.dataqsiz == 0 { // synchronous channel
-		sg := c.recvq.dequeue()
-		if sg != nil { // found a waiting receiver
-			if raceenabled {
-				racesync(c, sg)
-			}
-			unlock(&c.lock)
-
-			recvg := sg.g
-			if sg.elem != nil {
-				syncsend(c, sg, ep)
-			}
-			recvg.param = unsafe.Pointer(sg)
-			if sg.releasetime != 0 {
-				sg.releasetime = cputicks()
-			}
-			goready(recvg, 3)
-			return true
-		}
-
-		if !block {
-			unlock(&c.lock)
-			return false
-		}
-
-		// no receiver available: block on this channel.
-		gp := getg()
-		mysg := acquireSudog()
-		mysg.releasetime = 0
-		if t0 != 0 {
-			mysg.releasetime = -1
-		}
-		mysg.elem = ep
-		mysg.waitlink = nil
-		gp.waiting = mysg
-		mysg.g = gp
-		mysg.selectdone = nil
-		gp.param = nil
-		c.sendq.enqueue(mysg)
-		goparkunlock(&c.lock, "chan send", traceEvGoBlockSend, 3)
-
-		// someone woke us up.
-		if mysg != gp.waiting {
-			throw("G waiting list is corrupted!")
-		}
-		gp.waiting = nil
-		if gp.param == nil {
-			if c.closed == 0 {
-				throw("chansend: spurious wakeup")
-			}
-			panic("send on closed channel")
-		}
-		gp.param = nil
-		if mysg.releasetime > 0 {
-			blockevent(int64(mysg.releasetime)-t0, 2)
-		}
-		releaseSudog(mysg)
+	if sg := c.recvq.dequeue(); sg != nil {
+		// Found a waiting receiver. We pass the value we want to send
+		// directly to the receiver, bypassing the channel buffer (if any).
+		send(c, sg, ep, func() { unlock(&c.lock) })
 		return true
 	}
 
-	// asynchronous channel
-	// wait for some space to write our data
-	var t1 int64
-	for futile := byte(0); c.qcount >= c.dataqsiz; futile = traceFutileWakeup {
-		if !block {
-			unlock(&c.lock)
-			return false
+	if c.qcount < c.dataqsiz {
+		// Space is available in the channel buffer. Enqueue the element to send.
+		qp := chanbuf(c, c.sendx)
+		if raceenabled {
+			raceacquire(qp)
+			racerelease(qp)
 		}
-		gp := getg()
-		mysg := acquireSudog()
-		mysg.releasetime = 0
-		if t0 != 0 {
-			mysg.releasetime = -1
+		typedmemmove(c.elemtype, qp, ep)
+		c.sendx++
+		if c.sendx == c.dataqsiz {
+			c.sendx = 0
 		}
-		mysg.g = gp
-		mysg.elem = nil
-		mysg.selectdone = nil
-		c.sendq.enqueue(mysg)
-		goparkunlock(&c.lock, "chan send", traceEvGoBlockSend|futile, 3)
-
-		// someone woke us up - try again
-		if mysg.releasetime > 0 {
-			t1 = mysg.releasetime
-		}
-		releaseSudog(mysg)
-		lock(&c.lock)
-		if c.closed != 0 {
-			unlock(&c.lock)
-			panic("send on closed channel")
-		}
-	}
-
-	// write our data into the channel buffer
-	if raceenabled {
-		raceacquire(chanbuf(c, c.sendx))
-		racerelease(chanbuf(c, c.sendx))
-	}
-	typedmemmove(c.elemtype, chanbuf(c, c.sendx), ep)
-	c.sendx++
-	if c.sendx == c.dataqsiz {
-		c.sendx = 0
-	}
-	c.qcount++
-
-	// wake up a waiting receiver
-	sg := c.recvq.dequeue()
-	if sg != nil {
-		recvg := sg.g
+		c.qcount++
 		unlock(&c.lock)
-		if sg.releasetime != 0 {
-			sg.releasetime = cputicks()
-		}
-		goready(recvg, 3)
-	} else {
+		return true
+	}
+
+	if !block {
 		unlock(&c.lock)
+		return false
 	}
-	if t1 > 0 {
-		blockevent(t1-t0, 2)
+
+	// Block on the channel. Some receiver will complete our operation for us.
+	gp := getg()
+	mysg := acquireSudog()
+	mysg.releasetime = 0
+	if t0 != 0 {
+		mysg.releasetime = -1
 	}
+	// No stack splits between assigning elem and enqueuing mysg
+	// on gp.waiting where copystack can find it.
+	mysg.elem = ep
+	mysg.waitlink = nil
+	mysg.g = gp
+	mysg.selectdone = nil
+	mysg.c = c
+	gp.waiting = mysg
+	gp.param = nil
+	c.sendq.enqueue(mysg)
+	goparkunlock(&c.lock, "chan send", traceEvGoBlockSend, 3)
+
+	// someone woke us up.
+	if mysg != gp.waiting {
+		throw("G waiting list is corrupted")
+	}
+	gp.waiting = nil
+	if gp.param == nil {
+		if c.closed == 0 {
+			throw("chansend: spurious wakeup")
+		}
+		panic(plainError("send on closed channel"))
+	}
+	gp.param = nil
+	if mysg.releasetime > 0 {
+		blockevent(mysg.releasetime-t0, 2)
+	}
+	mysg.c = nil
+	releaseSudog(mysg)
 	return true
 }
 
-func syncsend(c *hchan, sg *sudog, elem unsafe.Pointer) {
-	// Send on unbuffered channel is the only operation
+// send processes a send operation on an empty channel c.
+// The value ep sent by the sender is copied to the receiver sg.
+// The receiver is then woken up to go on its merry way.
+// Channel c must be empty and locked.  send unlocks c with unlockf.
+// sg must already be dequeued from c.
+// ep must be non-nil and point to the heap or the caller's stack.
+func send(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+	if raceenabled {
+		if c.dataqsiz == 0 {
+			racesync(c, sg)
+		} else {
+			// Pretend we go through the buffer, even though
+			// we copy directly. Note that we need to increment
+			// the head/tail locations only when raceenabled.
+			qp := chanbuf(c, c.recvx)
+			raceacquire(qp)
+			racerelease(qp)
+			raceacquireg(sg.g, qp)
+			racereleaseg(sg.g, qp)
+			c.recvx++
+			if c.recvx == c.dataqsiz {
+				c.recvx = 0
+			}
+			c.sendx = c.recvx // c.sendx = (c.sendx+1) % c.dataqsiz
+		}
+	}
+	if sg.elem != nil {
+		sendDirect(c.elemtype, sg, ep)
+		sg.elem = nil
+	}
+	gp := sg.g
+	unlockf()
+	gp.param = unsafe.Pointer(sg)
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
+	}
+	goready(gp, 4)
+}
+
+func sendDirect(t *_type, sg *sudog, src unsafe.Pointer) {
+	// Send on an unbuffered or empty-buffered channel is the only operation
 	// in the entire runtime where one goroutine
 	// writes to the stack of another goroutine. The GC assumes that
 	// stack writes only happen when the goroutine is running and are
@@ -287,20 +291,24 @@
 	// typedmemmove will call heapBitsBulkBarrier, but the target bytes
 	// are not in the heap, so that will not help. We arrange to call
 	// memmove and typeBitsBulkBarrier instead.
-	memmove(sg.elem, elem, c.elemtype.size)
-	typeBitsBulkBarrier(c.elemtype, uintptr(sg.elem), c.elemtype.size)
-	sg.elem = nil
+
+	// Once we read sg.elem out of sg, it will no longer
+	// be updated if the destination's stack gets copied (shrunk).
+	// So make sure that no preemption points can happen between read & use.
+	dst := sg.elem
+	memmove(dst, src, t.size)
+	typeBitsBulkBarrier(t, uintptr(dst), t.size)
 }
 
 func closechan(c *hchan) {
 	if c == nil {
-		panic("close of nil channel")
+		panic(plainError("close of nil channel"))
 	}
 
 	lock(&c.lock)
 	if c.closed != 0 {
 		unlock(&c.lock)
-		panic("close of closed channel")
+		panic(plainError("close of closed channel"))
 	}
 
 	if raceenabled {
@@ -311,36 +319,57 @@
 
 	c.closed = 1
 
+	var glist *g
+
 	// release all readers
 	for {
 		sg := c.recvq.dequeue()
 		if sg == nil {
 			break
 		}
-		gp := sg.g
-		sg.elem = nil
-		gp.param = nil
+		if sg.elem != nil {
+			memclr(sg.elem, uintptr(c.elemsize))
+			sg.elem = nil
+		}
 		if sg.releasetime != 0 {
 			sg.releasetime = cputicks()
 		}
-		goready(gp, 3)
+		gp := sg.g
+		gp.param = nil
+		if raceenabled {
+			raceacquireg(gp, unsafe.Pointer(c))
+		}
+		gp.schedlink.set(glist)
+		glist = gp
 	}
 
-	// release all writers
+	// release all writers (they will panic)
 	for {
 		sg := c.sendq.dequeue()
 		if sg == nil {
 			break
 		}
-		gp := sg.g
 		sg.elem = nil
-		gp.param = nil
 		if sg.releasetime != 0 {
 			sg.releasetime = cputicks()
 		}
-		goready(gp, 3)
+		gp := sg.g
+		gp.param = nil
+		if raceenabled {
+			raceacquireg(gp, unsafe.Pointer(c))
+		}
+		gp.schedlink.set(glist)
+		glist = gp
 	}
 	unlock(&c.lock)
+
+	// Ready all Gs now that we've dropped the channel lock.
+	for glist != nil {
+		gp := glist
+		glist = glist.schedlink.ptr()
+		gp.schedlink = 0
+		goready(gp, 3)
+	}
 }
 
 // entry points for <- c from compiled code
@@ -360,8 +389,10 @@
 // If block == false and no elements are available, returns (false, false).
 // Otherwise, if c is closed, zeros *ep and returns (true, false).
 // Otherwise, fills in *ep with an element and returns (true, true).
+// A non-nil ep must point to the heap or the caller's stack.
 func chanrecv(t *chantype, c *hchan, ep unsafe.Pointer, block bool) (selected, received bool) {
-	// raceenabled: don't need to check ep, as it is always on the stack.
+	// raceenabled: don't need to check ep, as it is always on the stack
+	// or is new memory allocated by reflect.
 
 	if debugChan {
 		print("chanrecv: chan=", c, "\n")
@@ -388,8 +419,8 @@
 	// The order of operations is important here: reversing the operations can lead to
 	// incorrect behavior when racing with a close.
 	if !block && (c.dataqsiz == 0 && c.sendq.first == nil ||
-		c.dataqsiz > 0 && atomicloaduint(&c.qcount) == 0) &&
-		atomicload(&c.closed) == 0 {
+		c.dataqsiz > 0 && atomic.Loaduint(&c.qcount) == 0) &&
+		atomic.Load(&c.closed) == 0 {
 		return
 	}
 
@@ -399,167 +430,142 @@
 	}
 
 	lock(&c.lock)
-	if c.dataqsiz == 0 { // synchronous channel
-		if c.closed != 0 {
-			return recvclosed(c, ep)
+
+	if c.closed != 0 && c.qcount == 0 {
+		if raceenabled {
+			raceacquire(unsafe.Pointer(c))
 		}
-
-		sg := c.sendq.dequeue()
-		if sg != nil {
-			if raceenabled {
-				racesync(c, sg)
-			}
-			unlock(&c.lock)
-
-			if ep != nil {
-				typedmemmove(c.elemtype, ep, sg.elem)
-			}
-			sg.elem = nil
-			gp := sg.g
-			gp.param = unsafe.Pointer(sg)
-			if sg.releasetime != 0 {
-				sg.releasetime = cputicks()
-			}
-			goready(gp, 3)
-			selected = true
-			received = true
-			return
-		}
-
-		if !block {
-			unlock(&c.lock)
-			return
-		}
-
-		// no sender available: block on this channel.
-		gp := getg()
-		mysg := acquireSudog()
-		mysg.releasetime = 0
-		if t0 != 0 {
-			mysg.releasetime = -1
-		}
-		mysg.elem = ep
-		mysg.waitlink = nil
-		gp.waiting = mysg
-		mysg.g = gp
-		mysg.selectdone = nil
-		gp.param = nil
-		c.recvq.enqueue(mysg)
-		goparkunlock(&c.lock, "chan receive", traceEvGoBlockRecv, 3)
-
-		// someone woke us up
-		if mysg != gp.waiting {
-			throw("G waiting list is corrupted!")
-		}
-		gp.waiting = nil
-		if mysg.releasetime > 0 {
-			blockevent(mysg.releasetime-t0, 2)
-		}
-		haveData := gp.param != nil
-		gp.param = nil
-		releaseSudog(mysg)
-
-		if haveData {
-			// a sender sent us some data. It already wrote to ep.
-			selected = true
-			received = true
-			return
-		}
-
-		lock(&c.lock)
-		if c.closed == 0 {
-			throw("chanrecv: spurious wakeup")
-		}
-		return recvclosed(c, ep)
-	}
-
-	// asynchronous channel
-	// wait for some data to appear
-	var t1 int64
-	for futile := byte(0); c.qcount <= 0; futile = traceFutileWakeup {
-		if c.closed != 0 {
-			selected, received = recvclosed(c, ep)
-			if t1 > 0 {
-				blockevent(t1-t0, 2)
-			}
-			return
-		}
-
-		if !block {
-			unlock(&c.lock)
-			return
-		}
-
-		// wait for someone to send an element
-		gp := getg()
-		mysg := acquireSudog()
-		mysg.releasetime = 0
-		if t0 != 0 {
-			mysg.releasetime = -1
-		}
-		mysg.elem = nil
-		mysg.g = gp
-		mysg.selectdone = nil
-
-		c.recvq.enqueue(mysg)
-		goparkunlock(&c.lock, "chan receive", traceEvGoBlockRecv|futile, 3)
-
-		// someone woke us up - try again
-		if mysg.releasetime > 0 {
-			t1 = mysg.releasetime
-		}
-		releaseSudog(mysg)
-		lock(&c.lock)
-	}
-
-	if raceenabled {
-		raceacquire(chanbuf(c, c.recvx))
-		racerelease(chanbuf(c, c.recvx))
-	}
-	if ep != nil {
-		typedmemmove(c.elemtype, ep, chanbuf(c, c.recvx))
-	}
-	memclr(chanbuf(c, c.recvx), uintptr(c.elemsize))
-
-	c.recvx++
-	if c.recvx == c.dataqsiz {
-		c.recvx = 0
-	}
-	c.qcount--
-
-	// ping a sender now that there is space
-	sg := c.sendq.dequeue()
-	if sg != nil {
-		gp := sg.g
 		unlock(&c.lock)
-		if sg.releasetime != 0 {
-			sg.releasetime = cputicks()
+		if ep != nil {
+			memclr(ep, uintptr(c.elemsize))
 		}
-		goready(gp, 3)
-	} else {
-		unlock(&c.lock)
+		return true, false
 	}
 
-	if t1 > 0 {
-		blockevent(t1-t0, 2)
+	if sg := c.sendq.dequeue(); sg != nil {
+		// Found a waiting sender. If buffer is size 0, receive value
+		// directly from sender. Otherwise, receive from head of queue
+		// and add sender's value to the tail of the queue (both map to
+		// the same buffer slot because the queue is full).
+		recv(c, sg, ep, func() { unlock(&c.lock) })
+		return true, true
 	}
-	selected = true
-	received = true
-	return
+
+	if c.qcount > 0 {
+		// Receive directly from queue
+		qp := chanbuf(c, c.recvx)
+		if raceenabled {
+			raceacquire(qp)
+			racerelease(qp)
+		}
+		if ep != nil {
+			typedmemmove(c.elemtype, ep, qp)
+		}
+		memclr(qp, uintptr(c.elemsize))
+		c.recvx++
+		if c.recvx == c.dataqsiz {
+			c.recvx = 0
+		}
+		c.qcount--
+		unlock(&c.lock)
+		return true, true
+	}
+
+	if !block {
+		unlock(&c.lock)
+		return false, false
+	}
+
+	// no sender available: block on this channel.
+	gp := getg()
+	mysg := acquireSudog()
+	mysg.releasetime = 0
+	if t0 != 0 {
+		mysg.releasetime = -1
+	}
+	// No stack splits between assigning elem and enqueuing mysg
+	// on gp.waiting where copystack can find it.
+	mysg.elem = ep
+	mysg.waitlink = nil
+	gp.waiting = mysg
+	mysg.g = gp
+	mysg.selectdone = nil
+	mysg.c = c
+	gp.param = nil
+	c.recvq.enqueue(mysg)
+	goparkunlock(&c.lock, "chan receive", traceEvGoBlockRecv, 3)
+
+	// someone woke us up
+	if mysg != gp.waiting {
+		throw("G waiting list is corrupted")
+	}
+	gp.waiting = nil
+	if mysg.releasetime > 0 {
+		blockevent(mysg.releasetime-t0, 2)
+	}
+	closed := gp.param == nil
+	gp.param = nil
+	mysg.c = nil
+	releaseSudog(mysg)
+	return true, !closed
 }
 
-// recvclosed is a helper function for chanrecv.  Handles cleanup
-// when the receiver encounters a closed channel.
-// Caller must hold c.lock, recvclosed will release the lock.
-func recvclosed(c *hchan, ep unsafe.Pointer) (selected, recevied bool) {
-	if raceenabled {
-		raceacquire(unsafe.Pointer(c))
+// recv processes a receive operation on a full channel c.
+// There are 2 parts:
+// 1) The value sent by the sender sg is put into the channel
+//    and the sender is woken up to go on its merry way.
+// 2) The value received by the receiver (the current G) is
+//    written to ep.
+// For synchronous channels, both values are the same.
+// For asynchronous channels, the receiver gets its data from
+// the channel buffer and the sender's data is put in the
+// channel buffer.
+// Channel c must be full and locked. recv unlocks c with unlockf.
+// sg must already be dequeued from c.
+// A non-nil ep must point to the heap or the caller's stack.
+func recv(c *hchan, sg *sudog, ep unsafe.Pointer, unlockf func()) {
+	if c.dataqsiz == 0 {
+		if raceenabled {
+			racesync(c, sg)
+		}
+		if ep != nil {
+			// copy data from sender
+			// ep points to our own stack or heap, so nothing
+			// special (ala sendDirect) needed here.
+			typedmemmove(c.elemtype, ep, sg.elem)
+		}
+	} else {
+		// Queue is full. Take the item at the
+		// head of the queue. Make the sender enqueue
+		// its item at the tail of the queue. Since the
+		// queue is full, those are both the same slot.
+		qp := chanbuf(c, c.recvx)
+		if raceenabled {
+			raceacquire(qp)
+			racerelease(qp)
+			raceacquireg(sg.g, qp)
+			racereleaseg(sg.g, qp)
+		}
+		// copy data from queue to receiver
+		if ep != nil {
+			typedmemmove(c.elemtype, ep, qp)
+		}
+		// copy data from sender to queue
+		typedmemmove(c.elemtype, qp, sg.elem)
+		c.recvx++
+		if c.recvx == c.dataqsiz {
+			c.recvx = 0
+		}
+		c.sendx = c.recvx // c.sendx = (c.sendx+1) % c.dataqsiz
 	}
-	unlock(&c.lock)
-	if ep != nil {
-		memclr(ep, uintptr(c.elemsize))
+	sg.elem = nil
+	gp := sg.g
+	unlockf()
+	gp.param = unsafe.Pointer(sg)
+	if sg.releasetime != 0 {
+		sg.releasetime = cputicks()
 	}
-	return true, false
+	goready(gp, 4)
 }
 
 // compiler implements
@@ -692,7 +698,7 @@
 		// if sgp participates in a select and is already signaled, ignore it
 		if sgp.selectdone != nil {
 			// claim the right to signal
-			if *sgp.selectdone != 0 || !cas(sgp.selectdone, 0, 1) {
+			if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
 				continue
 			}
 		}

diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index 497e87f..8e8c47b 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go

@@ -573,7 +573,7 @@
 		}
 		e <- 9
 	}()
-	time.Sleep(time.Millisecond) // make sure goroutine A gets qeueued first on c
+	time.Sleep(time.Millisecond) // make sure goroutine A gets queued first on c
 
 	// goroutine B
 	go func() {
@@ -586,6 +586,84 @@
 	c <- 8 // wake up B.  This operation used to fail because c.recvq was corrupted (it tries to wake up an already running G instead of B)
 }
 
+var selectSink interface{}
+
+func TestSelectStackAdjust(t *testing.T) {
+	// Test that channel receive slots that contain local stack
+	// pointers are adjusted correctly by stack shrinking.
+	c := make(chan *int)
+	d := make(chan *int)
+	ready1 := make(chan bool)
+	ready2 := make(chan bool)
+
+	f := func(ready chan bool, dup bool) {
+		// Temporarily grow the stack to 10K.
+		stackGrowthRecursive((10 << 10) / (128 * 8))
+
+		// We're ready to trigger GC and stack shrink.
+		ready <- true
+
+		val := 42
+		var cx *int
+		cx = &val
+
+		var c2 chan *int
+		var d2 chan *int
+		if dup {
+			c2 = c
+			d2 = d
+		}
+
+		// Receive from d. cx won't be affected.
+		select {
+		case cx = <-c:
+		case <-c2:
+		case <-d:
+		case <-d2:
+		}
+
+		// Check that pointer in cx was adjusted correctly.
+		if cx != &val {
+			t.Error("cx no longer points to val")
+		} else if val != 42 {
+			t.Error("val changed")
+		} else {
+			*cx = 43
+			if val != 43 {
+				t.Error("changing *cx failed to change val")
+			}
+		}
+		ready <- true
+	}
+
+	go f(ready1, false)
+	go f(ready2, true)
+
+	// Let the goroutines get into the select.
+	<-ready1
+	<-ready2
+	time.Sleep(10 * time.Millisecond)
+
+	// Force concurrent GC a few times.
+	var before, after runtime.MemStats
+	runtime.ReadMemStats(&before)
+	for i := 0; i < 100; i++ {
+		selectSink = new([1 << 20]byte)
+		runtime.ReadMemStats(&after)
+		if after.NumGC-before.NumGC >= 2 {
+			goto done
+		}
+	}
+	t.Fatal("failed to trigger concurrent GC")
+done:
+	selectSink = nil
+
+	// Wake selects.
+	close(d)
+	<-ready1
+	<-ready2
+}
+
 func BenchmarkChanNonblocking(b *testing.B) {
 	myc := make(chan int)
 	b.RunParallel(func(pb *testing.PB) {
@@ -716,7 +794,7 @@
 	})
 }
 
-func BenchmarkChanSync(b *testing.B) {
+func benchmarkChanSync(b *testing.B, work int) {
 	const CallsPerSched = 1000
 	procs := 2
 	N := int32(b.N / CallsPerSched / procs * procs)
@@ -732,10 +810,14 @@
 				for g := 0; g < CallsPerSched; g++ {
 					if i%2 == 0 {
 						<-myc
+						localWork(work)
 						myc <- 0
+						localWork(work)
 					} else {
 						myc <- 0
+						localWork(work)
 						<-myc
+						localWork(work)
 					}
 				}
 			}
@@ -747,6 +829,14 @@
 	}
 }
 
+func BenchmarkChanSync(b *testing.B) {
+	benchmarkChanSync(b, 0)
+}
+
+func BenchmarkChanSyncWork(b *testing.B) {
+	benchmarkChanSync(b, 1000)
+}
+
 func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
@@ -920,3 +1010,18 @@
 	}
 	wg.Wait()
 }
+
+var (
+	alwaysFalse = false
+	workSink    = 0
+)
+
+func localWork(w int) {
+	foo := 0
+	for i := 0; i < w; i++ {
+		foo /= (foo + 1)
+	}
+	if alwaysFalse {
+		workSink += foo
+	}
+}

diff --git a/src/runtime/chanbarrier_test.go b/src/runtime/chanbarrier_test.go
index 770b850..b6029fb 100644
--- a/src/runtime/chanbarrier_test.go
+++ b/src/runtime/chanbarrier_test.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/compiler.go b/src/runtime/compiler.go
index 5f1e8d8..1ebc62d 100644
--- a/src/runtime/compiler.go
+++ b/src/runtime/compiler.go

@@ -1,11 +1,11 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
 // Compiler is the name of the compiler toolchain that built the
-// running binary.  Known toolchains are:
+// running binary. Known toolchains are:
 //
 //	gc      Also known as cmd/compile.
 //	gccgo   The gccgo front end, part of the GCC compiler suite.

diff --git a/src/runtime/cpuprof.go b/src/runtime/cpuprof.go
index 0790852..5308200 100644
--- a/src/runtime/cpuprof.go
+++ b/src/runtime/cpuprof.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -12,30 +12,30 @@
 // writes to an operating system file.
 //
 // The signal handler for the profiling clock tick adds a new stack trace
-// to a hash table tracking counts for recent traces.  Most clock ticks
-// hit in the cache.  In the event of a cache miss, an entry must be
+// to a hash table tracking counts for recent traces. Most clock ticks
+// hit in the cache. In the event of a cache miss, an entry must be
 // evicted from the hash table, copied to a log that will eventually be
-// written as profile data.  The google-perftools code flushed the
-// log itself during the signal handler.  This code cannot do that, because
+// written as profile data. The google-perftools code flushed the
+// log itself during the signal handler. This code cannot do that, because
 // the io.Writer might block or need system calls or locks that are not
-// safe to use from within the signal handler.  Instead, we split the log
+// safe to use from within the signal handler. Instead, we split the log
 // into two halves and let the signal handler fill one half while a goroutine
-// is writing out the other half.  When the signal handler fills its half, it
-// offers to swap with the goroutine.  If the writer is not done with its half,
+// is writing out the other half. When the signal handler fills its half, it
+// offers to swap with the goroutine. If the writer is not done with its half,
 // we lose the stack trace for this clock tick (and record that loss).
 // The goroutine interacts with the signal handler by calling getprofile() to
 // get the next log piece to write, implicitly handing back the last log
 // piece it obtained.
 //
 // The state of this dance between the signal handler and the goroutine
-// is encoded in the Profile.handoff field.  If handoff == 0, then the goroutine
+// is encoded in the Profile.handoff field. If handoff == 0, then the goroutine
 // is not using either log half and is waiting (or will soon be waiting) for
 // a new piece by calling notesleep(&p.wait).  If the signal handler
 // changes handoff from 0 to non-zero, it must call notewakeup(&p.wait)
-// to wake the goroutine.  The value indicates the number of entries in the
-// log half being handed off.  The goroutine leaves the non-zero value in
+// to wake the goroutine. The value indicates the number of entries in the
+// log half being handed off. The goroutine leaves the non-zero value in
 // place until it has finished processing the log half and then flips the number
-// back to zero.  Setting the high bit in handoff means that the profiling is over,
+// back to zero. Setting the high bit in handoff means that the profiling is over,
 // and the goroutine is now in charge of flushing the data left in the hash table
 // to the log and returning that data.
 //
@@ -44,13 +44,16 @@
 // then the signal handler owns it and can change it to non-zero.
 // If handoff != 0 then the goroutine owns it and can change it to zero.
 // If that were the end of the story then we would not need to manipulate
-// handoff using atomic operations.  The operations are needed, however,
+// handoff using atomic operations. The operations are needed, however,
 // in order to let the log closer set the high bit to indicate "EOF" safely
 // in the situation when normally the goroutine "owns" handoff.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	numBuckets      = 1 << 10
@@ -146,7 +149,7 @@
 
 		cpuprof.on = true
 		// pprof binary header format.
-		// http://code.google.com/p/google-perftools/source/browse/trunk/src/profiledata.cc#117
+		// https://github.com/gperftools/gperftools/blob/master/src/profiledata.cc#L119
 		p := &cpuprof.log[0]
 		p[0] = 0                 // count for header
 		p[1] = 3                 // depth for header
@@ -173,7 +176,7 @@
 			if n&0x80000000 != 0 {
 				print("runtime: setcpuprofile(off) twice\n")
 			}
-			if cas(&cpuprof.handoff, n, n|0x80000000) {
+			if atomic.Cas(&cpuprof.handoff, n, n|0x80000000) {
 				if n == 0 {
 					// we did the transition from 0 -> nonzero so we wake getprofile
 					notewakeup(&cpuprof.wait)
@@ -189,8 +192,21 @@
 // It is called from signal handlers and other limited environments
 // and cannot allocate memory or acquire locks that might be
 // held at the time of the signal, nor can it use substantial amounts
-// of stack.  It is allowed to call evict.
+// of stack. It is allowed to call evict.
+//go:nowritebarrierrec
 func (p *cpuProfile) add(pc []uintptr) {
+	p.addWithFlushlog(pc, p.flushlog)
+}
+
+// addWithFlushlog implements add and addNonGo.
+// It is called from signal handlers and other limited environments
+// and cannot allocate memory or acquire locks that might be
+// held at the time of the signal, nor can it use substantial amounts
+// of stack. It may be called by a signal handler with no g or m.
+// It is allowed to call evict, passing the flushlog parameter.
+//go:nosplit
+//go:nowritebarrierrec
+func (p *cpuProfile) addWithFlushlog(pc []uintptr, flushlog func() bool) {
 	if len(pc) > maxCPUProfStack {
 		pc = pc[:maxCPUProfStack]
 	}
@@ -228,8 +244,8 @@
 		}
 	}
 	if e.count > 0 {
-		if !p.evict(e) {
-			// Could not evict entry.  Record lost stack.
+		if !p.evict(e, flushlog) {
+			// Could not evict entry. Record lost stack.
 			p.lost++
 			return
 		}
@@ -245,15 +261,17 @@
 // evict copies the given entry's data into the log, so that
 // the entry can be reused.  evict is called from add, which
 // is called from the profiling signal handler, so it must not
-// allocate memory or block.  It is safe to call flushlog.
-// evict returns true if the entry was copied to the log,
-// false if there was no room available.
-func (p *cpuProfile) evict(e *cpuprofEntry) bool {
+// allocate memory or block, and it may be called with no g or m.
+// It is safe to call flushlog. evict returns true if the entry was
+// copied to the log, false if there was no room available.
+//go:nosplit
+//go:nowritebarrierrec
+func (p *cpuProfile) evict(e *cpuprofEntry, flushlog func() bool) bool {
 	d := e.depth
 	nslot := d + 2
 	log := &p.log[p.toggle]
 	if p.nlog+nslot > len(log) {
-		if !p.flushlog() {
+		if !flushlog() {
 			return false
 		}
 		log = &p.log[p.toggle]
@@ -273,10 +291,11 @@
 
 // flushlog tries to flush the current log and switch to the other one.
 // flushlog is called from evict, called from add, called from the signal handler,
-// so it cannot allocate memory or block.  It can try to swap logs with
+// so it cannot allocate memory or block. It can try to swap logs with
 // the writing goroutine, as explained in the comment at the top of this file.
+//go:nowritebarrierrec
 func (p *cpuProfile) flushlog() bool {
-	if !cas(&p.handoff, 0, uint32(p.nlog)) {
+	if !atomic.Cas(&p.handoff, 0, uint32(p.nlog)) {
 		return false
 	}
 	notewakeup(&p.wait)
@@ -296,8 +315,18 @@
 	return true
 }
 
+// addNonGo is like add, but runs on a non-Go thread.
+// It can't do anything that might need a g or an m.
+// With this entry point, we don't try to flush the log when evicting an
+// old entry. Instead, we just drop the stack trace if we're out of space.
+//go:nosplit
+//go:nowritebarrierrec
+func (p *cpuProfile) addNonGo(pc []uintptr) {
+	p.addWithFlushlog(pc, func() bool { return false })
+}
+
 // getprofile blocks until the next block of profiling data is available
-// and returns it as a []byte.  It is called from the writing goroutine.
+// and returns it as a []byte. It is called from the writing goroutine.
 func (p *cpuProfile) getprofile() []byte {
 	if p == nil {
 		return nil
@@ -318,7 +347,7 @@
 				p.flushing = true
 				goto Flush
 			}
-			if cas(&p.handoff, n, 0) {
+			if atomic.Cas(&p.handoff, n, 0) {
 				break
 			}
 		}
@@ -355,7 +384,7 @@
 	}
 
 	// In flush mode.
-	// Add is no longer being called.  We own the log.
+	// Add is no longer being called. We own the log.
 	// Also, p.handoff is non-zero, so flushlog will return false.
 	// Evict the hash table into the log and return it.
 Flush:
@@ -363,8 +392,8 @@
 		b := &p.hash[i]
 		for j := range b.entry {
 			e := &b.entry[j]
-			if e.count > 0 && !p.evict(e) {
-				// Filled the log.  Stop the loop and return what we've got.
+			if e.count > 0 && !p.evict(e, p.flushlog) {
+				// Filled the log. Stop the loop and return what we've got.
 				break Flush
 			}
 		}
@@ -387,9 +416,9 @@
 		return uintptrBytes(eod[:])
 	}
 
-	// Finally done.  Clean up and return nil.
+	// Finally done. Clean up and return nil.
 	p.flushing = false
-	if !cas(&p.handoff, p.handoff, 0) {
+	if !atomic.Cas(&p.handoff, p.handoff, 0) {
 		print("runtime: profile flush racing with something\n")
 	}
 	return nil
@@ -407,7 +436,7 @@
 }
 
 // CPUProfile returns the next chunk of binary CPU profiling stack trace data,
-// blocking until data is available.  If profiling is turned off and all the profile
+// blocking until data is available. If profiling is turned off and all the profile
 // data accumulated while it was on has been returned, CPUProfile returns nil.
 // The caller must save the returned data before calling CPUProfile again.
 //

diff --git a/src/runtime/cputicks.go b/src/runtime/cputicks.go
index 162e026..9162746 100644
--- a/src/runtime/cputicks.go
+++ b/src/runtime/cputicks.go

@@ -1,12 +1,14 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // +build !arm
 // +build !arm64
+// +build !mips64
+// +build !mips64le
 
 package runtime
 
 // careful: cputicks is not guaranteed to be monotonic!  In particular, we have
-// noticed drift between cpus on certain os/arch combinations.  See issue 8976.
+// noticed drift between cpus on certain os/arch combinations. See issue 8976.
 func cputicks() int64

diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 2e65e4c..2504bd0 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go

@@ -7,10 +7,15 @@
 package runtime_test
 
 import (
+	"bytes"
+	"fmt"
+	"internal/testenv"
+	"os"
 	"os/exec"
 	"runtime"
 	"strings"
 	"testing"
+	"time"
 )
 
 func TestCgoCrashHandler(t *testing.T) {
@@ -21,18 +26,18 @@
 	if testing.Short() && runtime.GOOS == "windows" {
 		t.Skip("Skipping in short mode") // takes up to 64 seconds
 	}
-	got := executeTest(t, cgoSignalDeadlockSource, nil)
+	got := runTestProg(t, "testprogcgo", "CgoSignalDeadlock")
 	want := "OK\n"
 	if got != want {
-		t.Fatalf("expected %q, but got %q", want, got)
+		t.Fatalf("expected %q, but got:\n%s", want, got)
 	}
 }
 
 func TestCgoTraceback(t *testing.T) {
-	got := executeTest(t, cgoTracebackSource, nil)
+	got := runTestProg(t, "testprogcgo", "CgoTraceback")
 	want := "OK\n"
 	if got != want {
-		t.Fatalf("expected %q, but got %q", want, got)
+		t.Fatalf("expected %q, but got:\n%s", want, got)
 	}
 }
 
@@ -40,13 +45,20 @@
 	if runtime.GOOS == "plan9" || runtime.GOOS == "windows" {
 		t.Skipf("no pthreads on %s", runtime.GOOS)
 	}
-	if testing.Short() && runtime.GOOS == "dragonfly" {
-		t.Skip("see golang.org/issue/11990")
+	if testing.Short() {
+		switch {
+		case runtime.GOOS == "dragonfly":
+			t.Skip("see golang.org/issue/11990")
+		case runtime.GOOS == "linux" && runtime.GOARCH == "arm":
+			t.Skip("too slow for arm builders")
+		case runtime.GOOS == "linux" && (runtime.GOARCH == "mips64" || runtime.GOARCH == "mips64le"):
+			t.Skip("too slow for mips64x builders")
+		}
 	}
-	got := executeTest(t, cgoCallbackGCSource, nil)
+	got := runTestProg(t, "testprogcgo", "CgoCallbackGC")
 	want := "OK\n"
 	if got != want {
-		t.Fatalf("expected %q, but got %q", want, got)
+		t.Fatalf("expected %q, but got:\n%s", want, got)
 	}
 }
 
@@ -54,11 +66,7 @@
 	if runtime.GOOS == "plan9" {
 		t.Skipf("no pthreads on %s", runtime.GOOS)
 	}
-	csrc := cgoExternalThreadPanicC
-	if runtime.GOOS == "windows" {
-		csrc = cgoExternalThreadPanicC_windows
-	}
-	got := executeTest(t, cgoExternalThreadPanicSource, nil, "main.c", csrc)
+	got := runTestProg(t, "testprogcgo", "CgoExternalThreadPanic")
 	want := "panic: BOOM"
 	if !strings.Contains(got, want) {
 		t.Fatalf("want failure containing %q. output:\n%s\n", want, got)
@@ -84,15 +92,15 @@
 			}
 		}
 	}
-	if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
+	if runtime.GOARCH == "ppc64" {
 		// TODO(austin) External linking not implemented on
 		// ppc64 (issue #8912)
 		t.Skipf("no external linking on ppc64")
 	}
-	got := executeTest(t, cgoExternalThreadSIGPROFSource, nil)
+	got := runTestProg(t, "testprogcgo", "CgoExternalThreadSIGPROF")
 	want := "OK\n"
 	if got != want {
-		t.Fatalf("expected %q, but got %q", want, got)
+		t.Fatalf("expected %q, but got:\n%s", want, got)
 	}
 }
 
@@ -102,10 +110,10 @@
 	case "plan9", "windows":
 		t.Skipf("no pthreads on %s", runtime.GOOS)
 	}
-	got := executeTest(t, cgoExternalThreadSignalSource, nil)
+	got := runTestProg(t, "testprogcgo", "CgoExternalThreadSignal")
 	want := "OK\n"
 	if got != want {
-		t.Fatalf("expected %q, but got %q", want, got)
+		t.Fatalf("expected %q, but got:\n%s", want, got)
 	}
 }
 
@@ -114,368 +122,169 @@
 	if runtime.GOOS != "windows" {
 		t.Skip("skipping windows specific test")
 	}
-	got := executeTest(t, cgoDLLImportsMainSource, nil, "a/a.go", cgoDLLImportsPkgSource)
+	got := runTestProg(t, "testprogcgo", "CgoDLLImportsMain")
 	want := "OK\n"
 	if got != want {
 		t.Fatalf("expected %q, but got %v", want, got)
 	}
 }
 
-const cgoSignalDeadlockSource = `
-package main
-
-import "C"
-
-import (
-	"fmt"
-	"runtime"
-	"time"
-)
-
-func main() {
-	runtime.GOMAXPROCS(100)
-	ping := make(chan bool)
-	go func() {
-		for i := 0; ; i++ {
-			runtime.Gosched()
-			select {
-			case done := <-ping:
-				if done {
-					ping <- true
-					return
-				}
-				ping <- true
-			default:
-			}
-			func() {
-				defer func() {
-					recover()
-				}()
-				var s *string
-				*s = ""
-			}()
-		}
-	}()
-	time.Sleep(time.Millisecond)
-	for i := 0; i < 64; i++ {
-		go func() {
-			runtime.LockOSThread()
-			select {}
-		}()
-		go func() {
-			runtime.LockOSThread()
-			select {}
-		}()
-		time.Sleep(time.Millisecond)
-		ping <- false
-		select {
-		case <-ping:
-		case <-time.After(time.Second):
-			fmt.Printf("HANG\n")
-			return
-		}
+func TestCgoExecSignalMask(t *testing.T) {
+	// Test issue 13164.
+	switch runtime.GOOS {
+	case "windows", "plan9":
+		t.Skipf("skipping signal mask test on %s", runtime.GOOS)
 	}
-	ping <- true
-	select {
-	case <-ping:
-	case <-time.After(time.Second):
-		fmt.Printf("HANG\n")
-		return
-	}
-	fmt.Printf("OK\n")
-}
-`
-
-const cgoTracebackSource = `
-package main
-
-/* void foo(void) {} */
-import "C"
-
-import (
-	"fmt"
-	"runtime"
-)
-
-func main() {
-	C.foo()
-	buf := make([]byte, 1)
-	runtime.Stack(buf, true)
-	fmt.Printf("OK\n")
-}
-`
-
-const cgoCallbackGCSource = `
-package main
-
-import "runtime"
-
-/*
-#include <pthread.h>
-
-void go_callback();
-
-static void *thr(void *arg) {
-    go_callback();
-    return 0;
-}
-
-static void foo() {
-    pthread_t th;
-    pthread_create(&th, 0, thr, 0);
-    pthread_join(th, 0);
-}
-*/
-import "C"
-import "fmt"
-
-//export go_callback
-func go_callback() {
-	runtime.GC()
-	grow()
-	runtime.GC()
-}
-
-var cnt int
-
-func grow() {
-	x := 10000
-	sum := 0
-	if grow1(&x, &sum) == 0 {
-		panic("bad")
+	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q, got %v", want, got)
 	}
 }
 
-func grow1(x, sum *int) int {
-	if *x == 0 {
-		return *sum + 1
+func TestEnsureDropM(t *testing.T) {
+	// Test for issue 13881.
+	switch runtime.GOOS {
+	case "windows", "plan9":
+		t.Skipf("skipping dropm test on %s", runtime.GOOS)
 	}
-	*x--
-	sum1 := *sum + *x
-	return grow1(x, &sum1)
-}
-
-func main() {
-	const P = 100
-	done := make(chan bool)
-	// allocate a bunch of stack frames and spray them with pointers
-	for i := 0; i < P; i++ {
-		go func() {
-			grow()
-			done <- true
-		}()
+	got := runTestProg(t, "testprogcgo", "EnsureDropM")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q, got %v", want, got)
 	}
-	for i := 0; i < P; i++ {
-		<-done
+}
+
+// Test for issue 14387.
+// Test that the program that doesn't need any cgo pointer checking
+// takes about the same amount of time with it as without it.
+func TestCgoCheckBytes(t *testing.T) {
+	// Make sure we don't count the build time as part of the run time.
+	testenv.MustHaveGoBuild(t)
+	exe, err := buildTestProg(t, "testprogcgo")
+	if err != nil {
+		t.Fatal(err)
 	}
-	// now give these stack frames to cgo callbacks
-	for i := 0; i < P; i++ {
-		go func() {
-			C.foo()
-			done <- true
-		}()
-	}
-	for i := 0; i < P; i++ {
-		<-done
-	}
-	fmt.Printf("OK\n")
-}
-`
 
-const cgoExternalThreadPanicSource = `
-package main
+	// Try it 10 times to avoid flakiness.
+	const tries = 10
+	var tot1, tot2 time.Duration
+	for i := 0; i < tries; i++ {
+		cmd := testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd.Env = append(cmd.Env, "GODEBUG=cgocheck=0", fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
-// void start(void);
-import "C"
+		start := time.Now()
+		cmd.Run()
+		d1 := time.Since(start)
 
-func main() {
-	C.start()
-	select {}
-}
+		cmd = testEnv(exec.Command(exe, "CgoCheckBytes"))
+		cmd.Env = append(cmd.Env, fmt.Sprintf("GO_CGOCHECKBYTES_TRY=%d", i))
 
-//export gopanic
-func gopanic() {
-	panic("BOOM")
-}
-`
+		start = time.Now()
+		cmd.Run()
+		d2 := time.Since(start)
 
-const cgoExternalThreadPanicC = `
-#include <stdlib.h>
-#include <stdio.h>
-#include <pthread.h>
-
-void gopanic(void);
-
-static void*
-die(void* x)
-{
-	gopanic();
-	return 0;
-}
-
-void
-start(void)
-{
-	pthread_t t;
-	if(pthread_create(&t, 0, die, 0) != 0)
-		printf("pthread_create failed\n");
-}
-`
-
-const cgoExternalThreadPanicC_windows = `
-#include <stdlib.h>
-#include <stdio.h>
-
-void gopanic(void);
-
-static void*
-die(void* x)
-{
-	gopanic();
-	return 0;
-}
-
-void
-start(void)
-{
-	if(_beginthreadex(0, 0, die, 0, 0, 0) != 0)
-		printf("_beginthreadex failed\n");
-}
-`
-
-const cgoExternalThreadSIGPROFSource = `
-package main
-
-/*
-#include <stdint.h>
-#include <signal.h>
-#include <pthread.h>
-
-volatile int32_t spinlock;
-
-static void *thread1(void *p) {
-	(void)p;
-	while (spinlock == 0)
-		;
-	pthread_kill(pthread_self(), SIGPROF);
-	spinlock = 0;
-	return NULL;
-}
-__attribute__((constructor)) void issue9456() {
-	pthread_t tid;
-	pthread_create(&tid, 0, thread1, NULL);
-}
-*/
-import "C"
-
-import (
-	"runtime"
-	"sync/atomic"
-	"unsafe"
-)
-
-func main() {
-	// This test intends to test that sending SIGPROF to foreign threads
-	// before we make any cgo call will not abort the whole process, so
-	// we cannot make any cgo call here. See https://golang.org/issue/9456.
-	atomic.StoreInt32((*int32)(unsafe.Pointer(&C.spinlock)), 1)
-	for atomic.LoadInt32((*int32)(unsafe.Pointer(&C.spinlock))) == 1 {
-		runtime.Gosched()
-	}
-	println("OK")
-}
-`
-
-const cgoExternalThreadSignalSource = `
-package main
-
-/*
-#include <pthread.h>
-
-void **nullptr;
-
-void *crash(void *p) {
-	*nullptr = p;
-	return 0;
-}
-
-int start_crashing_thread(void) {
-	pthread_t tid;
-	return pthread_create(&tid, 0, crash, 0);
-}
-*/
-import "C"
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"time"
-)
-
-func main() {
-	if len(os.Args) > 1 && os.Args[1] == "crash" {
-		i := C.start_crashing_thread()
-		if i != 0 {
-			fmt.Println("pthread_create failed:", i)
-			// Exit with 0 because parent expects us to crash.
+		if d1*20 > d2 {
+			// The slow version (d2) was less than 20 times
+			// slower than the fast version (d1), so OK.
 			return
 		}
 
-		// We should crash immediately, but give it plenty of
-		// time before failing (by exiting 0) in case we are
-		// running on a slow system.
-		time.Sleep(5 * time.Second)
-		return
+		tot1 += d1
+		tot2 += d2
 	}
 
-	out, err := exec.Command(os.Args[0], "crash").CombinedOutput()
-	if err == nil {
-		fmt.Println("C signal did not crash as expected\n")
-		fmt.Printf("%s\n", out)
-		os.Exit(1)
+	t.Errorf("cgo check too slow: got %v, expected at most %v", tot2/tries, (tot1/tries)*20)
+}
+
+func TestCgoPanicDeadlock(t *testing.T) {
+	// test issue 14432
+	got := runTestProg(t, "testprogcgo", "CgoPanicDeadlock")
+	want := "panic: cgo error\n\n"
+	if !strings.HasPrefix(got, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, got)
+	}
+}
+
+func TestCgoCCodeSIGPROF(t *testing.T) {
+	got := runTestProg(t, "testprogcgo", "CgoCCodeSIGPROF")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
+
+func TestCgoCrashTraceback(t *testing.T) {
+	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
+	}
+	got := runTestProg(t, "testprogcgo", "CrashTraceback")
+	for i := 1; i <= 3; i++ {
+		if !strings.Contains(got, fmt.Sprintf("cgo symbolizer:%d", i)) {
+			t.Errorf("missing cgo symbolizer:%d", i)
+		}
+	}
+}
+
+func TestCgoTracebackContext(t *testing.T) {
+	got := runTestProg(t, "testprogcgo", "TracebackContext")
+	want := "OK\n"
+	if got != want {
+		t.Errorf("expected %q got %v", want, got)
+	}
+}
+
+func testCgoPprof(t *testing.T, buildArg, runArg string) {
+	if runtime.GOOS != "linux" || runtime.GOARCH != "amd64" {
+		t.Skipf("not yet supported on %s/%s", runtime.GOOS, runtime.GOARCH)
+	}
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprogcgo", buildArg)
+	if err != nil {
+		t.Fatal(err)
 	}
 
-	fmt.Println("OK")
+	got, err := testEnv(exec.Command(exe, runArg)).CombinedOutput()
+	if err != nil {
+		t.Fatal(err)
+	}
+	fn := strings.TrimSpace(string(got))
+	defer os.Remove(fn)
+
+	cmd := testEnv(exec.Command("go", "tool", "pprof", "-top", "-nodecount=1", exe, fn))
+
+	found := false
+	for i, e := range cmd.Env {
+		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+			found = true
+			break
+		}
+	}
+	if !found {
+		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
+	}
+
+	top, err := cmd.CombinedOutput()
+	t.Logf("%s", top)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !bytes.Contains(top, []byte("cpuHog")) {
+		t.Error("missing cpuHog in pprof output")
+	}
 }
-`
 
-const cgoDLLImportsMainSource = `
-package main
-
-/*
-#include <windows.h>
-
-DWORD getthread() {
-	return GetCurrentThreadId();
+func TestCgoPprof(t *testing.T) {
+	testCgoPprof(t, "", "CgoPprof")
 }
-*/
-import "C"
 
-import "./a"
-
-func main() {
-	C.getthread()
-	a.GetThread()
-	println("OK")
+func TestCgoPprofPIE(t *testing.T) {
+	testCgoPprof(t, "-ldflags=-extldflags=-pie", "CgoPprof")
 }
-`
 
-const cgoDLLImportsPkgSource = `
-package a
-
-/*
-#cgo CFLAGS: -mnop-fun-dllimport
-
-#include <windows.h>
-
-DWORD agetthread() {
-	return GetCurrentThreadId();
+func TestCgoPprofThread(t *testing.T) {
+	testCgoPprof(t, "", "CgoPprofThread")
 }
-*/
-import "C"
-
-func GetThread() uint32 {
-	return uint32(C.agetthread())
-}
-`

diff --git a/src/runtime/crash_nonunix_test.go b/src/runtime/crash_nonunix_test.go
new file mode 100644
index 0000000..2ce995c
--- /dev/null
+++ b/src/runtime/crash_nonunix_test.go

@@ -0,0 +1,13 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build windows plan9 nacl
+
+package runtime_test
+
+import "os"
+
+// sigquit is the signal to send to kill a hanging testdata program.
+// On Unix we send SIGQUIT, but on non-Unix we only have os.Kill.
+var sigquit = os.Kill

diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 8efce4d..a2f7ff7 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go

@@ -5,6 +5,7 @@
 package runtime_test
 
 import (
+	"bytes"
 	"fmt"
 	"internal/testenv"
 	"io/ioutil"
@@ -13,12 +14,23 @@
 	"path/filepath"
 	"regexp"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"testing"
-	"text/template"
+	"time"
 )
 
+var toRemove []string
+
+func TestMain(m *testing.M) {
+	status := m.Run()
+	for _, file := range toRemove {
+		os.RemoveAll(file)
+	}
+	os.Exit(status)
+}
+
 func testEnv(cmd *exec.Cmd) *exec.Cmd {
 	if cmd.Env != nil {
 		panic("environment already set")
@@ -38,55 +50,104 @@
 	return cmd
 }
 
-func executeTest(t *testing.T, templ string, data interface{}, extra ...string) string {
+var testprog struct {
+	sync.Mutex
+	dir    string
+	target map[string]buildexe
+}
+
+type buildexe struct {
+	exe string
+	err error
+}
+
+func runTestProg(t *testing.T, binary, name string) string {
 	testenv.MustHaveGoBuild(t)
 
-	checkStaleRuntime(t)
-
-	st := template.Must(template.New("crashSource").Parse(templ))
-
-	dir, err := ioutil.TempDir("", "go-build")
+	exe, err := buildTestProg(t, binary)
 	if err != nil {
-		t.Fatalf("failed to create temp directory: %v", err)
-	}
-	defer os.RemoveAll(dir)
-
-	src := filepath.Join(dir, "main.go")
-	f, err := os.Create(src)
-	if err != nil {
-		t.Fatalf("failed to create file: %v", err)
-	}
-	err = st.Execute(f, data)
-	if err != nil {
-		f.Close()
-		t.Fatalf("failed to execute template: %v", err)
-	}
-	if err := f.Close(); err != nil {
-		t.Fatalf("failed to close file: %v", err)
+		t.Fatal(err)
 	}
 
-	for i := 0; i < len(extra); i += 2 {
-		fname := extra[i]
-		contents := extra[i+1]
-		if d, _ := filepath.Split(fname); d != "" {
-			if err := os.Mkdir(filepath.Join(dir, d), 0755); err != nil {
-				t.Fatal(err)
+	cmd := testEnv(exec.Command(exe, name))
+	var b bytes.Buffer
+	cmd.Stdout = &b
+	cmd.Stderr = &b
+	if err := cmd.Start(); err != nil {
+		t.Fatalf("starting %s %s: %v", binary, name, err)
+	}
+
+	// If the process doesn't complete within 1 minute,
+	// assume it is hanging and kill it to get a stack trace.
+	p := cmd.Process
+	done := make(chan bool)
+	go func() {
+		scale := 1
+		// This GOARCH/GOOS test is copied from cmd/dist/test.go.
+		// TODO(iant): Have cmd/dist update the environment variable.
+		if runtime.GOARCH == "arm" || runtime.GOOS == "windows" {
+			scale = 2
+		}
+		if s := os.Getenv("GO_TEST_TIMEOUT_SCALE"); s != "" {
+			if sc, err := strconv.Atoi(s); err == nil {
+				scale = sc
 			}
 		}
-		if err := ioutil.WriteFile(filepath.Join(dir, fname), []byte(contents), 0666); err != nil {
-			t.Fatal(err)
+
+		select {
+		case <-done:
+		case <-time.After(time.Duration(scale) * time.Minute):
+			p.Signal(sigquit)
 		}
+	}()
+
+	if err := cmd.Wait(); err != nil {
+		t.Logf("%s %s exit status: %v", binary, name, err)
+	}
+	close(done)
+
+	return b.String()
+}
+
+func buildTestProg(t *testing.T, binary string, flags ...string) (string, error) {
+	checkStaleRuntime(t)
+
+	testprog.Lock()
+	defer testprog.Unlock()
+	if testprog.dir == "" {
+		dir, err := ioutil.TempDir("", "go-build")
+		if err != nil {
+			t.Fatalf("failed to create temp directory: %v", err)
+		}
+		testprog.dir = dir
+		toRemove = append(toRemove, dir)
 	}
 
-	cmd := exec.Command("go", "build", "-o", "a.exe")
-	cmd.Dir = dir
+	if testprog.target == nil {
+		testprog.target = make(map[string]buildexe)
+	}
+	name := binary
+	if len(flags) > 0 {
+		name += "_" + strings.Join(flags, "_")
+	}
+	target, ok := testprog.target[name]
+	if ok {
+		return target.exe, target.err
+	}
+
+	exe := filepath.Join(testprog.dir, name+".exe")
+	cmd := exec.Command("go", append([]string{"build", "-o", exe}, flags...)...)
+	cmd.Dir = "testdata/" + binary
 	out, err := testEnv(cmd).CombinedOutput()
 	if err != nil {
-		t.Fatalf("building source: %v\n%s", err, out)
+		exe = ""
+		target.err = fmt.Errorf("building %s %v: %v\n%s", binary, flags, err, out)
+		testprog.target[name] = target
+		return "", target.err
 	}
-
-	got, _ := testEnv(exec.Command(filepath.Join(dir, "a.exe"))).CombinedOutput()
-	return string(got)
+	target.exe = exe
+	testprog.target[name] = target
+	return exe, nil
 }
 
 var (
@@ -115,7 +176,12 @@
 	type crashTest struct {
 		Cgo bool
 	}
-	output := executeTest(t, crashSource, &crashTest{Cgo: cgo})
+	var output string
+	if cgo {
+		output = runTestProg(t, "testprogcgo", "Crash")
+	} else {
+		output = runTestProg(t, "testprog", "Crash")
+	}
 	want := "main: recovered done\nnew-thread: recovered done\nsecond-new-thread: recovered done\nmain-again: recovered done\n"
 	if output != want {
 		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
@@ -126,8 +192,8 @@
 	testCrashHandler(t, false)
 }
 
-func testDeadlock(t *testing.T, source string) {
-	output := executeTest(t, source, nil)
+func testDeadlock(t *testing.T, name string) {
+	output := runTestProg(t, "testprog", name)
 	want := "fatal error: all goroutines are asleep - deadlock!\n"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
@@ -135,23 +201,23 @@
 }
 
 func TestSimpleDeadlock(t *testing.T) {
-	testDeadlock(t, simpleDeadlockSource)
+	testDeadlock(t, "SimpleDeadlock")
 }
 
 func TestInitDeadlock(t *testing.T) {
-	testDeadlock(t, initDeadlockSource)
+	testDeadlock(t, "InitDeadlock")
 }
 
 func TestLockedDeadlock(t *testing.T) {
-	testDeadlock(t, lockedDeadlockSource)
+	testDeadlock(t, "LockedDeadlock")
 }
 
 func TestLockedDeadlock2(t *testing.T) {
-	testDeadlock(t, lockedDeadlockSource2)
+	testDeadlock(t, "LockedDeadlock2")
 }
 
 func TestGoexitDeadlock(t *testing.T) {
-	output := executeTest(t, goexitDeadlockSource, nil)
+	output := runTestProg(t, "testprog", "GoexitDeadlock")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
@@ -159,15 +225,15 @@
 }
 
 func TestStackOverflow(t *testing.T) {
-	output := executeTest(t, stackOverflowSource, nil)
-	want := "runtime: goroutine stack exceeds 4194304-byte limit\nfatal error: stack overflow"
+	output := runTestProg(t, "testprog", "StackOverflow")
+	want := "runtime: goroutine stack exceeds 1474560-byte limit\nfatal error: stack overflow"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
 
 func TestThreadExhaustion(t *testing.T) {
-	output := executeTest(t, threadExhaustionSource, nil)
+	output := runTestProg(t, "testprog", "ThreadExhaustion")
 	want := "runtime: program exceeds 10-thread limit\nfatal error: thread exhaustion"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
@@ -175,7 +241,7 @@
 }
 
 func TestRecursivePanic(t *testing.T) {
-	output := executeTest(t, recursivePanicSource, nil)
+	output := runTestProg(t, "testprog", "RecursivePanic")
 	want := `wrap: bad
 panic: again
 
@@ -187,7 +253,7 @@
 }
 
 func TestGoexitCrash(t *testing.T) {
-	output := executeTest(t, goexitExitSource, nil)
+	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
@@ -211,15 +277,15 @@
 }
 
 func TestGoNil(t *testing.T) {
-	output := executeTest(t, goNilSource, nil)
+	output := runTestProg(t, "testprog", "GoNil")
 	want := "go of nil func value"
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
 	}
 }
 
-func TestMainGoroutineId(t *testing.T) {
-	output := executeTest(t, mainGoroutineIdSource, nil)
+func TestMainGoroutineID(t *testing.T) {
+	output := runTestProg(t, "testprog", "MainGoroutineID")
 	want := "panic: test\n\ngoroutine 1 [running]:\n"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
@@ -227,7 +293,7 @@
 }
 
 func TestNoHelperGoroutines(t *testing.T) {
-	output := executeTest(t, noHelperGoroutinesSource, nil)
+	output := runTestProg(t, "testprog", "NoHelperGoroutines")
 	matches := regexp.MustCompile(`goroutine [0-9]+ \[`).FindAllStringSubmatch(output, -1)
 	if len(matches) != 1 || matches[0][0] != "goroutine 1 [" {
 		t.Fatalf("want to see only goroutine 1, see:\n%s", output)
@@ -235,318 +301,92 @@
 }
 
 func TestBreakpoint(t *testing.T) {
-	output := executeTest(t, breakpointSource, nil)
+	output := runTestProg(t, "testprog", "Breakpoint")
 	want := "runtime.Breakpoint()"
 	if !strings.Contains(output, want) {
 		t.Fatalf("output:\n%s\n\nwant output containing: %s", output, want)
 	}
 }
 
-const crashSource = `
-package main
-
-import (
-	"fmt"
-	"runtime"
-)
-
-{{if .Cgo}}
-import "C"
-{{end}}
-
-func test(name string) {
-	defer func() {
-		if x := recover(); x != nil {
-			fmt.Printf(" recovered")
-		}
-		fmt.Printf(" done\n")
-	}()
-	fmt.Printf("%s:", name)
-	var s *string
-	_ = *s
-	fmt.Print("SHOULD NOT BE HERE")
-}
-
-func testInNewThread(name string) {
-	c := make(chan bool)
-	go func() {
-		runtime.LockOSThread()
-		test(name)
-		c <- true
-	}()
-	<-c
-}
-
-func main() {
-	runtime.LockOSThread()
-	test("main")
-	testInNewThread("new-thread")
-	testInNewThread("second-new-thread")
-	test("main-again")
-}
-`
-
-const simpleDeadlockSource = `
-package main
-func main() {
-	select {}
-}
-`
-
-const initDeadlockSource = `
-package main
-func init() {
-	select {}
-}
-func main() {
-}
-`
-
-const lockedDeadlockSource = `
-package main
-import "runtime"
-func main() {
-	runtime.LockOSThread()
-	select {}
-}
-`
-
-const lockedDeadlockSource2 = `
-package main
-import (
-	"runtime"
-	"time"
-)
-func main() {
-	go func() {
-		runtime.LockOSThread()
-		select {}
-	}()
-	time.Sleep(time.Millisecond)
-	select {}
-}
-`
-
-const goexitDeadlockSource = `
-package main
-import (
-      "runtime"
-)
-
-func F() {
-      for i := 0; i < 10; i++ {
-      }
-}
-
-func main() {
-      go F()
-      go F()
-      runtime.Goexit()
-}
-`
-
-const stackOverflowSource = `
-package main
-
-import "runtime/debug"
-
-func main() {
-	debug.SetMaxStack(4<<20)
-	f(make([]byte, 10))
-}
-
-func f(x []byte) byte {
-	var buf [64<<10]byte
-	return x[0] + f(buf[:])
-}
-`
-
-const threadExhaustionSource = `
-package main
-
-import (
-	"runtime"
-	"runtime/debug"
-)
-
-func main() {
-	debug.SetMaxThreads(10)
-	c := make(chan int)
-	for i := 0; i < 100; i++ {
-		go func() {
-			runtime.LockOSThread()
-			c <- 0
-			select{}
-		}()
-		<-c
-	}
-}
-`
-
-const recursivePanicSource = `
-package main
-
-import (
-	"fmt"
-)
-
-func main() {
-	func() {
-		defer func() {
-			fmt.Println(recover())
-		}()
-		var x [8192]byte
-		func(x [8192]byte) {
-			defer func() {
-				if err := recover(); err != nil {
-					panic("wrap: " + err.(string))
-				}
-			}()
-			panic("bad")
-		}(x)
-	}()
-	panic("again")
-}
-`
-
-const goexitExitSource = `
-package main
-
-import (
-	"runtime"
-	"time"
-)
-
-func main() {
-	go func() {
-		time.Sleep(time.Millisecond)
-	}()
-	i := 0
-	runtime.SetFinalizer(&i, func(p *int) {})
-	runtime.GC()
-	runtime.Goexit()
-}
-`
-
-const goNilSource = `
-package main
-
-func main() {
-	defer func() {
-		recover()
-	}()
-	var f func()
-	go f()
-	select{}
-}
-`
-
-const mainGoroutineIdSource = `
-package main
-func main() {
-	panic("test")
-}
-`
-
-const noHelperGoroutinesSource = `
-package main
-import (
-	"runtime"
-	"time"
-)
-func init() {
-	i := 0
-	runtime.SetFinalizer(&i, func(p *int) {})
-	time.AfterFunc(time.Hour, func() {})
-	panic("oops")
-}
-func main() {
-}
-`
-
-const breakpointSource = `
-package main
-import "runtime"
-func main() {
-	runtime.Breakpoint()
-}
-`
-
 func TestGoexitInPanic(t *testing.T) {
 	// see issue 8774: this code used to trigger an infinite recursion
-	output := executeTest(t, goexitInPanicSource, nil)
+	output := runTestProg(t, "testprog", "GoexitInPanic")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
 
-const goexitInPanicSource = `
-package main
-import "runtime"
-func main() {
-	go func() {
-		defer func() {
-			runtime.Goexit()
-		}()
-		panic("hello")
-	}()
-	runtime.Goexit()
+// Issue 14965: Runtime panics should be of type runtime.Error
+func TestRuntimePanicWithRuntimeError(t *testing.T) {
+	testCases := [...]func(){
+		0: func() {
+			var m map[uint64]bool
+			m[1234] = true
+		},
+		1: func() {
+			ch := make(chan struct{})
+			close(ch)
+			close(ch)
+		},
+		2: func() {
+			var ch = make(chan struct{})
+			close(ch)
+			ch <- struct{}{}
+		},
+		3: func() {
+			var s = make([]int, 2)
+			_ = s[2]
+		},
+		4: func() {
+			n := -1
+			_ = make(chan bool, n)
+		},
+		5: func() {
+			close((chan bool)(nil))
+		},
+	}
+
+	for i, fn := range testCases {
+		got := panicValue(fn)
+		if _, ok := got.(runtime.Error); !ok {
+			t.Errorf("test #%d: recovered value %v(type %T) does not implement runtime.Error", i, got, got)
+		}
+	}
 }
-`
+
+func panicValue(fn func()) (recovered interface{}) {
+	defer func() {
+		recovered = recover()
+	}()
+	fn()
+	return
+}
 
 func TestPanicAfterGoexit(t *testing.T) {
 	// an uncaught panic should still work after goexit
-	output := executeTest(t, panicAfterGoexitSource, nil)
+	output := runTestProg(t, "testprog", "PanicAfterGoexit")
 	want := "panic: hello"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
 
-const panicAfterGoexitSource = `
-package main
-import "runtime"
-func main() {
-	defer func() {
-		panic("hello")
-	}()
-	runtime.Goexit()
-}
-`
-
 func TestRecoveredPanicAfterGoexit(t *testing.T) {
-	output := executeTest(t, recoveredPanicAfterGoexitSource, nil)
+	output := runTestProg(t, "testprog", "RecoveredPanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
 
-const recoveredPanicAfterGoexitSource = `
-package main
-import "runtime"
-func main() {
-	defer func() {
-		defer func() {
-			r := recover()
-			if r == nil {
-				panic("bad recover")
-			}
-		}()
-		panic("hello")
-	}()
-	runtime.Goexit()
-}
-`
-
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
 	// 1. defer a function that recovers
 	// 2. defer a function that panics
 	// 3. call goexit
-	// Goexit should run the #2 defer.  Its panic
+	// Goexit should run the #2 defer. Its panic
 	// should be caught by the #1 defer, and execution
-	// should resume in the caller.  Like the Goexit
+	// should resume in the caller. Like the Goexit
 	// never happened!
 	defer func() {
 		r := recover()
@@ -561,29 +401,84 @@
 }
 
 func TestNetpollDeadlock(t *testing.T) {
-	output := executeTest(t, netpollDeadlockSource, nil)
+	output := runTestProg(t, "testprognet", "NetpollDeadlock")
 	want := "done\n"
 	if !strings.HasSuffix(output, want) {
 		t.Fatalf("output does not start with %q:\n%s", want, output)
 	}
 }
 
-const netpollDeadlockSource = `
-package main
-import (
-	"fmt"
-	"net"
-)
-func init() {
-	fmt.Println("dialing")
-	c, err := net.Dial("tcp", "localhost:14356")
-	if err == nil {
-		c.Close()
-	} else {
-		fmt.Println("error: ", err)
+func TestPanicTraceback(t *testing.T) {
+	output := runTestProg(t, "testprog", "PanicTraceback")
+	want := "panic: hello"
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
+
+	// Check functions in the traceback.
+	fns := []string{"panic", "main.pt1.func1", "panic", "main.pt2.func1", "panic", "main.pt2", "main.pt1"}
+	for _, fn := range fns {
+		re := regexp.MustCompile(`(?m)^` + regexp.QuoteMeta(fn) + `\(.*\n`)
+		idx := re.FindStringIndex(output)
+		if idx == nil {
+			t.Fatalf("expected %q function in traceback:\n%s", fn, output)
+		}
+		output = output[idx[1]:]
 	}
 }
-func main() {
-	fmt.Println("done")
+
+func testPanicDeadlock(t *testing.T, name string, want string) {
+	// test issue 14432
+	output := runTestProg(t, "testprog", name)
+	if !strings.HasPrefix(output, want) {
+		t.Fatalf("output does not start with %q:\n%s", want, output)
+	}
 }
-`
+
+func TestPanicDeadlockGosched(t *testing.T) {
+	testPanicDeadlock(t, "GoschedInPanic", "panic: errorThatGosched\n\n")
+}
+
+func TestPanicDeadlockSyscall(t *testing.T) {
+	testPanicDeadlock(t, "SyscallInPanic", "1\n2\npanic: 3\n\n")
+}
+
+func TestMemPprof(t *testing.T) {
+	testenv.MustHaveGoRun(t)
+
+	exe, err := buildTestProg(t, "testprog")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	got, err := testEnv(exec.Command(exe, "MemProf")).CombinedOutput()
+	if err != nil {
+		t.Fatal(err)
+	}
+	fn := strings.TrimSpace(string(got))
+	defer os.Remove(fn)
+
+	cmd := testEnv(exec.Command("go", "tool", "pprof", "-alloc_space", "-top", exe, fn))
+
+	found := false
+	for i, e := range cmd.Env {
+		if strings.HasPrefix(e, "PPROF_TMPDIR=") {
+			cmd.Env[i] = "PPROF_TMPDIR=" + os.TempDir()
+			found = true
+			break
+		}
+	}
+	if !found {
+		cmd.Env = append(cmd.Env, "PPROF_TMPDIR="+os.TempDir())
+	}
+
+	top, err := cmd.CombinedOutput()
+	t.Logf("%s", top)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !bytes.Contains(top, []byte("MemProf")) {
+		t.Error("missing MemProf in pprof output")
+	}
+}

diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index b925d02..6e4d04b 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go

@@ -14,10 +14,15 @@
 	"os/exec"
 	"path/filepath"
 	"runtime"
+	"strings"
 	"syscall"
 	"testing"
 )
 
+// sigquit is the signal to send to kill a hanging testdata program.
+// Send SIGQUIT to get a stack trace.
+var sigquit = syscall.SIGQUIT
+
 func TestCrashDumpsAllThreads(t *testing.T) {
 	switch runtime.GOOS {
 	case "darwin", "dragonfly", "freebsd", "linux", "netbsd", "openbsd", "solaris":
@@ -52,6 +57,18 @@
 	cmd = exec.Command(filepath.Join(dir, "a.exe"))
 	cmd = testEnv(cmd)
 	cmd.Env = append(cmd.Env, "GOTRACEBACK=crash")
+
+	// Set GOGC=off. Because of golang.org/issue/10958, the tight
+	// loops in the test program are not preemptible. If GC kicks
+	// in, it may lock up and prevent main from saying it's ready.
+	newEnv := []string{}
+	for _, s := range cmd.Env {
+		if !strings.HasPrefix(s, "GOGC=") {
+			newEnv = append(newEnv, s)
+		}
+	}
+	cmd.Env = append(newEnv, "GOGC=off")
+
 	var outbuf bytes.Buffer
 	cmd.Stdout = &outbuf
 	cmd.Stderr = &outbuf
@@ -133,3 +150,29 @@
 	}
 }
 `
+
+func TestSignalExitStatus(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	exe, err := buildTestProg(t, "testprog")
+	if err != nil {
+		t.Fatal(err)
+	}
+	err = testEnv(exec.Command(exe, "SignalExitStatus")).Run()
+	if err == nil {
+		t.Error("test program succeeded unexpectedly")
+	} else if ee, ok := err.(*exec.ExitError); !ok {
+		t.Errorf("error (%v) has type %T; expected exec.ExitError", err, err)
+	} else if ws, ok := ee.Sys().(syscall.WaitStatus); !ok {
+		t.Errorf("error.Sys (%v) has type %T; expected syscall.WaitStatus", ee.Sys(), ee.Sys())
+	} else if !ws.Signaled() || ws.Signal() != syscall.SIGTERM {
+		t.Errorf("got %v; expected SIGTERM", ee)
+	}
+}
+
+func TestSignalIgnoreSIGTRAP(t *testing.T) {
+	output := runTestProg(t, "testprognet", "SignalIgnoreSIGTRAP")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}

diff --git a/src/runtime/debug.go b/src/runtime/debug.go
index b7e7971..0e798fc 100644
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go

@@ -4,10 +4,13 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // GOMAXPROCS sets the maximum number of CPUs that can be executing
-// simultaneously and returns the previous setting.  If n < 1, it does not
+// simultaneously and returns the previous setting. If n < 1, it does not
 // change the current setting.
 // The number of logical CPUs on the local machine can be queried with NumCPU.
 // This call will go away when the scheduler improves.
@@ -32,6 +35,10 @@
 }
 
 // NumCPU returns the number of logical CPUs usable by the current process.
+//
+// The set of available CPUs is checked by querying the operating system
+// at process startup. Changes to operating system CPU allocation after
+// process startup are not reflected.
 func NumCPU() int {
 	return int(ncpu)
 }
@@ -39,7 +46,7 @@
 // NumCgoCall returns the number of cgo calls made by the current process.
 func NumCgoCall() int64 {
 	var n int64
-	for mp := (*m)(atomicloadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
 		n += int64(mp.ncgocall)
 	}
 	return n

diff --git a/src/runtime/debug/debug.s b/src/runtime/debug/debug.s
index a7292c4..6aae33a 100644
--- a/src/runtime/debug/debug.s
+++ b/src/runtime/debug/debug.s

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/debug/garbage.go b/src/runtime/debug/garbage.go
index 41202f9..8144497 100644
--- a/src/runtime/debug/garbage.go
+++ b/src/runtime/debug/garbage.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -155,5 +155,22 @@
 
 // WriteHeapDump writes a description of the heap and the objects in
 // it to the given file descriptor.
-// The heap dump format is defined at https://golang.org/s/go13heapdump.
+//
+// WriteHeapDump suspends the execution of all goroutines until the heap
+// dump is completely written.  Thus, the file descriptor must not be
+// connected to a pipe or socket whose other end is in the same Go
+// process; instead, use a temporary file or network socket.
+//
+// The heap dump format is defined at https://golang.org/s/go15heapdump.
 func WriteHeapDump(fd uintptr)
+
+// SetTraceback sets the amount of detail printed by the runtime in
+// the traceback it prints before exiting due to an unrecovered panic
+// or an internal runtime error.
+// The level argument takes the same values as the GOTRACEBACK
+// environment variable. For example, SetTraceback("all") ensure
+// that the program prints all goroutines when it crashes.
+// See the package runtime documentation for details.
+// If SetTraceback is called with a level lower than that of the
+// environment variable, the call is ignored.
+func SetTraceback(level string)

diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
index 3e3483d..6ec94aa 100644
--- a/src/runtime/debug/garbage_test.go
+++ b/src/runtime/debug/garbage_test.go

@@ -1,11 +1,12 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package debug
+package debug_test
 
 import (
 	"runtime"
+	. "runtime/debug"
 	"testing"
 	"time"
 )
@@ -88,7 +89,7 @@
 var big = make([]byte, 1<<20)
 
 func TestFreeOSMemory(t *testing.T) {
-	if runtime.GOARCH == "arm64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" ||
+	if runtime.GOARCH == "arm64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "mips64" || runtime.GOARCH == "mips64le" ||
 		runtime.GOOS == "nacl" {
 		t.Skip("issue 9993; scavenger temporarily disabled on systems with physical pages larger than logical pages")
 	}

diff --git a/src/runtime/debug/heapdump_test.go b/src/runtime/debug/heapdump_test.go
index cb2f2f0..7d5b950 100644
--- a/src/runtime/debug/heapdump_test.go
+++ b/src/runtime/debug/heapdump_test.go

@@ -2,12 +2,13 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package debug
+package debug_test
 
 import (
 	"io/ioutil"
 	"os"
 	"runtime"
+	. "runtime/debug"
 	"testing"
 )
 
@@ -37,7 +38,7 @@
 }
 
 func objfin(x *Obj) {
-	println("finalized", x)
+	//println("finalized", x)
 }
 
 func TestWriteHeapDumpFinalizers(t *testing.T) {

diff --git a/src/runtime/debug/stack.go b/src/runtime/debug/stack.go
index ab12bff..5d810af 100644
--- a/src/runtime/debug/stack.go
+++ b/src/runtime/debug/stack.go

@@ -7,92 +7,24 @@
 package debug
 
 import (
-	"bytes"
-	"fmt"
-	"io/ioutil"
 	"os"
 	"runtime"
 )
 
-var (
-	dunno     = []byte("???")
-	centerDot = []byte("·")
-	dot       = []byte(".")
-	slash     = []byte("/")
-)
-
-// PrintStack prints to standard error the stack trace returned by Stack.
+// PrintStack prints to standard error the stack trace returned by runtime.Stack.
 func PrintStack() {
-	os.Stderr.Write(stack())
+	os.Stderr.Write(Stack())
 }
 
 // Stack returns a formatted stack trace of the goroutine that calls it.
-// For each routine, it includes the source line information and PC value,
-// then attempts to discover, for Go functions, the calling function or
-// method and the text of the line containing the invocation.
-//
-// Deprecated: Use package runtime's Stack instead.
+// It calls runtime.Stack with a large enough buffer to capture the entire trace.
 func Stack() []byte {
-	return stack()
-}
-
-// stack implements Stack, skipping 2 frames
-func stack() []byte {
-	buf := new(bytes.Buffer) // the returned data
-	// As we loop, we open files and read them. These variables record the currently
-	// loaded file.
-	var lines [][]byte
-	var lastFile string
-	for i := 2; ; i++ { // Caller we care about is the user, 2 frames up
-		pc, file, line, ok := runtime.Caller(i)
-		if !ok {
-			break
+	buf := make([]byte, 1024)
+	for {
+		n := runtime.Stack(buf, false)
+		if n < len(buf) {
+			return buf[:n]
 		}
-		// Print this much at least.  If we can't find the source, it won't show.
-		fmt.Fprintf(buf, "%s:%d (0x%x)\n", file, line, pc)
-		if file != lastFile {
-			data, err := ioutil.ReadFile(file)
-			if err != nil {
-				continue
-			}
-			lines = bytes.Split(data, []byte{'\n'})
-			lastFile = file
-		}
-		line-- // in stack trace, lines are 1-indexed but our array is 0-indexed
-		fmt.Fprintf(buf, "\t%s: %s\n", function(pc), source(lines, line))
+		buf = make([]byte, 2*len(buf))
 	}
-	return buf.Bytes()
-}
-
-// source returns a space-trimmed slice of the n'th line.
-func source(lines [][]byte, n int) []byte {
-	if n < 0 || n >= len(lines) {
-		return dunno
-	}
-	return bytes.Trim(lines[n], " \t")
-}
-
-// function returns, if possible, the name of the function containing the PC.
-func function(pc uintptr) []byte {
-	fn := runtime.FuncForPC(pc)
-	if fn == nil {
-		return dunno
-	}
-	name := []byte(fn.Name())
-	// The name includes the path name to the package, which is unnecessary
-	// since the file name is already included.  Plus, it has center dots.
-	// That is, we see
-	//	runtime/debug.*T·ptrmethod
-	// and want
-	//	*T.ptrmethod
-	// Since the package path might contains dots (e.g. code.google.com/...),
-	// we first remove the path prefix if there is one.
-	if lastslash := bytes.LastIndex(name, slash); lastslash >= 0 {
-		name = name[lastslash+1:]
-	}
-	if period := bytes.Index(name, dot); period >= 0 {
-		name = name[period+1:]
-	}
-	name = bytes.Replace(name, centerDot, dot, -1)
-	return name
 }

diff --git a/src/runtime/debug/stack_test.go b/src/runtime/debug/stack_test.go
index 28691ee..9376e82 100644
--- a/src/runtime/debug/stack_test.go
+++ b/src/runtime/debug/stack_test.go

@@ -2,9 +2,10 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package debug
+package debug_test
 
 import (
+	. "runtime/debug"
 	"strings"
 	"testing"
 )
@@ -22,16 +23,19 @@
 	The traceback should look something like this, modulo line numbers and hex constants.
 	Don't worry much about the base levels, but check the ones in our own package.
 
-		/Users/r/go/src/runtime/debug/stack_test.go:15 (0x13878)
-			(*T).ptrmethod: return Stack()
-		/Users/r/go/src/runtime/debug/stack_test.go:18 (0x138dd)
-			T.method: return t.ptrmethod()
-		/Users/r/go/src/runtime/debug/stack_test.go:23 (0x13920)
-			TestStack: b := T(0).method()
-		/Users/r/go/src/testing/testing.go:132 (0x14a7a)
-			tRunner: test.F(t)
-		/Users/r/go/src/runtime/proc.c:145 (0xc970)
-			???: runtime·unlock(&runtime·sched);
+		goroutine 10 [running]:
+		runtime/debug.Stack(0x0, 0x0, 0x0)
+			/Users/r/go/src/runtime/debug/stack.go:28 +0x80
+		runtime/debug.(*T).ptrmethod(0xc82005ee70, 0x0, 0x0, 0x0)
+			/Users/r/go/src/runtime/debug/stack_test.go:15 +0x29
+		runtime/debug.T.method(0x0, 0x0, 0x0, 0x0)
+			/Users/r/go/src/runtime/debug/stack_test.go:18 +0x32
+		runtime/debug.TestStack(0xc8201ce000)
+			/Users/r/go/src/runtime/debug/stack_test.go:37 +0x38
+		testing.tRunner(0xc8201ce000, 0x664b58)
+			/Users/r/go/src/testing/testing.go:456 +0x98
+		created by testing.RunTests
+			/Users/r/go/src/testing/testing.go:561 +0x86d
 */
 func TestStack(t *testing.T) {
 	b := T(0).method()
@@ -41,22 +45,21 @@
 	}
 	n := 0
 	frame := func(line, code string) {
+		check(t, lines[n], code)
+		n++
 		check(t, lines[n], line)
 		n++
-		// The source might not be available while running the test.
-		if strings.HasPrefix(lines[n], "\t") {
-			check(t, lines[n], code)
-			n++
-		}
 	}
-	frame("src/runtime/debug/stack_test.go", "\t(*T).ptrmethod: return Stack()")
-	frame("src/runtime/debug/stack_test.go", "\tT.method: return t.ptrmethod()")
-	frame("src/runtime/debug/stack_test.go", "\tTestStack: b := T(0).method()")
+	n++
+	frame("src/runtime/debug/stack.go", "runtime/debug.Stack")
+	frame("src/runtime/debug/stack_test.go", "runtime/debug_test.(*T).ptrmethod")
+	frame("src/runtime/debug/stack_test.go", "runtime/debug_test.T.method")
+	frame("src/runtime/debug/stack_test.go", "runtime/debug_test.TestStack")
 	frame("src/testing/testing.go", "")
 }
 
 func check(t *testing.T, line, has string) {
-	if strings.Index(line, has) < 0 {
+	if !strings.Contains(line, has) {
 		t.Errorf("expected %q in %q", has, line)
 	}
 }

diff --git a/src/runtime/debug/stubs.go b/src/runtime/debug/stubs.go
index 95b33e4..2cba136 100644
--- a/src/runtime/debug/stubs.go
+++ b/src/runtime/debug/stubs.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -8,12 +8,10 @@
 	"time"
 )
 
-// Uses assembly to call corresponding runtime-internal functions.
+// Implemented in package runtime.
+func readGCStats(*[]time.Duration)
+func freeOSMemory()
 func setMaxStack(int) int
 func setGCPercent(int32) int32
 func setPanicOnFault(bool) bool
 func setMaxThreads(int) int
-
-// Implemented in package runtime.
-func readGCStats(*[]time.Duration)
-func freeOSMemory()

diff --git a/src/runtime/debug/stubs.s b/src/runtime/debug/stubs.s
deleted file mode 100644
index 9dc8e54..0000000
--- a/src/runtime/debug/stubs.s
+++ /dev/null

@@ -1,30 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-#include "textflag.h"
-
-#ifdef GOARCH_arm
-#define JMP B
-#endif
-#ifdef GOARCH_arm64
-#define JMP B
-#endif
-#ifdef GOARCH_ppc64
-#define JMP BR
-#endif
-#ifdef GOARCH_ppc64le
-#define JMP BR
-#endif
-
-TEXT ·setMaxStack(SB),NOSPLIT,$0-0
-  JMP runtime·setMaxStack(SB)
-
-TEXT ·setGCPercent(SB),NOSPLIT,$0-0
-  JMP runtime·setGCPercent(SB)
-
-TEXT ·setPanicOnFault(SB),NOSPLIT,$0-0
-  JMP runtime·setPanicOnFault(SB)
-
-TEXT ·setMaxThreads(SB),NOSPLIT,$0-0
-  JMP runtime·setMaxThreads(SB)

diff --git a/src/runtime/defs2_linux.go b/src/runtime/defs2_linux.go
index 980df9e..9dea6a1 100644
--- a/src/runtime/defs2_linux.go
+++ b/src/runtime/defs2_linux.go

@@ -32,7 +32,7 @@
 #include <linux/eventpoll.h>
 
 // This is the sigaction structure from the Linux 2.1.68 kernel which
-//   is used with the rt_sigaction system call.  For 386 this is not
+//   is used with the rt_sigaction system call. For 386 this is not
 //   defined in any public header file.
 
 struct kernel_sigaction {

diff --git a/src/runtime/defs_darwin.go b/src/runtime/defs_darwin.go
index 722013b..78df4e7 100644
--- a/src/runtime/defs_darwin.go
+++ b/src/runtime/defs_darwin.go

@@ -152,7 +152,7 @@
 type Sighandler C.union___sigaction_u
 
 type Sigaction C.struct___sigaction // used in syscalls
-// type Sigaction C.struct_sigaction	// used by the C library
+type Usigaction C.struct_sigaction  // used by sigaction second argument
 type Sigval C.union_sigval
 type Siginfo C.siginfo_t
 type Timeval C.struct_timeval

diff --git a/src/runtime/defs_darwin_386.go b/src/runtime/defs_darwin_386.go
index e051301..1a5967b 100644
--- a/src/runtime/defs_darwin_386.go
+++ b/src/runtime/defs_darwin_386.go

@@ -167,6 +167,12 @@
 	sa_flags      int32
 }
 
+type usigactiont struct {
+	__sigaction_u [4]byte
+	sa_mask       uint32
+	sa_flags      int32
+}
+
 type siginfo struct {
 	si_signo  int32
 	si_errno  int32

diff --git a/src/runtime/defs_darwin_amd64.go b/src/runtime/defs_darwin_amd64.go
index d9d9fc5..a4ab090 100644
--- a/src/runtime/defs_darwin_amd64.go
+++ b/src/runtime/defs_darwin_amd64.go

@@ -168,6 +168,12 @@
 	sa_flags      int32
 }
 
+type usigactiont struct {
+	__sigaction_u [8]byte
+	sa_mask       uint32
+	sa_flags      int32
+}
+
 type siginfo struct {
 	si_signo  int32
 	si_errno  int32

diff --git a/src/runtime/defs_darwin_arm.go b/src/runtime/defs_darwin_arm.go
index b53336c..3f8dbbf 100644
--- a/src/runtime/defs_darwin_arm.go
+++ b/src/runtime/defs_darwin_arm.go

@@ -169,6 +169,12 @@
 	sa_flags      int32
 }
 
+type usigactiont struct {
+	__sigaction_u [4]byte
+	sa_mask       uint32
+	sa_flags      int32
+}
+
 type siginfo struct {
 	si_signo  int32
 	si_errno  int32

diff --git a/src/runtime/defs_darwin_arm64.go b/src/runtime/defs_darwin_arm64.go
index 3cc77c1..c25a41b 100644
--- a/src/runtime/defs_darwin_arm64.go
+++ b/src/runtime/defs_darwin_arm64.go

@@ -168,6 +168,12 @@
 	sa_flags      int32
 }
 
+type usigactiont struct {
+	__sigaction_u [8]byte
+	sa_mask       uint32
+	sa_flags      int32
+}
+
 type siginfo struct {
 	si_signo  int32
 	si_errno  int32

diff --git a/src/runtime/defs_linux_386.go b/src/runtime/defs_linux_386.go
index 7cf57c8..44d2fd1 100644
--- a/src/runtime/defs_linux_386.go
+++ b/src/runtime/defs_linux_386.go

@@ -90,6 +90,10 @@
 	_EPOLL_CTL_ADD = 0x1
 	_EPOLL_CTL_DEL = 0x2
 	_EPOLL_CTL_MOD = 0x3
+
+	_AF_UNIX    = 0x1
+	_F_SETFL    = 0x4
+	_SOCK_DGRAM = 0x2
 )
 
 type fpreg struct {
@@ -218,3 +222,8 @@
 	events uint32
 	data   [8]byte // to match amd64
 }
+
+type sockaddr_un struct {
+	family uint16
+	path   [108]byte
+}

diff --git a/src/runtime/defs_linux_amd64.go b/src/runtime/defs_linux_amd64.go
index 48aeb80..1936285 100644
--- a/src/runtime/defs_linux_amd64.go
+++ b/src/runtime/defs_linux_amd64.go

@@ -87,6 +87,10 @@
 	_EPOLL_CTL_ADD = 0x1
 	_EPOLL_CTL_DEL = 0x2
 	_EPOLL_CTL_MOD = 0x3
+
+	_AF_UNIX    = 0x1
+	_F_SETFL    = 0x4
+	_SOCK_DGRAM = 0x2
 )
 
 type timespec struct {
@@ -253,3 +257,8 @@
 	fpstate     *fpstate1
 	__reserved1 [8]uint64
 }
+
+type sockaddr_un struct {
+	family uint16
+	path   [108]byte
+}

diff --git a/src/runtime/defs_linux_arm64.go b/src/runtime/defs_linux_arm64.go
index 1a4d884..d1b1a36 100644
--- a/src/runtime/defs_linux_arm64.go
+++ b/src/runtime/defs_linux_arm64.go

@@ -87,6 +87,10 @@
 	_EPOLL_CTL_ADD = 0x1
 	_EPOLL_CTL_DEL = 0x2
 	_EPOLL_CTL_MOD = 0x3
+
+	_AF_UNIX    = 0x1
+	_F_SETFL    = 0x4
+	_SOCK_DGRAM = 0x2
 )
 
 type timespec struct {
@@ -167,6 +171,11 @@
 	__reserved [4096]byte
 }
 
+type sockaddr_un struct {
+	family uint16
+	path   [108]byte
+}
+
 type ucontext struct {
 	uc_flags    uint64
 	uc_link     *ucontext

diff --git a/src/runtime/defs_linux_mips64x.go b/src/runtime/defs_linux_mips64x.go
new file mode 100644
index 0000000..bb3cd98
--- /dev/null
+++ b/src/runtime/defs_linux_mips64x.go

@@ -0,0 +1,183 @@
+// +build mips64 mips64le
+// +build linux
+
+package runtime
+
+const (
+	_EINTR  = 0x4
+	_EAGAIN = 0xb
+	_ENOMEM = 0xc
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x800
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED   = 0x4
+	_MADV_HUGEPAGE   = 0xe
+	_MADV_NOHUGEPAGE = 0xf
+
+	_SA_RESTART = 0x10000000
+	_SA_ONSTACK = 0x8000000
+	_SA_SIGINFO = 0x8
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGEMT    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGBUS    = 0xa
+	_SIGSEGV   = 0xb
+	_SIGSYS    = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGUSR1   = 0x10
+	_SIGUSR2   = 0x11
+	_SIGCHLD   = 0x12
+	_SIGPWR    = 0x13
+	_SIGWINCH  = 0x14
+	_SIGURG    = 0x15
+	_SIGIO     = 0x16
+	_SIGSTOP   = 0x17
+	_SIGTSTP   = 0x18
+	_SIGCONT   = 0x19
+	_SIGTTIN   = 0x1a
+	_SIGTTOU   = 0x1b
+	_SIGVTALRM = 0x1c
+	_SIGPROF   = 0x1d
+	_SIGXCPU   = 0x1e
+	_SIGXFSZ   = 0x1f
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EPOLLIN       = 0x1
+	_EPOLLOUT      = 0x4
+	_EPOLLERR      = 0x8
+	_EPOLLHUP      = 0x10
+	_EPOLLRDHUP    = 0x2000
+	_EPOLLET       = 0x80000000
+	_EPOLL_CLOEXEC = 0x80000
+	_EPOLL_CTL_ADD = 0x1
+	_EPOLL_CTL_DEL = 0x2
+	_EPOLL_CTL_MOD = 0x3
+)
+
+//struct Sigset {
+//	uint64	sig[1];
+//};
+//typedef uint64 Sigset;
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+func (ts *timespec) set_sec(x int64) {
+	ts.tv_sec = x
+}
+
+func (ts *timespec) set_nsec(x int32) {
+	ts.tv_nsec = int64(x)
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type sigactiont struct {
+	sa_flags   uint32
+	sa_handler uintptr
+	sa_mask    [2]uint64
+	// linux header does not have sa_restorer field,
+	// but it is used in setsig(). it is no harm to put it here
+	sa_restorer uintptr
+}
+
+type siginfo struct {
+	si_signo int32
+	si_code  int32
+	si_errno int32
+	__pad0   [1]int32
+	// below here is a union; si_addr is the only field we use
+	si_addr uint64
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type epollevent struct {
+	events    uint32
+	pad_cgo_0 [4]byte
+	data      [8]byte // unaligned uintptr
+}
+
+const (
+	_O_RDONLY    = 0x0
+	_O_CLOEXEC   = 0x80000
+	_SA_RESTORER = 0
+)
+
+type sigaltstackt struct {
+	ss_sp    *byte
+	ss_size  uintptr
+	ss_flags int32
+}
+
+type sigcontext struct {
+	sc_regs      [32]uint64
+	sc_fpregs    [32]uint64
+	sc_mdhi      uint64
+	sc_hi1       uint64
+	sc_hi2       uint64
+	sc_hi3       uint64
+	sc_mdlo      uint64
+	sc_lo1       uint64
+	sc_lo2       uint64
+	sc_lo3       uint64
+	sc_pc        uint64
+	sc_fpc_csr   uint32
+	sc_used_math uint32
+	sc_dsp       uint32
+	sc_reserved  uint32
+}
+
+type ucontext struct {
+	uc_flags    uint64
+	uc_link     *ucontext
+	uc_stack    sigaltstackt
+	uc_mcontext sigcontext
+	uc_sigmask  uint64
+}

diff --git a/src/runtime/defs_linux_s390x.go b/src/runtime/defs_linux_s390x.go
new file mode 100644
index 0000000..5f55d5a
--- /dev/null
+++ b/src/runtime/defs_linux_s390x.go

@@ -0,0 +1,167 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_EINTR  = 0x4
+	_EAGAIN = 0xb
+	_ENOMEM = 0xc
+
+	_PROT_NONE  = 0x0
+	_PROT_READ  = 0x1
+	_PROT_WRITE = 0x2
+	_PROT_EXEC  = 0x4
+
+	_MAP_ANON    = 0x20
+	_MAP_PRIVATE = 0x2
+	_MAP_FIXED   = 0x10
+
+	_MADV_DONTNEED   = 0x4
+	_MADV_HUGEPAGE   = 0xe
+	_MADV_NOHUGEPAGE = 0xf
+
+	_SA_RESTART = 0x10000000
+	_SA_ONSTACK = 0x8000000
+	_SA_SIGINFO = 0x4
+
+	_SIGHUP    = 0x1
+	_SIGINT    = 0x2
+	_SIGQUIT   = 0x3
+	_SIGILL    = 0x4
+	_SIGTRAP   = 0x5
+	_SIGABRT   = 0x6
+	_SIGBUS    = 0x7
+	_SIGFPE    = 0x8
+	_SIGKILL   = 0x9
+	_SIGUSR1   = 0xa
+	_SIGSEGV   = 0xb
+	_SIGUSR2   = 0xc
+	_SIGPIPE   = 0xd
+	_SIGALRM   = 0xe
+	_SIGSTKFLT = 0x10
+	_SIGCHLD   = 0x11
+	_SIGCONT   = 0x12
+	_SIGSTOP   = 0x13
+	_SIGTSTP   = 0x14
+	_SIGTTIN   = 0x15
+	_SIGTTOU   = 0x16
+	_SIGURG    = 0x17
+	_SIGXCPU   = 0x18
+	_SIGXFSZ   = 0x19
+	_SIGVTALRM = 0x1a
+	_SIGPROF   = 0x1b
+	_SIGWINCH  = 0x1c
+	_SIGIO     = 0x1d
+	_SIGPWR    = 0x1e
+	_SIGSYS    = 0x1f
+
+	_FPE_INTDIV = 0x1
+	_FPE_INTOVF = 0x2
+	_FPE_FLTDIV = 0x3
+	_FPE_FLTOVF = 0x4
+	_FPE_FLTUND = 0x5
+	_FPE_FLTRES = 0x6
+	_FPE_FLTINV = 0x7
+	_FPE_FLTSUB = 0x8
+
+	_BUS_ADRALN = 0x1
+	_BUS_ADRERR = 0x2
+	_BUS_OBJERR = 0x3
+
+	_SEGV_MAPERR = 0x1
+	_SEGV_ACCERR = 0x2
+
+	_ITIMER_REAL    = 0x0
+	_ITIMER_VIRTUAL = 0x1
+	_ITIMER_PROF    = 0x2
+
+	_EPOLLIN       = 0x1
+	_EPOLLOUT      = 0x4
+	_EPOLLERR      = 0x8
+	_EPOLLHUP      = 0x10
+	_EPOLLRDHUP    = 0x2000
+	_EPOLLET       = 0x80000000
+	_EPOLL_CLOEXEC = 0x80000
+	_EPOLL_CTL_ADD = 0x1
+	_EPOLL_CTL_DEL = 0x2
+	_EPOLL_CTL_MOD = 0x3
+)
+
+type timespec struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+func (ts *timespec) set_sec(x int64) {
+	ts.tv_sec = x
+}
+
+func (ts *timespec) set_nsec(x int32) {
+	ts.tv_nsec = int64(x)
+}
+
+type timeval struct {
+	tv_sec  int64
+	tv_usec int64
+}
+
+func (tv *timeval) set_usec(x int32) {
+	tv.tv_usec = int64(x)
+}
+
+type sigactiont struct {
+	sa_handler  uintptr
+	sa_flags    uint64
+	sa_restorer uintptr
+	sa_mask     uint64
+}
+
+type siginfo struct {
+	si_signo int32
+	si_errno int32
+	si_code  int32
+	// below here is a union; si_addr is the only field we use
+	si_addr uint64
+}
+
+type itimerval struct {
+	it_interval timeval
+	it_value    timeval
+}
+
+type epollevent struct {
+	events    uint32
+	pad_cgo_0 [4]byte
+	data      [8]byte // unaligned uintptr
+}
+
+const (
+	_O_RDONLY    = 0x0
+	_O_CLOEXEC   = 0x80000
+	_SA_RESTORER = 0
+)
+
+type sigaltstackt struct {
+	ss_sp    *byte
+	ss_flags int32
+	ss_size  uintptr
+}
+
+type sigcontext struct {
+	psw_mask uint64
+	psw_addr uint64
+	gregs    [16]uint64
+	aregs    [16]uint32
+	fpc      uint32
+	fpregs   [16]uint64
+}
+
+type ucontext struct {
+	uc_flags    uint64
+	uc_link     *ucontext
+	uc_stack    sigaltstackt
+	uc_mcontext sigcontext
+	uc_sigmask  uint64
+}

diff --git a/src/runtime/defs_plan9_386.go b/src/runtime/defs_plan9_386.go
index 3574cb6..54ace48 100644
--- a/src/runtime/defs_plan9_386.go
+++ b/src/runtime/defs_plan9_386.go

@@ -30,9 +30,13 @@
 
 func (c *sigctxt) pc() uintptr { return uintptr(c.u.pc) }
 func (c *sigctxt) sp() uintptr { return uintptr(c.u.sp) }
+func (c *sigctxt) lr() uintptr { return uintptr(0) }
 
 func (c *sigctxt) setpc(x uintptr) { c.u.pc = uint32(x) }
 func (c *sigctxt) setsp(x uintptr) { c.u.sp = uint32(x) }
+func (c *sigctxt) setlr(x uintptr) {}
+
+func (c *sigctxt) savelr(x uintptr) {}
 
 func dumpregs(u *ureg) {
 	print("ax    ", hex(u.ax), "\n")
@@ -49,3 +53,5 @@
 	print("fs    ", hex(u.fs), "\n")
 	print("gs    ", hex(u.gs), "\n")
 }
+
+func sigpanictramp() {}

diff --git a/src/runtime/defs_plan9_amd64.go b/src/runtime/defs_plan9_amd64.go
index 54b4d39..1633ec1 100644
--- a/src/runtime/defs_plan9_amd64.go
+++ b/src/runtime/defs_plan9_amd64.go

@@ -39,9 +39,13 @@
 
 func (c *sigctxt) pc() uintptr { return uintptr(c.u.ip) }
 func (c *sigctxt) sp() uintptr { return uintptr(c.u.sp) }
+func (c *sigctxt) lr() uintptr { return uintptr(0) }
 
 func (c *sigctxt) setpc(x uintptr) { c.u.ip = uint64(x) }
 func (c *sigctxt) setsp(x uintptr) { c.u.sp = uint64(x) }
+func (c *sigctxt) setlr(x uintptr) {}
+
+func (c *sigctxt) savelr(x uintptr) {}
 
 func dumpregs(u *ureg) {
 	print("ax    ", hex(u.ax), "\n")
@@ -66,3 +70,5 @@
 	print("fs    ", hex(u.fs), "\n")
 	print("gs    ", hex(u.gs), "\n")
 }
+
+func sigpanictramp() {}

diff --git a/src/runtime/defs_plan9_arm.go b/src/runtime/defs_plan9_arm.go
new file mode 100644
index 0000000..9c700ae
--- /dev/null
+++ b/src/runtime/defs_plan9_arm.go

@@ -0,0 +1,63 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const _PAGESIZE = 0x1000
+
+type ureg struct {
+	r0   uint32 /* general registers */
+	r1   uint32 /* ... */
+	r2   uint32 /* ... */
+	r3   uint32 /* ... */
+	r4   uint32 /* ... */
+	r5   uint32 /* ... */
+	r6   uint32 /* ... */
+	r7   uint32 /* ... */
+	r8   uint32 /* ... */
+	r9   uint32 /* ... */
+	r10  uint32 /* ... */
+	r11  uint32 /* ... */
+	r12  uint32 /* ... */
+	sp   uint32
+	link uint32 /* ... */
+	trap uint32 /* trap type */
+	psr  uint32
+	pc   uint32 /* interrupted addr */
+}
+
+type sigctxt struct {
+	u *ureg
+}
+
+func (c *sigctxt) pc() uintptr { return uintptr(c.u.pc) }
+func (c *sigctxt) sp() uintptr { return uintptr(c.u.sp) }
+func (c *sigctxt) lr() uintptr { return uintptr(c.u.link) }
+
+func (c *sigctxt) setpc(x uintptr)  { c.u.pc = uint32(x) }
+func (c *sigctxt) setsp(x uintptr)  { c.u.sp = uint32(x) }
+func (c *sigctxt) setlr(x uintptr)  { c.u.link = uint32(x) }
+func (c *sigctxt) savelr(x uintptr) { c.u.r0 = uint32(x) }
+
+func dumpregs(u *ureg) {
+	print("r0    ", hex(u.r0), "\n")
+	print("r1    ", hex(u.r1), "\n")
+	print("r2    ", hex(u.r2), "\n")
+	print("r3    ", hex(u.r3), "\n")
+	print("r4    ", hex(u.r4), "\n")
+	print("r5    ", hex(u.r5), "\n")
+	print("r6    ", hex(u.r6), "\n")
+	print("r7    ", hex(u.r7), "\n")
+	print("r8    ", hex(u.r8), "\n")
+	print("r9    ", hex(u.r9), "\n")
+	print("r10   ", hex(u.r10), "\n")
+	print("r11   ", hex(u.r11), "\n")
+	print("r12   ", hex(u.r12), "\n")
+	print("sp    ", hex(u.sp), "\n")
+	print("link  ", hex(u.link), "\n")
+	print("pc    ", hex(u.pc), "\n")
+	print("psr   ", hex(u.psr), "\n")
+}
+
+func sigpanictramp()

diff --git a/src/runtime/duff_amd64.s b/src/runtime/duff_amd64.s
index 0b51228..6ed7f65 100644
--- a/src/runtime/duff_amd64.s
+++ b/src/runtime/duff_amd64.s

@@ -5,837 +5,423 @@
 #include "textflag.h"
 
 TEXT runtime·duffzero(SB), NOSPLIT, $0-0
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
+	MOVUPS	X0,(DI)
+	MOVUPS	X0,16(DI)
+	MOVUPS	X0,32(DI)
+	MOVUPS	X0,48(DI)
+	ADDQ	$64,DI
 
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	MOVQ	AX,(DI)
-	MOVQ	AX,8(DI)
-	MOVQ	AX,16(DI)
-	MOVQ	AX,24(DI)
-	ADDQ	$32,DI
-
-	STOSQ
-	STOSQ
-	STOSQ
-	STOSQ
 	RET
 
 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
-
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
-	MOVQ	(SI), CX
-	ADDQ	$8, SI
-	MOVQ	CX, (DI)
-	ADDQ	$8, DI
+	MOVUPS	(SI), X0
+	ADDQ	$16, SI
+	MOVUPS	X0, (DI)
+	ADDQ	$16, DI
 
 	RET

diff --git a/src/runtime/duff_mips64x.s b/src/runtime/duff_mips64x.s
new file mode 100644
index 0000000..062e04a
--- /dev/null
+++ b/src/runtime/duff_mips64x.s

@@ -0,0 +1,268 @@
+// AUTO-GENERATED by mkduff.go
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+TEXT runtime·duffzero(SB), NOSPLIT, $-8-0
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	MOVV	R0, 8(R1)
+	ADDV	$8, R1
+	RET
+
+// TODO: Implement runtime·duffcopy.

diff --git a/src/runtime/duff_ppc64x.s b/src/runtime/duff_ppc64x.s
index 14bc33e..c8204c4 100644
--- a/src/runtime/duff_ppc64x.s
+++ b/src/runtime/duff_ppc64x.s

@@ -6,7 +6,7 @@
 
 #include "textflag.h"
 
-TEXT runtime·duffzero(SB), NOSPLIT, $-8-0
+TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
 	MOVDU	R0, 8(R3)
 	MOVDU	R0, 8(R3)
 	MOVDU	R0, 8(R3)

diff --git a/src/runtime/env_plan9.go b/src/runtime/env_plan9.go
index 0e2588b..c95b5db 100644
--- a/src/runtime/env_plan9.go
+++ b/src/runtime/env_plan9.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -40,8 +40,8 @@
 	}
 
 	var s string
-	sp := (*_string)(unsafe.Pointer(&s))
-	sp.str = &p[0]
+	sp := stringStructOf(&s)
+	sp.str = unsafe.Pointer(&p[0])
 	sp.len = int(r)
 	return s
 }

diff --git a/src/runtime/env_posix.go b/src/runtime/env_posix.go
index 5e49287..da34425 100644
--- a/src/runtime/env_posix.go
+++ b/src/runtime/env_posix.go

@@ -32,7 +32,7 @@
 		return
 	}
 	arg := [2]unsafe.Pointer{cstring(k), cstring(v)}
-	asmcgocall(unsafe.Pointer(_cgo_setenv), unsafe.Pointer(&arg))
+	asmcgocall(_cgo_setenv, unsafe.Pointer(&arg))
 }
 
 // Update the C environment if cgo is loaded.
@@ -43,12 +43,11 @@
 		return
 	}
 	arg := [1]unsafe.Pointer{cstring(k)}
-	asmcgocall(unsafe.Pointer(_cgo_unsetenv), unsafe.Pointer(&arg))
+	asmcgocall(_cgo_unsetenv, unsafe.Pointer(&arg))
 }
 
 func cstring(s string) unsafe.Pointer {
 	p := make([]byte, len(s)+1)
-	sp := (*_string)(unsafe.Pointer(&s))
-	memmove(unsafe.Pointer(&p[0]), unsafe.Pointer(sp.str), uintptr(len(s)))
+	copy(p, s)
 	return unsafe.Pointer(&p[0])
 }

diff --git a/src/runtime/error.go b/src/runtime/error.go
index 4280306..0238c5e 100644
--- a/src/runtime/error.go
+++ b/src/runtime/error.go

@@ -4,8 +4,6 @@
 
 package runtime
 
-import "unsafe"
-
 // The Error interface identifies a run time error.
 type Error interface {
 	error
@@ -52,13 +50,24 @@
 	return "runtime error: " + string(e)
 }
 
+// plainError represents a runtime error described a string without
+// the prefix "runtime error: " after invoking errorString.Error().
+// See Issue #14965.
+type plainError string
+
+func (e plainError) RuntimeError() {}
+
+func (e plainError) Error() string {
+	return string(e)
+}
+
 type stringer interface {
 	String() string
 }
 
 func typestring(x interface{}) string {
-	e := (*eface)(unsafe.Pointer(&x))
-	return *e._type._string
+	e := efaceOf(&x)
+	return e._type.string()
 }
 
 // For calling from C.
@@ -84,5 +93,5 @@
 
 // called from generated code
 func panicwrap(pkg, typ, meth string) {
-	panic("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer")
+	panic(plainError("value method " + pkg + "." + typ + "." + meth + " called using nil *" + typ + " pointer"))
 }

diff --git a/src/runtime/export_arm_test.go b/src/runtime/export_arm_test.go
index 446d264..b8a89fc 100644
--- a/src/runtime/export_arm_test.go
+++ b/src/runtime/export_arm_test.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/export_futex_test.go b/src/runtime/export_futex_test.go
index 96281f6..5e27236 100644
--- a/src/runtime/export_futex_test.go
+++ b/src/runtime/export_futex_test.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/export_linux_test.go b/src/runtime/export_linux_test.go
index c8b9746..ef0c111 100644
--- a/src/runtime/export_linux_test.go
+++ b/src/runtime/export_linux_test.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -7,3 +7,4 @@
 package runtime
 
 var NewOSProc0 = newosproc0
+var Mincore = mincore

diff --git a/src/runtime/export_mmap_test.go b/src/runtime/export_mmap_test.go
new file mode 100644
index 0000000..bc8191e
--- /dev/null
+++ b/src/runtime/export_mmap_test.go

@@ -0,0 +1,15 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+
+// Export guts for testing.
+
+package runtime
+
+var Mmap = mmap
+
+const ENOMEM = _ENOMEM
+const MAP_ANON = _MAP_ANON
+const MAP_PRIVATE = _MAP_PRIVATE

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index f14dc30..199a049 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,7 +6,11 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 var Fadd64 = fadd64
 var Fsub64 = fsub64
@@ -22,10 +26,12 @@
 var Entersyscall = entersyscall
 var Exitsyscall = exitsyscall
 var LockedOSThread = lockedOSThread
-var Xadduintptr = xadduintptr
+var Xadduintptr = atomic.Xadduintptr
 
 var FuncPC = funcPC
 
+var Fastlog2 = fastlog2
+
 type LFNode struct {
 	Next    uint64
 	Pushcnt uintptr
@@ -39,42 +45,6 @@
 	return (*LFNode)(unsafe.Pointer(lfstackpop(head)))
 }
 
-type ParFor struct {
-	body   func(*ParFor, uint32)
-	done   uint32
-	Nthr   uint32
-	thrseq uint32
-	Cnt    uint32
-	wait   bool
-}
-
-func NewParFor(nthrmax uint32) *ParFor {
-	var desc *ParFor
-	systemstack(func() {
-		desc = (*ParFor)(unsafe.Pointer(parforalloc(nthrmax)))
-	})
-	return desc
-}
-
-func ParForSetup(desc *ParFor, nthr, n uint32, wait bool, body func(*ParFor, uint32)) {
-	systemstack(func() {
-		parforsetup((*parfor)(unsafe.Pointer(desc)), nthr, n, wait,
-			*(*func(*parfor, uint32))(unsafe.Pointer(&body)))
-	})
-}
-
-func ParForDo(desc *ParFor) {
-	systemstack(func() {
-		parfordo((*parfor)(unsafe.Pointer(desc)))
-	})
-}
-
-func ParForIters(desc *ParFor, tid uint32) (uint32, uint32) {
-	desc1 := (*parfor)(unsafe.Pointer(desc))
-	pos := desc1.thr[tid].pos
-	return uint32(pos), uint32(pos >> 32)
-}
-
 func GCMask(x interface{}) (ret []byte) {
 	systemstack(func() {
 		ret = getgcmask(x)
@@ -83,10 +53,100 @@
 }
 
 func RunSchedLocalQueueTest() {
-	testSchedLocalQueue()
+	_p_ := new(p)
+	gs := make([]g, len(_p_.runq))
+	for i := 0; i < len(_p_.runq); i++ {
+		if g, _ := runqget(_p_); g != nil {
+			throw("runq is not empty initially")
+		}
+		for j := 0; j < i; j++ {
+			runqput(_p_, &gs[i], false)
+		}
+		for j := 0; j < i; j++ {
+			if g, _ := runqget(_p_); g != &gs[i] {
+				print("bad element at iter ", i, "/", j, "\n")
+				throw("bad element")
+			}
+		}
+		if g, _ := runqget(_p_); g != nil {
+			throw("runq is not empty afterwards")
+		}
+	}
 }
+
 func RunSchedLocalQueueStealTest() {
-	testSchedLocalQueueSteal()
+	p1 := new(p)
+	p2 := new(p)
+	gs := make([]g, len(p1.runq))
+	for i := 0; i < len(p1.runq); i++ {
+		for j := 0; j < i; j++ {
+			gs[j].sig = 0
+			runqput(p1, &gs[j], false)
+		}
+		gp := runqsteal(p2, p1, true)
+		s := 0
+		if gp != nil {
+			s++
+			gp.sig++
+		}
+		for {
+			gp, _ = runqget(p2)
+			if gp == nil {
+				break
+			}
+			s++
+			gp.sig++
+		}
+		for {
+			gp, _ = runqget(p1)
+			if gp == nil {
+				break
+			}
+			gp.sig++
+		}
+		for j := 0; j < i; j++ {
+			if gs[j].sig != 1 {
+				print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
+				throw("bad element")
+			}
+		}
+		if s != i/2 && s != i/2+1 {
+			print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
+			throw("bad steal")
+		}
+	}
+}
+
+func RunSchedLocalQueueEmptyTest(iters int) {
+	// Test that runq is not spuriously reported as empty.
+	// Runq emptiness affects scheduling decisions and spurious emptiness
+	// can lead to underutilization (both runnable Gs and idle Ps coexist
+	// for arbitrary long time).
+	done := make(chan bool, 1)
+	p := new(p)
+	gs := make([]g, 2)
+	ready := new(uint32)
+	for i := 0; i < iters; i++ {
+		*ready = 0
+		next0 := (i & 1) == 0
+		next1 := (i & 2) == 0
+		runqput(p, &gs[0], next0)
+		go func() {
+			for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
+			}
+			if runqempty(p) {
+				println("next:", next0, next1)
+				throw("queue is empty")
+			}
+			done <- true
+		}()
+		for atomic.Xadd(ready, 1); atomic.Load(ready) != 2; {
+		}
+		runqput(p, &gs[1], next1)
+		runqget(p)
+		<-done
+		runqget(p)
+	}
 }
 
 var StringHash = stringHash
@@ -110,7 +170,7 @@
 var Gostringnocopy = gostringnocopy
 var Maxstring = &maxstring
 
-type Uintreg uintreg
+type Uintreg sys.Uintreg
 
 var Open = open
 var Close = closefd
@@ -120,21 +180,21 @@
 func Envs() []string     { return envs }
 func SetEnvs(e []string) { envs = e }
 
-var BigEndian = _BigEndian
+var BigEndian = sys.BigEndian
 
 // For benchmarking.
 
 func BenchSetType(n int, x interface{}) {
-	e := *(*eface)(unsafe.Pointer(&x))
+	e := *efaceOf(&x)
 	t := e._type
 	var size uintptr
 	var p unsafe.Pointer
 	switch t.kind & kindMask {
-	case _KindPtr:
+	case kindPtr:
 		t = (*ptrtype)(unsafe.Pointer(t)).elem
 		size = t.size
 		p = e.data
-	case _KindSlice:
+	case kindSlice:
 		slice := *(*struct {
 			ptr      unsafe.Pointer
 			len, cap uintptr
@@ -151,7 +211,36 @@
 	})
 }
 
-const PtrSize = ptrSize
+const PtrSize = sys.PtrSize
 
 var TestingAssertE2I2GC = &testingAssertE2I2GC
 var TestingAssertE2T2GC = &testingAssertE2T2GC
+
+var ForceGCPeriod = &forcegcperiod
+
+// SetTracebackEnv is like runtime/debug.SetTraceback, but it raises
+// the "environment" traceback level, so later calls to
+// debug.SetTraceback (e.g., from testing timeouts) can't lower it.
+func SetTracebackEnv(level string) {
+	setTraceback(level)
+	traceback_env = traceback_cache
+}
+
+var ReadUnaligned32 = readUnaligned32
+var ReadUnaligned64 = readUnaligned64
+
+func CountPagesInUse() (pagesInUse, counted uintptr) {
+	stopTheWorld("CountPagesInUse")
+
+	pagesInUse = uintptr(mheap_.pagesInUse)
+
+	for _, s := range h_allspans {
+		if s.state == mSpanInUse {
+			counted += s.npages
+		}
+	}
+
+	startTheWorld()
+
+	return
+}

diff --git a/src/runtime/export_windows_test.go b/src/runtime/export_windows_test.go
index 61fcef9..536b398 100644
--- a/src/runtime/export_windows_test.go
+++ b/src/runtime/export_windows_test.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,4 +6,20 @@
 
 package runtime
 
-var TestingWER = &testingWER
+import "unsafe"
+
+var (
+	TestingWER              = &testingWER
+	OsYield                 = osyield
+	TimeBeginPeriodRetValue = &timeBeginPeriodRetValue
+)
+
+func NumberOfProcessors() int32 {
+	var info systeminfo
+	stdcall1(_GetSystemInfo, uintptr(unsafe.Pointer(&info)))
+	return int32(info.dwnumberofprocessors)
+}
+
+func LoadLibraryExStatus() (useEx, haveEx, haveFlags bool) {
+	return useLoadLibraryEx, _LoadLibraryExW != nil, _AddDllDirectory != nil
+}

diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index cdb66ba..441dcd9 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go

@@ -27,6 +27,13 @@
 	allocfreetrace: setting allocfreetrace=1 causes every allocation to be
 	profiled and a stack trace printed on each object's allocation and free.
 
+	cgocheck: setting cgocheck=0 disables all checks for packages
+	using cgo to incorrectly pass Go pointers to non-Go code.
+	Setting cgocheck=1 (the default) enables relatively cheap
+	checks that may miss some errors.  Setting cgocheck=2 enables
+	expensive checks that should not miss any errors, but will
+	cause your program to run slower.
+
 	efence: setting efence=1 causes the allocator to run in a mode
 	where each object is allocated on a unique page and addresses are
 	never recycled.
@@ -59,7 +66,7 @@
 	length of the pause. Setting gctrace=2 emits the same summary but also
 	repeats each collection. The format of this line is subject to change.
 	Currently, it is:
-		gc # @#s #%: #+...+# ms clock, #+...+# ms cpu, #->#-># MB, # MB goal, # P
+		gc # @#s #%: #+#+# ms clock, #+#/#/#+# ms cpu, #->#-># MB, # MB goal, # P
 	where the fields are as follows:
 		gc #        the GC number, incremented at each GC
 		@#s         time in seconds since program start
@@ -68,13 +75,28 @@
 		#->#-># MB  heap size at GC start, at GC end, and live heap
 		# MB goal   goal heap size
 		# P         number of processors used
-	The phases are stop-the-world (STW) sweep termination, scan,
-	synchronize Ps, mark, and STW mark termination. The CPU times
-	for mark are broken down in to assist time (GC performed in
+	The phases are stop-the-world (STW) sweep termination, concurrent
+	mark and scan, and STW mark termination. The CPU times
+	for mark/scan are broken down in to assist time (GC performed in
 	line with allocation), background GC time, and idle GC time.
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call and all phases are STW.
 
+	Setting gctrace to any value > 0 also causes the garbage collector
+	to emit a summary when memory is released back to the system.
+	This process of returning memory to the system is called scavenging.
+	The format of this summary is subject to change.
+	Currently it is:
+		scvg#: # MB released  printed only if non-zero
+		scvg#: inuse: # idle: # sys: # released: # consumed: # (MB)
+	where the fields are as follows:
+		scvg#        the scavenge cycle number, incremented at each scavenge
+		inuse: #     MB used or partially used spans
+		idle: #      MB spans pending scavenging
+		sys: #       MB mapped from the system
+		released: #  MB released to the system
+		consumed: #  MB allocated from the system
+
 	memprofilerate: setting memprofilerate=X will update the value of runtime.MemProfileRate.
 	When set to 0 memory profiling is disabled.  Refer to the description of
 	MemProfileRate for the default value.
@@ -98,6 +120,9 @@
 	schedtrace: setting schedtrace=X causes the scheduler to emit a single line to standard
 	error every X milliseconds, summarizing the scheduler state.
 
+The net and net/http packages also refer to debugging variables in GODEBUG.
+See the documentation for those packages for details.
+
 The GOMAXPROCS variable limits the number of operating system threads that
 can execute user-level Go code simultaneously. There is no limit to the number of threads
 that can be blocked in system calls on behalf of Go code; those do not count against
@@ -106,15 +131,24 @@
 
 The GOTRACEBACK variable controls the amount of output generated when a Go
 program fails due to an unrecovered panic or an unexpected runtime condition.
-By default, a failure prints a stack trace for every extant goroutine, eliding functions
-internal to the run-time system, and then exits with exit code 2.
-If GOTRACEBACK=0, the per-goroutine stack traces are omitted entirely.
-If GOTRACEBACK=1, the default behavior is used.
-If GOTRACEBACK=2, the per-goroutine stack traces include run-time functions.
-If GOTRACEBACK=crash, the per-goroutine stack traces include run-time functions,
-and if possible the program crashes in an operating-specific manner instead of
-exiting. For example, on Unix systems, the program raises SIGABRT to trigger a
-core dump.
+By default, a failure prints a stack trace for the current goroutine,
+eliding functions internal to the run-time system, and then exits with exit code 2.
+The failure prints stack traces for all goroutines if there is no current goroutine
+or the failure is internal to the run-time.
+GOTRACEBACK=none omits the goroutine stack traces entirely.
+GOTRACEBACK=single (the default) behaves as described above.
+GOTRACEBACK=all adds stack traces for all user-created goroutines.
+GOTRACEBACK=system is like ``all'' but adds stack frames for run-time functions
+and shows goroutines created internally by the run-time.
+GOTRACEBACK=crash is like ``system'' but crashes in an operating system-specific
+manner instead of exiting. For example, on Unix systems, the crash raises
+SIGABRT to trigger a core dump.
+For historical reasons, the GOTRACEBACK settings 0, 1, and 2 are synonyms for
+none, all, and system, respectively.
+The runtime/debug package's SetTraceback function allows increasing the
+amount of output at run time, but it cannot reduce the amount below that
+specified by the environment variable.
+See https://golang.org/pkg/runtime/debug/#SetTraceback.
 
 The GOARCH, GOOS, GOPATH, and GOROOT environment variables complete
 the set of Go environment variables. They influence the building of Go programs
@@ -125,12 +159,14 @@
 */
 package runtime
 
+import "runtime/internal/sys"
+
 // Caller reports file and line number information about function invocations on
-// the calling goroutine's stack.  The argument skip is the number of stack frames
+// the calling goroutine's stack. The argument skip is the number of stack frames
 // to ascend, with 0 identifying the caller of Caller.  (For historical reasons the
 // meaning of skip differs between Caller and Callers.) The return values report the
 // program counter, file name, and line number within the file of the corresponding
-// call.  The boolean ok is false if it was not possible to recover the information.
+// call. The boolean ok is false if it was not possible to recover the information.
 func Caller(skip int) (pc uintptr, file string, line int, ok bool) {
 	// Ask for two PCs: the one we were asked for
 	// and what it called, so that we can see if it
@@ -163,22 +199,19 @@
 }
 
 // Callers fills the slice pc with the return program counters of function invocations
-// on the calling goroutine's stack.  The argument skip is the number of stack frames
+// on the calling goroutine's stack. The argument skip is the number of stack frames
 // to skip before recording in pc, with 0 identifying the frame for Callers itself and
 // 1 identifying the caller of Callers.
 // It returns the number of entries written to pc.
 //
 // Note that since each slice entry pc[i] is a return program counter,
 // looking up the file and line for pc[i] (for example, using (*Func).FileLine)
-// will return the file and line number of the instruction immediately
+// will normally return the file and line number of the instruction immediately
 // following the call.
-// To look up the file and line number of the call itself, use pc[i]-1.
-// As an exception to this rule, if pc[i-1] corresponds to the function
-// runtime.sigpanic, then pc[i] is the program counter of a faulting
-// instruction and should be used without any subtraction.
+// To easily look up file/line information for the call sequence, use Frames.
 func Callers(skip int, pc []uintptr) int {
 	// runtime.callers uses pc.array==nil as a signal
-	// to print a stack trace.  Pick off 0-length pc here
+	// to print a stack trace. Pick off 0-length pc here
 	// so that we don't let a nil pc slice get to it.
 	if len(pc) == 0 {
 		return 0
@@ -194,20 +227,20 @@
 	if s != "" {
 		return s
 	}
-	return defaultGoroot
+	return sys.DefaultGoroot
 }
 
 // Version returns the Go tree's version string.
 // It is either the commit hash and date at the time of the build or,
 // when possible, a release tag like "go1.3".
 func Version() string {
-	return theVersion
+	return sys.TheVersion
 }
 
 // GOOS is the running program's operating system target:
 // one of darwin, freebsd, linux, and so on.
-const GOOS string = theGoos
+const GOOS string = sys.GOOS
 
 // GOARCH is the running program's architecture target:
-// 386, amd64, or arm.
-const GOARCH string = theGoarch
+// 386, amd64, arm, or s390x.
+const GOARCH string = sys.GOARCH

diff --git a/src/runtime/fastlog2.go b/src/runtime/fastlog2.go
new file mode 100644
index 0000000..b22e825
--- /dev/null
+++ b/src/runtime/fastlog2.go

@@ -0,0 +1,33 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// fastlog2 implements a fast approximation to the base 2 log of a
+// float64. This is used to compute a geometric distribution for heap
+// sampling, without introducing dependences into package math. This
+// uses a very rough approximation using the float64 exponent and the
+// first 25 bits of the mantissa. The top 5 bits of the mantissa are
+// used to load limits from a table of constants and the rest are used
+// to scale linearly between them.
+func fastlog2(x float64) float64 {
+	const fastlogScaleBits = 20
+	const fastlogScaleRatio = 1.0 / (1 << fastlogScaleBits)
+
+	xBits := float64bits(x)
+	// Extract the exponent from the IEEE float64, and index a constant
+	// table with the first 10 bits from the mantissa.
+	xExp := int64((xBits>>52)&0x7FF) - 1023
+	xManIndex := (xBits >> (52 - fastlogNumBits)) % (1 << fastlogNumBits)
+	xManScale := (xBits >> (52 - fastlogNumBits - fastlogScaleBits)) % (1 << fastlogScaleBits)
+
+	low, high := fastlog2Table[xManIndex], fastlog2Table[xManIndex+1]
+	return float64(xExp) + low + (high-low)*float64(xManScale)*fastlogScaleRatio
+}
+
+// float64bits returns the IEEE 754 binary representation of f.
+// Taken from math.Float64bits to avoid dependences into package math.
+func float64bits(f float64) uint64 { return *(*uint64)(unsafe.Pointer(&f)) }

diff --git a/src/runtime/fastlog2_test.go b/src/runtime/fastlog2_test.go
new file mode 100644
index 0000000..ae0f40b
--- /dev/null
+++ b/src/runtime/fastlog2_test.go

@@ -0,0 +1,34 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"math"
+	"runtime"
+	"testing"
+)
+
+func TestFastLog2(t *testing.T) {
+	// Compute the euclidean distance between math.Log2 and the FastLog2
+	// implementation over the range of interest for heap sampling.
+	const randomBitCount = 26
+	var e float64
+
+	inc := 1
+	if testing.Short() {
+		// Check 1K total values, down from 64M.
+		inc = 1 << 16
+	}
+	for i := 1; i < 1<<randomBitCount; i += inc {
+		l, fl := math.Log2(float64(i)), runtime.Fastlog2(float64(i))
+		d := l - fl
+		e += d * d
+	}
+	e = math.Sqrt(e)
+
+	if e > 1.0 {
+		t.Fatalf("imprecision on fastlog2 implementation, want <=1.0, got %f", e)
+	}
+}

diff --git a/src/runtime/fastlog2table.go b/src/runtime/fastlog2table.go
new file mode 100644
index 0000000..c36d583
--- /dev/null
+++ b/src/runtime/fastlog2table.go

@@ -0,0 +1,43 @@
+// AUTO-GENERATED by mkfastlog2table.go
+// Run go generate from src/runtime to update.
+// See mkfastlog2table.go for comments.
+
+package runtime
+
+const fastlogNumBits = 5
+
+var fastlog2Table = [1<<fastlogNumBits + 1]float64{
+	0,
+	0.0443941193584535,
+	0.08746284125033943,
+	0.12928301694496647,
+	0.16992500144231248,
+	0.2094533656289499,
+	0.24792751344358555,
+	0.28540221886224837,
+	0.3219280948873623,
+	0.3575520046180837,
+	0.39231742277876036,
+	0.4262647547020979,
+	0.4594316186372973,
+	0.4918530963296748,
+	0.5235619560570128,
+	0.5545888516776374,
+	0.5849625007211563,
+	0.6147098441152082,
+	0.6438561897747247,
+	0.6724253419714956,
+	0.7004397181410922,
+	0.7279204545631992,
+	0.7548875021634686,
+	0.7813597135246596,
+	0.8073549220576042,
+	0.8328900141647417,
+	0.8579809951275721,
+	0.8826430493618412,
+	0.9068905956085185,
+	0.9307373375628862,
+	0.9541963103868752,
+	0.9772799234999164,
+	1,
+}

diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index ce62dab..82992e2 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -14,7 +14,6 @@
 
 #define FUNCDATA_ArgsPointerMaps 0 /* garbage collector blocks */
 #define FUNCDATA_LocalsPointerMaps 1
-#define FUNCDATA_DeadValueMaps 2
 
 // Pseudo-assembly statements.
 
@@ -37,7 +36,7 @@
 // GO_RESULTS_INITIALIZED indicates that the assembly function
 // has initialized the stack space for its results and that those results
 // should be considered live for the remainder of the function.
-#define GO_RESULTS_INITIALIZED	FUNCDATA PCDATA $PCDATA_StackMapIndex, 1
+#define GO_RESULTS_INITIALIZED	PCDATA $PCDATA_StackMapIndex, $1
 
 // NO_LOCAL_POINTERS indicates that the assembly function stores
 // no pointers to heap objects in its local stack variables.
@@ -48,13 +47,3 @@
 // assembly code without an explicit specification).
 // This value is generated by the compiler, assembler, or linker.
 #define ArgsSizeUnknown 0x80000000
-
-/*c2go
-enum {
-	PCDATA_StackMapIndex = 0,
-	FUNCDATA_ArgsPointerMaps = 0,
-	FUNCDATA_LocalsPointerMaps = 1,
-	FUNCDATA_DeadValueMaps = 2,
-	ArgsSizeUnknown = 0x80000000,
-};
-*/

diff --git a/src/runtime/futex_test.go b/src/runtime/futex_test.go
index b85249a..0738f8f 100644
--- a/src/runtime/futex_test.go
+++ b/src/runtime/futex_test.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -13,6 +13,8 @@
 
 import (
 	"runtime"
+	"sync"
+	"sync/atomic"
 	"testing"
 	"time"
 )
@@ -21,12 +23,12 @@
 	mtx uint32
 	ns  int64
 	msg string
-	ch  chan futexsleepTest
+	ch  chan *futexsleepTest
 }
 
 var futexsleepTests = []futexsleepTest{
-	beforeY2038: {mtx: 0, ns: 86400 * 1e9, msg: "before the year 2038", ch: make(chan futexsleepTest, 1)},
-	afterY2038:  {mtx: 0, ns: (1<<31 + 100) * 1e9, msg: "after the year 2038", ch: make(chan futexsleepTest, 1)},
+	beforeY2038: {mtx: 0, ns: 86400 * 1e9, msg: "before the year 2038"},
+	afterY2038:  {mtx: 0, ns: (1<<31 + 100) * 1e9, msg: "after the year 2038"},
 }
 
 const (
@@ -42,12 +44,18 @@
 	}
 
 	start := time.Now()
-	for _, tt := range futexsleepTests {
-		go func(tt futexsleepTest) {
+	var wg sync.WaitGroup
+	for i := range futexsleepTests {
+		tt := &futexsleepTests[i]
+		tt.mtx = 0
+		tt.ch = make(chan *futexsleepTest, 1)
+		wg.Add(1)
+		go func(tt *futexsleepTest) {
 			runtime.Entersyscall(0)
-			runtime.Futexsleep(&tt.mtx, tt.mtx, tt.ns)
+			runtime.Futexsleep(&tt.mtx, 0, tt.ns)
 			runtime.Exitsyscall(0)
 			tt.ch <- tt
+			wg.Done()
 		}(tt)
 	}
 loop:
@@ -71,7 +79,10 @@
 			break loop
 		}
 	}
-	for _, tt := range futexsleepTests {
+	for i := range futexsleepTests {
+		tt := &futexsleepTests[i]
+		atomic.StoreUint32(&tt.mtx, 1)
 		runtime.Futexwakeup(&tt.mtx, 1)
 	}
+	wg.Wait()
 }

diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 6c9b314..d53d3ee 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go

@@ -19,59 +19,13 @@
 	if os.Getenv("GOGC") == "off" {
 		t.Skip("skipping test; GOGC=off in environment")
 	}
-	data := struct{ Short bool }{testing.Short()}
-	got := executeTest(t, testGCSysSource, &data)
+	got := runTestProg(t, "testprog", "GCSys")
 	want := "OK\n"
 	if got != want {
 		t.Fatalf("expected %q, but got %q", want, got)
 	}
 }
 
-const testGCSysSource = `
-package main
-
-import (
-	"fmt"
-	"runtime"
-)
-
-func main() {
-	runtime.GOMAXPROCS(1)
-	memstats := new(runtime.MemStats)
-	runtime.GC()
-	runtime.ReadMemStats(memstats)
-	sys := memstats.Sys
-
-	runtime.MemProfileRate = 0 // disable profiler
-
-	itercount := 1000000
-{{if .Short}}
-	itercount = 100000
-{{end}}
-	for i := 0; i < itercount; i++ {
-		workthegc()
-	}
-
-	// Should only be using a few MB.
-	// We allocated 100 MB or (if not short) 1 GB.
-	runtime.ReadMemStats(memstats)
-	if sys > memstats.Sys {
-		sys = 0
-	} else {
-		sys = memstats.Sys - sys
-	}
-	if sys > 16<<20 {
-		fmt.Printf("using too much memory: %d bytes\n", sys)
-		return
-	}
-	fmt.Printf("OK\n")
-}
-
-func workthegc() []byte {
-	return make([]byte, 1029)
-}
-`
-
 func TestGcDeepNesting(t *testing.T) {
 	type T [2][2][2][2][2][2][2][2][2][2]*int
 	a := new(T)
@@ -199,6 +153,37 @@
 	}
 }
 
+func TestPeriodicGC(t *testing.T) {
+	// Make sure we're not in the middle of a GC.
+	runtime.GC()
+
+	var ms1, ms2 runtime.MemStats
+	runtime.ReadMemStats(&ms1)
+
+	// Make periodic GC run continuously.
+	orig := *runtime.ForceGCPeriod
+	*runtime.ForceGCPeriod = 0
+
+	// Let some periodic GCs happen. In a heavily loaded system,
+	// it's possible these will be delayed, so this is designed to
+	// succeed quickly if things are working, but to give it some
+	// slack if things are slow.
+	var numGCs uint32
+	const want = 2
+	for i := 0; i < 20 && numGCs < want; i++ {
+		time.Sleep(5 * time.Millisecond)
+
+		// Test that periodic GC actually happened.
+		runtime.ReadMemStats(&ms2)
+		numGCs = ms2.NumGC - ms1.NumGC
+	}
+	*runtime.ForceGCPeriod = orig
+
+	if numGCs < want {
+		t.Fatalf("no periodic GC: got %v GCs, want >= 2", numGCs)
+	}
+}
+
 func BenchmarkSetTypePtr(b *testing.B) {
 	benchSetType(b, new(*byte))
 }
@@ -480,9 +465,28 @@
 	testIfaceEqual(io.EOF)
 }
 
+var a bool
+
+//go:noinline
 func testIfaceEqual(x interface{}) {
 	if x == "abc" {
-		// Prevent inlining
-		panic("")
+		a = true
+	}
+}
+
+func TestPageAccounting(t *testing.T) {
+	// Grow the heap in small increments. This used to drop the
+	// pages-in-use count below zero because of a rounding
+	// mismatch (golang.org/issue/15022).
+	const blockSize = 64 << 10
+	blocks := make([]*[blockSize]byte, (64<<20)/blockSize)
+	for i := range blocks {
+		blocks[i] = new([blockSize]byte)
+	}
+
+	// Check that the running page count matches reality.
+	pagesInUse, counted := runtime.CountPagesInUse()
+	if pagesInUse != counted {
+		t.Fatalf("mheap_.pagesInUse is %d, but direct count is %d", pagesInUse, counted)
 	}
 }

diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index f330bf2..011f005 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go

@@ -59,14 +59,14 @@
 
 func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 	mask := runtime.GCMask(p)
-	if bytes.Compare(mask, mask0) != 0 {
+	if !bytes.Equal(mask, mask0) {
 		t.Errorf("bad GC program for %v:\nwant %+v\ngot  %+v", name, mask0, mask)
 		return
 	}
 }
 
 func padDead(mask []byte) []byte {
-	// Because the dead bit isn't encoded until the third word,
+	// Because the dead bit isn't encoded in the second word,
 	// and because on 32-bit systems a one-word allocation
 	// uses a two-word block, the pointer info for a one-word
 	// object needs to be expanded to include an extra scalar
@@ -81,6 +81,9 @@
 	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
 		mask = mask[:len(mask)-1]
 	}
+	if len(mask) == 2 && mask[0] == typeScalar && mask[1] == typeScalar {
+		mask = mask[:0]
+	}
 	return mask
 }
 
@@ -144,7 +147,7 @@
 			typeScalar, typeScalar, typeScalar, typeScalar, // t int; y uint16; u uint64
 			typePointer, typeScalar, // i string
 		}
-	case "arm64", "amd64", "ppc64", "ppc64le":
+	case "arm64", "amd64", "mips64", "mips64le", "ppc64", "ppc64le", "s390x":
 		return []byte{
 			typePointer,                        // q *int
 			typeScalar, typeScalar, typeScalar, // w byte; e [17]byte

diff --git a/src/runtime/go_tls.h b/src/runtime/go_tls.h
index 6a707cf..61f7dbe 100644
--- a/src/runtime/go_tls.h
+++ b/src/runtime/go_tls.h

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/hash32.go b/src/runtime/hash32.go
index 79fb15c..2b7c5c0 100644
--- a/src/runtime/hash32.go
+++ b/src/runtime/hash32.go

@@ -52,9 +52,9 @@
 		h = rotl_15(h*m1) * m2
 	default:
 		v1 := h
-		v2 := uint32(hashkey[1])
-		v3 := uint32(hashkey[2])
-		v4 := uint32(hashkey[3])
+		v2 := uint32(seed * hashkey[1])
+		v3 := uint32(seed * hashkey[2])
+		v4 := uint32(seed * hashkey[3])
 		for s >= 16 {
 			v1 ^= readUnaligned32(p)
 			v1 = rotl_15(v1*m1) * m2

diff --git a/src/runtime/hash64.go b/src/runtime/hash64.go
index 716db61..d61f114 100644
--- a/src/runtime/hash64.go
+++ b/src/runtime/hash64.go

@@ -6,7 +6,7 @@
 //   xxhash: https://code.google.com/p/xxhash/
 // cityhash: https://code.google.com/p/cityhash/
 
-// +build amd64 amd64p32 arm64 ppc64 ppc64le
+// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x
 
 package runtime
 
@@ -53,9 +53,9 @@
 		h = rotl_31(h*m1) * m2
 	default:
 		v1 := h
-		v2 := uint64(hashkey[1])
-		v3 := uint64(hashkey[2])
-		v4 := uint64(hashkey[3])
+		v2 := uint64(seed * hashkey[1])
+		v3 := uint64(seed * hashkey[2])
+		v4 := uint64(seed * hashkey[3])
 		for s >= 32 {
 			v1 ^= readUnaligned64(p)
 			v1 = rotl_31(v1*m1) * m2

diff --git a/src/runtime/hash_test.go b/src/runtime/hash_test.go
index 6b229bd..3108b3b 100644
--- a/src/runtime/hash_test.go
+++ b/src/runtime/hash_test.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -11,13 +11,14 @@
 	. "runtime"
 	"strings"
 	"testing"
+	"unsafe"
 )
 
 // Smhasher is a torture test for hash functions.
 // https://code.google.com/p/smhasher/
 // This code is a port of some of the Smhasher tests to Go.
 //
-// The current AES hash function passes Smhasher.  Our fallback
+// The current AES hash function passes Smhasher. Our fallback
 // hash functions don't, so we only enable the difficult tests when
 // we know the AES implementation is available.
 
@@ -74,7 +75,7 @@
 	pairs := int64(s.n) * int64(s.n-1) / 2
 	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
 	stddev := math.Sqrt(expected)
-	if float64(collisions) > expected+SLOP*3*stddev {
+	if float64(collisions) > expected+SLOP*(3*stddev+1) {
 		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
 	}
 }
@@ -349,7 +350,7 @@
 	k.i = uint64(r.Int63())
 }
 func (k *EfaceKey) bits() int {
-	// use 64 bits.  This tests inlined interfaces
+	// use 64 bits. This tests inlined interfaces
 	// on 64-bit targets and indirect interfaces on
 	// 32-bit targets.
 	return 64
@@ -381,7 +382,7 @@
 	k.i = fInter(r.Int63())
 }
 func (k *IfaceKey) bits() int {
-	// use 64 bits.  This tests inlined interfaces
+	// use 64 bits. This tests inlined interfaces
 	// on 64-bit targets and indirect interfaces on
 	// 32-bit targets.
 	return 64
@@ -443,7 +444,7 @@
 
 	// Each entry in the grid should be about REP/2.
 	// More precisely, we did N = k.bits() * hashSize experiments where
-	// each is the sum of REP coin flips.  We want to find bounds on the
+	// each is the sum of REP coin flips. We want to find bounds on the
 	// sum of coin flips such that a truly random experiment would have
 	// all sums inside those bounds with 99% probability.
 	N := n * hashSize
@@ -561,3 +562,142 @@
 func BenchmarkHash64(b *testing.B)    { benchmarkHash(b, 64) }
 func BenchmarkHash1024(b *testing.B)  { benchmarkHash(b, 1024) }
 func BenchmarkHash65536(b *testing.B) { benchmarkHash(b, 65536) }
+
+func TestArrayHash(t *testing.T) {
+	// Make sure that "" in arrays hash correctly. The hash
+	// should at least scramble the input seed so that, e.g.,
+	// {"","foo"} and {"foo",""} have different hashes.
+
+	// If the hash is bad, then all (8 choose 4) = 70 keys
+	// have the same hash. If so, we allocate 70/8 = 8
+	// overflow buckets. If the hash is good we don't
+	// normally allocate any overflow buckets, and the
+	// probability of even one or two overflows goes down rapidly.
+	// (There is always 1 allocation of the bucket array. The map
+	// header is allocated on the stack.)
+	f := func() {
+		// Make the key type at most 128 bytes. Otherwise,
+		// we get an allocation per key.
+		type key [8]string
+		m := make(map[key]bool, 70)
+
+		// fill m with keys that have 4 "foo"s and 4 ""s.
+		for i := 0; i < 256; i++ {
+			var k key
+			cnt := 0
+			for j := uint(0); j < 8; j++ {
+				if i>>j&1 != 0 {
+					k[j] = "foo"
+					cnt++
+				}
+			}
+			if cnt == 4 {
+				m[k] = true
+			}
+		}
+		if len(m) != 70 {
+			t.Errorf("bad test: (8 choose 4) should be 70, not %d", len(m))
+		}
+	}
+	if n := testing.AllocsPerRun(10, f); n > 6 {
+		t.Errorf("too many allocs %f - hash not balanced", n)
+	}
+}
+func TestStructHash(t *testing.T) {
+	// See the comment in TestArrayHash.
+	f := func() {
+		type key struct {
+			a, b, c, d, e, f, g, h string
+		}
+		m := make(map[key]bool, 70)
+
+		// fill m with keys that have 4 "foo"s and 4 ""s.
+		for i := 0; i < 256; i++ {
+			var k key
+			cnt := 0
+			if i&1 != 0 {
+				k.a = "foo"
+				cnt++
+			}
+			if i&2 != 0 {
+				k.b = "foo"
+				cnt++
+			}
+			if i&4 != 0 {
+				k.c = "foo"
+				cnt++
+			}
+			if i&8 != 0 {
+				k.d = "foo"
+				cnt++
+			}
+			if i&16 != 0 {
+				k.e = "foo"
+				cnt++
+			}
+			if i&32 != 0 {
+				k.f = "foo"
+				cnt++
+			}
+			if i&64 != 0 {
+				k.g = "foo"
+				cnt++
+			}
+			if i&128 != 0 {
+				k.h = "foo"
+				cnt++
+			}
+			if cnt == 4 {
+				m[k] = true
+			}
+		}
+		if len(m) != 70 {
+			t.Errorf("bad test: (8 choose 4) should be 70, not %d", len(m))
+		}
+	}
+	if n := testing.AllocsPerRun(10, f); n > 6 {
+		t.Errorf("too many allocs %f - hash not balanced", n)
+	}
+}
+
+var sink uint64
+
+func BenchmarkAlignedLoad(b *testing.B) {
+	var buf [16]byte
+	p := unsafe.Pointer(&buf[0])
+	var s uint64
+	for i := 0; i < b.N; i++ {
+		s += ReadUnaligned64(p)
+	}
+	sink = s
+}
+
+func BenchmarkUnalignedLoad(b *testing.B) {
+	var buf [16]byte
+	p := unsafe.Pointer(&buf[1])
+	var s uint64
+	for i := 0; i < b.N; i++ {
+		s += ReadUnaligned64(p)
+	}
+	sink = s
+}
+
+func TestCollisions(t *testing.T) {
+	for i := 0; i < 16; i++ {
+		for j := 0; j < 16; j++ {
+			if j == i {
+				continue
+			}
+			var a [16]byte
+			m := make(map[uint16]struct{}, 1<<16)
+			for n := 0; n < 1<<16; n++ {
+				a[i] = byte(n)
+				a[j] = byte(n >> 8)
+				m[uint16(BytesHash(a[:], 0))] = struct{}{}
+			}
+			if len(m) <= 1<<15 {
+				t.Errorf("too many collisions i=%d j=%d outputs=%d out of 65536\n", i, j, len(m))
+			}
+		}
+	}
+}

diff --git a/src/runtime/hashmap.go b/src/runtime/hashmap.go
index 917ed21..509cab2 100644
--- a/src/runtime/hashmap.go
+++ b/src/runtime/hashmap.go

@@ -6,10 +6,10 @@
 
 // This file contains the implementation of Go's map type.
 //
-// A map is just a hash table.  The data is arranged
-// into an array of buckets.  Each bucket contains up to
-// 8 key/value pairs.  The low-order bits of the hash are
-// used to select a bucket.  Each bucket contains a few
+// A map is just a hash table. The data is arranged
+// into an array of buckets. Each bucket contains up to
+// 8 key/value pairs. The low-order bits of the hash are
+// used to select a bucket. Each bucket contains a few
 // high-order bits of each hash to distinguish the entries
 // within a single bucket.
 //
@@ -17,7 +17,7 @@
 // extra buckets.
 //
 // When the hashtable grows, we allocate a new array
-// of buckets twice as big.  Buckets are incrementally
+// of buckets twice as big. Buckets are incrementally
 // copied from the old bucket array to the new bucket array.
 //
 // Map iterators walk through the array of buckets and
@@ -31,7 +31,7 @@
 // to the new table.
 
 // Picking loadFactor: too large and we have lots of overflow
-// buckets, too small and we waste a lot of space.  I wrote
+// buckets, too small and we waste a lot of space. I wrote
 // a simple program to check some stats for different loads:
 // (64-bit, 8 byte keys and values)
 //  loadFactor    %overflow  bytes/entry     hitprobe    missprobe
@@ -51,9 +51,11 @@
 // missprobe   = # of entries to check when looking up an absent key
 //
 // Keep in mind this data is for maximally loaded tables, i.e. just
-// before the table grows.  Typical tables will be somewhat less loaded.
+// before the table grows. Typical tables will be somewhat less loaded.
 
 import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -73,14 +75,14 @@
 	maxValueSize = 128
 
 	// data offset should be the size of the bmap struct, but needs to be
-	// aligned correctly.  For amd64p32 this means 64-bit alignment
+	// aligned correctly. For amd64p32 this means 64-bit alignment
 	// even though pointers are 32 bit.
 	dataOffset = unsafe.Offsetof(struct {
 		b bmap
 		v int64
 	}{}.v)
 
-	// Possible tophash values.  We reserve a few possibilities for special marks.
+	// Possible tophash values. We reserve a few possibilities for special marks.
 	// Each bucket (including its overflow buckets, if any) will have either all or none of its
 	// entries in the evacuated* states (except during the evacuate() method, which only happens
 	// during map writes and thus no one else can observe the map during that time).
@@ -93,15 +95,16 @@
 	// flags
 	iterator    = 1 // there may be an iterator using buckets
 	oldIterator = 2 // there may be an iterator using oldbuckets
+	hashWriting = 4 // a goroutine is writing to the map
 
 	// sentinel bucket ID for iterator checks
-	noCheck = 1<<(8*ptrSize) - 1
+	noCheck = 1<<(8*sys.PtrSize) - 1
 )
 
 // A header for a Go map.
 type hmap struct {
 	// Note: the format of the Hmap is encoded in ../../cmd/internal/gc/reflect.go and
-	// ../reflect/type.go.  Don't change this structure without also changing that code!
+	// ../reflect/type.go. Don't change this structure without also changing that code!
 	count int // # live cells == size of map.  Must be first (used by len() builtin)
 	flags uint8
 	B     uint8  // log_2 of # of buckets (can hold up to loadFactor * 2^B items)
@@ -159,7 +162,7 @@
 }
 
 func (b *bmap) overflow(t *maptype) *bmap {
-	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-ptrSize))
+	return *(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize))
 }
 
 func (h *hmap) setoverflow(t *maptype, b, ovf *bmap) {
@@ -167,7 +170,7 @@
 		h.createOverflow()
 		*h.overflow[0] = append(*h.overflow[0], ovf)
 	}
-	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-ptrSize)) = ovf
+	*(**bmap)(add(unsafe.Pointer(b), uintptr(t.bucketsize)-sys.PtrSize)) = ovf
 }
 
 func (h *hmap) createOverflow() {
@@ -185,13 +188,13 @@
 // If h != nil, the map can be created directly in h.
 // If bucket != nil, bucket can be used as the first bucket.
 func makemap(t *maptype, hint int64, h *hmap, bucket unsafe.Pointer) *hmap {
-	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != uintptr(t.hmap.size) {
+	if sz := unsafe.Sizeof(hmap{}); sz > 48 || sz != t.hmap.size {
 		println("runtime: sizeof(hmap) =", sz, ", t.hmap.size =", t.hmap.size)
 		throw("bad hmap size")
 	}
 
 	if hint < 0 || int64(int32(hint)) != hint {
-		panic("makemap: size out of range")
+		panic(plainError("makemap: size out of range"))
 		// TODO: make hint an int, then none of this nonsense
 	}
 
@@ -200,16 +203,16 @@
 	}
 
 	// check compiler's and reflect's math
-	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(ptrSize)) ||
+	if t.key.size > maxKeySize && (!t.indirectkey || t.keysize != uint8(sys.PtrSize)) ||
 		t.key.size <= maxKeySize && (t.indirectkey || t.keysize != uint8(t.key.size)) {
 		throw("key size wrong")
 	}
-	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(ptrSize)) ||
+	if t.elem.size > maxValueSize && (!t.indirectvalue || t.valuesize != uint8(sys.PtrSize)) ||
 		t.elem.size <= maxValueSize && (t.indirectvalue || t.valuesize != uint8(t.elem.size)) {
 		throw("value size wrong")
 	}
 
-	// invariants we depend on.  We should probably check these at compile time
+	// invariants we depend on. We should probably check these at compile time
 	// somewhere, but for now we'll do it here.
 	if t.key.align > bucketCnt {
 		throw("key align too big")
@@ -217,10 +220,10 @@
 	if t.elem.align > bucketCnt {
 		throw("value align too big")
 	}
-	if uintptr(t.key.size)%uintptr(t.key.align) != 0 {
+	if t.key.size%uintptr(t.key.align) != 0 {
 		throw("key size not a multiple of key align")
 	}
-	if uintptr(t.elem.size)%uintptr(t.elem.align) != 0 {
+	if t.elem.size%uintptr(t.elem.align) != 0 {
 		throw("value size not a multiple of value align")
 	}
 	if bucketCnt < 8 {
@@ -233,9 +236,6 @@
 		throw("need padding in bucket (value)")
 	}
 
-	// make sure zero of element type is available.
-	mapzero(t.elem)
-
 	// find size parameter which will hold the requested # of elements
 	B := uint8(0)
 	for ; hint > bucketCnt && float32(hint) > loadFactor*float32(uintptr(1)<<B); B++ {
@@ -246,7 +246,7 @@
 	// If hint is large zeroing this memory could take a while.
 	buckets := bucket
 	if B != 0 {
-		buckets = newarray(t.bucket, uintptr(1)<<B)
+		buckets = newarray(t.bucket, 1<<B)
 	}
 
 	// initialize Hmap
@@ -276,8 +276,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
 	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero)
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
@@ -289,7 +295,7 @@
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -312,7 +318,7 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero)
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -324,8 +330,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
 	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero), false
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
@@ -337,7 +349,7 @@
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -360,16 +372,19 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
 
-// returns both key and value.  Used by map iterator
+// returns both key and value. Used by map iterator
 func mapaccessK(t *maptype, h *hmap, key unsafe.Pointer) (unsafe.Pointer, unsafe.Pointer) {
 	if h == nil || h.count == 0 {
 		return nil, nil
 	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
 	m := uintptr(1)<<h.B - 1
@@ -380,7 +395,7 @@
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -408,9 +423,25 @@
 	}
 }
 
+func mapaccess1_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) unsafe.Pointer {
+	v := mapaccess1(t, h, key)
+	if v == unsafe.Pointer(&zeroVal[0]) {
+		return zero
+	}
+	return v
+}
+
+func mapaccess2_fat(t *maptype, h *hmap, key, zero unsafe.Pointer) (unsafe.Pointer, bool) {
+	v := mapaccess1(t, h, key)
+	if v == unsafe.Pointer(&zeroVal[0]) {
+		return zero, false
+	}
+	return v, true
+}
+
 func mapassign1(t *maptype, h *hmap, key unsafe.Pointer, val unsafe.Pointer) {
 	if h == nil {
-		panic("assignment to entry in nil map")
+		panic(plainError("assignment to entry in nil map"))
 	}
 	if raceenabled {
 		callerpc := getcallerpc(unsafe.Pointer(&t))
@@ -419,6 +450,14 @@
 		raceReadObjectPC(t.key, key, callerpc, pc)
 		raceReadObjectPC(t.elem, val, callerpc, pc)
 	}
+	if msanenabled {
+		msanread(key, t.key.size)
+		msanread(val, t.elem.size)
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	h.flags |= hashWriting
 
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
@@ -433,7 +472,7 @@
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -459,15 +498,17 @@
 			if !alg.equal(key, k2) {
 				continue
 			}
-			// already have a mapping for key.  Update it.
-			typedmemmove(t.key, k2, key)
+			// already have a mapping for key. Update it.
+			if t.needkeyupdate {
+				typedmemmove(t.key, k2, key)
+			}
 			v := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+i*uintptr(t.valuesize))
 			v2 := v
 			if t.indirectvalue {
 				v2 = *((*unsafe.Pointer)(v2))
 			}
 			typedmemmove(t.elem, v2, val)
-			return
+			goto done
 		}
 		ovf := b.overflow(t)
 		if ovf == nil {
@@ -476,7 +517,7 @@
 		b = ovf
 	}
 
-	// did not find mapping for key.  Allocate new cell & add entry.
+	// did not find mapping for key. Allocate new cell & add entry.
 	if float32(h.count) >= loadFactor*float32((uintptr(1)<<h.B)) && h.count >= bucketCnt {
 		hashGrow(t, h)
 		goto again // Growing the table invalidates everything, so try again
@@ -506,6 +547,12 @@
 	typedmemmove(t.elem, insertv, val)
 	*inserti = top
 	h.count++
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
 }
 
 func mapdelete(t *maptype, h *hmap, key unsafe.Pointer) {
@@ -515,9 +562,17 @@
 		racewritepc(unsafe.Pointer(h), callerpc, pc)
 		raceReadObjectPC(t.key, key, callerpc, pc)
 	}
+	if msanenabled && h != nil {
+		msanread(key, t.key.size)
+	}
 	if h == nil || h.count == 0 {
 		return
 	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map writes")
+	}
+	h.flags |= hashWriting
+
 	alg := t.key.alg
 	hash := alg.hash(key, uintptr(h.hash0))
 	bucket := hash & (uintptr(1)<<h.B - 1)
@@ -525,7 +580,7 @@
 		growWork(t, h, bucket)
 	}
 	b := (*bmap)(unsafe.Pointer(uintptr(h.buckets) + bucket*uintptr(t.bucketsize)))
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -547,13 +602,19 @@
 			memclr(v, uintptr(t.valuesize))
 			b.tophash[i] = empty
 			h.count--
-			return
+			goto done
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return
+			goto done
 		}
 	}
+
+done:
+	if h.flags&hashWriting == 0 {
+		throw("concurrent map writes")
+	}
+	h.flags &^= hashWriting
 }
 
 func mapiterinit(t *maptype, h *hmap, it *hiter) {
@@ -578,7 +639,7 @@
 		return
 	}
 
-	if unsafe.Sizeof(hiter{})/ptrSize != 12 {
+	if unsafe.Sizeof(hiter{})/sys.PtrSize != 12 {
 		throw("hash_iter size incorrect") // see ../../cmd/internal/gc/reflect.go
 	}
 	it.t = t
@@ -612,7 +673,7 @@
 	// Remember we have an iterator.
 	// Can run concurrently with another hash_iter_init().
 	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
-		atomicor8(&h.flags, iterator|oldIterator)
+		atomic.Or8(&h.flags, iterator|oldIterator)
 	}
 
 	mapiternext(it)
@@ -670,9 +731,9 @@
 		if b.tophash[offi] != empty && b.tophash[offi] != evacuatedEmpty {
 			if checkBucket != noCheck {
 				// Special case: iterator was started during a grow and the
-				// grow is not done yet.  We're working on a bucket whose
-				// oldbucket has not been evacuated yet.  Or at least, it wasn't
-				// evacuated when we started the bucket.  So we're iterating
+				// grow is not done yet. We're working on a bucket whose
+				// oldbucket has not been evacuated yet. Or at least, it wasn't
+				// evacuated when we started the bucket. So we're iterating
 				// through the oldbucket, skipping any keys that will go
 				// to the other new bucket (each oldbucket expands to two
 				// buckets during a grow).
@@ -690,7 +751,7 @@
 				} else {
 					// Hash isn't repeatable if k != k (NaNs).  We need a
 					// repeatable and randomish choice of which direction
-					// to send NaNs during evacuation.  We'll use the low
+					// to send NaNs during evacuation. We'll use the low
 					// bit of tophash to decide which way NaNs go.
 					// NOTE: this case is why we need two evacuate tophash
 					// values, evacuatedX and evacuatedY, that differ in
@@ -731,7 +792,7 @@
 					it.value = rv
 				} else {
 					// if key!=key then the entry can't be deleted or
-					// updated, so we can just return it.  That's lucky for
+					// updated, so we can just return it. That's lucky for
 					// us because when key!=key we can't look it up
 					// successfully in the current table.
 					it.key = k2
@@ -742,7 +803,9 @@
 				}
 			}
 			it.bucket = bucket
-			it.bptr = b
+			if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
+				it.bptr = b
+			}
 			it.i = i + 1
 			it.checkBucket = checkBucket
 			return
@@ -758,7 +821,7 @@
 		throw("evacuation not done in time")
 	}
 	oldbuckets := h.buckets
-	newbuckets := newarray(t.bucket, uintptr(1)<<(h.B+1))
+	newbuckets := newarray(t.bucket, 1<<(h.B+1))
 	flags := h.flags &^ (iterator | oldIterator)
 	if h.flags&iterator != 0 {
 		flags |= oldIterator
@@ -834,12 +897,12 @@
 				if h.flags&iterator != 0 {
 					if !t.reflexivekey && !alg.equal(k2, k2) {
 						// If key != key (NaNs), then the hash could be (and probably
-						// will be) entirely different from the old hash.  Moreover,
-						// it isn't reproducible.  Reproducibility is required in the
+						// will be) entirely different from the old hash. Moreover,
+						// it isn't reproducible. Reproducibility is required in the
 						// presence of iterators, as our evacuation decision must
 						// match whatever decision the iterator made.
 						// Fortunately, we have the freedom to send these keys either
-						// way.  Also, tophash is meaningless for these kinds of keys.
+						// way. Also, tophash is meaningless for these kinds of keys.
 						// We let the low bit of tophash drive the evacuation decision.
 						// We recompute a new random tophash for the next level so
 						// these keys will get evenly distributed across all buckets
@@ -849,7 +912,7 @@
 						} else {
 							hash &^= newbit
 						}
-						top = uint8(hash >> (ptrSize*8 - 8))
+						top = uint8(hash >> (sys.PtrSize*8 - 8))
 						if top < minTopHash {
 							top += minTopHash
 						}
@@ -917,7 +980,7 @@
 	if oldbucket == h.nevacuate {
 		h.nevacuate = oldbucket + 1
 		if oldbucket+1 == newbit { // newbit == # of oldbuckets
-			// Growing is all done.  Free old main bucket array.
+			// Growing is all done. Free old main bucket array.
 			h.oldbuckets = nil
 			// Can discard old overflow buckets as well.
 			// If they are still referenced by an iterator,
@@ -933,7 +996,7 @@
 	return t.alg.hash != nil
 }
 
-// Reflect stubs.  Called from ../reflect/asm_*.s
+// Reflect stubs. Called from ../reflect/asm_*.s
 
 //go:linkname reflect_makemap reflect.makemap
 func reflect_makemap(t *maptype) *hmap {
@@ -994,59 +1057,5 @@
 	return ismapkey(t)
 }
 
-var zerobuf struct {
-	lock mutex
-	p    *byte
-	size uintptr
-}
-
-var zerotiny [1024]byte
-
-// mapzero ensures that t.zero points at a zero value for type t.
-// Types known to the compiler are in read-only memory and all point
-// to a single zero in the bss of a large enough size.
-// Types allocated by package reflect are in writable memory and
-// start out with zero set to nil; we initialize those on demand.
-func mapzero(t *_type) {
-	// On ARM, atomicloadp is implemented as xadd(p, 0),
-	// so we cannot use atomicloadp on read-only memory.
-	// Check whether the pointer is in the heap; if not, it's not writable
-	// so the zero value must already be set.
-	if GOARCH == "arm" && !inheap(uintptr(unsafe.Pointer(t))) {
-		if t.zero == nil {
-			print("runtime: map element ", *t._string, " missing zero value\n")
-			throw("mapzero")
-		}
-		return
-	}
-
-	// Already done?
-	// Check without lock, so must use atomicload to sync with atomicstore in allocation case below.
-	if atomicloadp(unsafe.Pointer(&t.zero)) != nil {
-		return
-	}
-
-	// Small enough for static buffer?
-	if t.size <= uintptr(len(zerotiny)) {
-		atomicstorep(unsafe.Pointer(&t.zero), unsafe.Pointer(&zerotiny[0]))
-		return
-	}
-
-	// Use allocated buffer.
-	lock(&zerobuf.lock)
-	if zerobuf.size < t.size {
-		if zerobuf.size == 0 {
-			zerobuf.size = 4 * 1024
-		}
-		for zerobuf.size < t.size {
-			zerobuf.size *= 2
-			if zerobuf.size == 0 {
-				// need >2GB zero on 32-bit machine
-				throw("map element too large")
-			}
-		}
-		zerobuf.p = (*byte)(persistentalloc(zerobuf.size, 64, &memstats.other_sys))
-	}
-	atomicstorep(unsafe.Pointer(&t.zero), unsafe.Pointer(zerobuf.p))
-	unlock(&zerobuf.lock)
-}
+const maxZero = 1024 // must match value in ../cmd/compile/internal/gc/walk.go
+var zeroVal [maxZero]byte

diff --git a/src/runtime/hashmap_fast.go b/src/runtime/hashmap_fast.go
index 02c51a2..8f9bb5a 100644
--- a/src/runtime/hashmap_fast.go
+++ b/src/runtime/hashmap_fast.go

@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -14,11 +15,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero)
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	var b *bmap
 	if h.B == 0 {
-		// One-bucket table.  No need to hash.
+		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
@@ -45,7 +49,7 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero)
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -56,11 +60,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast32))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero), false
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	var b *bmap
 	if h.B == 0 {
-		// One-bucket table.  No need to hash.
+		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
@@ -87,7 +94,7 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
@@ -98,11 +105,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero)
+		return unsafe.Pointer(&zeroVal[0])
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	var b *bmap
 	if h.B == 0 {
-		// One-bucket table.  No need to hash.
+		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
@@ -129,7 +139,7 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero)
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -140,11 +150,14 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_fast64))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero), false
+		return unsafe.Pointer(&zeroVal[0]), false
+	}
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
 	}
 	var b *bmap
 	if h.B == 0 {
-		// One-bucket table.  No need to hash.
+		// One-bucket table. No need to hash.
 		b = (*bmap)(h.buckets)
 	} else {
 		hash := t.key.alg.hash(noescape(unsafe.Pointer(&key)), uintptr(h.hash0))
@@ -171,7 +184,7 @@
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }
@@ -182,9 +195,12 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess1_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero)
+		return unsafe.Pointer(&zeroVal[0])
 	}
-	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	key := stringStructOf(&ky)
 	if h.B == 0 {
 		// One-bucket table.
 		b := (*bmap)(h.buckets)
@@ -195,15 +211,15 @@
 				if x == empty {
 					continue
 				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 				if k.len != key.len {
 					continue
 				}
-				if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
-					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 				}
 			}
-			return unsafe.Pointer(t.elem.zero)
+			return unsafe.Pointer(&zeroVal[0])
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -212,12 +228,12 @@
 			if x == empty {
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 			if k.len != key.len {
 				continue
 			}
 			if k.str == key.str {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 			// check first 4 bytes
 			// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
@@ -230,18 +246,18 @@
 				continue
 			}
 			if keymaybe != bucketCnt {
-				// Two keys are potential matches.  Use hash to distinguish them.
+				// Two keys are potential matches. Use hash to distinguish them.
 				goto dohash
 			}
 			keymaybe = i
 		}
 		if keymaybe != bucketCnt {
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
-			if memeq(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize))
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*sys.PtrSize))
+			if memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+keymaybe*uintptr(t.valuesize))
 			}
 		}
-		return unsafe.Pointer(t.elem.zero)
+		return unsafe.Pointer(&zeroVal[0])
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -253,7 +269,7 @@
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -263,17 +279,17 @@
 			if x != top {
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 			if k.len != key.len {
 				continue
 			}
-			if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize))
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
 			}
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero)
+			return unsafe.Pointer(&zeroVal[0])
 		}
 	}
 }
@@ -284,9 +300,12 @@
 		racereadpc(unsafe.Pointer(h), callerpc, funcPC(mapaccess2_faststr))
 	}
 	if h == nil || h.count == 0 {
-		return unsafe.Pointer(t.elem.zero), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
-	key := (*stringStruct)(unsafe.Pointer(&ky))
+	if h.flags&hashWriting != 0 {
+		throw("concurrent map read and map write")
+	}
+	key := stringStructOf(&ky)
 	if h.B == 0 {
 		// One-bucket table.
 		b := (*bmap)(h.buckets)
@@ -297,15 +316,15 @@
 				if x == empty {
 					continue
 				}
-				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+				k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 				if k.len != key.len {
 					continue
 				}
-				if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
-					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+				if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+					return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
 				}
 			}
-			return unsafe.Pointer(t.elem.zero), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 		// long key, try not to do more comparisons than necessary
 		keymaybe := uintptr(bucketCnt)
@@ -314,12 +333,12 @@
 			if x == empty {
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 			if k.len != key.len {
 				continue
 			}
 			if k.str == key.str {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
 			}
 			// check first 4 bytes
 			if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
@@ -330,18 +349,18 @@
 				continue
 			}
 			if keymaybe != bucketCnt {
-				// Two keys are potential matches.  Use hash to distinguish them.
+				// Two keys are potential matches. Use hash to distinguish them.
 				goto dohash
 			}
 			keymaybe = i
 		}
 		if keymaybe != bucketCnt {
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*ptrSize))
-			if memeq(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+keymaybe*uintptr(t.valuesize)), true
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+keymaybe*2*sys.PtrSize))
+			if memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+keymaybe*uintptr(t.valuesize)), true
 			}
 		}
-		return unsafe.Pointer(t.elem.zero), false
+		return unsafe.Pointer(&zeroVal[0]), false
 	}
 dohash:
 	hash := t.key.alg.hash(noescape(unsafe.Pointer(&ky)), uintptr(h.hash0))
@@ -353,7 +372,7 @@
 			b = oldb
 		}
 	}
-	top := uint8(hash >> (ptrSize*8 - 8))
+	top := uint8(hash >> (sys.PtrSize*8 - 8))
 	if top < minTopHash {
 		top += minTopHash
 	}
@@ -363,17 +382,17 @@
 			if x != top {
 				continue
 			}
-			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*ptrSize))
+			k := (*stringStruct)(add(unsafe.Pointer(b), dataOffset+i*2*sys.PtrSize))
 			if k.len != key.len {
 				continue
 			}
-			if k.str == key.str || memeq(k.str, key.str, uintptr(key.len)) {
-				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*ptrSize+i*uintptr(t.valuesize)), true
+			if k.str == key.str || memequal(k.str, key.str, uintptr(key.len)) {
+				return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize)), true
 			}
 		}
 		b = b.overflow(t)
 		if b == nil {
-			return unsafe.Pointer(t.elem.zero), false
+			return unsafe.Pointer(&zeroVal[0]), false
 		}
 	}
 }

diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 492ea92..c317b5f 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go

@@ -2,16 +2,19 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Implementation of runtime/debug.WriteHeapDump.  Writes all
+// Implementation of runtime/debug.WriteHeapDump. Writes all
 // objects in the heap plus additional info (roots, threads,
 // finalizers, etc.) to a file.
 
 // The format of the dumped file is described at
-// https://golang.org/s/go14heapdump.
+// https://golang.org/s/go15heapdump.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 //go:linkname runtime_debug_WriteHeapDump runtime/debug.WriteHeapDump
 func runtime_debug_WriteHeapDump(fd uintptr) {
@@ -70,7 +73,7 @@
 		return
 	}
 
-	write(dumpfd, (unsafe.Pointer)(&buf), int32(nbuf))
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
 	if len >= bufSize {
 		write(dumpfd, data, int32(len))
 		nbuf = 0
@@ -85,7 +88,7 @@
 }
 
 func flush() {
-	write(dumpfd, (unsafe.Pointer)(&buf), int32(nbuf))
+	write(dumpfd, unsafe.Pointer(&buf), int32(nbuf))
 	nbuf = 0
 }
 
@@ -94,7 +97,7 @@
 // Inside a bucket, we keep a list of types that
 // have been serialized so far, most recently used first.
 // Note: when a bucket overflows we may end up
-// serializing a type more than once.  That's ok.
+// serializing a type more than once. That's ok.
 const (
 	typeCacheBuckets = 256
 	typeCacheAssoc   = 4
@@ -142,7 +145,7 @@
 }
 
 func dumpstr(s string) {
-	sp := (*stringStruct)(unsafe.Pointer(&s))
+	sp := stringStructOf(&s)
 	dumpmemrange(sp.str, uintptr(sp.len))
 }
 
@@ -169,7 +172,7 @@
 		}
 	}
 
-	// Might not have been dumped yet.  Dump it and
+	// Might not have been dumped yet. Dump it and
 	// remember we did so.
 	for j := typeCacheAssoc - 1; j > 0; j-- {
 		b.t[j] = b.t[j-1]
@@ -180,11 +183,13 @@
 	dumpint(tagType)
 	dumpint(uint64(uintptr(unsafe.Pointer(t))))
 	dumpint(uint64(t.size))
-	if t.x == nil || t.x.pkgpath == nil || t.x.name == nil {
-		dumpstr(*t._string)
+	if x := t.uncommon(); x == nil || t.nameOff(x.pkgpath).name() == "" {
+		dumpstr(t.string())
 	} else {
-		pkgpath := (*stringStruct)(unsafe.Pointer(&t.x.pkgpath))
-		name := (*stringStruct)(unsafe.Pointer(&t.x.name))
+		pkgpathstr := t.nameOff(x.pkgpath).name()
+		pkgpath := stringStructOf(&pkgpathstr)
+		namestr := t.name()
+		name := stringStructOf(&namestr)
 		dumpint(uint64(uintptr(pkgpath.len) + 1 + uintptr(name.len)))
 		dwrite(pkgpath.str, uintptr(pkgpath.len))
 		dwritebyte('.')
@@ -230,10 +235,10 @@
 // dump kinds & offsets of interesting fields in bv
 func dumpbv(cbv *bitvector, offset uintptr) {
 	bv := gobv(*cbv)
-	for i := uintptr(0); i < uintptr(bv.n); i++ {
+	for i := uintptr(0); i < bv.n; i++ {
 		if bv.bytedata[i/8]>>(i%8)&1 == 1 {
 			dumpint(fieldKindPtr)
-			dumpint(uint64(offset + i*ptrSize))
+			dumpint(uint64(offset + i*sys.PtrSize))
 		}
 	}
 }
@@ -247,10 +252,10 @@
 	if pc != f.entry {
 		pc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, pc, nil)
 	if pcdata == -1 {
 		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function.  It is likely that we are looking
+		// stackmap for this function. It is likely that we are looking
 		// at the function prologue, assume so and hope for the best.
 		pcdata = 0
 	}
@@ -263,7 +268,7 @@
 	var bv bitvector
 	if stkmap != nil && stkmap.n > 0 {
 		bv = stackmapdata(stkmap, pcdata)
-		dumpbvtypes(&bv, unsafe.Pointer(s.varp-uintptr(bv.n*ptrSize)))
+		dumpbvtypes(&bv, unsafe.Pointer(s.varp-uintptr(bv.n*sys.PtrSize)))
 	} else {
 		bv.n = -1
 	}
@@ -288,7 +293,7 @@
 		dumpbv(&child.args, child.argoff)
 	} else {
 		// conservative - everything might be a pointer
-		for off := child.argoff; off < child.argoff+child.arglen; off += ptrSize {
+		for off := child.argoff; off < child.argoff+child.arglen; off += sys.PtrSize {
 			dumpint(fieldKindPtr)
 			dumpint(uint64(off))
 		}
@@ -297,21 +302,21 @@
 	// Dump fields in the local vars section
 	if stkmap == nil {
 		// No locals information, dump everything.
-		for off := child.arglen; off < s.varp-s.sp; off += ptrSize {
+		for off := child.arglen; off < s.varp-s.sp; off += sys.PtrSize {
 			dumpint(fieldKindPtr)
 			dumpint(uint64(off))
 		}
 	} else if stkmap.n < 0 {
 		// Locals size information, dump just the locals.
 		size := uintptr(-stkmap.n)
-		for off := s.varp - size - s.sp; off < s.varp-s.sp; off += ptrSize {
+		for off := s.varp - size - s.sp; off < s.varp-s.sp; off += sys.PtrSize {
 			dumpint(fieldKindPtr)
 			dumpint(uint64(off))
 		}
 	} else if stkmap.n > 0 {
 		// Locals bitmap information, scan just the pointers in
 		// locals.
-		dumpbv(&bv, s.varp-uintptr(bv.n)*ptrSize-s.sp)
+		dumpbv(&bv, s.varp-uintptr(bv.n)*sys.PtrSize-s.sp)
 	}
 	dumpint(fieldKindEol)
 
@@ -379,7 +384,7 @@
 		dumpint(tagPanic)
 		dumpint(uint64(uintptr(unsafe.Pointer(p))))
 		dumpint(uint64(uintptr(unsafe.Pointer(gp))))
-		eface := (*eface)(unsafe.Pointer(&p.arg))
+		eface := efaceOf(&p.arg)
 		dumpint(uint64(uintptr(unsafe.Pointer(eface._type))))
 		dumpint(uint64(uintptr(unsafe.Pointer(eface.data))))
 		dumpint(0) // was p->defer, no longer recorded
@@ -442,7 +447,7 @@
 					continue
 				}
 				spf := (*specialfinalizer)(unsafe.Pointer(sp))
-				p := unsafe.Pointer((uintptr(s.start) << _PageShift) + uintptr(spf.special.offset))
+				p := unsafe.Pointer(s.base() + uintptr(spf.special.offset))
 				dumpfinalizer(p, spf.fn, spf.fint, spf.ot)
 			}
 		}
@@ -462,15 +467,19 @@
 		if s.state != _MSpanInUse {
 			continue
 		}
-		p := uintptr(s.start << _PageShift)
+		p := s.base()
 		size := s.elemsize
 		n := (s.npages << _PageShift) / size
 		if n > uintptr(len(freemark)) {
 			throw("freemark array doesn't have enough entries")
 		}
-		for l := s.freelist; l.ptr() != nil; l = l.ptr().next {
-			freemark[(uintptr(l)-p)/size] = true
+
+		for freeIndex := s.freeindex; freeIndex < s.nelems; freeIndex++ {
+			if s.isFree(freeIndex) {
+				freemark[freeIndex] = true
+			}
 		}
+
 		for j := uintptr(0); j < n; j, p = j+1, p+size {
 			if freemark[j] {
 				freemark[j] = false
@@ -489,38 +498,20 @@
 	} else {
 		dumpbool(true) // big-endian ptrs
 	}
-	dumpint(ptrSize)
+	dumpint(sys.PtrSize)
 	dumpint(uint64(mheap_.arena_start))
 	dumpint(uint64(mheap_.arena_used))
-	dumpint(thechar)
-	dumpstr(goexperiment)
+	dumpstr(sys.GOARCH)
+	dumpstr(sys.Goexperiment)
 	dumpint(uint64(ncpu))
 }
 
 func itab_callback(tab *itab) {
 	t := tab._type
-	// Dump a map from itab* to the type of its data field.
-	// We want this map so we can deduce types of interface referents.
-	if t.kind&kindDirectIface == 0 {
-		// indirect - data slot is a pointer to t.
-		dumptype(t.ptrto)
-		dumpint(tagItab)
-		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
-		dumpint(uint64(uintptr(unsafe.Pointer(t.ptrto))))
-	} else if t.kind&kindNoPointers == 0 {
-		// t is pointer-like - data slot is a t.
-		dumptype(t)
-		dumpint(tagItab)
-		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
-		dumpint(uint64(uintptr(unsafe.Pointer(t))))
-	} else {
-		// Data slot is a scalar.  Dump type just for fun.
-		// With pointer-only interfaces, this shouldn't happen.
-		dumptype(t)
-		dumpint(tagItab)
-		dumpint(uint64(uintptr(unsafe.Pointer(tab))))
-		dumpint(uint64(uintptr(unsafe.Pointer(t))))
-	}
+	dumptype(t)
+	dumpint(tagItab)
+	dumpint(uint64(uintptr(unsafe.Pointer(tab))))
+	dumpint(uint64(uintptr(unsafe.Pointer(t))))
 }
 
 func dumpitabs() {
@@ -628,7 +619,7 @@
 				continue
 			}
 			spp := (*specialprofile)(unsafe.Pointer(sp))
-			p := uintptr(s.start<<_PageShift) + uintptr(spp.special.offset)
+			p := s.base() + uintptr(spp.special.offset)
 			dumpint(tagAllocSample)
 			dumpint(uint64(p))
 			dumpint(uint64(uintptr(unsafe.Pointer(spp.b))))
@@ -636,14 +627,14 @@
 	}
 }
 
-var dumphdr = []byte("go1.5 heap dump\n")
+var dumphdr = []byte("go1.7 heap dump\n")
 
 func mdump() {
 	// make sure we're done sweeping
 	for i := uintptr(0); i < uintptr(mheap_.nspan); i++ {
 		s := h_allspans[i]
 		if s.state == _MSpanInUse {
-			mSpan_EnsureSwept(s)
+			s.ensureSwept()
 		}
 	}
 	memclr(unsafe.Pointer(&typecache), unsafe.Sizeof(typecache))
@@ -693,8 +684,8 @@
 }
 
 // The heap dump reader needs to be able to disambiguate
-// Eface entries.  So it needs to know every type that might
-// appear in such an entry.  The following routine accomplishes that.
+// Eface entries. So it needs to know every type that might
+// appear in such an entry. The following routine accomplishes that.
 // TODO(rsc, khr): Delete - no longer possible.
 
 // Dump all the types that appear in the type field of
@@ -704,7 +695,7 @@
 
 func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	// Extend the temp buffer if necessary.
-	nptr := size / ptrSize
+	nptr := size / sys.PtrSize
 	if uintptr(len(tmpbuf)) < nptr/8+1 {
 		if tmpbuf != nil {
 			sysFree(unsafe.Pointer(&tmpbuf[0]), uintptr(len(tmpbuf)), &memstats.other_sys)
@@ -723,7 +714,7 @@
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		if i >= 2 && !hbits.isMarked() {
+		if i != 1 && !hbits.morePointers() {
 			break // end of object
 		}
 		if hbits.isPointer() {

diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 332b7d5..1690147 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go

@@ -4,7 +4,11 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
 	hashSize = 1009
@@ -15,11 +19,12 @@
 	hash      [hashSize]*itab
 )
 
-// fInterface is our standard non-empty interface.  We use it instead
-// of interface{f()} in function prototypes because gofmt insists on
-// putting lots of newlines in the otherwise concise interface{f()}.
-type fInterface interface {
-	f()
+func itabhash(inter *interfacetype, typ *_type) uint32 {
+	// compiler has provided some good hash codes for us.
+	h := inter.typ.hash
+	h += 17 * typ.hash
+	// TODO(rsc): h += 23 * x.mhash ?
+	return h % hashSize
 }
 
 func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
@@ -28,19 +33,15 @@
 	}
 
 	// easy case
-	x := typ.x
-	if x == nil {
+	if typ.tflag&tflagUncommon == 0 {
 		if canfail {
 			return nil
 		}
-		panic(&TypeAssertionError{"", *typ._string, *inter.typ._string, *inter.mhdr[0].name})
+		name := inter.typ.nameOff(inter.mhdr[0].name)
+		panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), name.name()})
 	}
 
-	// compiler has provided some good hash codes for us.
-	h := inter.typ.hash
-	h += 17 * typ.hash
-	// TODO(rsc): h += 23 * x.mhash ?
-	h %= hashSize
+	h := itabhash(inter, typ)
 
 	// look twice - once without lock, once with.
 	// common case will be no lock contention.
@@ -50,20 +51,19 @@
 		if locked != 0 {
 			lock(&ifaceLock)
 		}
-		for m = (*itab)(atomicloadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
+		for m = (*itab)(atomic.Loadp(unsafe.Pointer(&hash[h]))); m != nil; m = m.link {
 			if m.inter == inter && m._type == typ {
 				if m.bad != 0 {
-					m = nil
 					if !canfail {
 						// this can only happen if the conversion
 						// was already done once using the , ok form
 						// and we have a cached negative result.
 						// the cached result doesn't record which
-						// interface function was missing, so jump
-						// down to the interface check, which will
-						// do more work but give a better error.
-						goto search
+						// interface function was missing, so try
+						// adding the itab again, which will throw an error.
+						additab(m, locked != 0, false)
 					}
+					m = nil
 				}
 				if locked != 0 {
 					unlock(&ifaceLock)
@@ -73,48 +73,10 @@
 		}
 	}
 
-	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*ptrSize, 0, &memstats.other_sys))
+	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*sys.PtrSize, 0, &memstats.other_sys))
 	m.inter = inter
 	m._type = typ
-
-search:
-	// both inter and typ have method sorted by name,
-	// and interface names are unique,
-	// so can iterate over both in lock step;
-	// the loop is O(ni+nt) not O(ni*nt).
-	ni := len(inter.mhdr)
-	nt := len(x.mhdr)
-	j := 0
-	for k := 0; k < ni; k++ {
-		i := &inter.mhdr[k]
-		iname := i.name
-		ipkgpath := i.pkgpath
-		itype := i._type
-		for ; j < nt; j++ {
-			t := &x.mhdr[j]
-			if t.mtyp == itype && (t.name == iname || *t.name == *iname) && t.pkgpath == ipkgpath {
-				if m != nil {
-					*(*unsafe.Pointer)(add(unsafe.Pointer(&m.fun[0]), uintptr(k)*ptrSize)) = t.ifn
-				}
-				goto nextimethod
-			}
-		}
-		// didn't find method
-		if !canfail {
-			if locked != 0 {
-				unlock(&ifaceLock)
-			}
-			panic(&TypeAssertionError{"", *typ._string, *inter.typ._string, *iname})
-		}
-		m.bad = 1
-		break
-	nextimethod:
-	}
-	if locked == 0 {
-		throw("invalid itab locking")
-	}
-	m.link = hash[h]
-	atomicstorep(unsafe.Pointer(&hash[h]), unsafe.Pointer(m))
+	additab(m, true, canfail)
 	unlock(&ifaceLock)
 	if m.bad != 0 {
 		return nil
@@ -122,109 +84,170 @@
 	return m
 }
 
-func typ2Itab(t *_type, inter *interfacetype, cache **itab) *itab {
-	tab := getitab(inter, t, false)
-	atomicstorep(unsafe.Pointer(cache), unsafe.Pointer(tab))
-	return tab
+func additab(m *itab, locked, canfail bool) {
+	inter := m.inter
+	typ := m._type
+	x := typ.uncommon()
+
+	// both inter and typ have method sorted by name,
+	// and interface names are unique,
+	// so can iterate over both in lock step;
+	// the loop is O(ni+nt) not O(ni*nt).
+	ni := len(inter.mhdr)
+	nt := int(x.mcount)
+	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
+	j := 0
+	for k := 0; k < ni; k++ {
+		i := &inter.mhdr[k]
+		itype := inter.typ.typeOff(i.ityp)
+		name := inter.typ.nameOff(i.name)
+		iname := name.name()
+		ipkg := name.pkgPath()
+		if ipkg == "" {
+			ipkg = inter.pkgpath.name()
+		}
+		for ; j < nt; j++ {
+			t := &xmhdr[j]
+			tname := typ.nameOff(t.name)
+			if typ.typeOff(t.mtyp) == itype && tname.name() == iname {
+				pkgPath := tname.pkgPath()
+				if pkgPath == "" {
+					pkgPath = typ.nameOff(x.pkgpath).name()
+				}
+				if tname.isExported() || pkgPath == ipkg {
+					if m != nil {
+						ifn := typ.textOff(t.ifn)
+						*(*unsafe.Pointer)(add(unsafe.Pointer(&m.fun[0]), uintptr(k)*sys.PtrSize)) = ifn
+					}
+					goto nextimethod
+				}
+			}
+		}
+		// didn't find method
+		if !canfail {
+			if locked {
+				unlock(&ifaceLock)
+			}
+			panic(&TypeAssertionError{"", typ.string(), inter.typ.string(), iname})
+		}
+		m.bad = 1
+		break
+	nextimethod:
+	}
+	if !locked {
+		throw("invalid itab locking")
+	}
+	h := itabhash(inter, typ)
+	m.link = hash[h]
+	atomicstorep(unsafe.Pointer(&hash[h]), unsafe.Pointer(m))
 }
 
-func convT2E(t *_type, elem unsafe.Pointer, x unsafe.Pointer) (e interface{}) {
-	ep := (*eface)(unsafe.Pointer(&e))
-	if isDirectIface(t) {
-		ep._type = t
-		typedmemmove(t, unsafe.Pointer(&ep.data), elem)
-	} else {
-		if x == nil {
-			x = newobject(t)
+func itabsinit() {
+	lock(&ifaceLock)
+	for m := &firstmoduledata; m != nil; m = m.next {
+		for _, i := range m.itablinks {
+			additab(i, true, false)
 		}
-		// TODO: We allocate a zeroed object only to overwrite it with
-		// actual data.  Figure out how to avoid zeroing.  Also below in convT2I.
-		typedmemmove(t, x, elem)
-		ep._type = t
-		ep.data = x
 	}
+	unlock(&ifaceLock)
+}
+
+func convT2E(t *_type, elem unsafe.Pointer, x unsafe.Pointer) (e eface) {
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&t)), funcPC(convT2E))
+	}
+	if msanenabled {
+		msanread(elem, t.size)
+	}
+	if isDirectIface(t) {
+		throw("direct convT2E")
+	}
+	if x == nil {
+		x = newobject(t)
+		// TODO: We allocate a zeroed object only to overwrite it with
+		// actual data. Figure out how to avoid zeroing. Also below in convT2I.
+	}
+	typedmemmove(t, x, elem)
+	e._type = t
+	e.data = x
 	return
 }
 
-func convT2I(t *_type, inter *interfacetype, cache **itab, elem unsafe.Pointer, x unsafe.Pointer) (i fInterface) {
-	tab := (*itab)(atomicloadp(unsafe.Pointer(cache)))
-	if tab == nil {
-		tab = getitab(inter, t, false)
-		atomicstorep(unsafe.Pointer(cache), unsafe.Pointer(tab))
+func convT2I(tab *itab, elem unsafe.Pointer, x unsafe.Pointer) (i iface) {
+	t := tab._type
+	if raceenabled {
+		raceReadObjectPC(t, elem, getcallerpc(unsafe.Pointer(&tab)), funcPC(convT2I))
 	}
-	pi := (*iface)(unsafe.Pointer(&i))
+	if msanenabled {
+		msanread(elem, t.size)
+	}
 	if isDirectIface(t) {
-		pi.tab = tab
-		typedmemmove(t, unsafe.Pointer(&pi.data), elem)
-	} else {
-		if x == nil {
-			x = newobject(t)
-		}
-		typedmemmove(t, x, elem)
-		pi.tab = tab
-		pi.data = x
+		throw("direct convT2I")
 	}
+	if x == nil {
+		x = newobject(t)
+	}
+	typedmemmove(t, x, elem)
+	i.tab = tab
+	i.data = x
 	return
 }
 
 func panicdottype(have, want, iface *_type) {
 	haveString := ""
 	if have != nil {
-		haveString = *have._string
+		haveString = have.string()
 	}
-	panic(&TypeAssertionError{*iface._string, haveString, *want._string, ""})
+	panic(&TypeAssertionError{iface.string(), haveString, want.string(), ""})
 }
 
-func assertI2T(t *_type, i fInterface, r unsafe.Pointer) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2T(t *_type, i iface, r unsafe.Pointer) {
+	tab := i.tab
 	if tab == nil {
-		panic(&TypeAssertionError{"", "", *t._string, ""})
+		panic(&TypeAssertionError{"", "", t.string(), ""})
 	}
 	if tab._type != t {
-		panic(&TypeAssertionError{*tab.inter.typ._string, *tab._type._string, *t._string, ""})
+		panic(&TypeAssertionError{tab.inter.typ.string(), tab._type.string(), t.string(), ""})
 	}
 	if r != nil {
 		if isDirectIface(t) {
-			writebarrierptr((*uintptr)(r), uintptr(ip.data))
+			writebarrierptr((*uintptr)(r), uintptr(i.data))
 		} else {
-			typedmemmove(t, r, ip.data)
+			typedmemmove(t, r, i.data)
 		}
 	}
 }
 
-func assertI2T2(t *_type, i fInterface, r unsafe.Pointer) bool {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2T2(t *_type, i iface, r unsafe.Pointer) bool {
+	tab := i.tab
 	if tab == nil || tab._type != t {
 		if r != nil {
-			memclr(r, uintptr(t.size))
+			memclr(r, t.size)
 		}
 		return false
 	}
 	if r != nil {
 		if isDirectIface(t) {
-			writebarrierptr((*uintptr)(r), uintptr(ip.data))
+			writebarrierptr((*uintptr)(r), uintptr(i.data))
 		} else {
-			typedmemmove(t, r, ip.data)
+			typedmemmove(t, r, i.data)
 		}
 	}
 	return true
 }
 
-func assertE2T(t *_type, e interface{}, r unsafe.Pointer) {
-	ep := (*eface)(unsafe.Pointer(&e))
-	if ep._type == nil {
-		panic(&TypeAssertionError{"", "", *t._string, ""})
+func assertE2T(t *_type, e eface, r unsafe.Pointer) {
+	if e._type == nil {
+		panic(&TypeAssertionError{"", "", t.string(), ""})
 	}
-	if ep._type != t {
-		panic(&TypeAssertionError{"", *ep._type._string, *t._string, ""})
+	if e._type != t {
+		panic(&TypeAssertionError{"", e._type.string(), t.string(), ""})
 	}
 	if r != nil {
 		if isDirectIface(t) {
-			writebarrierptr((*uintptr)(r), uintptr(ep.data))
+			writebarrierptr((*uintptr)(r), uintptr(e.data))
 		} else {
-			typedmemmove(t, r, ep.data)
+			typedmemmove(t, r, e.data)
 		}
 	}
 }
@@ -232,101 +255,89 @@
 var testingAssertE2T2GC bool
 
 // The compiler ensures that r is non-nil.
-func assertE2T2(t *_type, e interface{}, r unsafe.Pointer) bool {
+func assertE2T2(t *_type, e eface, r unsafe.Pointer) bool {
 	if testingAssertE2T2GC {
 		GC()
 	}
-	ep := (*eface)(unsafe.Pointer(&e))
-	if ep._type != t {
-		memclr(r, uintptr(t.size))
+	if e._type != t {
+		memclr(r, t.size)
 		return false
 	}
 	if isDirectIface(t) {
-		writebarrierptr((*uintptr)(r), uintptr(ep.data))
+		writebarrierptr((*uintptr)(r), uintptr(e.data))
 	} else {
-		typedmemmove(t, r, ep.data)
+		typedmemmove(t, r, e.data)
 	}
 	return true
 }
 
-func convI2E(i fInterface) (r interface{}) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func convI2E(i iface) (r eface) {
+	tab := i.tab
 	if tab == nil {
 		return
 	}
-	rp := (*eface)(unsafe.Pointer(&r))
-	rp._type = tab._type
-	rp.data = ip.data
+	r._type = tab._type
+	r.data = i.data
 	return
 }
 
-func assertI2E(inter *interfacetype, i fInterface, r *interface{}) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2E(inter *interfacetype, i iface, r *eface) {
+	tab := i.tab
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
-	rp := (*eface)(unsafe.Pointer(r))
-	rp._type = tab._type
-	rp.data = ip.data
+	r._type = tab._type
+	r.data = i.data
 	return
 }
 
 // The compiler ensures that r is non-nil.
-func assertI2E2(inter *interfacetype, i fInterface, r *interface{}) bool {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2E2(inter *interfacetype, i iface, r *eface) bool {
+	tab := i.tab
 	if tab == nil {
 		return false
 	}
-	rp := (*eface)(unsafe.Pointer(r))
-	rp._type = tab._type
-	rp.data = ip.data
+	r._type = tab._type
+	r.data = i.data
 	return true
 }
 
-func convI2I(inter *interfacetype, i fInterface) (r fInterface) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func convI2I(inter *interfacetype, i iface) (r iface) {
+	tab := i.tab
 	if tab == nil {
 		return
 	}
-	rp := (*iface)(unsafe.Pointer(&r))
 	if tab.inter == inter {
-		rp.tab = tab
-		rp.data = ip.data
+		r.tab = tab
+		r.data = i.data
 		return
 	}
-	rp.tab = getitab(inter, tab._type, false)
-	rp.data = ip.data
+	r.tab = getitab(inter, tab._type, false)
+	r.data = i.data
 	return
 }
 
-func assertI2I(inter *interfacetype, i fInterface, r *fInterface) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2I(inter *interfacetype, i iface, r *iface) {
+	tab := i.tab
 	if tab == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
-	rp := (*iface)(unsafe.Pointer(r))
 	if tab.inter == inter {
-		rp.tab = tab
-		rp.data = ip.data
+		r.tab = tab
+		r.data = i.data
 		return
 	}
-	rp.tab = getitab(inter, tab._type, false)
-	rp.data = ip.data
+	r.tab = getitab(inter, tab._type, false)
+	r.data = i.data
 }
 
-func assertI2I2(inter *interfacetype, i fInterface, r *fInterface) bool {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
+func assertI2I2(inter *interfacetype, i iface, r *iface) bool {
+	tab := i.tab
 	if tab == nil {
 		if r != nil {
-			*r = nil
+			*r = iface{}
 		}
 		return false
 	}
@@ -334,103 +345,78 @@
 		tab = getitab(inter, tab._type, true)
 		if tab == nil {
 			if r != nil {
-				*r = nil
+				*r = iface{}
 			}
 			return false
 		}
 	}
 	if r != nil {
-		rp := (*iface)(unsafe.Pointer(r))
-		rp.tab = tab
-		rp.data = ip.data
+		r.tab = tab
+		r.data = i.data
 	}
 	return true
 }
 
-func assertE2I(inter *interfacetype, e interface{}, r *fInterface) {
-	ep := (*eface)(unsafe.Pointer(&e))
-	t := ep._type
+func assertE2I(inter *interfacetype, e eface, r *iface) {
+	t := e._type
 	if t == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
-	rp := (*iface)(unsafe.Pointer(r))
-	rp.tab = getitab(inter, t, false)
-	rp.data = ep.data
+	r.tab = getitab(inter, t, false)
+	r.data = e.data
 }
 
 var testingAssertE2I2GC bool
 
-func assertE2I2(inter *interfacetype, e interface{}, r *fInterface) bool {
+func assertE2I2(inter *interfacetype, e eface, r *iface) bool {
 	if testingAssertE2I2GC {
 		GC()
 	}
-	ep := (*eface)(unsafe.Pointer(&e))
-	t := ep._type
+	t := e._type
 	if t == nil {
 		if r != nil {
-			*r = nil
+			*r = iface{}
 		}
 		return false
 	}
 	tab := getitab(inter, t, true)
 	if tab == nil {
 		if r != nil {
-			*r = nil
+			*r = iface{}
 		}
 		return false
 	}
 	if r != nil {
-		rp := (*iface)(unsafe.Pointer(r))
-		rp.tab = tab
-		rp.data = ep.data
+		r.tab = tab
+		r.data = e.data
 	}
 	return true
 }
 
 //go:linkname reflect_ifaceE2I reflect.ifaceE2I
-func reflect_ifaceE2I(inter *interfacetype, e interface{}, dst *fInterface) {
+func reflect_ifaceE2I(inter *interfacetype, e eface, dst *iface) {
 	assertE2I(inter, e, dst)
 }
 
-func assertE2E(inter *interfacetype, e interface{}, r *interface{}) {
-	ep := (*eface)(unsafe.Pointer(&e))
-	if ep._type == nil {
+func assertE2E(inter *interfacetype, e eface, r *eface) {
+	if e._type == nil {
 		// explicit conversions require non-nil interface value.
-		panic(&TypeAssertionError{"", "", *inter.typ._string, ""})
+		panic(&TypeAssertionError{"", "", inter.typ.string(), ""})
 	}
 	*r = e
 }
 
 // The compiler ensures that r is non-nil.
-func assertE2E2(inter *interfacetype, e interface{}, r *interface{}) bool {
-	ep := (*eface)(unsafe.Pointer(&e))
-	if ep._type == nil {
-		*r = nil
+func assertE2E2(inter *interfacetype, e eface, r *eface) bool {
+	if e._type == nil {
+		*r = eface{}
 		return false
 	}
 	*r = e
 	return true
 }
 
-func ifacethash(i fInterface) uint32 {
-	ip := (*iface)(unsafe.Pointer(&i))
-	tab := ip.tab
-	if tab == nil {
-		return 0
-	}
-	return tab._type.hash
-}
-
-func efacethash(e interface{}) uint32 {
-	ep := (*eface)(unsafe.Pointer(&e))
-	t := ep._type
-	if t == nil {
-		return 0
-	}
-	return t.hash
-}
-
 func iterate_itabs(fn func(*itab)) {
 	for _, h := range &hash {
 		for ; h != nil; h = h.link {

diff --git a/src/runtime/internal/atomic/asm.s b/src/runtime/internal/atomic/asm.s
new file mode 100644
index 0000000..8488585
--- /dev/null
+++ b/src/runtime/internal/atomic/asm.s

@@ -0,0 +1,8 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·nop(SB),NOSPLIT,$0-0
+	RET

diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
new file mode 100644
index 0000000..ebecd0b
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_386.s

@@ -0,0 +1,166 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-8
+	JMP runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	JMP runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	JMP runtime∕internal∕atomic·Xadd64(SB)
+
+
+// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+	MOVL	ptr+0(FP), BP
+	MOVL	old_lo+4(FP), AX
+	MOVL	old_hi+8(FP), DX
+	MOVL	new_lo+12(FP), BX
+	MOVL	new_hi+16(FP), CX
+	LOCK
+	CMPXCHG8B	0(BP)
+	SETEQ	ret+20(FP)
+	RET
+
+// bool Casp(void **p, void *old, void *new)
+// Atomically:
+//	if(*p == old){
+//		*p = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+12(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xchg(SB)
+
+
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+// uint64 atomicload64(uint64 volatile* addr);
+TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	LEAL	ret_lo+4(FP), BX
+	// MOVQ (%EAX), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x00
+	// MOVQ %MM0, 0(%EBX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x03
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	RET
+
+// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), AX
+	TESTL	$7, AX
+	JZ	2(PC)
+	MOVL	0, AX // crash with nil ptr deref
+	// MOVQ and EMMS were introduced on the Pentium MMX.
+	// MOVQ 0x8(%ESP), %MM0
+	BYTE $0x0f; BYTE $0x6f; BYTE $0x44; BYTE $0x24; BYTE $0x08
+	// MOVQ %MM0, (%EAX)
+	BYTE $0x0f; BYTE $0x7f; BYTE $0x00 
+	// EMMS
+	BYTE $0x0F; BYTE $0x77
+	// This is essentially a no-op, but it provides required memory fencing.
+	// It can be replaced with MFENCE, but MFENCE was introduced only on the Pentium4 (SSE2).
+	MOVL	$0, AX
+	LOCK
+	XADDL	AX, (SP)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), AX
+	MOVB	val+4(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET

diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
new file mode 100644
index 0000000..94d4ac2
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_amd64.s

@@ -0,0 +1,150 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0-17
+	MOVQ	ptr+0(FP), BX
+	MOVL	old+8(FP), AX
+	MOVL	new+12(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// bool	runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	MOVQ	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	delta+8(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+	MOVQ	ptr+0(FP), BX
+	MOVL	new+8(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVQ	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xchg64(SB)
+
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), BX
+	MOVL	val+8(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVQ	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ORB	BX, (AX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
+	MOVQ	ptr+0(FP), AX
+	MOVB	val+8(FP), BX
+	LOCK
+	ANDB	BX, (AX)
+	RET

diff --git a/src/runtime/internal/atomic/asm_amd64p32.s b/src/runtime/internal/atomic/asm_amd64p32.s
new file mode 100644
index 0000000..74c79d0
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_amd64p32.s

@@ -0,0 +1,150 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-17
+	JMP	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool	runtime∕internal∕atomic·cas64(uint64 *val, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVL	ptr+0(FP), BX
+	MOVQ	old+8(FP), AX
+	MOVQ	new+16(FP), CX
+	LOCK
+	CMPXCHGQ	CX, 0(BX)
+	SETEQ	ret+24(FP)
+	RET
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-17
+	MOVL	ptr+0(FP), BX
+	MOVL	old+4(FP), AX
+	MOVL	new+8(FP), CX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	SETEQ	ret+16(FP)
+	RET
+
+// uint32 Xadd(uint32 volatile *val, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	delta+4(FP), AX
+	MOVL	AX, CX
+	LOCK
+	XADDL	AX, 0(BX)
+	ADDL	CX, AX
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	delta+8(FP), AX
+	MOVQ	AX, CX
+	LOCK
+	XADDQ	AX, 0(BX)
+	ADDQ	CX, AX
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+	MOVL	ptr+0(FP), BX
+	MOVL	new+4(FP), AX
+	XCHGL	AX, 0(BX)
+	MOVL	AX, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVL	ptr+0(FP), BX
+	MOVQ	new+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	MOVQ	AX, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	runtime∕internal∕atomic·Xchg(SB)
+
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), BX
+	MOVL	val+4(FP), AX
+	XCHGL	AX, 0(BX)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVL	ptr+0(FP), BX
+	MOVQ	val+8(FP), AX
+	XCHGQ	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	LOCK
+	ORB	AX, 0(BX)
+	RET
+
+// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+	MOVL	ptr+0(FP), BX
+	MOVB	val+4(FP), AX
+	LOCK
+	ANDB	AX, 0(BX)
+	RET

diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
new file mode 100644
index 0000000..235e8bf
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_arm.s

@@ -0,0 +1,71 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+//
+// To implement runtime∕internal∕atomic·cas in sys_$GOOS_arm.s
+// using the native instructions, use:
+//
+//	TEXT runtime∕internal∕atomic·cas(SB),NOSPLIT,$0
+//		B	runtime∕internal∕atomic·armcas(SB)
+//
+TEXT runtime∕internal∕atomic·armcas(SB),NOSPLIT,$0-13
+	MOVW	valptr+0(FP), R1
+	MOVW	old+4(FP), R2
+	MOVW	new+8(FP), R3
+casl:
+	LDREX	(R1), R0
+	CMP	R0, R2
+	BNE	casfail
+
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	WORD	$0xf57ff05a	// dmb ishst
+
+	STREX	R3, (R1), R0
+	CMP	$0, R0
+	BNE	casl
+	MOVW	$1, R0
+
+	MOVB	runtime·goarm(SB), R11
+	CMP	$7, R11
+	BLT	2(PC)
+	WORD	$0xf57ff05b	// dmb ish
+
+	MOVB	R0, ret+12(FP)
+	RET
+casfail:
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
+	B	runtime∕internal∕atomic·Cas(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Load(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Store(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-8
+	B	runtime∕internal∕atomic·Xadd(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)

diff --git a/src/runtime/internal/atomic/asm_arm64.s b/src/runtime/internal/atomic/asm_arm64.s
new file mode 100644
index 0000000..c255677
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_arm64.s

@@ -0,0 +1,58 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// bool Cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R0
+	MOVW	old+8(FP), R1
+	MOVW	new+12(FP), R2
+again:
+	LDAXRW	(R0), R3
+	CMPW	R1, R3
+	BNE	ok
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, again
+ok:
+	CSET	EQ, R0
+	MOVB	R0, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	B	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $-8-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $-8-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool Casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	B runtime∕internal∕atomic·Cas64(SB)

diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
new file mode 100644
index 0000000..80b178d
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_mips64x.s

@@ -0,0 +1,231 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+#define LL(base, rt)	WORD	$((060<<26)|((base)<<21)|((rt)<<16))
+#define LLV(base, rt)	WORD	$((064<<26)|((base)<<21)|((rt)<<16))
+#define SC(base, rt)	WORD	$((070<<26)|((base)<<21)|((rt)<<16))
+#define SCV(base, rt)	WORD	$((074<<26)|((base)<<21)|((rt)<<16))
+#define SYNC	WORD $0xf
+
+// bool cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVV	ptr+0(FP), R1
+	MOVW	old+8(FP), R2
+	MOVW	new+12(FP), R5
+	SYNC
+cas_again:
+	MOVV	R5, R3
+	LL(1, 4)	// R4 = *R1
+	BNE	R2, R4, cas_fail
+	SC(1, 3)	// *R1 = R3
+	BEQ	R3, cas_again
+	MOVV	$1, R1
+	MOVB	R1, ret+16(FP)
+	SYNC
+	RET
+cas_fail:
+	MOVV	$0, R1
+	JMP	-4(PC)
+
+// bool	cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVV	ptr+0(FP), R1
+	MOVV	old+8(FP), R2
+	MOVV	new+16(FP), R5
+	SYNC
+cas64_again:
+	MOVV	R5, R3
+	LLV(1, 4)	// R4 = *R1
+	BNE	R2, R4, cas64_fail
+	SCV(1, 3)	// *R1 = R3
+	BEQ	R3, cas64_again
+	MOVV	$1, R1
+	MOVB	R1, ret+24(FP)
+	SYNC
+	RET
+cas64_fail:
+	MOVV	$0, R1
+	JMP	-4(PC)
+
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	JMP	·Cas64(SB)
+
+TEXT ·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	JMP	·Load64(SB)
+
+TEXT ·Xaddint64(SB), NOSPLIT, $0-24
+	JMP	·Xadd64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	JMP runtime∕internal∕atomic·Cas64(SB)
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVV	ptr+0(FP), R2
+	MOVW	delta+8(FP), R3
+	SYNC
+	LL(2, 1)	// R1 = *R2
+	ADDU	R1, R3, R4
+	MOVV	R4, R1
+	SC(2, 4)	// *R2 = R4
+	BEQ	R4, -4(PC)
+	MOVW	R1, ret+16(FP)
+	SYNC
+	RET
+
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVV	ptr+0(FP), R2
+	MOVV	delta+8(FP), R3
+	SYNC
+	LLV(2, 1)	// R1 = *R2
+	ADDVU	R1, R3, R4
+	MOVV	R4, R1
+	SCV(2, 4)	// *R2 = R4
+	BEQ	R4, -4(PC)
+	MOVV	R1, ret+16(FP)
+	SYNC
+	RET
+
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVV	ptr+0(FP), R2
+	MOVW	new+8(FP), R5
+
+	SYNC
+	MOVV	R5, R3
+	LL(2, 1)	// R1 = *R2
+	SC(2, 3)	// *R2 = R3
+	BEQ	R3, -3(PC)
+	MOVW	R1, ret+16(FP)
+	SYNC
+	RET
+
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVV	ptr+0(FP), R2
+	MOVV	new+8(FP), R5
+
+	SYNC
+	MOVV	R5, R3
+	LLV(2, 1)	// R1 = *R2
+	SCV(2, 3)	// *R2 = R3
+	BEQ	R3, -3(PC)
+	MOVV	R1, ret+16(FP)
+	SYNC
+	RET
+
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	JMP	·Xchg64(SB)
+
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·Store(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+	SYNC
+	MOVW	R2, 0(R1)
+	SYNC
+	RET
+
+TEXT ·Store64(SB), NOSPLIT, $0-16
+	MOVV	ptr+0(FP), R1
+	MOVV	val+8(FP), R2
+	SYNC
+	MOVV	R2, 0(R1)
+	SYNC
+	RET
+
+// void	Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVBU	val+8(FP), R2
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	MOVV	$~3, R3
+	AND	R1, R3
+	// Compute val shift.
+#ifdef GOARCH_mips64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	// R4 = ((ptr & 3) * 8)
+	AND	$3, R1, R4
+	SLLV	$3, R4
+	// Shift val for aligned ptr. R2 = val << R4
+	SLLV	R4, R2
+
+	SYNC
+	LL(3, 4)	// R4 = *R3
+	OR	R2, R4
+	SC(3, 4)	// *R3 = R4
+	BEQ	R4, -4(PC)
+	SYNC
+	RET
+
+// void	And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVV	ptr+0(FP), R1
+	MOVBU	val+8(FP), R2
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	MOVV	$~3, R3
+	AND	R1, R3
+	// Compute val shift.
+#ifdef GOARCH_mips64
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R1
+#endif
+	// R4 = ((ptr & 3) * 8)
+	AND	$3, R1, R4
+	SLLV	$3, R4
+	// Shift val for aligned ptr. R2 = val << R4 | ^(0xFF << R4)
+	MOVV	$0xFF, R5
+	SLLV	R4, R2
+	SLLV	R4, R5
+	NOR	R0, R5
+	OR	R5, R2
+
+	SYNC
+	LL(3, 4)	// R4 = *R3
+	AND	R2, R4
+	SC(3, 4)	// *R3 = R4
+	BEQ	R4, -4(PC)
+	SYNC
+	RET

diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
new file mode 100644
index 0000000..de4f895
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_ppc64x.s

@@ -0,0 +1,231 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// bool cas(uint32 *ptr, uint32 old, uint32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R3
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+	SYNC
+cas_again:
+	LWAR	(R3), R6
+	CMPW	R6, R4
+	BNE	cas_fail
+	STWCCC	R5, (R3)
+	BNE	cas_again
+	MOVD	$1, R3
+	ISYNC
+	MOVB	R3, ret+16(FP)
+	RET
+cas_fail:
+	MOVB	R0, ret+16(FP)
+	RET
+
+// bool	runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//	if(*val == *old){
+//		*val = new;
+//		return 1;
+//	} else {
+//		return 0;
+//	}
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	SYNC
+cas64_again:
+	LDAR	(R3), R6
+	CMP	R6, R4
+	BNE	cas64_fail
+	STDCCC	R5, (R3)
+	BNE	cas64_again
+	MOVD	$1, R3
+	ISYNC
+	MOVB	R3, ret+24(FP)
+	RET
+cas64_fail:
+	MOVB	R0, ret+24(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
+	BR	runtime∕internal∕atomic·Cas64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
+	BR	runtime∕internal∕atomic·Xadd64(SB)
+
+TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Load64(SB)
+
+TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Xadd64(SB)
+
+// bool casp(void **val, void *old, void *new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	} else
+//		return 0;
+TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-25
+	BR runtime∕internal∕atomic·Cas64(SB)
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//	*val += delta;
+//	return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	delta+8(FP), R5
+	SYNC
+	LWAR	(R4), R3
+	ADD	R5, R3
+	STWCCC	R3, (R4)
+	BNE	-3(PC)
+	ISYNC
+	MOVW	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	delta+8(FP), R5
+	SYNC
+	LDAR	(R4), R3
+	ADD	R5, R3
+	STDCCC	R3, (R4)
+	BNE	-3(PC)
+	ISYNC
+	MOVD	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	new+8(FP), R5
+	SYNC
+	LWAR	(R4), R3
+	STWCCC	R5, (R4)
+	BNE	-2(PC)
+	ISYNC
+	MOVW	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	new+8(FP), R5
+	SYNC
+	LDAR	(R4), R3
+	STDCCC	R5, (R4)
+	BNE	-2(PC)
+	ISYNC
+	MOVD	R3, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	BR	runtime∕internal∕atomic·Xchg64(SB)
+
+
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	SYNC
+	MOVW	R4, 0(R3)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	SYNC
+	MOVD	R4, 0(R3)
+	RET
+
+// void runtime∕internal∕atomic·Or8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+#ifdef  GOARCH_ppc64
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	// R5 = (R3 << 0) & ~3
+	RLDCR	$0, R3, $~3, R5
+	// Compute val shift.
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R3
+	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
+	RLDC	$3, R3, $(3*8), R6
+	// Shift val for aligned ptr.  R4 = val << R6
+	SLD	R6, R4, R4
+	SYNC
+
+again:
+	LWAR	(R5), R6
+	OR	R4, R6
+	STWCCC	R6, (R5)
+	BNE	again
+#else
+	SYNC
+again:
+	LBAR	(R3), R6
+	OR	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+#endif
+	ISYNC
+	RET
+
+// void runtime∕internal∕atomic·And8(byte volatile*, byte);
+TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
+#ifdef  GOARCH_ppc64
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	// R5 = (R3 << 0) & ~3
+	RLDCR	$0, R3, $~3, R5
+	// Compute val shift.
+	// Big endian.  ptr = ptr ^ 3
+	XOR	$3, R3
+	// R6 = ((ptr & 3) * 8) = (ptr << 3) & (3*8)
+	RLDC	$3, R3, $(3*8), R6
+	// Shift val for aligned ptr.  R4 = val << R6 | ^(0xFF << R6)
+	MOVD	$0xFF, R7
+	SLD	R6, R4
+	SLD	R6, R7
+	XOR	$-1, R7
+	OR	R7, R4
+	SYNC
+again:
+	LWAR	(R5), R6
+	AND	R4, R6
+	STWCCC	R6, (R5)
+	BNE	again
+#else
+	SYNC
+again:
+	LBAR	(R3),R6
+	AND	R4,R6
+	STBCCC	R6,(R3)
+	BNE	again
+#endif
+	ISYNC
+	RET

diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
new file mode 100644
index 0000000..c84718c
--- /dev/null
+++ b/src/runtime/internal/atomic/asm_s390x.s

@@ -0,0 +1,174 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Cas(ptr *uint32, old, new uint32) bool
+// Atomically:
+//	if *ptr == old {
+//		*val = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Cas(SB), NOSPLIT, $0-17
+	MOVD	ptr+0(FP), R3
+	MOVWZ	old+8(FP), R4
+	MOVWZ	new+12(FP), R5
+	CS	R4, R5, 0(R3)    //  if (R4 == 0(R3)) then 0(R3)= R5
+	BNE	cas_fail
+	MOVB	$1, ret+16(FP)
+	RET
+cas_fail:
+	MOVB	$0, ret+16(FP)
+	RET
+
+// func Cas64(ptr *uint64, old, new uint64) bool
+// Atomically:
+//	if *ptr == old {
+//		*ptr = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R3
+	MOVD	old+8(FP), R4
+	MOVD	new+16(FP), R5
+	CSG	R4, R5, 0(R3)    //  if (R4 == 0(R3)) then 0(R3)= R5
+	BNE	cas64_fail
+	MOVB	$1, ret+24(FP)
+	RET
+cas64_fail:
+	MOVB	$0, ret+24(FP)
+	RET
+
+// func Casuintptr(ptr *uintptr, old, new uintptr) bool
+TEXT ·Casuintptr(SB), NOSPLIT, $0-25
+	BR	·Cas64(SB)
+
+// func Loaduintptr(ptr *uintptr) uintptr
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Loaduint(ptr *uint) uint
+TEXT ·Loaduint(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Storeuintptr(ptr *uintptr, new uintptr)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-16
+	BR	·Store64(SB)
+
+// func Loadint64(ptr *int64) int64
+TEXT ·Loadint64(SB), NOSPLIT, $0-16
+	BR	·Load64(SB)
+
+// func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-24
+	BR	·Xadd64(SB)
+
+// func Xaddint64(ptr *int64, delta int64) int64
+TEXT ·Xaddint64(SB), NOSPLIT, $0-16
+	BR	·Xadd64(SB)
+
+// func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+// Atomically:
+//	if *ptr == old {
+//		*ptr = new
+//		return 1
+//	} else {
+//		return 0
+//	}
+TEXT ·Casp1(SB), NOSPLIT, $0-25
+	BR ·Cas64(SB)
+
+// func Xadd(ptr *uint32, delta int32) uint32
+// Atomically:
+//	*ptr += delta
+//	return *ptr
+TEXT ·Xadd(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	delta+8(FP), R5
+	MOVW	(R4), R3
+repeat:
+	ADD	R5, R3, R6
+	CS	R3, R6, (R4) // if R3==(R4) then (R4)=R6 else R3=(R4)
+	BNE	repeat
+	MOVW	R6, ret+16(FP)
+	RET
+
+// func Xadd64(ptr *uint64, delta int64) uint64
+TEXT ·Xadd64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	delta+8(FP), R5
+	MOVD	(R4), R3
+repeat:
+	ADD	R5, R3, R6
+	CSG	R3, R6, (R4) // if R3==(R4) then (R4)=R6 else R3=(R4)
+	BNE	repeat
+	MOVD	R6, ret+16(FP)
+	RET
+
+// func Xchg(ptr *uint32, new uint32) uint32
+TEXT ·Xchg(SB), NOSPLIT, $0-20
+	MOVD	ptr+0(FP), R4
+	MOVW	new+8(FP), R3
+	MOVW	(R4), R6
+repeat:
+	CS	R6, R3, (R4) // if R6==(R4) then (R4)=R3 else R6=(R4)
+	BNE	repeat
+	MOVW	R6, ret+16(FP)
+	RET
+
+// func Xchg64(ptr *uint64, new uint64) uint64
+TEXT ·Xchg64(SB), NOSPLIT, $0-24
+	MOVD	ptr+0(FP), R4
+	MOVD	new+8(FP), R3
+	MOVD	(R4), R6
+repeat:
+	CSG	R6, R3, (R4) // if R6==(R4) then (R4)=R3 else R6=(R4)
+	BNE	repeat
+	MOVD	R6, ret+16(FP)
+	RET
+
+// func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
+	BR	·Xchg64(SB)
+
+// func Or8(addr *uint8, v uint8)
+TEXT ·Or8(SB), NOSPLIT, $0-9
+	MOVD    ptr+0(FP), R3
+	MOVBZ   val+8(FP), R4
+	// Calculate shift.
+	AND	$3, R3, R5
+	XOR	$3, R5 // big endian - flip direction
+	SLD	$3, R5 // MUL $8, R5
+	SLD	R5, R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	AND	$-4, R3
+	MOVWZ	0(R3), R6
+again:
+	OR	R4, R6, R7
+	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
+	BNE	again
+	RET
+
+// func And8(addr *uint8, v uint8)
+TEXT ·And8(SB), NOSPLIT, $0-9
+	MOVD    ptr+0(FP), R3
+	MOVBZ   val+8(FP), R4
+	// Calculate shift.
+	AND	$3, R3, R5
+	XOR	$3, R5 // big endian - flip direction
+	SLD	$3, R5 // MUL $8, R5
+	OR	$-256, R4 // create 0xffffffffffffffxx
+	RLLG	R5, R4
+	// Align ptr down to 4 bytes so we can use 32-bit load/store.
+	AND	$-4, R3
+	MOVWZ	0(R3), R6
+again:
+	AND	R4, R6, R7
+	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
+	BNE	again
+	RET

diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
new file mode 100644
index 0000000..23a8479
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_386.go

@@ -0,0 +1,76 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386
+
+package atomic
+
+import "unsafe"
+
+//go:nosplit
+//go:noinline
+func Load(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+func Xadd64(ptr *uint64, delta int64) uint64 {
+	for {
+		old := *ptr
+		if Cas64(ptr, old, old+uint64(delta)) {
+			return old + uint64(delta)
+		}
+	}
+}
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:nosplit
+func Xchg64(ptr *uint64, new uint64) uint64 {
+	for {
+		old := *ptr
+		if Cas64(ptr, old, new) {
+			return old
+		}
+	}
+}
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/internal/atomic/atomic_amd64x.go b/src/runtime/internal/atomic/atomic_amd64x.go
new file mode 100644
index 0000000..54851d3
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_amd64x.go

@@ -0,0 +1,68 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 amd64p32
+
+package atomic
+
+import "unsafe"
+
+//go:nosplit
+//go:noinline
+func Load(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+//go:noinline
+func Load64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// StorepNoWB performs *ptr = val atomically and without a write
+// barrier.
+//
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
new file mode 100644
index 0000000..244237d
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm.go

@@ -0,0 +1,183 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm
+
+package atomic
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type spinlock struct {
+	v uint32
+}
+
+//go:nosplit
+func (l *spinlock) lock() {
+	for {
+		if Cas(&l.v, 0, 1) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func (l *spinlock) unlock() {
+	Store(&l.v, 0)
+}
+
+var locktab [57]struct {
+	l   spinlock
+	pad [sys.CacheLineSize - unsafe.Sizeof(spinlock{})]byte
+}
+
+func addrLock(addr *uint64) *spinlock {
+	return &locktab[(uintptr(unsafe.Pointer(addr))>>3)%uintptr(len(locktab))].l
+}
+
+// Atomic add and return new value.
+//go:nosplit
+func Xadd(val *uint32, delta int32) uint32 {
+	for {
+		oval := *val
+		nval := oval + uint32(delta)
+		if Cas(val, oval, nval) {
+			return nval
+		}
+	}
+}
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:nosplit
+func Xchg(addr *uint32, v uint32) uint32 {
+	for {
+		old := *addr
+		if Cas(addr, old, v) {
+			return old
+		}
+	}
+}
+
+//go:nosplit
+func Xchguintptr(addr *uintptr, v uintptr) uintptr {
+	return uintptr(Xchg((*uint32)(unsafe.Pointer(addr)), uint32(v)))
+}
+
+//go:nosplit
+func Load(addr *uint32) uint32 {
+	return Xadd(addr, 0)
+}
+
+// Should be a built-in for unsafe.Pointer?
+//go:nosplit
+func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(p) + x)
+}
+
+//go:nosplit
+func Loadp(addr unsafe.Pointer) unsafe.Pointer {
+	return unsafe.Pointer(uintptr(Xadd((*uint32)(addr), 0)))
+}
+
+//go:nosplit
+func StorepNoWB(addr unsafe.Pointer, v unsafe.Pointer) {
+	for {
+		old := *(*unsafe.Pointer)(addr)
+		if Casp1((*unsafe.Pointer)(addr), old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func Store(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func Cas64(addr *uint64, old, new uint64) bool {
+	var ok bool
+	addrLock(addr).lock()
+	if *addr == old {
+		*addr = new
+		ok = true
+	}
+	addrLock(addr).unlock()
+	return ok
+}
+
+//go:nosplit
+func Xadd64(addr *uint64, delta int64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr + uint64(delta)
+	*addr = r
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Xchg64(addr *uint64, v uint64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr
+	*addr = v
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Load64(addr *uint64) uint64 {
+	var r uint64
+	addrLock(addr).lock()
+	r = *addr
+	addrLock(addr).unlock()
+	return r
+}
+
+//go:nosplit
+func Store64(addr *uint64, v uint64) {
+	addrLock(addr).lock()
+	*addr = v
+	addrLock(addr).unlock()
+}
+
+//go:nosplit
+func Or8(addr *uint8, v uint8) {
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8) // little endian
+	for {
+		old := *addr32
+		if Cas(addr32, old, old|word) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And8(addr *uint8, v uint8) {
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
+	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
+	word |= ^mask
+	for {
+		old := *addr32
+		if Cas(addr32, old, old&word) {
+			return
+		}
+	}
+}

diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
new file mode 100644
index 0000000..dc82c33
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm64.go

@@ -0,0 +1,80 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm64
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:nosplit
+func Or8(addr *uint8, v uint8) {
+	// TODO(dfc) implement this in asm.
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8) // little endian
+	for {
+		old := *addr32
+		if Cas(addr32, old, old|word) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And8(addr *uint8, v uint8) {
+	// TODO(dfc) implement this in asm.
+	// Align down to 4 bytes and use 32-bit CAS.
+	uaddr := uintptr(unsafe.Pointer(addr))
+	addr32 := (*uint32)(unsafe.Pointer(uaddr &^ 3))
+	word := uint32(v) << ((uaddr & 3) * 8)    // little endian
+	mask := uint32(0xFF) << ((uaddr & 3) * 8) // little endian
+	word |= ^mask
+	for {
+		old := *addr32
+		if Cas(addr32, old, old&word) {
+			return
+		}
+	}
+}
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
new file mode 100644
index 0000000..eb32f37
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_arm64.s

@@ -0,0 +1,113 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// uint32 runtime∕internal∕atomic·Load(uint32 volatile* addr)
+TEXT ·Load(SB),NOSPLIT,$-8-12
+	MOVD	ptr+0(FP), R0
+	LDARW	(R0), R0
+	MOVW	R0, ret+8(FP)
+	RET
+
+// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* addr)
+TEXT ·Load64(SB),NOSPLIT,$-8-16
+	MOVD	ptr+0(FP), R0
+	LDAR	(R0), R0
+	MOVD	R0, ret+8(FP)
+	RET
+
+// void *runtime∕internal∕atomic·Loadp(void *volatile *addr)
+TEXT ·Loadp(SB),NOSPLIT,$-8-16
+	MOVD	ptr+0(FP), R0
+	LDAR	(R0), R0
+	MOVD	R0, ret+8(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	STLRW	R1, (R0)
+	RET
+
+TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R0
+	MOVD	val+8(FP), R1
+	STLR	R1, (R0)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-20
+again:
+	MOVD	ptr+0(FP), R0
+	MOVW	new+8(FP), R1
+	LDAXRW	(R0), R2
+	STLXRW	R1, (R0), R3
+	CBNZ	R3, again
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchg64(SB), NOSPLIT, $0-24
+again:
+	MOVD	ptr+0(FP), R0
+	MOVD	new+8(FP), R1
+	LDAXR	(R0), R2
+	STLXR	R1, (R0), R3
+	CBNZ	R3, again
+	MOVD	R2, ret+16(FP)
+	RET
+
+// bool runtime∕internal∕atomic·Cas64(uint64 *ptr, uint64 old, uint64 new)
+// Atomically:
+//      if(*val == *old){
+//              *val = new;
+//              return 1;
+//      } else {
+//              return 0;
+//      }
+TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-25
+	MOVD	ptr+0(FP), R0
+	MOVD	old+8(FP), R1
+	MOVD	new+16(FP), R2
+again:
+	LDAXR	(R0), R3
+	CMP	R1, R3
+	BNE	ok
+	STLXR	R2, (R0), R3
+	CBNZ	R3, again
+ok:
+	CSET	EQ, R0
+	MOVB	R0, ret+24(FP)
+	RET
+
+// uint32 xadd(uint32 volatile *ptr, int32 delta)
+// Atomically:
+//      *val += delta;
+//      return *val;
+TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-20
+again:
+	MOVD	ptr+0(FP), R0
+	MOVW	delta+8(FP), R1
+	LDAXRW	(R0), R2
+	ADDW	R2, R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, again
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-24
+again:
+	MOVD	ptr+0(FP), R0
+	MOVD	delta+8(FP), R1
+	LDAXR	(R0), R2
+	ADD	R2, R1, R2
+	STLXR	R2, (R0), R3
+	CBNZ	R3, again
+	MOVD	R2, ret+16(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-24
+	B	runtime∕internal∕atomic·Xchg64(SB)

diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
new file mode 100644
index 0000000..d06ea48
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_mips64x.go

@@ -0,0 +1,56 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
new file mode 100644
index 0000000..71d3f7f
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_mips64x.s

@@ -0,0 +1,36 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+#define SYNC	WORD $0xf
+
+// uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
+TEXT ·Load(SB),NOSPLIT,$-8-12
+	MOVV	ptr+0(FP), R1
+	SYNC
+	MOVWU	0(R1), R1
+	SYNC
+	MOVW	R1, ret+8(FP)
+	RET
+
+// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* ptr)
+TEXT ·Load64(SB),NOSPLIT,$-8-16
+	MOVV	ptr+0(FP), R1
+	SYNC
+	MOVV	0(R1), R1
+	SYNC
+	MOVV	R1, ret+8(FP)
+	RET
+
+// void *runtime∕internal∕atomic·Loadp(void *volatile *ptr)
+TEXT ·Loadp(SB),NOSPLIT,$-8-16
+	MOVV	ptr+0(FP), R1
+	SYNC
+	MOVV	0(R1), R1
+	SYNC
+	MOVV	R1, ret+8(FP)
+	RET

diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
new file mode 100644
index 0000000..72c98eb
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go

@@ -0,0 +1,56 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Load(ptr *uint32) uint32
+
+//go:noescape
+func Load64(ptr *uint64) uint64
+
+//go:noescape
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool
+
+//go:noescape
+func Store(ptr *uint32, val uint32)
+
+//go:noescape
+func Store64(ptr *uint64, val uint64)
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)

diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
new file mode 100644
index 0000000..1a7537e
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s

@@ -0,0 +1,40 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+// uint32 runtime∕internal∕atomic·Load(uint32 volatile* addr)
+TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVWZ	0(R3), R3
+	CMPW	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVW	R3, ret+8(FP)
+	RET
+
+// uint64 runtime∕internal∕atomic·Load64(uint64 volatile* addr)
+TEXT ·Load64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVD	R3, ret+8(FP)
+	RET
+
+// void *runtime∕internal∕atomic·Loadp(void *volatile *addr)
+TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD	addr+0(FP), R3
+	SYNC
+	MOVD	0(R3), R3
+	CMP	R3, R3, CR7
+	BC	4, 30, 1(PC) // bne- cr7,0x4
+	ISYNC
+	MOVD	R3, ret+8(FP)
+	RET

diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
new file mode 100644
index 0000000..9343853
--- /dev/null
+++ b/src/runtime/internal/atomic/atomic_s390x.go

@@ -0,0 +1,73 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+//go:nosplit
+//go:noinline
+func Load(ptr *uint32) uint32 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func Loadp(ptr unsafe.Pointer) unsafe.Pointer {
+	return *(*unsafe.Pointer)(ptr)
+}
+
+//go:nosplit
+//go:noinline
+func Load64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:noinline
+//go:nosplit
+func Store(ptr *uint32, val uint32) {
+	*ptr = val
+}
+
+//go:noinline
+//go:nosplit
+func Store64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+// NO go:noescape annotation; see atomic_pointer.go.
+//go:noinline
+//go:nosplit
+func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer) {
+	*(*uintptr)(ptr) = uintptr(val)
+}
+
+//go:noescape
+func And8(ptr *uint8, val uint8)
+
+//go:noescape
+func Or8(ptr *uint8, val uint8)
+
+// NOTE: Do not add atomicxor8 (XOR is not idempotent).
+
+//go:noescape
+func Xadd(ptr *uint32, delta int32) uint32
+
+//go:noescape
+func Xadd64(ptr *uint64, delta int64) uint64
+
+//go:noescape
+func Xadduintptr(ptr *uintptr, delta uintptr) uintptr
+
+//go:noescape
+func Xchg(ptr *uint32, new uint32) uint32
+
+//go:noescape
+func Xchg64(ptr *uint64, new uint64) uint64
+
+//go:noescape
+func Xchguintptr(ptr *uintptr, new uintptr) uintptr
+
+//go:noescape
+func Cas64(ptr *uint64, old, new uint64) bool

diff --git a/src/runtime/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
similarity index 79%
rename from src/runtime/atomic_test.go
rename to src/runtime/internal/atomic/atomic_test.go
index 2699103..d5dc552 100644
--- a/src/runtime/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go

@@ -2,10 +2,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package runtime_test
+package atomic_test
 
 import (
 	"runtime"
+	"runtime/internal/atomic"
 	"testing"
 	"unsafe"
 )
@@ -32,34 +33,34 @@
 	inc := uintptr(100)
 	total := uintptr(0)
 	runParallel(N, iter, func() {
-		runtime.Xadduintptr(&total, inc)
+		atomic.Xadduintptr(&total, inc)
 	})
 	if want := uintptr(N * iter * inc); want != total {
 		t.Fatalf("xadduintpr error, want %d, got %d", want, total)
 	}
 	total = 0
 	runParallel(N, iter, func() {
-		runtime.Xadduintptr(&total, inc)
-		runtime.Xadduintptr(&total, uintptr(-int64(inc)))
+		atomic.Xadduintptr(&total, inc)
+		atomic.Xadduintptr(&total, uintptr(-int64(inc)))
 	})
 	if total != 0 {
 		t.Fatalf("xadduintpr total error, want %d, got %d", 0, total)
 	}
 }
 
-// Tests that xadduintptr correctly updates 64-bit values.  The place where
+// Tests that xadduintptr correctly updates 64-bit values. The place where
 // we actually do so is mstats.go, functions mSysStat{Inc,Dec}.
 func TestXadduintptrOnUint64(t *testing.T) {
-	if runtime.BigEndian != 0 {
+	/*	if runtime.BigEndian != 0 {
 		// On big endian architectures, we never use xadduintptr to update
 		// 64-bit values and hence we skip the test.  (Note that functions
 		// mSysStat{Inc,Dec} in mstats.go have explicit checks for
 		// big-endianness.)
 		return
-	}
+	}*/
 	const inc = 100
 	val := uint64(0)
-	runtime.Xadduintptr((*uintptr)(unsafe.Pointer(&val)), inc)
+	atomic.Xadduintptr((*uintptr)(unsafe.Pointer(&val)), inc)
 	if inc != val {
 		t.Fatalf("xadduintptr should increase lower-order bits, want %d, got %d", inc, val)
 	}

diff --git a/src/runtime/internal/atomic/stubs.go b/src/runtime/internal/atomic/stubs.go
new file mode 100644
index 0000000..497b980
--- /dev/null
+++ b/src/runtime/internal/atomic/stubs.go

@@ -0,0 +1,33 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+import "unsafe"
+
+//go:noescape
+func Cas(ptr *uint32, old, new uint32) bool
+
+// NO go:noescape annotation; see atomic_pointer.go.
+func Casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
+
+//go:noescape
+func Casuintptr(ptr *uintptr, old, new uintptr) bool
+
+//go:noescape
+func Storeuintptr(ptr *uintptr, new uintptr)
+
+//go:noescape
+func Loaduintptr(ptr *uintptr) uintptr
+
+//go:noescape
+func Loaduint(ptr *uint) uint
+
+// TODO(matloob): Should these functions have the go:noescape annotation?
+
+//go:noescape
+func Loadint64(ptr *int64) int64
+
+//go:noescape
+func Xaddint64(ptr *int64, delta int64) int64

diff --git a/src/runtime/internal/atomic/sys_darwin_arm.s b/src/runtime/internal/atomic/sys_darwin_arm.s
new file mode 100644
index 0000000..01b7aef
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_darwin_arm.s

@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)

diff --git a/src/runtime/internal/atomic/sys_freebsd_arm.s b/src/runtime/internal/atomic/sys_freebsd_arm.s
new file mode 100644
index 0000000..30d49b8
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_freebsd_arm.s

@@ -0,0 +1,19 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// TODO(minux): this is only valid for ARMv6+
+// bool armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)

diff --git a/src/runtime/internal/atomic/sys_linux_arm.s b/src/runtime/internal/atomic/sys_linux_arm.s
new file mode 100644
index 0000000..f8de2a2
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_linux_arm.s

@@ -0,0 +1,42 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Use kernel version instead of native armcas in asm_arm.s.
+// See ../../../sync/atomic/asm_linux_arm.s for details.
+TEXT cas<>(SB),NOSPLIT,$0
+	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+loop:
+	MOVW	new+8(FP), R1
+	BL	cas<>(SB)
+	BCC	check
+	MOVW	$1, R0
+	MOVB	R0, ret+12(FP)
+	RET
+check:
+	// Kernel lies; double-check.
+	MOVW	ptr+0(FP), R2
+	MOVW	old+4(FP), R0
+	MOVW	0(R2), R3
+	CMP	R0, R3
+	BEQ	loop
+	MOVW	$0, R0
+	MOVB	R0, ret+12(FP)
+	RET
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
+
+// As for cas, memory barriers are complicated on ARM, but the kernel
+// provides a user helper. ARMv5 does not support SMP and has no
+// memory barrier instruction at all. ARMv6 added SMP support and has
+// a memory barrier, but it requires writing to a coprocessor
+// register. ARMv7 introduced the DMB instruction, but it's expensive
+// even on single-core devices. The kernel helper takes care of all of
+// this for us.
\ No newline at end of file

diff --git a/src/runtime/internal/atomic/sys_nacl_arm.s b/src/runtime/internal/atomic/sys_nacl_arm.s
new file mode 100644
index 0000000..efa9604
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_nacl_arm.s

@@ -0,0 +1,16 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Casp(SB),NOSPLIT,$0
+	B	runtime·cas(SB)
+
+// This is only valid for ARMv6+, however, NaCl/ARM is only defined
+// for ARMv7A anyway.
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)

diff --git a/src/runtime/internal/atomic/sys_netbsd_arm.s b/src/runtime/internal/atomic/sys_netbsd_arm.s
new file mode 100644
index 0000000..3277d94
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_netbsd_arm.s

@@ -0,0 +1,21 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// TODO(minux): this is only valid for ARMv6+
+// bool Armcas(int32 *val, int32 old, int32 new)
+// Atomically:
+//	if(*val == old){
+//		*val = new;
+//		return 1;
+//	}else
+//		return 0;
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)
+
+

diff --git a/src/runtime/internal/atomic/sys_openbsd_arm.s b/src/runtime/internal/atomic/sys_openbsd_arm.s
new file mode 100644
index 0000000..01b7aef
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_openbsd_arm.s

@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)

diff --git a/src/runtime/internal/atomic/sys_plan9_arm.s b/src/runtime/internal/atomic/sys_plan9_arm.s
new file mode 100644
index 0000000..01b7aef
--- /dev/null
+++ b/src/runtime/internal/atomic/sys_plan9_arm.s

@@ -0,0 +1,11 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕atomic·Cas(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·armcas(SB)
+
+TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0
+	B	runtime∕internal∕atomic·Cas(SB)

diff --git a/src/runtime/internal/sys/arch.go b/src/runtime/internal/sys/arch.go
new file mode 100644
index 0000000..c175704
--- /dev/null
+++ b/src/runtime/internal/sys/arch.go

@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+type ArchFamilyType int
+
+const (
+	AMD64 ArchFamilyType = iota
+	ARM
+	ARM64
+	I386
+	MIPS64
+	PPC64
+	S390X
+)

diff --git a/src/runtime/internal/sys/arch_386.go b/src/runtime/internal/sys/arch_386.go
new file mode 100644
index 0000000..48c42f7
--- /dev/null
+++ b/src/runtime/internal/sys/arch_386.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = I386
+	BigEndian     = 0
+	CacheLineSize = 64
+	PhysPageSize  = GoosNacl*65536 + (1-GoosNacl)*4096 // 4k normally; 64k on NaCl
+	PCQuantum     = 1
+	Int64Align    = 4
+	HugePageSize  = 1 << 21
+	MinFrameSize  = 0
+)
+
+type Uintreg uint32

diff --git a/src/runtime/internal/sys/arch_amd64.go b/src/runtime/internal/sys/arch_amd64.go
new file mode 100644
index 0000000..1bbdb99
--- /dev/null
+++ b/src/runtime/internal/sys/arch_amd64.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = AMD64
+	BigEndian     = 0
+	CacheLineSize = 64
+	PhysPageSize  = 4096
+	PCQuantum     = 1
+	Int64Align    = 8
+	HugePageSize  = 1 << 21
+	MinFrameSize  = 0
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_amd64p32.go b/src/runtime/internal/sys/arch_amd64p32.go
new file mode 100644
index 0000000..b7011a4
--- /dev/null
+++ b/src/runtime/internal/sys/arch_amd64p32.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = AMD64
+	BigEndian     = 0
+	CacheLineSize = 64
+	PhysPageSize  = 65536*GoosNacl + 4096*(1-GoosNacl)
+	PCQuantum     = 1
+	Int64Align    = 8
+	HugePageSize  = 1 << 21
+	MinFrameSize  = 0
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_arm.go b/src/runtime/internal/sys/arch_arm.go
new file mode 100644
index 0000000..f90f52d
--- /dev/null
+++ b/src/runtime/internal/sys/arch_arm.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = ARM
+	BigEndian     = 0
+	CacheLineSize = 32
+	PhysPageSize  = 65536*GoosNacl + 4096*(1-GoosNacl)
+	PCQuantum     = 4
+	Int64Align    = 4
+	HugePageSize  = 0
+	MinFrameSize  = 4
+)
+
+type Uintreg uint32

diff --git a/src/runtime/internal/sys/arch_arm64.go b/src/runtime/internal/sys/arch_arm64.go
new file mode 100644
index 0000000..aaaa4b0
--- /dev/null
+++ b/src/runtime/internal/sys/arch_arm64.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = ARM64
+	BigEndian     = 0
+	CacheLineSize = 32
+	PhysPageSize  = 65536
+	PCQuantum     = 4
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 8
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_mips64.go b/src/runtime/internal/sys/arch_mips64.go
new file mode 100644
index 0000000..d567259
--- /dev/null
+++ b/src/runtime/internal/sys/arch_mips64.go

@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = MIPS64
+	BigEndian     = 1
+	CacheLineSize = 32
+	PhysPageSize  = 16384
+	PCQuantum     = 4
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 8
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_mips64le.go b/src/runtime/internal/sys/arch_mips64le.go
new file mode 100644
index 0000000..f8cdf2b
--- /dev/null
+++ b/src/runtime/internal/sys/arch_mips64le.go

@@ -0,0 +1,18 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = MIPS64
+	BigEndian     = 0
+	CacheLineSize = 32
+	PhysPageSize  = 16384
+	PCQuantum     = 4
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 8
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_ppc64.go b/src/runtime/internal/sys/arch_ppc64.go
new file mode 100644
index 0000000..cdec63f
--- /dev/null
+++ b/src/runtime/internal/sys/arch_ppc64.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = PPC64
+	BigEndian     = 1
+	CacheLineSize = 64
+	PhysPageSize  = 65536
+	PCQuantum     = 4
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 32
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_ppc64le.go b/src/runtime/internal/sys/arch_ppc64le.go
new file mode 100644
index 0000000..4fd68f9
--- /dev/null
+++ b/src/runtime/internal/sys/arch_ppc64le.go

@@ -0,0 +1,18 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = PPC64
+	BigEndian     = 0
+	CacheLineSize = 64
+	PhysPageSize  = 65536
+	PCQuantum     = 4
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 32
+)
+
+type Uintreg uint64

diff --git a/src/runtime/internal/sys/arch_s390x.go b/src/runtime/internal/sys/arch_s390x.go
new file mode 100644
index 0000000..ca1cb86
--- /dev/null
+++ b/src/runtime/internal/sys/arch_s390x.go

@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+const (
+	ArchFamily    = S390X
+	BigEndian     = 1
+	CacheLineSize = 256
+	PhysPageSize  = 4096
+	PCQuantum     = 2
+	Int64Align    = 8
+	HugePageSize  = 0
+	MinFrameSize  = 8
+)
+
+type Uintreg uint64

diff --git a/src/runtime/gengoos.go b/src/runtime/internal/sys/gengoos.go
similarity index 79%
rename from src/runtime/gengoos.go
rename to src/runtime/internal/sys/gengoos.go
index 06621c8..4c45c0a 100644
--- a/src/runtime/gengoos.go
+++ b/src/runtime/internal/sys/gengoos.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -18,7 +18,7 @@
 var gooses, goarches []string
 
 func main() {
-	data, err := ioutil.ReadFile("../go/build/syslist.go")
+	data, err := ioutil.ReadFile("../../../go/build/syslist.go")
 	if err != nil {
 		log.Fatal(err)
 	}
@@ -49,14 +49,14 @@
 		if target == "linux" {
 			fmt.Fprintf(&buf, "// +build !android\n\n") // must explicitly exclude android for linux
 		}
-		fmt.Fprintf(&buf, "package runtime\n\n")
-		fmt.Fprintf(&buf, "const theGoos = `%s`\n\n", target)
+		fmt.Fprintf(&buf, "package sys\n\n")
+		fmt.Fprintf(&buf, "const GOOS = `%s`\n\n", target)
 		for _, goos := range gooses {
 			value := 0
 			if goos == target {
 				value = 1
 			}
-			fmt.Fprintf(&buf, "const goos_%s = %d\n", goos, value)
+			fmt.Fprintf(&buf, "const Goos%s = %d\n", strings.Title(goos), value)
 		}
 		err := ioutil.WriteFile("zgoos_"+target+".go", buf.Bytes(), 0666)
 		if err != nil {
@@ -67,14 +67,14 @@
 	for _, target := range goarches {
 		var buf bytes.Buffer
 		fmt.Fprintf(&buf, "// generated by gengoos.go using 'go generate'\n\n")
-		fmt.Fprintf(&buf, "package runtime\n\n")
-		fmt.Fprintf(&buf, "const theGoarch = `%s`\n\n", target)
+		fmt.Fprintf(&buf, "package sys\n\n")
+		fmt.Fprintf(&buf, "const GOARCH = `%s`\n\n", target)
 		for _, goarch := range goarches {
 			value := 0
 			if goarch == target {
 				value = 1
 			}
-			fmt.Fprintf(&buf, "const goarch_%s = %d\n", goarch, value)
+			fmt.Fprintf(&buf, "const Goarch%s = %d\n", strings.Title(goarch), value)
 		}
 		err := ioutil.WriteFile("zgoarch_"+target+".go", buf.Bytes(), 0666)
 		if err != nil {

diff --git a/src/runtime/internal/sys/intrinsics.go b/src/runtime/internal/sys/intrinsics.go
new file mode 100644
index 0000000..08a062f
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics.go

@@ -0,0 +1,116 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !386
+
+package sys
+
+// Using techniques from http://supertech.csail.mit.edu/papers/debruijn.pdf
+
+const deBruijn64 = 0x0218a392cd3d5dbf
+
+var deBruijnIdx64 = [64]byte{
+	0, 1, 2, 7, 3, 13, 8, 19,
+	4, 25, 14, 28, 9, 34, 20, 40,
+	5, 17, 26, 38, 15, 46, 29, 48,
+	10, 31, 35, 54, 21, 50, 41, 57,
+	63, 6, 12, 18, 24, 27, 33, 39,
+	16, 37, 45, 47, 30, 53, 49, 56,
+	62, 11, 23, 32, 36, 44, 52, 55,
+	61, 22, 43, 51, 60, 42, 59, 58,
+}
+
+const deBruijn32 = 0x04653adf
+
+var deBruijnIdx32 = [32]byte{
+	0, 1, 2, 6, 3, 11, 7, 16,
+	4, 14, 12, 21, 8, 23, 17, 26,
+	31, 5, 10, 15, 13, 20, 22, 25,
+	30, 9, 19, 24, 29, 18, 28, 27,
+}
+
+const deBruijn16 = 0x09af
+
+var deBruijnIdx16 = [16]byte{
+	0, 1, 2, 5, 3, 9, 6, 11,
+	15, 4, 8, 10, 14, 7, 13, 12,
+}
+
+const deBruijn8 = 0x17
+
+var deBruijnIdx8 = [8]byte{
+	0, 1, 2, 4, 7, 3, 6, 5,
+}
+
+// Ctz64 counts trailing (low-order) zeroes,
+// and if all are zero, then 64.
+func Ctz64(x uint64) uint64 {
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn64 >> 58    // extract part of deBruijn sequence
+	y = uint64(deBruijnIdx64[y]) // convert to bit index
+	z := (x - 1) >> 57 & 64      // adjustment if zero
+	return y + z
+}
+
+// Ctz32 counts trailing (low-order) zeroes,
+// and if all are zero, then 32.
+func Ctz32(x uint32) uint32 {
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn32 >> 27    // extract part of deBruijn sequence
+	y = uint32(deBruijnIdx32[y]) // convert to bit index
+	z := (x - 1) >> 26 & 32      // adjustment if zero
+	return y + z
+}
+
+// Ctz16 counts trailing (low-order) zeroes,
+// and if all are zero, then 16.
+func Ctz16(x uint16) uint16 {
+	x &= -x                      // isolate low-order bit
+	y := x * deBruijn16 >> 12    // extract part of deBruijn sequence
+	y = uint16(deBruijnIdx16[y]) // convert to bit index
+	z := (x - 1) >> 11 & 16      // adjustment if zero
+	return y + z
+}
+
+// Ctz8 counts trailing (low-order) zeroes,
+// and if all are zero, then 8.
+func Ctz8(x uint8) uint8 {
+	x &= -x                    // isolate low-order bit
+	y := x * deBruijn8 >> 5    // extract part of deBruijn sequence
+	y = uint8(deBruijnIdx8[y]) // convert to bit index
+	z := (x - 1) >> 4 & 8      // adjustment if zero
+	return y + z
+}
+
+// Bswap64 returns its input with byte order reversed
+// 0x0102030405060708 -> 0x0807060504030201
+func Bswap64(x uint64) uint64 {
+	c8 := uint64(0x00ff00ff00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
+	x = a | b
+	c16 := uint64(0x0000ffff0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
+	x = a | b
+	c32 := uint64(0x00000000ffffffff)
+	a = x >> 32 & c32
+	b = (x & c32) << 32
+	x = a | b
+	return x
+}
+
+// Bswap32 returns its input with byte order reversed
+// 0x01020304 -> 0x04030201
+func Bswap32(x uint32) uint32 {
+	c8 := uint32(0x00ff00ff)
+	a := x >> 8 & c8
+	b := (x & c8) << 8
+	x = a | b
+	c16 := uint32(0x0000ffff)
+	a = x >> 16 & c16
+	b = (x & c16) << 16
+	x = a | b
+	return x
+}

diff --git a/src/runtime/internal/sys/intrinsics_386.s b/src/runtime/internal/sys/intrinsics_386.s
new file mode 100644
index 0000000..1f48e26
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics_386.s

@@ -0,0 +1,68 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT runtime∕internal∕sys·Ctz64(SB), NOSPLIT, $0-16
+	MOVL	$0, ret_hi+12(FP)
+
+	// Try low 32 bits.
+	MOVL	x_lo+0(FP), AX
+	BSFL	AX, AX
+	JZ	tryhigh
+	MOVL	AX, ret_lo+8(FP)
+	RET
+
+tryhigh:
+	// Try high 32 bits.
+	MOVL	x_hi+4(FP), AX
+	BSFL	AX, AX
+	JZ	none
+	ADDL	$32, AX
+	MOVL	AX, ret_lo+8(FP)
+	RET
+
+none:
+	// No bits are set.
+	MOVL	$64, ret_lo+8(FP)
+	RET
+
+TEXT runtime∕internal∕sys·Ctz32(SB), NOSPLIT, $0-8
+	MOVL	x+0(FP), AX
+	BSFL	AX, AX
+	JNZ	2(PC)
+	MOVL	$32, AX
+	MOVL	AX, ret+4(FP)
+	RET
+
+TEXT runtime∕internal∕sys·Ctz16(SB), NOSPLIT, $0-6
+	MOVW	x+0(FP), AX
+	BSFW	AX, AX
+	JNZ	2(PC)
+	MOVW	$16, AX
+	MOVW	AX, ret+4(FP)
+	RET
+
+TEXT runtime∕internal∕sys·Ctz8(SB), NOSPLIT, $0-5
+	MOVBLZX	x+0(FP), AX
+	BSFL	AX, AX
+	JNZ	2(PC)
+	MOVB	$8, AX
+	MOVB	AX, ret+4(FP)
+	RET
+
+TEXT runtime∕internal∕sys·Bswap64(SB), NOSPLIT, $0-16
+	MOVL	x_lo+0(FP), AX
+	MOVL	x_hi+4(FP), BX
+	BSWAPL	AX
+	BSWAPL	BX
+	MOVL	BX, ret_lo+8(FP)
+	MOVL	AX, ret_hi+12(FP)
+	RET
+
+TEXT runtime∕internal∕sys·Bswap32(SB), NOSPLIT, $0-8
+	MOVL	x+0(FP), AX
+	BSWAPL	AX
+	MOVL	AX, ret+4(FP)
+	RET

diff --git a/src/runtime/internal/sys/intrinsics_stubs.go b/src/runtime/internal/sys/intrinsics_stubs.go
new file mode 100644
index 0000000..079844f
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics_stubs.go

@@ -0,0 +1,14 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386
+
+package sys
+
+func Ctz64(x uint64) uint64
+func Ctz32(x uint32) uint32
+func Ctz16(x uint16) uint16
+func Ctz8(x uint8) uint8
+func Bswap64(x uint64) uint64
+func Bswap32(x uint32) uint32

diff --git a/src/runtime/internal/sys/intrinsics_test.go b/src/runtime/internal/sys/intrinsics_test.go
new file mode 100644
index 0000000..097631b
--- /dev/null
+++ b/src/runtime/internal/sys/intrinsics_test.go

@@ -0,0 +1,54 @@
+package sys_test
+
+import (
+	"runtime/internal/sys"
+	"testing"
+)
+
+func TestCtz64(t *testing.T) {
+	for i := uint(0); i <= 64; i++ {
+		x := uint64(5) << i
+		if got := sys.Ctz64(x); got != uint64(i) {
+			t.Errorf("Ctz64(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz32(t *testing.T) {
+	for i := uint(0); i <= 32; i++ {
+		x := uint32(5) << i
+		if got := sys.Ctz32(x); got != uint32(i) {
+			t.Errorf("Ctz32(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz16(t *testing.T) {
+	for i := uint(0); i <= 16; i++ {
+		x := uint16(5) << i
+		if got := sys.Ctz16(x); got != uint16(i) {
+			t.Errorf("Ctz16(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+func TestCtz8(t *testing.T) {
+	for i := uint(0); i <= 8; i++ {
+		x := uint8(5) << i
+		if got := sys.Ctz8(x); got != uint8(i) {
+			t.Errorf("Ctz8(%d)=%d, want %d", x, got, i)
+		}
+	}
+}
+
+func TestBswap64(t *testing.T) {
+	x := uint64(0x1122334455667788)
+	y := sys.Bswap64(x)
+	if y != 0x8877665544332211 {
+		t.Errorf("Bswap(%x)=%x, want 0x8877665544332211", x, y)
+	}
+}
+func TestBswap32(t *testing.T) {
+	x := uint32(0x11223344)
+	y := sys.Bswap32(x)
+	if y != 0x44332211 {
+		t.Errorf("Bswap(%x)=%x, want 0x44332211", x, y)
+	}
+}

diff --git a/src/runtime/internal/sys/stubs.go b/src/runtime/internal/sys/stubs.go
new file mode 100644
index 0000000..0a94502
--- /dev/null
+++ b/src/runtime/internal/sys/stubs.go

@@ -0,0 +1,11 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package sys
+
+// Declarations for runtime services implemented in C or assembly.
+
+const PtrSize = 4 << (^uintptr(0) >> 63)           // unsafe.Sizeof(uintptr(0)) but an ideal const
+const RegSize = 4 << (^Uintreg(0) >> 63)           // unsafe.Sizeof(uintreg(0)) but an ideal const
+const SpAlign = 1*(1-GoarchArm64) + 16*GoarchArm64 // SP alignment: 1 normally, 16 for ARM64

diff --git a/src/runtime/internal/sys/sys.go b/src/runtime/internal/sys/sys.go
new file mode 100644
index 0000000..586a763
--- /dev/null
+++ b/src/runtime/internal/sys/sys.go

@@ -0,0 +1,15 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// package sys contains system- and configuration- and architecture-specific
+// constants used by the runtime.
+package sys
+
+// The next line makes 'go generate' write the zgen_*.go files with
+// per-OS and per-arch information, including constants
+// named goos_$GOOS and goarch_$GOARCH for every
+// known GOOS and GOARCH. The constant is 1 on the
+// current system, 0 otherwise; multiplying by them is
+// useful for defining GOOS- or GOARCH-specific constants.
+//go:generate go run gengoos.go

diff --git a/src/runtime/internal/sys/zgoarch_386.go b/src/runtime/internal/sys/zgoarch_386.go
new file mode 100644
index 0000000..3bcf83b
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_386.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `386`
+
+const Goarch386 = 1
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_amd64.go b/src/runtime/internal/sys/zgoarch_amd64.go
new file mode 100644
index 0000000..699f191
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_amd64.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `amd64`
+
+const Goarch386 = 0
+const GoarchAmd64 = 1
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_amd64p32.go b/src/runtime/internal/sys/zgoarch_amd64p32.go
new file mode 100644
index 0000000..cc2d658
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_amd64p32.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `amd64p32`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 1
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_arm.go b/src/runtime/internal/sys/zgoarch_arm.go
new file mode 100644
index 0000000..a5fd789
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_arm.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `arm`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 1
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_arm64.go b/src/runtime/internal/sys/zgoarch_arm64.go
new file mode 100644
index 0000000..084d2c7
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_arm64.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `arm64`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 1
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_mips64.go b/src/runtime/internal/sys/zgoarch_mips64.go
new file mode 100644
index 0000000..2ad62bd
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_mips64.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `mips64`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 1
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_mips64le.go b/src/runtime/internal/sys/zgoarch_mips64le.go
new file mode 100644
index 0000000..047c8b4
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_mips64le.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `mips64le`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 1
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_ppc64.go b/src/runtime/internal/sys/zgoarch_ppc64.go
new file mode 100644
index 0000000..748b5b5
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_ppc64.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `ppc64`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 1
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_ppc64le.go b/src/runtime/internal/sys/zgoarch_ppc64le.go
new file mode 100644
index 0000000..d3dcba4
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_ppc64le.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `ppc64le`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 1
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 0
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoarch_s390x.go b/src/runtime/internal/sys/zgoarch_s390x.go
new file mode 100644
index 0000000..1ead5d5
--- /dev/null
+++ b/src/runtime/internal/sys/zgoarch_s390x.go

@@ -0,0 +1,26 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOARCH = `s390x`
+
+const Goarch386 = 0
+const GoarchAmd64 = 0
+const GoarchAmd64p32 = 0
+const GoarchArm = 0
+const GoarchArmbe = 0
+const GoarchArm64 = 0
+const GoarchArm64be = 0
+const GoarchPpc64 = 0
+const GoarchPpc64le = 0
+const GoarchMips = 0
+const GoarchMipsle = 0
+const GoarchMips64 = 0
+const GoarchMips64le = 0
+const GoarchMips64p32 = 0
+const GoarchMips64p32le = 0
+const GoarchPpc = 0
+const GoarchS390 = 0
+const GoarchS390x = 1
+const GoarchSparc = 0
+const GoarchSparc64 = 0

diff --git a/src/runtime/internal/sys/zgoos_android.go b/src/runtime/internal/sys/zgoos_android.go
new file mode 100644
index 0000000..6503b15
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_android.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `android`
+
+const GoosAndroid = 1
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_darwin.go b/src/runtime/internal/sys/zgoos_darwin.go
new file mode 100644
index 0000000..6a28598
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_darwin.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `darwin`
+
+const GoosAndroid = 0
+const GoosDarwin = 1
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_dragonfly.go b/src/runtime/internal/sys/zgoos_dragonfly.go
new file mode 100644
index 0000000..886ac26
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_dragonfly.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `dragonfly`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 1
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_freebsd.go b/src/runtime/internal/sys/zgoos_freebsd.go
new file mode 100644
index 0000000..0bf2403
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_freebsd.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `freebsd`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 1
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_linux.go b/src/runtime/internal/sys/zgoos_linux.go
new file mode 100644
index 0000000..c8664db
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_linux.go

@@ -0,0 +1,19 @@
+// generated by gengoos.go using 'go generate'
+
+// +build !android
+
+package sys
+
+const GOOS = `linux`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 1
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_nacl.go b/src/runtime/internal/sys/zgoos_nacl.go
new file mode 100644
index 0000000..0541226
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_nacl.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `nacl`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 1
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_netbsd.go b/src/runtime/internal/sys/zgoos_netbsd.go
new file mode 100644
index 0000000..5c509a1
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_netbsd.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `netbsd`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 1
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_openbsd.go b/src/runtime/internal/sys/zgoos_openbsd.go
new file mode 100644
index 0000000..dc43157
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_openbsd.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `openbsd`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 1
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_plan9.go b/src/runtime/internal/sys/zgoos_plan9.go
new file mode 100644
index 0000000..4b0934f
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_plan9.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `plan9`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 1
+const GoosSolaris = 0
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_solaris.go b/src/runtime/internal/sys/zgoos_solaris.go
new file mode 100644
index 0000000..42511a3
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_solaris.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `solaris`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 1
+const GoosWindows = 0

diff --git a/src/runtime/internal/sys/zgoos_windows.go b/src/runtime/internal/sys/zgoos_windows.go
new file mode 100644
index 0000000..d77f62c
--- /dev/null
+++ b/src/runtime/internal/sys/zgoos_windows.go

@@ -0,0 +1,17 @@
+// generated by gengoos.go using 'go generate'
+
+package sys
+
+const GOOS = `windows`
+
+const GoosAndroid = 0
+const GoosDarwin = 0
+const GoosDragonfly = 0
+const GoosFreebsd = 0
+const GoosLinux = 0
+const GoosNacl = 0
+const GoosNetbsd = 0
+const GoosOpenbsd = 0
+const GoosPlan9 = 0
+const GoosSolaris = 0
+const GoosWindows = 1

diff --git a/src/runtime/internal/sys/zversion.go b/src/runtime/internal/sys/zversion.go
new file mode 100644
index 0000000..c34564c
--- /dev/null
+++ b/src/runtime/internal/sys/zversion.go

@@ -0,0 +1,9 @@
+// auto generated by go tool dist
+
+package sys
+
+const DefaultGoroot = `./prebuilts/go/linux-x86`
+const TheVersion = `go1.7rc1`
+const Goexperiment = ``
+const StackGuardMultiplier = 1
+

diff --git a/src/runtime/lfstack.go b/src/runtime/lfstack.go
index 5838c1d..8e33ce1 100644
--- a/src/runtime/lfstack.go
+++ b/src/runtime/lfstack.go

@@ -3,23 +3,29 @@
 // license that can be found in the LICENSE file.
 
 // Lock-free stack.
-// The following code runs only on g0 stack.
+// Initialize head to 0, compare with 0 to test for emptiness.
+// The stack does not keep pointers to nodes,
+// so they can be garbage collected if there are no other pointers to nodes.
+// The following code runs only in non-preemptible contexts.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 func lfstackpush(head *uint64, node *lfnode) {
 	node.pushcnt++
 	new := lfstackPack(node, node.pushcnt)
-	if node1, _ := lfstackUnpack(new); node1 != node {
-		println("runtime: lfstackpush invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
+	if node1 := lfstackUnpack(new); node1 != node {
+		print("runtime: lfstackpush invalid packing: node=", node, " cnt=", hex(node.pushcnt), " packed=", hex(new), " -> node=", node1, "\n")
 		throw("lfstackpush")
 	}
 	for {
-		old := atomicload64(head)
+		old := atomic.Load64(head)
 		node.next = old
-		if cas64(head, old, new) {
+		if atomic.Cas64(head, old, new) {
 			break
 		}
 	}
@@ -27,13 +33,13 @@
 
 func lfstackpop(head *uint64) unsafe.Pointer {
 	for {
-		old := atomicload64(head)
+		old := atomic.Load64(head)
 		if old == 0 {
 			return nil
 		}
-		node, _ := lfstackUnpack(old)
-		next := atomicload64(&node.next)
-		if cas64(head, old, next) {
+		node := lfstackUnpack(old)
+		next := atomic.Load64(&node.next)
+		if atomic.Cas64(head, old, next) {
 			return unsafe.Pointer(node)
 		}
 	}

diff --git a/src/runtime/lfstack_32bit.go b/src/runtime/lfstack_32bit.go
index 4b8bcba..2f59e02 100644
--- a/src/runtime/lfstack_32bit.go
+++ b/src/runtime/lfstack_32bit.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -14,8 +14,6 @@
 	return uint64(uintptr(unsafe.Pointer(node)))<<32 | uint64(cnt)
 }
 
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> 32)))
-	cnt = uintptr(val)
-	return
+func lfstackUnpack(val uint64) *lfnode {
+	return (*lfnode)(unsafe.Pointer(uintptr(val >> 32)))
 }

diff --git a/src/runtime/lfstack_64bit.go b/src/runtime/lfstack_64bit.go
new file mode 100644
index 0000000..5367f08
--- /dev/null
+++ b/src/runtime/lfstack_64bit.go

@@ -0,0 +1,48 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 mips64 mips64le ppc64 ppc64le s390x
+
+package runtime
+
+import "unsafe"
+
+const (
+	// addrBits is the number of bits needed to represent a virtual address.
+	//
+	// In Linux the user address space for each architecture is limited as
+	// follows (taken from the processor.h file for the architecture):
+	//
+	// Architecture  Name              Maximum Value (exclusive)
+	// ---------------------------------------------------------------------
+	// arm64         TASK_SIZE_64      Depends on configuration.
+	// ppc64{,le}    TASK_SIZE_USER64  0x400000000000UL (46 bit addresses)
+	// mips64{,le}   TASK_SIZE64       0x010000000000UL (40 bit addresses)
+	// s390x         TASK_SIZE         0x020000000000UL (41 bit addresses)
+	//
+	// These values may increase over time.
+	//
+	// On AMD64, virtual addresses are 48-bit numbers sign extended to 64.
+	// We shift the address left 16 to eliminate the sign extended part and make
+	// room in the bottom for the count.
+	addrBits = 48
+
+	// In addition to the 16 bits taken from the top, we can take 3 from the
+	// bottom, because node must be pointer-aligned, giving a total of 19 bits
+	// of count.
+	cntBits = 64 - addrBits + 3
+)
+
+func lfstackPack(node *lfnode, cnt uintptr) uint64 {
+	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
+}
+
+func lfstackUnpack(val uint64) *lfnode {
+	if GOARCH == "amd64" {
+		// amd64 systems can place the stack above the VA hole, so we need to sign extend
+		// val before unpacking.
+		return (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> cntBits << 3)))
+	}
+	return (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
+}

diff --git a/src/runtime/lfstack_amd64.go b/src/runtime/lfstack_amd64.go
deleted file mode 100644
index 84e2851..0000000
--- a/src/runtime/lfstack_amd64.go
+++ /dev/null

@@ -1,24 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// On AMD64, virtual addresses are 48-bit numbers sign extended to 64.
-// We shift the address left 16 to eliminate the sign extended part and make
-// room in the bottom for the count.
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<16 | uint64(cnt&(1<<19-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(int64(val) >> 19 << 3)))
-	cnt = uintptr(val & (1<<19 - 1))
-	return
-}

diff --git a/src/runtime/lfstack_darwin_arm64.go b/src/runtime/lfstack_darwin_arm64.go
deleted file mode 100644
index 54cae39..0000000
--- a/src/runtime/lfstack_darwin_arm64.go
+++ /dev/null

@@ -1,25 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}

diff --git a/src/runtime/lfstack_linux_arm64.go b/src/runtime/lfstack_linux_arm64.go
deleted file mode 100644
index 54cae39..0000000
--- a/src/runtime/lfstack_linux_arm64.go
+++ /dev/null

@@ -1,25 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}

diff --git a/src/runtime/lfstack_linux_ppc64x.go b/src/runtime/lfstack_linux_ppc64x.go
deleted file mode 100644
index 7ed5025..0000000
--- a/src/runtime/lfstack_linux_ppc64x.go
+++ /dev/null

@@ -1,32 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ppc64 ppc64le
-// +build linux
-
-package runtime
-
-import "unsafe"
-
-// On ppc64, Linux limits the user address space to 46 bits (see
-// TASK_SIZE_USER64 in the Linux kernel).  This has grown over time,
-// so here we allow 48 bit addresses.
-//
-// In addition to the 16 bits taken from the top, we can take 3 from the
-// bottom, because node must be pointer-aligned, giving a total of 19 bits
-// of count.
-const (
-	addrBits = 48
-	cntBits  = 64 - addrBits + 3
-)
-
-func lfstackPack(node *lfnode, cnt uintptr) uint64 {
-	return uint64(uintptr(unsafe.Pointer(node)))<<(64-addrBits) | uint64(cnt&(1<<cntBits-1))
-}
-
-func lfstackUnpack(val uint64) (node *lfnode, cnt uintptr) {
-	node = (*lfnode)(unsafe.Pointer(uintptr(val >> cntBits << 3)))
-	cnt = uintptr(val & (1<<cntBits - 1))
-	return
-}

diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 768fd57..073136a 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go

@@ -6,17 +6,20 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This implementation depends on OS-specific implementations of
 //
-//	runtime·futexsleep(uint32 *addr, uint32 val, int64 ns)
+//	futexsleep(addr *uint32, val uint32, ns int64)
 //		Atomically,
-//			if(*addr == val) sleep
+//			if *addr == val { sleep }
 //		Might be woken up spuriously; that's allowed.
 //		Don't sleep longer than ns; ns < 0 means forever.
 //
-//	runtime·futexwakeup(uint32 *addr, uint32 cnt)
+//	futexwakeup(addr *uint32, cnt uint32)
 //		If any procs are sleeping on addr, wake up at most cnt.
 
 const (
@@ -48,14 +51,14 @@
 	gp.m.locks++
 
 	// Speculative grab for lock.
-	v := xchg(key32(&l.key), mutex_locked)
+	v := atomic.Xchg(key32(&l.key), mutex_locked)
 	if v == mutex_unlocked {
 		return
 	}
 
 	// wait is either MUTEX_LOCKED or MUTEX_SLEEPING
 	// depending on whether there is a thread sleeping
-	// on this mutex.  If we ever change l->key from
+	// on this mutex. If we ever change l->key from
 	// MUTEX_SLEEPING to some other value, we must be
 	// careful to change it back to MUTEX_SLEEPING before
 	// returning, to ensure that the sleeping thread gets
@@ -72,7 +75,7 @@
 		// Try for lock, spinning.
 		for i := 0; i < spin; i++ {
 			for l.key == mutex_unlocked {
-				if cas(key32(&l.key), mutex_unlocked, wait) {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
 					return
 				}
 			}
@@ -82,7 +85,7 @@
 		// Try for lock, rescheduling.
 		for i := 0; i < passive_spin; i++ {
 			for l.key == mutex_unlocked {
-				if cas(key32(&l.key), mutex_unlocked, wait) {
+				if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
 					return
 				}
 			}
@@ -90,7 +93,7 @@
 		}
 
 		// Sleep.
-		v = xchg(key32(&l.key), mutex_sleeping)
+		v = atomic.Xchg(key32(&l.key), mutex_sleeping)
 		if v == mutex_unlocked {
 			return
 		}
@@ -100,7 +103,7 @@
 }
 
 func unlock(l *mutex) {
-	v := xchg(key32(&l.key), mutex_unlocked)
+	v := atomic.Xchg(key32(&l.key), mutex_unlocked)
 	if v == mutex_unlocked {
 		throw("unlock of unlocked lock")
 	}
@@ -124,7 +127,7 @@
 }
 
 func notewakeup(n *note) {
-	old := xchg(key32(&n.key), 1)
+	old := atomic.Xchg(key32(&n.key), 1)
 	if old != 0 {
 		print("notewakeup - double wakeup (", old, ")\n")
 		throw("notewakeup - double wakeup")
@@ -137,7 +140,7 @@
 	if gp != gp.m.g0 {
 		throw("notesleep not on g0")
 	}
-	for atomicload(key32(&n.key)) == 0 {
+	for atomic.Load(key32(&n.key)) == 0 {
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, -1)
 		gp.m.blocked = false
@@ -153,7 +156,7 @@
 	gp := getg()
 
 	if ns < 0 {
-		for atomicload(key32(&n.key)) == 0 {
+		for atomic.Load(key32(&n.key)) == 0 {
 			gp.m.blocked = true
 			futexsleep(key32(&n.key), 0, -1)
 			gp.m.blocked = false
@@ -161,7 +164,7 @@
 		return true
 	}
 
-	if atomicload(key32(&n.key)) != 0 {
+	if atomic.Load(key32(&n.key)) != 0 {
 		return true
 	}
 
@@ -170,7 +173,7 @@
 		gp.m.blocked = true
 		futexsleep(key32(&n.key), 0, ns)
 		gp.m.blocked = false
-		if atomicload(key32(&n.key)) != 0 {
+		if atomic.Load(key32(&n.key)) != 0 {
 			break
 		}
 		now := nanotime()
@@ -179,7 +182,7 @@
 		}
 		ns = deadline - now
 	}
-	return atomicload(key32(&n.key)) != 0
+	return atomic.Load(key32(&n.key)) != 0
 }
 
 func notetsleep(n *note, ns int64) bool {

diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index d9d91c9..0fa0481 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,22 +6,23 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // This implementation depends on OS-specific implementations of
 //
-//	uintptr runtime·semacreate(void)
-//		Create a semaphore, which will be assigned to m->waitsema.
-//		The zero value is treated as absence of any semaphore,
-//		so be sure to return a non-zero value.
+//	func semacreate(mp *m)
+//		Create a semaphore for mp, if it does not already have one.
 //
-//	int32 runtime·semasleep(int64 ns)
-//		If ns < 0, acquire m->waitsema and return 0.
-//		If ns >= 0, try to acquire m->waitsema for at most ns nanoseconds.
+//	func semasleep(ns int64) int32
+//		If ns < 0, acquire m's semaphore and return 0.
+//		If ns >= 0, try to acquire m's semaphore for at most ns nanoseconds.
 //		Return 0 if the semaphore was acquired, -1 if interrupted or timed out.
 //
-//	int32 runtime·semawakeup(M *mp)
-//		Wake up mp, which is or will soon be sleeping on mp->waitsema.
+//	func semawakeup(mp *m)
+//		Wake up mp, which is or will soon be sleeping on its semaphore.
 //
 const (
 	locked uintptr = 1
@@ -39,12 +40,10 @@
 	gp.m.locks++
 
 	// Speculative grab for lock.
-	if casuintptr(&l.key, 0, locked) {
+	if atomic.Casuintptr(&l.key, 0, locked) {
 		return
 	}
-	if gp.m.waitsema == 0 {
-		gp.m.waitsema = semacreate()
-	}
+	semacreate(gp.m)
 
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
@@ -54,10 +53,10 @@
 	}
 Loop:
 	for i := 0; ; i++ {
-		v := atomicloaduintptr(&l.key)
+		v := atomic.Loaduintptr(&l.key)
 		if v&locked == 0 {
 			// Unlocked. Try to lock.
-			if casuintptr(&l.key, v, v|locked) {
+			if atomic.Casuintptr(&l.key, v, v|locked) {
 				return
 			}
 			i = 0
@@ -73,16 +72,16 @@
 			// Queue this M.
 			for {
 				gp.m.nextwaitm = v &^ locked
-				if casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
+				if atomic.Casuintptr(&l.key, v, uintptr(unsafe.Pointer(gp.m))|locked) {
 					break
 				}
-				v = atomicloaduintptr(&l.key)
+				v = atomic.Loaduintptr(&l.key)
 				if v&locked == 0 {
 					continue Loop
 				}
 			}
 			if v&locked != 0 {
-				// Queued.  Wait.
+				// Queued. Wait.
 				semasleep(-1)
 				i = 0
 			}
@@ -96,16 +95,16 @@
 	gp := getg()
 	var mp *m
 	for {
-		v := atomicloaduintptr(&l.key)
+		v := atomic.Loaduintptr(&l.key)
 		if v == locked {
-			if casuintptr(&l.key, locked, 0) {
+			if atomic.Casuintptr(&l.key, locked, 0) {
 				break
 			}
 		} else {
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
-			mp = (*m)((unsafe.Pointer)(v &^ locked))
-			if casuintptr(&l.key, v, mp.nextwaitm) {
+			mp = (*m)(unsafe.Pointer(v &^ locked))
+			if atomic.Casuintptr(&l.key, v, mp.nextwaitm) {
 				// Dequeued an M.  Wake it.
 				semawakeup(mp)
 				break
@@ -129,8 +128,8 @@
 func notewakeup(n *note) {
 	var v uintptr
 	for {
-		v = atomicloaduintptr(&n.key)
-		if casuintptr(&n.key, v, locked) {
+		v = atomic.Loaduintptr(&n.key)
+		if atomic.Casuintptr(&n.key, v, locked) {
 			break
 		}
 	}
@@ -144,7 +143,7 @@
 		// Two notewakeups!  Not allowed.
 		throw("notewakeup - double wakeup")
 	default:
-		// Must be the waiting m.  Wake it up.
+		// Must be the waiting m. Wake it up.
 		semawakeup((*m)(unsafe.Pointer(v)))
 	}
 }
@@ -154,17 +153,15 @@
 	if gp != gp.m.g0 {
 		throw("notesleep not on g0")
 	}
-	if gp.m.waitsema == 0 {
-		gp.m.waitsema = semacreate()
-	}
-	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+	semacreate(gp.m)
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
 		// Must be locked (got wakeup).
 		if n.key != locked {
 			throw("notesleep - waitm out of sync")
 		}
 		return
 	}
-	// Queued.  Sleep.
+	// Queued. Sleep.
 	gp.m.blocked = true
 	semasleep(-1)
 	gp.m.blocked = false
@@ -179,7 +176,7 @@
 	gp = getg()
 
 	// Register for wakeup on n->waitm.
-	if !casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
+	if !atomic.Casuintptr(&n.key, 0, uintptr(unsafe.Pointer(gp.m))) {
 		// Must be locked (got wakeup).
 		if n.key != locked {
 			throw("notetsleep - waitm out of sync")
@@ -187,7 +184,7 @@
 		return true
 	}
 	if ns < 0 {
-		// Queued.  Sleep.
+		// Queued. Sleep.
 		gp.m.blocked = true
 		semasleep(-1)
 		gp.m.blocked = false
@@ -196,7 +193,7 @@
 
 	deadline = nanotime() + ns
 	for {
-		// Registered.  Sleep.
+		// Registered. Sleep.
 		gp.m.blocked = true
 		if semasleep(ns) >= 0 {
 			gp.m.blocked = false
@@ -205,24 +202,24 @@
 			return true
 		}
 		gp.m.blocked = false
-		// Interrupted or timed out.  Still registered.  Semaphore not acquired.
+		// Interrupted or timed out. Still registered. Semaphore not acquired.
 		ns = deadline - nanotime()
 		if ns <= 0 {
 			break
 		}
-		// Deadline hasn't arrived.  Keep sleeping.
+		// Deadline hasn't arrived. Keep sleeping.
 	}
 
-	// Deadline arrived.  Still registered.  Semaphore not acquired.
+	// Deadline arrived. Still registered. Semaphore not acquired.
 	// Want to give up and return, but have to unregister first,
 	// so that any notewakeup racing with the return does not
 	// try to grant us the semaphore when we don't expect it.
 	for {
-		v := atomicloaduintptr(&n.key)
+		v := atomic.Loaduintptr(&n.key)
 		switch v {
 		case uintptr(unsafe.Pointer(gp.m)):
 			// No wakeup yet; unregister if possible.
-			if casuintptr(&n.key, v, 0) {
+			if atomic.Casuintptr(&n.key, v, 0) {
 				return false
 			}
 		case locked:
@@ -245,9 +242,7 @@
 	if gp != gp.m.g0 && gp.m.preemptoff != "" {
 		throw("notetsleep not on g0")
 	}
-	if gp.m.waitsema == 0 {
-		gp.m.waitsema = semacreate()
-	}
+	semacreate(gp.m)
 	return notetsleep_internal(n, ns, nil, 0)
 }
 
@@ -258,9 +253,7 @@
 	if gp == gp.m.g0 {
 		throw("notetsleepg on g0")
 	}
-	if gp.m.waitsema == 0 {
-		gp.m.waitsema = semacreate()
-	}
+	semacreate(gp.m)
 	entersyscallblock(0)
 	ok := notetsleep_internal(n, ns, nil, 0)
 	exitsyscall(0)

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 353f840..b079a07 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go

@@ -65,8 +65,8 @@
 // directly, bypassing the MCache and MCentral free lists.
 //
 // The small objects on the MCache and MCentral free lists
-// may or may not be zeroed.  They are zeroed if and only if
-// the second word of the object is zero.  A span in the
+// may or may not be zeroed. They are zeroed if and only if
+// the second word of the object is zero. A span in the
 // page heap is zeroed unless s->needzero is set. When a span
 // is allocated to break into small objects, it is zeroed if needed
 // and s->needzero is set. There are two main benefits to delaying the
@@ -76,20 +76,17 @@
 //	   or the page heap can avoid zeroing altogether.
 //	2. the cost of zeroing when reusing a small object is
 //	   charged to the mutator, not the garbage collector.
-//
-// This code was written with an eye toward translating to Go
-// in the future.  Methods have the form Type_Method(Type *t, ...).
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
 	debugMalloc = false
 
-	flagNoScan = _FlagNoScan
-	flagNoZero = _FlagNoZero
-
 	maxTinySize   = _TinySize
 	tinySizeClass = _TinySizeClass
 	maxSmallSize  = _MaxSmallSize
@@ -97,6 +94,9 @@
 	pageShift = _PageShift
 	pageSize  = _PageSize
 	pageMask  = _PageMask
+	// By construction, single page spans of the smallest object class
+	// have the most objects per span.
+	maxObjsPerSpan = pageSize / 8
 
 	mSpanInUse = _MSpanInUse
 
@@ -113,9 +113,9 @@
 	// _64bit = 1 on 64-bit systems, 0 on 32-bit systems
 	_64bit = 1 << (^uintptr(0) >> 63) / 2
 
-	// Computed constant.  The definition of MaxSmallSize and the
+	// Computed constant. The definition of MaxSmallSize and the
 	// algorithm in msize.go produces some number of different allocation
-	// size classes.  NumSizeClasses is that number.  It's needed here
+	// size classes. NumSizeClasses is that number. It's needed here
 	// because there are static arrays of this length; when msize runs its
 	// size choosing algorithm it double-checks that NumSizeClasses agrees.
 	_NumSizeClasses = 67
@@ -134,9 +134,9 @@
 	// Per-P, per order stack segment cache size.
 	_StackCacheSize = 32 * 1024
 
-	// Number of orders that get caching.  Order 0 is FixedStack
+	// Number of orders that get caching. Order 0 is FixedStack
 	// and each successive order is twice as large.
-	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks.  Larger stacks
+	// We want to cache 2KB, 4KB, 8KB, and 16KB stacks. Larger stacks
 	// will be allocated directly.
 	// Since FixedStack is different on different systems, we
 	// must vary NumStackOrders to keep the same maximum cached size.
@@ -146,7 +146,7 @@
 	//   windows/32       | 4KB        | 3
 	//   windows/64       | 8KB        | 2
 	//   plan9            | 4KB        | 3
-	_NumStackOrders = 4 - ptrSize/4*goos_windows - 1*goos_plan9
+	_NumStackOrders = 4 - sys.PtrSize/4*sys.GoosWindows - 1*sys.GoosPlan9
 
 	// Number of bits in page to span calculations (4k pages).
 	// On Windows 64-bit we limit the arena to 32GB or 35 bits.
@@ -158,22 +158,19 @@
 	// On Darwin/arm64, we cannot reserve more than ~5GB of virtual memory,
 	// but as most devices have less than 4GB of physical memory anyway, we
 	// try to be conservative here, and only ask for a 2GB heap.
-	_MHeapMap_TotalBits = (_64bit*goos_windows)*35 + (_64bit*(1-goos_windows)*(1-goos_darwin*goarch_arm64))*39 + goos_darwin*goarch_arm64*31 + (1-_64bit)*32
+	_MHeapMap_TotalBits = (_64bit*sys.GoosWindows)*35 + (_64bit*(1-sys.GoosWindows)*(1-sys.GoosDarwin*sys.GoarchArm64))*39 + sys.GoosDarwin*sys.GoarchArm64*31 + (1-_64bit)*32
 	_MHeapMap_Bits      = _MHeapMap_TotalBits - _PageShift
 
 	_MaxMem = uintptr(1<<_MHeapMap_TotalBits - 1)
 
 	// Max number of threads to run garbage collection.
 	// 2, 3, and 4 are all plausible maximums depending
-	// on the hardware details of the machine.  The garbage
+	// on the hardware details of the machine. The garbage
 	// collector scales well to 32 cpus.
 	_MaxGcproc = 32
 )
 
-// Page number (address>>pageShift)
-type pageID uintptr
-
-const _MaxArena32 = 2 << 30
+const _MaxArena32 = 1<<32 - 1
 
 // OS-defined helpers:
 //
@@ -192,14 +189,14 @@
 //
 // SysFree returns it unconditionally; this is only used if
 // an out-of-memory error has been detected midway through
-// an allocation.  It is okay if SysFree is a no-op.
+// an allocation. It is okay if SysFree is a no-op.
 //
 // SysReserve reserves address space without allocating memory.
 // If the pointer passed to it is non-nil, the caller wants the
 // reservation there, but SysReserve can still choose another
-// location if that one is unavailable.  On some systems and in some
+// location if that one is unavailable. On some systems and in some
 // cases SysReserve will simply check that the address space is
-// available and not actually reserve it.  If SysReserve returns
+// available and not actually reserve it. If SysReserve returns
 // non-nil, it sets *reserved to true if the address space is
 // reserved, false if it has merely been checked.
 // NOTE: SysReserve returns OS-aligned memory, but the heap allocator
@@ -211,7 +208,7 @@
 // reserved, not merely checked.
 //
 // SysFault marks a (already sysAlloc'd) region to fault
-// if accessed.  Used only for debugging the runtime.
+// if accessed. Used only for debugging the runtime.
 
 func mallocinit() {
 	initSizes()
@@ -229,9 +226,9 @@
 	limit = 0
 
 	// Set up the allocation arena, a contiguous area of memory where
-	// allocated data will be found.  The arena begins with a bitmap large
-	// enough to hold 4 bits per allocated word.
-	if ptrSize == 8 && (limit == 0 || limit > 1<<30) {
+	// allocated data will be found. The arena begins with a bitmap large
+	// enough to hold 2 bits per allocated word.
+	if sys.PtrSize == 8 && (limit == 0 || limit > 1<<30) {
 		// On a 64-bit machine, allocate from a single contiguous reservation.
 		// 512 GB (MaxMem) should be big enough for now.
 		//
@@ -239,12 +236,12 @@
 		// SysReserve to use 0x0000XXc000000000 if possible (XX=00...7f).
 		// Allocating a 512 GB region takes away 39 bits, and the amd64
 		// doesn't let us choose the top 17 bits, so that leaves the 9 bits
-		// in the middle of 0x00c0 for us to choose.  Choosing 0x00c0 means
+		// in the middle of 0x00c0 for us to choose. Choosing 0x00c0 means
 		// that the valid memory addresses will begin 0x00c0, 0x00c1, ..., 0x00df.
 		// In little-endian, that's c0 00, c1 00, ..., df 00. None of those are valid
 		// UTF-8 sequences, and they are otherwise as far away from
-		// ff (likely a common byte) as possible.  If that fails, we try other 0xXXc0
-		// addresses.  An earlier attempt to use 0x11f8 caused out of memory errors
+		// ff (likely a common byte) as possible. If that fails, we try other 0xXXc0
+		// addresses. An earlier attempt to use 0x11f8 caused out of memory errors
 		// on OS X during thread allocations.  0x00c0 causes conflicts with
 		// AddressSanitizer which reserves all memory up to 0x0100.
 		// These choices are both for debuggability and to reduce the
@@ -262,8 +259,8 @@
 		// translation buffers, the user address space is limited to 39 bits
 		// On darwin/arm64, the address space is even smaller.
 		arenaSize := round(_MaxMem, _PageSize)
-		bitmapSize = arenaSize / (ptrSize * 8 / 4)
-		spansSize = arenaSize / _PageSize * ptrSize
+		bitmapSize = arenaSize / (sys.PtrSize * 8 / 2)
+		spansSize = arenaSize / _PageSize * sys.PtrSize
 		spansSize = round(spansSize, _PageSize)
 		for i := 0; i <= 0x7f; i++ {
 			switch {
@@ -287,44 +284,38 @@
 		// with a giant virtual address space reservation.
 		// Instead we map the memory information bitmap
 		// immediately after the data segment, large enough
-		// to handle another 2GB of mappings (256 MB),
+		// to handle the entire 4GB address space (256 MB),
 		// along with a reservation for an initial arena.
 		// When that gets used up, we'll start asking the kernel
-		// for any memory anywhere and hope it's in the 2GB
-		// following the bitmap (presumably the executable begins
-		// near the bottom of memory, so we'll have to use up
-		// most of memory before the kernel resorts to giving out
-		// memory before the beginning of the text segment).
-		//
-		// Alternatively we could reserve 512 MB bitmap, enough
-		// for 4GB of mappings, and then accept any memory the
-		// kernel threw at us, but normally that's a waste of 512 MB
-		// of address space, which is probably too much in a 32-bit world.
+		// for any memory anywhere.
 
 		// If we fail to allocate, try again with a smaller arena.
 		// This is necessary on Android L where we share a process
 		// with ART, which reserves virtual memory aggressively.
+		// In the worst case, fall back to a 0-sized initial arena,
+		// in the hope that subsequent reservations will succeed.
 		arenaSizes := []uintptr{
 			512 << 20,
 			256 << 20,
 			128 << 20,
+			0,
 		}
 
 		for _, arenaSize := range arenaSizes {
-			bitmapSize = _MaxArena32 / (ptrSize * 8 / 4)
-			spansSize = _MaxArena32 / _PageSize * ptrSize
+			bitmapSize = (_MaxArena32 + 1) / (sys.PtrSize * 8 / 2)
+			spansSize = (_MaxArena32 + 1) / _PageSize * sys.PtrSize
 			if limit > 0 && arenaSize+bitmapSize+spansSize > limit {
 				bitmapSize = (limit / 9) &^ ((1 << _PageShift) - 1)
 				arenaSize = bitmapSize * 8
-				spansSize = arenaSize / _PageSize * ptrSize
+				spansSize = arenaSize / _PageSize * sys.PtrSize
 			}
 			spansSize = round(spansSize, _PageSize)
 
 			// SysReserve treats the address we ask for, end, as a hint,
-			// not as an absolute requirement.  If we ask for the end
+			// not as an absolute requirement. If we ask for the end
 			// of the data segment but the operating system requires
 			// a little more space before we can start allocating, it will
-			// give out a slightly higher pointer.  Except QEMU, which
+			// give out a slightly higher pointer. Except QEMU, which
 			// is buggy, as usual: it won't adjust the pointer upward.
 			// So adjust it upward a little bit ourselves: 1/4 MB to get
 			// away from the running binary image and then round up
@@ -347,10 +338,16 @@
 	p1 := round(p, _PageSize)
 
 	mheap_.spans = (**mspan)(unsafe.Pointer(p1))
-	mheap_.bitmap = p1 + spansSize
-	mheap_.arena_start = p1 + (spansSize + bitmapSize)
-	mheap_.arena_used = mheap_.arena_start
+	mheap_.bitmap = p1 + spansSize + bitmapSize
+	if sys.PtrSize == 4 {
+		// Set arena_start such that we can accept memory
+		// reservations located anywhere in the 4GB virtual space.
+		mheap_.arena_start = 0
+	} else {
+		mheap_.arena_start = p1 + (spansSize + bitmapSize)
+	}
 	mheap_.arena_end = p + pSize
+	mheap_.arena_used = p1 + (spansSize + bitmapSize)
 	mheap_.arena_reserved = reserved
 
 	if mheap_.arena_start&(_PageSize-1) != 0 {
@@ -359,134 +356,177 @@
 	}
 
 	// Initialize the rest of the allocator.
-	mHeap_Init(&mheap_, spansSize)
+	mheap_.init(spansSize)
 	_g_ := getg()
 	_g_.m.mcache = allocmcache()
 }
 
-// sysReserveHigh reserves space somewhere high in the address space.
-// sysReserve doesn't actually reserve the full amount requested on
-// 64-bit systems, because of problems with ulimit. Instead it checks
-// that it can get the first 64 kB and assumes it can grab the rest as
-// needed. This doesn't work well with the "let the kernel pick an address"
-// mode, so don't do that. Pick a high address instead.
-func sysReserveHigh(n uintptr, reserved *bool) unsafe.Pointer {
-	if ptrSize == 4 {
-		return sysReserve(nil, n, reserved)
-	}
-
-	for i := 0; i <= 0x7f; i++ {
-		p := uintptr(i)<<40 | uintptrMask&(0x00c0<<32)
-		*reserved = false
-		p = uintptr(sysReserve(unsafe.Pointer(p), n, reserved))
-		if p != 0 {
-			return unsafe.Pointer(p)
-		}
-	}
-
-	return sysReserve(nil, n, reserved)
-}
-
-func mHeap_SysAlloc(h *mheap, n uintptr) unsafe.Pointer {
-	if n > uintptr(h.arena_end)-uintptr(h.arena_used) {
+// sysAlloc allocates the next n bytes from the heap arena. The
+// returned pointer is always _PageSize aligned and between
+// h.arena_start and h.arena_end. sysAlloc returns nil on failure.
+// There is no corresponding free function.
+func (h *mheap) sysAlloc(n uintptr) unsafe.Pointer {
+	if n > h.arena_end-h.arena_used {
 		// We are in 32-bit mode, maybe we didn't use all possible address space yet.
 		// Reserve some more space.
 		p_size := round(n+_PageSize, 256<<20)
-		new_end := h.arena_end + p_size
-		if new_end <= h.arena_start+_MaxArena32 {
+		new_end := h.arena_end + p_size // Careful: can overflow
+		if h.arena_end <= new_end && new_end-h.arena_start-1 <= _MaxArena32 {
 			// TODO: It would be bad if part of the arena
 			// is reserved and part is not.
 			var reserved bool
-			p := uintptr(sysReserve((unsafe.Pointer)(h.arena_end), p_size, &reserved))
+			p := uintptr(sysReserve(unsafe.Pointer(h.arena_end), p_size, &reserved))
+			if p == 0 {
+				return nil
+			}
 			if p == h.arena_end {
 				h.arena_end = new_end
 				h.arena_reserved = reserved
-			} else if p+p_size <= h.arena_start+_MaxArena32 {
+			} else if h.arena_start <= p && p+p_size-h.arena_start-1 <= _MaxArena32 {
 				// Keep everything page-aligned.
 				// Our pages are bigger than hardware pages.
 				h.arena_end = p + p_size
-				used := p + (-uintptr(p) & (_PageSize - 1))
-				mHeap_MapBits(h, used)
-				mHeap_MapSpans(h, used)
+				used := p + (-p & (_PageSize - 1))
+				h.mapBits(used)
+				h.mapSpans(used)
 				h.arena_used = used
 				h.arena_reserved = reserved
 			} else {
-				var stat uint64
-				sysFree((unsafe.Pointer)(p), p_size, &stat)
+				// We haven't added this allocation to
+				// the stats, so subtract it from a
+				// fake stat (but avoid underflow).
+				stat := uint64(p_size)
+				sysFree(unsafe.Pointer(p), p_size, &stat)
 			}
 		}
 	}
 
-	if n <= uintptr(h.arena_end)-uintptr(h.arena_used) {
+	if n <= h.arena_end-h.arena_used {
 		// Keep taking from our reservation.
 		p := h.arena_used
-		sysMap((unsafe.Pointer)(p), n, h.arena_reserved, &memstats.heap_sys)
-		mHeap_MapBits(h, p+n)
-		mHeap_MapSpans(h, p+n)
+		sysMap(unsafe.Pointer(p), n, h.arena_reserved, &memstats.heap_sys)
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
 		h.arena_used = p + n
 		if raceenabled {
-			racemapshadow((unsafe.Pointer)(p), n)
+			racemapshadow(unsafe.Pointer(p), n)
 		}
 
-		if uintptr(p)&(_PageSize-1) != 0 {
+		if p&(_PageSize-1) != 0 {
 			throw("misrounded allocation in MHeap_SysAlloc")
 		}
-		return (unsafe.Pointer)(p)
+		return unsafe.Pointer(p)
 	}
 
 	// If using 64-bit, our reservation is all we have.
-	if uintptr(h.arena_end)-uintptr(h.arena_start) >= _MaxArena32 {
+	if h.arena_end-h.arena_start > _MaxArena32 {
 		return nil
 	}
 
 	// On 32-bit, once the reservation is gone we can
-	// try to get memory at a location chosen by the OS
-	// and hope that it is in the range we allocated bitmap for.
+	// try to get memory at a location chosen by the OS.
 	p_size := round(n, _PageSize) + _PageSize
 	p := uintptr(sysAlloc(p_size, &memstats.heap_sys))
 	if p == 0 {
 		return nil
 	}
 
-	if p < h.arena_start || uintptr(p)+p_size-uintptr(h.arena_start) >= _MaxArena32 {
-		print("runtime: memory allocated by OS (", p, ") not in usable range [", hex(h.arena_start), ",", hex(h.arena_start+_MaxArena32), ")\n")
-		sysFree((unsafe.Pointer)(p), p_size, &memstats.heap_sys)
+	if p < h.arena_start || p+p_size-h.arena_start > _MaxArena32 {
+		top := ^uintptr(0)
+		if top-h.arena_start-1 > _MaxArena32 {
+			top = h.arena_start + _MaxArena32 + 1
+		}
+		print("runtime: memory allocated by OS (", hex(p), ") not in usable range [", hex(h.arena_start), ",", hex(top), ")\n")
+		sysFree(unsafe.Pointer(p), p_size, &memstats.heap_sys)
 		return nil
 	}
 
 	p_end := p + p_size
 	p += -p & (_PageSize - 1)
-	if uintptr(p)+n > uintptr(h.arena_used) {
-		mHeap_MapBits(h, p+n)
-		mHeap_MapSpans(h, p+n)
+	if p+n > h.arena_used {
+		h.mapBits(p + n)
+		h.mapSpans(p + n)
 		h.arena_used = p + n
 		if p_end > h.arena_end {
 			h.arena_end = p_end
 		}
 		if raceenabled {
-			racemapshadow((unsafe.Pointer)(p), n)
+			racemapshadow(unsafe.Pointer(p), n)
 		}
 	}
 
-	if uintptr(p)&(_PageSize-1) != 0 {
+	if p&(_PageSize-1) != 0 {
 		throw("misrounded allocation in MHeap_SysAlloc")
 	}
-	return (unsafe.Pointer)(p)
+	return unsafe.Pointer(p)
 }
 
 // base address for all 0-byte allocations
 var zerobase uintptr
 
-const (
-	// flags to malloc
-	_FlagNoScan = 1 << 0 // GC doesn't have to scan object
-	_FlagNoZero = 1 << 1 // don't zero memory
-)
+// nextFreeFast returns the next free object if one is quickly available.
+// Otherwise it returns 0.
+func nextFreeFast(s *mspan) gclinkptr {
+	theBit := sys.Ctz64(s.allocCache) // Is there a free object in the allocCache?
+	if theBit < 64 {
+		result := s.freeindex + uintptr(theBit)
+		if result < s.nelems {
+			freeidx := result + 1
+			if freeidx%64 == 0 && freeidx != s.nelems {
+				return 0
+			}
+			s.allocCache >>= (theBit + 1)
+			s.freeindex = freeidx
+			v := gclinkptr(result*s.elemsize + s.base())
+			s.allocCount++
+			return v
+		}
+	}
+	return 0
+}
+
+// nextFree returns the next free object from the cached span if one is available.
+// Otherwise it refills the cache with a span with an available object and
+// returns that object along with a flag indicating that this was a heavy
+// weight allocation. If it is a heavy weight allocation the caller must
+// determine whether a new GC cycle needs to be started or if the GC is active
+// whether this goroutine needs to assist the GC.
+func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, s *mspan, shouldhelpgc bool) {
+	s = c.alloc[sizeclass]
+	shouldhelpgc = false
+	freeIndex := s.nextFreeIndex()
+	if freeIndex == s.nelems {
+		// The span is full.
+		if uintptr(s.allocCount) != s.nelems {
+			println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+			throw("s.allocCount != s.nelems && freeIndex == s.nelems")
+		}
+		systemstack(func() {
+			c.refill(int32(sizeclass))
+		})
+		shouldhelpgc = true
+		s = c.alloc[sizeclass]
+
+		freeIndex = s.nextFreeIndex()
+	}
+
+	if freeIndex >= s.nelems {
+		throw("freeIndex is not valid")
+	}
+
+	v = gclinkptr(freeIndex*s.elemsize + s.base())
+	s.allocCount++
+	if uintptr(s.allocCount) > s.nelems {
+		println("s.allocCount=", s.allocCount, "s.nelems=", s.nelems)
+		throw("s.allocCount > s.nelems")
+	}
+	return
+}
 
 // Allocate an object of size bytes.
 // Small objects are allocated from the per-P cache's free lists.
 // Large objects (> 32 kB) are allocated straight from the heap.
-func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
+func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	if gcphase == _GCmarktermination {
 		throw("mallocgc called with gcphase == _GCmarktermination")
 	}
@@ -495,10 +535,6 @@
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if flags&flagNoScan == 0 && typ == nil {
-		throw("malloc missing type")
-	}
-
 	if debug.sbrk != 0 {
 		align := uintptr(16)
 		if typ != nil {
@@ -507,6 +543,27 @@
 		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
+	// assistG is the G to charge for this allocation, or nil if
+	// GC is not currently active.
+	var assistG *g
+	if gcBlackenEnabled != 0 {
+		// Charge the current user G for this allocation.
+		assistG = getg()
+		if assistG.m.curg != nil {
+			assistG = assistG.m.curg
+		}
+		// Charge the allocation against the G. We'll account
+		// for internal fragmentation at the end of mallocgc.
+		assistG.gcAssistBytes -= int64(size)
+
+		if assistG.gcAssistBytes < 0 {
+			// This G is in debt. Assist the GC to correct
+			// this before allocating. This must happen
+			// before disabling preemption.
+			gcAssistAlloc(assistG)
+		}
+	}
+
 	// Set mp.mallocing to keep from being preempted by GC.
 	mp := acquirem()
 	if mp.mallocing != 0 {
@@ -520,16 +577,16 @@
 	shouldhelpgc := false
 	dataSize := size
 	c := gomcache()
-	var s *mspan
 	var x unsafe.Pointer
+	noscan := typ == nil || typ.kind&kindNoPointers != 0
 	if size <= maxSmallSize {
-		if flags&flagNoScan != 0 && size < maxTinySize {
+		if noscan && size < maxTinySize {
 			// Tiny allocator.
 			//
 			// Tiny allocator combines several tiny allocation requests
 			// into a single memory block. The resulting memory block
 			// is freed when all subobjects are unreachable. The subobjects
-			// must be FlagNoScan (don't have pointers), this ensures that
+			// must be noscan (don't have pointers), this ensures that
 			// the amount of potentially wasted memory is bounded.
 			//
 			// Size of the memory block used for combining (maxTinySize) is tunable.
@@ -562,9 +619,9 @@
 			} else if size&1 == 0 {
 				off = round(off, 2)
 			}
-			if off+size <= maxTinySize && c.tiny != nil {
+			if off+size <= maxTinySize && c.tiny != 0 {
 				// The object fits into existing tiny block.
-				x = add(c.tiny, off)
+				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
 				c.local_tinyallocs++
 				mp.mallocing = 0
@@ -572,27 +629,18 @@
 				return x
 			}
 			// Allocate a new maxTinySize block.
-			s = c.alloc[tinySizeClass]
-			v := s.freelist
-			if v.ptr() == nil {
-				systemstack(func() {
-					mCache_Refill(c, tinySizeClass)
-				})
-				shouldhelpgc = true
-				s = c.alloc[tinySizeClass]
-				v = s.freelist
+			span := c.alloc[tinySizeClass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, _, shouldhelpgc = c.nextFree(tinySizeClass)
 			}
-			s.freelist = v.ptr().next
-			s.ref++
-			// prefetchnta offers best performance, see change list message.
-			prefetchnta(uintptr(v.ptr().next))
 			x = unsafe.Pointer(v)
 			(*[2]uint64)(x)[0] = 0
 			(*[2]uint64)(x)[1] = 0
 			// See if we need to replace the existing tiny block with the new one
 			// based on amount of remaining free space.
-			if size < c.tinyoffset {
-				c.tiny = x
+			if size < c.tinyoffset || c.tiny == 0 {
+				c.tiny = uintptr(x)
 				c.tinyoffset = size
 			}
 			size = maxTinySize
@@ -604,41 +652,31 @@
 				sizeclass = size_to_class128[(size-1024+127)>>7]
 			}
 			size = uintptr(class_to_size[sizeclass])
-			s = c.alloc[sizeclass]
-			v := s.freelist
-			if v.ptr() == nil {
-				systemstack(func() {
-					mCache_Refill(c, int32(sizeclass))
-				})
-				shouldhelpgc = true
-				s = c.alloc[sizeclass]
-				v = s.freelist
+			span := c.alloc[sizeclass]
+			v := nextFreeFast(span)
+			if v == 0 {
+				v, span, shouldhelpgc = c.nextFree(sizeclass)
 			}
-			s.freelist = v.ptr().next
-			s.ref++
-			// prefetchnta offers best performance, see change list message.
-			prefetchnta(uintptr(v.ptr().next))
 			x = unsafe.Pointer(v)
-			if flags&flagNoZero == 0 {
-				v.ptr().next = 0
-				if size > 2*ptrSize && ((*[2]uintptr)(x))[1] != 0 {
-					memclr(unsafe.Pointer(v), size)
-				}
+			if needzero && span.needzero != 0 {
+				memclr(unsafe.Pointer(v), size)
 			}
 		}
-		c.local_cachealloc += size
 	} else {
 		var s *mspan
 		shouldhelpgc = true
 		systemstack(func() {
-			s = largeAlloc(size, uint32(flags))
+			s = largeAlloc(size, needzero)
 		})
-		x = unsafe.Pointer(uintptr(s.start << pageShift))
-		size = uintptr(s.elemsize)
+		s.freeindex = 1
+		s.allocCount = 1
+		x = unsafe.Pointer(s.base())
+		size = s.elemsize
 	}
 
-	if flags&flagNoScan != 0 {
-		// All objects are pre-marked as noscan. Nothing to do.
+	var scanSize uintptr
+	if noscan {
+		heapBitsSetTypeNoScan(uintptr(x))
 	} else {
 		// If allocating a defer+arg block, now that we've picked a malloc size
 		// large enough to hold everything, cut the "asked for" size down to
@@ -655,35 +693,38 @@
 			// pointers, GC has to scan to the last
 			// element.
 			if typ.ptrdata != 0 {
-				c.local_scan += dataSize - typ.size + typ.ptrdata
+				scanSize = dataSize - typ.size + typ.ptrdata
 			}
 		} else {
-			c.local_scan += typ.ptrdata
+			scanSize = typ.ptrdata
 		}
-
-		// Ensure that the stores above that initialize x to
-		// type-safe memory and set the heap bits occur before
-		// the caller can make x observable to the garbage
-		// collector. Otherwise, on weakly ordered machines,
-		// the garbage collector could follow a pointer to x,
-		// but see uninitialized memory or stale heap bits.
-		publicationBarrier()
+		c.local_scan += scanSize
 	}
 
-	// GCmarkterminate allocates black
+	// Ensure that the stores above that initialize x to
+	// type-safe memory and set the heap bits occur before
+	// the caller can make x observable to the garbage
+	// collector. Otherwise, on weakly ordered machines,
+	// the garbage collector could follow a pointer to x,
+	// but see uninitialized memory or stale heap bits.
+	publicationBarrier()
+
+	// Allocate black during GC.
 	// All slots hold nil so no scanning is needed.
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
-	if gcphase == _GCmarktermination || gcBlackenPromptly {
-		systemstack(func() {
-			gcmarknewobject_m(uintptr(x), size)
-		})
+	if gcphase != _GCoff {
+		gcmarknewobject(uintptr(x), size, scanSize)
 	}
 
 	if raceenabled {
 		racemalloc(x, size)
 	}
 
+	if msanenabled {
+		msanmalloc(x, size)
+	}
+
 	mp.mallocing = 0
 	releasem(mp)
 
@@ -701,32 +742,20 @@
 		}
 	}
 
-	if shouldhelpgc && shouldtriggergc() {
-		startGC(gcBackgroundMode, false)
-	} else if gcBlackenEnabled != 0 {
-		// Assist garbage collector. We delay this until the
-		// epilogue so that it doesn't interfere with the
-		// inner working of malloc such as mcache refills that
-		// might happen while doing the gcAssistAlloc.
-		gcAssistAlloc(size, shouldhelpgc)
-	} else if shouldhelpgc && bggc.working != 0 {
-		// The GC is starting up or shutting down, so we can't
-		// assist, but we also can't allocate unabated. Slow
-		// down this G's allocation and help the GC stay
-		// scheduled by yielding.
-		//
-		// TODO: This is a workaround. Either help the GC make
-		// the transition or block.
-		gp := getg()
-		if gp != gp.m.g0 && gp.m.locks == 0 && gp.m.preemptoff == "" {
-			Gosched()
-		}
+	if assistG != nil {
+		// Account for internal fragmentation in the assist
+		// debt now that we know it.
+		assistG.gcAssistBytes -= int64(size - dataSize)
+	}
+
+	if shouldhelpgc && gcShouldStart(false) {
+		gcStart(gcBackgroundMode, false)
 	}
 
 	return x
 }
 
-func largeAlloc(size uintptr, flag uint32) *mspan {
+func largeAlloc(size uintptr, needzero bool) *mspan {
 	// print("largeAlloc size=", size, "\n")
 
 	if size+_PageSize < size {
@@ -742,22 +771,18 @@
 	// pays the debt down to npage pages.
 	deductSweepCredit(npages*_PageSize, npages)
 
-	s := mHeap_Alloc(&mheap_, npages, 0, true, flag&_FlagNoZero == 0)
+	s := mheap_.alloc(npages, 0, true, needzero)
 	if s == nil {
 		throw("out of memory")
 	}
-	s.limit = uintptr(s.start)<<_PageShift + size
-	heapBitsForSpan(s.base()).initSpan(s.layout())
+	s.limit = s.base() + size
+	heapBitsForSpan(s.base()).initSpan(s)
 	return s
 }
 
 // implementation of new builtin
 func newobject(typ *_type) unsafe.Pointer {
-	flags := uint32(0)
-	if typ.kind&kindNoPointers != 0 {
-		flags |= flagNoScan
-	}
-	return mallocgc(uintptr(typ.size), typ, flags)
+	return mallocgc(typ.size, typ, true)
 }
 
 //go:linkname reflect_unsafe_New reflect.unsafe_New
@@ -765,50 +790,78 @@
 	return newobject(typ)
 }
 
-// implementation of make builtin for slices
-func newarray(typ *_type, n uintptr) unsafe.Pointer {
-	flags := uint32(0)
-	if typ.kind&kindNoPointers != 0 {
-		flags |= flagNoScan
+// newarray allocates an array of n elements of type typ.
+func newarray(typ *_type, n int) unsafe.Pointer {
+	if n < 0 || uintptr(n) > maxSliceCap(typ.size) {
+		panic(plainError("runtime: allocation size out of range"))
 	}
-	if int(n) < 0 || (typ.size > 0 && n > _MaxMem/uintptr(typ.size)) {
-		panic("runtime: allocation size out of range")
-	}
-	return mallocgc(uintptr(typ.size)*n, typ, flags)
+	return mallocgc(typ.size*uintptr(n), typ, true)
 }
 
 //go:linkname reflect_unsafe_NewArray reflect.unsafe_NewArray
-func reflect_unsafe_NewArray(typ *_type, n uintptr) unsafe.Pointer {
+func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 	return newarray(typ, n)
 }
 
-// rawmem returns a chunk of pointerless memory.  It is
-// not zeroed.
-func rawmem(size uintptr) unsafe.Pointer {
-	return mallocgc(size, nil, flagNoScan|flagNoZero)
+func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
+	mp.mcache.next_sample = nextSample()
+	mProf_Malloc(x, size)
 }
 
-func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	c := mp.mcache
-	rate := MemProfileRate
-	if size < uintptr(rate) {
-		// pick next profile time
-		// If you change this, also change allocmcache.
-		if rate > 0x3fffffff { // make 2*rate not overflow
-			rate = 0x3fffffff
+// nextSample returns the next sampling point for heap profiling.
+// It produces a random variable with a geometric distribution and
+// mean MemProfileRate. This is done by generating a uniformly
+// distributed random number and applying the cumulative distribution
+// function for an exponential.
+func nextSample() int32 {
+	if GOOS == "plan9" {
+		// Plan 9 doesn't support floating point in note handler.
+		if g := getg(); g == g.m.gsignal {
+			return nextSampleNoFP()
 		}
-		next := int32(fastrand1()) % (2 * int32(rate))
-		// Subtract the "remainder" of the current allocation.
-		// Otherwise objects that are close in size to sampling rate
-		// will be under-sampled, because we consistently discard this remainder.
-		next -= (int32(size) - c.next_sample)
-		if next < 0 {
-			next = 0
-		}
-		c.next_sample = next
 	}
 
-	mProf_Malloc(x, size)
+	period := MemProfileRate
+
+	// make nextSample not overflow. Maximum possible step is
+	// -ln(1/(1<<kRandomBitCount)) * period, approximately 20 * period.
+	switch {
+	case period > 0x7000000:
+		period = 0x7000000
+	case period == 0:
+		return 0
+	}
+
+	// Let m be the sample rate,
+	// the probability distribution function is m*exp(-mx), so the CDF is
+	// p = 1 - exp(-mx), so
+	// q = 1 - p == exp(-mx)
+	// log_e(q) = -mx
+	// -log_e(q)/m = x
+	// x = -log_e(q) * period
+	// x = log_2(q) * (-log_e(2)) * period    ; Using log_2 for efficiency
+	const randomBitCount = 26
+	q := fastrand1()%(1<<randomBitCount) + 1
+	qlog := fastlog2(float64(q)) - randomBitCount
+	if qlog > 0 {
+		qlog = 0
+	}
+	const minusLog2 = -0.6931471805599453 // -ln(2)
+	return int32(qlog*(minusLog2*float64(period))) + 1
+}
+
+// nextSampleNoFP is similar to nextSample, but uses older,
+// simpler code to avoid floating point.
+func nextSampleNoFP() int32 {
+	// Set first allocation sample size.
+	rate := MemProfileRate
+	if rate > 0x3fffffff { // make 2*rate not overflow
+		rate = 0x3fffffff
+	}
+	if rate != 0 {
+		return int32(int(fastrand1()) % (2 * rate))
+	}
+	return 0
 }
 
 type persistentAlloc struct {

diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index f0e73ba..767b51f 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go

@@ -17,13 +17,14 @@
 	st := new(MemStats)
 	ReadMemStats(st)
 
-	// Everything except HeapReleased and HeapIdle, because they indeed can be 0.
+	// Everything except HeapReleased, HeapIdle, and NumGC,
+	// because they indeed can be 0.
 	if st.Alloc == 0 || st.TotalAlloc == 0 || st.Sys == 0 || st.Lookups == 0 ||
 		st.Mallocs == 0 || st.Frees == 0 || st.HeapAlloc == 0 || st.HeapSys == 0 ||
 		st.HeapInuse == 0 || st.HeapObjects == 0 || st.StackInuse == 0 ||
 		st.StackSys == 0 || st.MSpanInuse == 0 || st.MSpanSys == 0 || st.MCacheInuse == 0 ||
 		st.MCacheSys == 0 || st.BuckHashSys == 0 || st.GCSys == 0 || st.OtherSys == 0 ||
-		st.NextGC == 0 || st.NumGC == 0 {
+		st.NextGC == 0 {
 		t.Fatalf("Zero value: %+v", *st)
 	}
 
@@ -58,6 +59,14 @@
 		if st.PauseTotalNs != pauseTotal {
 			t.Fatalf("PauseTotalNs(%d) != sum PauseNs(%d)", st.PauseTotalNs, pauseTotal)
 		}
+		for i := int(st.NumGC); i < len(st.PauseNs); i++ {
+			if st.PauseNs[i] != 0 {
+				t.Fatalf("Non-zero PauseNs[%d]: %+v", i, st)
+			}
+			if st.PauseEnd[i] != 0 {
+				t.Fatalf("Non-zero PauseEnd[%d]: %+v", i, st)
+			}
+		}
 	} else {
 		if st.PauseTotalNs < pauseTotal {
 			t.Fatalf("PauseTotalNs(%d) < sum PauseNs(%d)", st.PauseTotalNs, pauseTotal)
@@ -82,6 +91,23 @@
 	}
 }
 
+func TestTinyAlloc(t *testing.T) {
+	const N = 16
+	var v [N]unsafe.Pointer
+	for i := range v {
+		v[i] = unsafe.Pointer(new(byte))
+	}
+
+	chunks := make(map[uintptr]bool, N)
+	for _, p := range v {
+		chunks[uintptr(p)&^7] = true
+	}
+
+	if len(chunks) == N {
+		t.Fatal("no bytes allocated within the same 8-byte chunk")
+	}
+}
+
 var mallocSink uintptr
 
 func BenchmarkMalloc8(b *testing.B) {

diff --git a/src/runtime/map_test.go b/src/runtime/map_test.go
index 9d2894c..496f8e8 100644
--- a/src/runtime/map_test.go
+++ b/src/runtime/map_test.go

@@ -317,6 +317,22 @@
 	}
 }
 
+func TestMapHugeZero(t *testing.T) {
+	type T [4000]byte
+	m := map[int]T{}
+	x := m[0]
+	if x != (T{}) {
+		t.Errorf("map value not zero")
+	}
+	y, ok := m[0]
+	if ok {
+		t.Errorf("map value should be missing")
+	}
+	if y != (T{}) {
+		t.Errorf("map value not zero")
+	}
+}
+
 type empty struct {
 }
 

diff --git a/src/runtime/mbarrier.go b/src/runtime/mbarrier.go
index 0dbe1ff..bf75934 100644
--- a/src/runtime/mbarrier.go
+++ b/src/runtime/mbarrier.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -13,7 +13,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // markwb is the mark-phase write barrier, the only barrier we have.
 // The rest of this file exists only to make calls to this function.
@@ -31,12 +34,12 @@
 // Dealing with memory ordering:
 //
 // Dijkstra pointed out that maintaining the no black to white
-// pointers means that white to white pointers not need
+// pointers means that white to white pointers do not need
 // to be noted by the write barrier. Furthermore if either
 // white object dies before it is reached by the
 // GC then the object can be collected during this GC cycle
 // instead of waiting for the next cycle. Unfortunately the cost of
-// ensure that the object holding the slot doesn't concurrently
+// ensuring that the object holding the slot doesn't concurrently
 // change to black without the mutator noticing seems prohibitive.
 //
 // Consider the following example where the mutator writes into
@@ -84,9 +87,20 @@
 // frames that have potentially been active since the concurrent scan,
 // so it depends on write barriers to track changes to pointers in
 // stack frames that have not been active.
-//go:nowritebarrier
+//
+//
+// Global writes:
+//
+// The Go garbage collector requires write barriers when heap pointers
+// are stored in globals. Many garbage collectors ignore writes to
+// globals and instead pick up global -> heap pointers during
+// termination. This increases pause time, so we instead rely on write
+// barriers for writes to globals so that we don't have to rescan
+// global during mark termination.
+//
+//go:nowritebarrierrec
 func gcmarkwb_m(slot *uintptr, ptr uintptr) {
-	if writeBarrierEnabled {
+	if writeBarrier.needed {
 		if ptr != 0 && inheap(ptr) {
 			shade(ptr)
 		}
@@ -97,7 +111,7 @@
 // related operations. In particular there are times when the GC assumes
 // that the world is stopped but scheduler related code is still being
 // executed, dealing with syscalls, dealing with putting gs on runnable
-// queues and so forth. This code can not execute write barriers because
+// queues and so forth. This code cannot execute write barriers because
 // the GC might drop them on the floor. Stopping the world involves removing
 // the p associated with an m. We use the fact that m.p == nil to indicate
 // that we are in one these critical section and throw if the write is of
@@ -125,10 +139,13 @@
 //go:nosplit
 func writebarrierptr(dst *uintptr, src uintptr) {
 	*dst = src
-	if !writeBarrierEnabled {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
 		return
 	}
-	if src != 0 && (src < _PhysPageSize || src == poisonStack) {
+	if src != 0 && src < sys.PhysPageSize {
 		systemstack(func() {
 			print("runtime: writebarrierptr *", dst, " = ", hex(src), "\n")
 			throw("bad pointer in write barrier")
@@ -141,44 +158,25 @@
 // Do not reapply.
 //go:nosplit
 func writebarrierptr_nostore(dst *uintptr, src uintptr) {
-	if !writeBarrierEnabled {
+	if writeBarrier.cgo {
+		cgoCheckWriteBarrier(dst, src)
+	}
+	if !writeBarrier.needed {
 		return
 	}
-	if src != 0 && (src < _PhysPageSize || src == poisonStack) {
+	if src != 0 && src < sys.PhysPageSize {
 		systemstack(func() { throw("bad pointer in write barrier") })
 	}
 	writebarrierptr_nostore1(dst, src)
 }
 
-//go:nosplit
-func writebarrierstring(dst *[2]uintptr, src [2]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-}
-
-//go:nosplit
-func writebarrierslice(dst *[3]uintptr, src [3]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	dst[2] = src[2]
-}
-
-//go:nosplit
-func writebarrieriface(dst *[2]uintptr, src [2]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-}
-
-//go:generate go run wbfat_gen.go -- wbfat.go
-//
-// The above line generates multiword write barriers for
-// all the combinations of ptr+scalar up to four words.
-// The implementations are written to wbfat.go.
-
 // typedmemmove copies a value of type t to dst from src.
 //go:nosplit
 func typedmemmove(typ *_type, dst, src unsafe.Pointer) {
 	memmove(dst, src, typ.size)
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, 0, typ.size)
+	}
 	if typ.kind&kindNoPointers != 0 {
 		return
 	}
@@ -195,15 +193,18 @@
 //go:linkname reflect_typedmemmovepartial reflect.typedmemmovepartial
 func reflect_typedmemmovepartial(typ *_type, dst, src unsafe.Pointer, off, size uintptr) {
 	memmove(dst, src, size)
-	if !writeBarrierEnabled || typ.kind&kindNoPointers != 0 || size < ptrSize || !inheap(uintptr(dst)) {
+	if writeBarrier.cgo {
+		cgoCheckMemmove(typ, dst, src, off, size)
+	}
+	if !writeBarrier.needed || typ.kind&kindNoPointers != 0 || size < sys.PtrSize {
 		return
 	}
 
-	if frag := -off & (ptrSize - 1); frag != 0 {
+	if frag := -off & (sys.PtrSize - 1); frag != 0 {
 		dst = add(dst, frag)
 		size -= frag
 	}
-	heapBitsBulkBarrier(uintptr(dst), size&^(ptrSize-1))
+	heapBitsBulkBarrier(uintptr(dst), size&^(sys.PtrSize-1))
 }
 
 // callwritebarrier is invoked at the end of reflectcall, to execute
@@ -211,11 +212,11 @@
 // values have just been copied to frame, starting at retoffset
 // and continuing to framesize. The entire frame (not just the return
 // values) is described by typ. Because the copy has already
-// happened, we call writebarrierptr_nostore, and we must be careful
-// not to be preempted before the write barriers have been run.
+// happened, we call writebarrierptr_nostore, and this is nosplit so
+// the copy and write barrier appear atomic to GC.
 //go:nosplit
 func callwritebarrier(typ *_type, frame unsafe.Pointer, framesize, retoffset uintptr) {
-	if !writeBarrierEnabled || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < ptrSize || !inheap(uintptr(frame)) {
+	if !writeBarrier.needed || typ == nil || typ.kind&kindNoPointers != 0 || framesize-retoffset < sys.PtrSize {
 		return
 	}
 	heapBitsBulkBarrier(uintptr(add(frame, retoffset)), framesize-retoffset)
@@ -232,8 +233,8 @@
 	if n == 0 {
 		return 0
 	}
-	dstp := unsafe.Pointer(dst.array)
-	srcp := unsafe.Pointer(src.array)
+	dstp := dst.array
+	srcp := src.array
 
 	if raceenabled {
 		callerpc := getcallerpc(unsafe.Pointer(&typ))
@@ -241,12 +242,20 @@
 		racewriterangepc(dstp, uintptr(n)*typ.size, callerpc, pc)
 		racereadrangepc(srcp, uintptr(n)*typ.size, callerpc, pc)
 	}
+	if msanenabled {
+		msanwrite(dstp, uintptr(n)*typ.size)
+		msanread(srcp, uintptr(n)*typ.size)
+	}
+
+	if writeBarrier.cgo {
+		cgoCheckSliceCopy(typ, dst, src, n)
+	}
 
 	// Note: No point in checking typ.kind&kindNoPointers here:
 	// compiler only emits calls to typedslicecopy for types with pointers,
 	// and growslice and reflect_typedslicecopy check for pointers
 	// before calling typedslicecopy.
-	if !writeBarrierEnabled {
+	if !writeBarrier.needed {
 		memmove(dstp, srcp, uintptr(n)*typ.size)
 		return n
 	}
@@ -281,7 +290,7 @@
 			}
 		}
 	})
-	return int(n)
+	return n
 }
 
 //go:linkname reflect_typedslicecopy reflect.typedslicecopy

diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index fd6b4b1..ccefbcd 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -24,21 +24,24 @@
 // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
 // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
 // The meaning of the high bit depends on the position of the word being described
-// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
+// in its allocated object. In all words *except* the second word, the
+// high bit indicates that the object is still being described. In
+// these words, if a bit pair with a high bit 0 is encountered, the
+// low bit can also be assumed to be 0, and the object description is
+// over. This 00 is called the ``dead'' encoding: it signals that the
+// rest of the words in the object are uninteresting to the garbage
+// collector.
+//
 // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
-// In the third and later words, the high bit indicates that the object is still
-// being described. In these words, if a bit pair with a high bit 0 is encountered,
-// the low bit can also be assumed to be 0, and the object description is over.
-// This 00 is called the ``dead'' encoding: it signals that the rest of the words
-// in the object are uninteresting to the garbage collector.
 //
 // The 2-bit entries are split when written into the byte, so that the top half
-// of the byte contains 4 mark bits and the bottom half contains 4 pointer bits.
+// of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
+// bits.
 // This form allows a copy from the 1-bit to the 4-bit form to keep the
 // pointer bits contiguous, instead of having to space them out.
 //
 // The code makes use of the fact that the zero value for a heap bitmap
-// has no live pointer bit set and is (depending on position), not marked,
+// has no live pointer bit set and is (depending on position), not used,
 // not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
@@ -63,17 +66,22 @@
 // It is still used in general, except in checkmark the type bit is repurposed
 // as the checkmark bit and then reinitialized (to 1) as the type bit when
 // finished.
+//
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
 	bitPointer = 1 << 0
-	bitMarked  = 1 << 4
+	bitMarked  = 1 << 4 // TODO: Rename bitScan.
 
-	heapBitsShift   = 1                 // shift offset between successive bitPointer or bitMarked entries
-	heapBitmapScale = ptrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
+	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitMarked entries
+	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
 
 	// all mark/pointer bits in a byte
 	bitMarkedAll  = bitMarked | bitMarked<<heapBitsShift | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
@@ -82,6 +90,7 @@
 
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
+//go:nosplit
 func addb(p *byte, n uintptr) *byte {
 	// Note: wrote out full expression instead of calling add(p, n)
 	// to reduce the number of temporaries generated by the
@@ -90,7 +99,10 @@
 }
 
 // subtractb returns the byte pointer p-n.
+// subtractb is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
 //go:nowritebarrier
+//go:nosplit
 func subtractb(p *byte, n uintptr) *byte {
 	// Note: wrote out full expression instead of calling add(p, -n)
 	// to reduce the number of temporaries generated by the
@@ -100,6 +112,7 @@
 
 // add1 returns the byte pointer p+1.
 //go:nowritebarrier
+//go:nosplit
 func add1(p *byte) *byte {
 	// Note: wrote out full expression instead of calling addb(p, 1)
 	// to reduce the number of temporaries generated by the
@@ -108,6 +121,8 @@
 }
 
 // subtract1 returns the byte pointer p-1.
+// subtract1 is typically used when traversing the pointer tables referred to by hbits
+// which are arranged in reverse order.
 //go:nowritebarrier
 //
 // nosplit because it is used during write barriers and must not be preempted.
@@ -128,7 +143,7 @@
 // after observing the change to arena_used.
 //
 //go:nowritebarrier
-func mHeap_MapBits(h *mheap, arena_used uintptr) {
+func (h *mheap) mapBits(arena_used uintptr) {
 	// Caller has added extra mappings to the arena.
 	// Add extra mappings of bitmap words as needed.
 	// We allocate extra bitmap pieces in chunks of bitmapChunk.
@@ -136,12 +151,12 @@
 
 	n := (arena_used - mheap_.arena_start) / heapBitmapScale
 	n = round(n, bitmapChunk)
-	n = round(n, _PhysPageSize)
+	n = round(n, sys.PhysPageSize)
 	if h.bitmap_mapped >= n {
 		return
 	}
 
-	sysMap(unsafe.Pointer(h.arena_start-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
+	sysMap(unsafe.Pointer(h.bitmap-n), n-h.bitmap_mapped, h.arena_reserved, &memstats.gc_sys)
 	h.bitmap_mapped = n
 }
 
@@ -154,6 +169,193 @@
 	shift uint32
 }
 
+// markBits provides access to the mark bit for an object in the heap.
+// bytep points to the byte holding the mark bit.
+// mask is a byte with a single bit set that can be &ed with *bytep
+// to see if the bit has been set.
+// *m.byte&m.mask != 0 indicates the mark bit is set.
+// index can be used along with span information to generate
+// the address of the object in the heap.
+// We maintain one set of mark bits for allocation and one for
+// marking purposes.
+type markBits struct {
+	bytep *uint8
+	mask  uint8
+	index uintptr
+}
+
+//go:nosplit
+func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits {
+	whichByte := allocBitIndex / 8
+	whichBit := allocBitIndex % 8
+	bytePtr := addb(s.allocBits, whichByte)
+	return markBits{bytePtr, uint8(1 << whichBit), allocBitIndex}
+}
+
+// refillaCache takes 8 bytes s.allocBits starting at whichByte
+// and negates them so that ctz (count trailing zeros) instructions
+// can be used. It then places these 8 bytes into the cached 64 bit
+// s.allocCache.
+func (s *mspan) refillAllocCache(whichByte uintptr) {
+	bytes := (*[8]uint8)(unsafe.Pointer(addb(s.allocBits, whichByte)))
+	aCache := uint64(0)
+	aCache |= uint64(bytes[0])
+	aCache |= uint64(bytes[1]) << (1 * 8)
+	aCache |= uint64(bytes[2]) << (2 * 8)
+	aCache |= uint64(bytes[3]) << (3 * 8)
+	aCache |= uint64(bytes[4]) << (4 * 8)
+	aCache |= uint64(bytes[5]) << (5 * 8)
+	aCache |= uint64(bytes[6]) << (6 * 8)
+	aCache |= uint64(bytes[7]) << (7 * 8)
+	s.allocCache = ^aCache
+}
+
+// nextFreeIndex returns the index of the next free object in s at
+// or after s.freeindex.
+// There are hardware instructions that can be used to make this
+// faster if profiling warrants it.
+func (s *mspan) nextFreeIndex() uintptr {
+	sfreeindex := s.freeindex
+	snelems := s.nelems
+	if sfreeindex == snelems {
+		return sfreeindex
+	}
+	if sfreeindex > snelems {
+		throw("s.freeindex > s.nelems")
+	}
+
+	aCache := s.allocCache
+
+	bitIndex := sys.Ctz64(aCache)
+	for bitIndex == 64 {
+		// Move index to start of next cached bits.
+		sfreeindex = (sfreeindex + 64) &^ (64 - 1)
+		if sfreeindex >= snelems {
+			s.freeindex = snelems
+			return snelems
+		}
+		whichByte := sfreeindex / 8
+		// Refill s.allocCache with the next 64 alloc bits.
+		s.refillAllocCache(whichByte)
+		aCache = s.allocCache
+		bitIndex = sys.Ctz64(aCache)
+		// nothing available in cached bits
+		// grab the next 8 bytes and try again.
+	}
+	result := sfreeindex + uintptr(bitIndex)
+	if result >= snelems {
+		s.freeindex = snelems
+		return snelems
+	}
+
+	s.allocCache >>= (bitIndex + 1)
+	sfreeindex = result + 1
+
+	if sfreeindex%64 == 0 && sfreeindex != snelems {
+		// We just incremented s.freeindex so it isn't 0.
+		// As each 1 in s.allocCache was encountered and used for allocation
+		// it was shifted away. At this point s.allocCache contains all 0s.
+		// Refill s.allocCache so that it corresponds
+		// to the bits at s.allocBits starting at s.freeindex.
+		whichByte := sfreeindex / 8
+		s.refillAllocCache(whichByte)
+	}
+	s.freeindex = sfreeindex
+	return result
+}
+
+func (s *mspan) isFree(index uintptr) bool {
+	whichByte := index / 8
+	whichBit := index % 8
+	byteVal := *addb(s.allocBits, whichByte)
+	return byteVal&uint8(1<<whichBit) == 0
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	byteOffset := p - s.base()
+	if byteOffset == 0 {
+		return 0
+	}
+	if s.baseMask != 0 {
+		// s.baseMask is 0, elemsize is a power of two, so shift by s.divShift
+		return byteOffset >> s.divShift
+	}
+	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+}
+
+func markBitsForAddr(p uintptr) markBits {
+	s := spanOf(p)
+	objIndex := s.objIndex(p)
+	return s.markBitsForIndex(objIndex)
+}
+
+func (s *mspan) markBitsForIndex(objIndex uintptr) markBits {
+	whichByte := objIndex / 8
+	bitMask := uint8(1 << (objIndex % 8)) // low 3 bits hold the bit index
+	bytePtr := addb(s.gcmarkBits, whichByte)
+	return markBits{bytePtr, bitMask, objIndex}
+}
+
+func (s *mspan) markBitsForBase() markBits {
+	return markBits{s.gcmarkBits, uint8(1), 0}
+}
+
+// isMarked reports whether mark bit m is set.
+func (m markBits) isMarked() bool {
+	return *m.bytep&m.mask != 0
+}
+
+// setMarked sets the marked bit in the markbits, atomically. Some compilers
+// are not able to inline atomic.Or8 function so if it appears as a hot spot consider
+// inlining it manually.
+func (m markBits) setMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.Or8(m.bytep, m.mask)
+}
+
+// setMarkedNonAtomic sets the marked bit in the markbits, non-atomically.
+func (m markBits) setMarkedNonAtomic() {
+	*m.bytep |= m.mask
+}
+
+// clearMarked clears the marked bit in the markbits, atomically.
+func (m markBits) clearMarked() {
+	// Might be racing with other updates, so use atomic update always.
+	// We used to be clever here and use a non-atomic update in certain
+	// cases, but it's not worth the risk.
+	atomic.And8(m.bytep, ^m.mask)
+}
+
+// clearMarkedNonAtomic clears the marked bit non-atomically.
+func (m markBits) clearMarkedNonAtomic() {
+	*m.bytep ^= m.mask
+}
+
+// markBitsForSpan returns the markBits for the span base address base.
+func markBitsForSpan(base uintptr) (mbits markBits) {
+	if base < mheap_.arena_start || base >= mheap_.arena_used {
+		throw("heapBitsForSpan: base out of range")
+	}
+	mbits = markBitsForAddr(base)
+	if mbits.mask != 1 {
+		throw("markBitsForSpan: unaligned start")
+	}
+	return mbits
+}
+
+// advance advances the markBits to the next object in the span.
+func (m *markBits) advance() {
+	if m.mask == 1<<7 {
+		m.bytep = (*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(m.bytep)) + 1))
+		m.mask = 1
+	} else {
+		m.mask = m.mask << 1
+	}
+	m.index++
+}
+
 // heapBitsForAddr returns the heapBits for the address addr.
 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 //
@@ -161,8 +363,8 @@
 //go:nosplit
 func heapBitsForAddr(addr uintptr) heapBits {
 	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
-	off := (addr - mheap_.arena_start) / ptrSize
-	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(off & 3)}
+	off := (addr - mheap_.arena_start) / sys.PtrSize
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.bitmap - off/4 - 1)), uint32(off & 3)}
 }
 
 // heapBitsForSpan returns the heapBits for the span base address base.
@@ -170,19 +372,20 @@
 	if base < mheap_.arena_start || base >= mheap_.arena_used {
 		throw("heapBitsForSpan: base out of range")
 	}
-	hbits = heapBitsForAddr(base)
-	if hbits.shift != 0 {
-		throw("heapBitsForSpan: unaligned start")
-	}
-	return hbits
+	return heapBitsForAddr(base)
 }
 
 // heapBitsForObject returns the base address for the heap object
-// containing the address p, along with the heapBits for base.
+// containing the address p, the heapBits for base,
+// the object's span, and of the index of the object in s.
 // If p does not point into a heap object,
 // return base == 0
 // otherwise return the base of the object.
-func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) {
+//
+// refBase and refOff optionally give the base address of the object
+// in which the pointer p was found and the byte offset at which it
+// was found. These are used for error reporting.
+func heapBitsForObject(p, refBase, refOff uintptr) (base uintptr, hbits heapBits, s *mspan, objIndex uintptr) {
 	arenaStart := mheap_.arena_start
 	if p < arenaStart || p >= mheap_.arena_used {
 		return
@@ -191,9 +394,8 @@
 	idx := off >> _PageShift
 	// p points into the heap, but possibly to the middle of an object.
 	// Consult the span table to find the block beginning.
-	k := p >> _PageShift
 	s = h_spans[idx]
-	if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse {
+	if s == nil || p < s.base() || p >= s.limit || s.state != mSpanInUse {
 		if s == nil || s.state == _MSpanStack {
 			// If s is nil, the virtual address has never been part of the heap.
 			// This pointer may be to some mmap'd region, so we allow it.
@@ -203,18 +405,28 @@
 
 		// The following ensures that we are rigorous about what data
 		// structures hold valid pointers.
-		// TODO(rsc): Check if this still happens.
 		if debug.invalidptr != 0 {
-			// Still happens sometimes. We don't know why.
+			// Typically this indicates an incorrect use
+			// of unsafe or cgo to store a bad pointer in
+			// the Go heap. It may also indicate a runtime
+			// bug.
+			//
+			// TODO(austin): We could be more aggressive
+			// and detect pointers to unallocated objects
+			// in allocated spans.
 			printlock()
-			print("runtime:objectstart Span weird: p=", hex(p), " k=", hex(k))
-			if s == nil {
-				print(" s=nil\n")
+			print("runtime: pointer ", hex(p))
+			if s.state != mSpanInUse {
+				print(" to unallocated span")
 			} else {
-				print(" s.start=", hex(s.start<<_PageShift), " s.limit=", hex(s.limit), " s.state=", s.state, "\n")
+				print(" to unused region of span")
 			}
-			printunlock()
-			throw("objectstart: bad pointer in unexpected span")
+			print("idx=", hex(idx), " span.base()=", hex(s.base()), " span.limit=", hex(s.limit), " span.state=", s.state, "\n")
+			if refBase != 0 {
+				print("runtime: found in object at *(", hex(refBase), "+", hex(refOff), ")\n")
+				gcDumpObject("object", refBase, refOff)
+			}
+			throw("found bad pointer in Go heap (incorrect use of unsafe or cgo?)")
 		}
 		return
 	}
@@ -224,6 +436,7 @@
 		// optimize for power of 2 sized objects.
 		base = s.base()
 		base = base + (p-base)&s.baseMask
+		objIndex = (base - s.base()) >> s.divShift
 		// base = p & s.baseMask is faster for small spans,
 		// but doesn't work for large spans.
 		// Overall, it's faster to use the more general computation above.
@@ -231,8 +444,8 @@
 		base = s.base()
 		if p-base >= s.elemsize {
 			// n := (p - base) / s.elemsize, using division by multiplication
-			n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
-			base += n * s.elemsize
+			objIndex = uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
+			base += objIndex * s.elemsize
 		}
 	}
 	// Now that we know the actual base, compute heapBits to return to caller.
@@ -272,64 +485,34 @@
 // The result includes in its higher bits the bits for subsequent words
 // described by the same bitmap byte.
 func (h heapBits) bits() uint32 {
-	return uint32(*h.bitp) >> h.shift
+	// The (shift & 31) eliminates a test and conditional branch
+	// from the generated code.
+	return uint32(*h.bitp) >> (h.shift & 31)
 }
 
-// isMarked reports whether the heap bits have the marked bit set.
-// h must describe the initial word of the object.
-func (h heapBits) isMarked() bool {
-	return *h.bitp&(bitMarked<<h.shift) != 0
-}
-
-// setMarked sets the marked bit in the heap bits, atomically.
-// h must describe the initial word of the object.
-func (h heapBits) setMarked() {
-	// Each byte of GC bitmap holds info for four words.
-	// Might be racing with other updates, so use atomic update always.
-	// We used to be clever here and use a non-atomic update in certain
-	// cases, but it's not worth the risk.
-	atomicor8(h.bitp, bitMarked<<h.shift)
-}
-
-// setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
-// h must describe the initial word of the object.
-func (h heapBits) setMarkedNonAtomic() {
-	*h.bitp |= bitMarked << h.shift
+// morePointers returns true if this word and all remaining words in this object
+// are scalars.
+// h must not describe the second word of the object.
+func (h heapBits) morePointers() bool {
+	return h.bits()&bitMarked != 0
 }
 
 // isPointer reports whether the heap bits describe a pointer word.
-// h must describe the initial word of the object.
 //
 // nosplit because it is used during write barriers and must not be preempted.
 //go:nosplit
 func (h heapBits) isPointer() bool {
-	return (*h.bitp>>h.shift)&bitPointer != 0
+	return h.bits()&bitPointer != 0
 }
 
 // hasPointers reports whether the given object has any pointers.
-// It must be told how large the object at h is, so that it does not read too
-// far into the bitmap.
+// It must be told how large the object at h is for efficiency.
 // h must describe the initial word of the object.
 func (h heapBits) hasPointers(size uintptr) bool {
-	if size == ptrSize { // 1-word objects are always pointers
+	if size == sys.PtrSize { // 1-word objects are always pointers
 		return true
 	}
-	// Otherwise, at least a 2-word object, and at least 2-word aligned,
-	// so h.shift is either 0 or 4, so we know we can get the bits for the
-	// first two words out of *h.bitp.
-	// If either of the first two words is a pointer, not pointer free.
-	b := uint32(*h.bitp >> h.shift)
-	if b&(bitPointer|bitPointer<<heapBitsShift) != 0 {
-		return true
-	}
-	if size == 2*ptrSize {
-		return false
-	}
-	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
-	if h.shift == 0 {
-		return b&(bitMarked<<(2*heapBitsShift)) != 0
-	}
-	return uint32(*subtract1(h.bitp))&bitMarked != 0
+	return (*h.bitp>>h.shift)&bitMarked != 0
 }
 
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
@@ -337,7 +520,7 @@
 // checkmark bit varies by size.
 // h must describe the initial word of the object.
 func (h heapBits) isCheckmarked(size uintptr) bool {
-	if size == ptrSize {
+	if size == sys.PtrSize {
 		return (*h.bitp>>h.shift)&bitPointer != 0
 	}
 	// All multiword objects are 2-word aligned,
@@ -352,19 +535,19 @@
 // checkmark bit varies by size.
 // h must describe the initial word of the object.
 func (h heapBits) setCheckmarked(size uintptr) {
-	if size == ptrSize {
-		atomicor8(h.bitp, bitPointer<<h.shift)
+	if size == sys.PtrSize {
+		atomic.Or8(h.bitp, bitPointer<<h.shift)
 		return
 	}
-	atomicor8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
+	atomic.Or8(h.bitp, bitMarked<<(heapBitsShift+h.shift))
 }
 
 // heapBitsBulkBarrier executes writebarrierptr_nostore
 // for every pointer slot in the memory range [p, p+size),
-// using the heap bitmap to locate those pointer slots.
+// using the heap, data, or BSS bitmap to locate those pointer slots.
 // This executes the write barriers necessary after a memmove.
 // Both p and size must be pointer-aligned.
-// The range [p, p+size) must lie within a single allocation.
+// The range [p, p+size) must lie within a single object.
 //
 // Callers should call heapBitsBulkBarrier immediately after
 // calling memmove(p, src, size). This function is marked nosplit
@@ -378,10 +561,10 @@
 //
 //go:nosplit
 func heapBitsBulkBarrier(p, size uintptr) {
-	if (p|size)&(ptrSize-1) != 0 {
+	if (p|size)&(sys.PtrSize-1) != 0 {
 		throw("heapBitsBulkBarrier: unaligned arguments")
 	}
-	if !writeBarrierEnabled {
+	if !writeBarrier.needed {
 		return
 	}
 	if !inheap(p) {
@@ -408,12 +591,28 @@
 			systemstack(func() {
 				gcUnwindBarriers(gp, p)
 			})
+			return
+		}
+
+		// If p is a global, use the data or BSS bitmaps to
+		// execute write barriers.
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			if datap.data <= p && p < datap.edata {
+				bulkBarrierBitmap(p, size, p-datap.data, datap.gcdatamask.bytedata)
+				return
+			}
+		}
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			if datap.bss <= p && p < datap.ebss {
+				bulkBarrierBitmap(p, size, p-datap.bss, datap.gcbssmask.bytedata)
+				return
+			}
 		}
 		return
 	}
 
 	h := heapBitsForAddr(p)
-	for i := uintptr(0); i < size; i += ptrSize {
+	for i := uintptr(0); i < size; i += sys.PtrSize {
 		if h.isPointer() {
 			x := (*uintptr)(unsafe.Pointer(p + i))
 			writebarrierptr_nostore(x, *x)
@@ -422,6 +621,36 @@
 	}
 }
 
+// bulkBarrierBitmap executes write barriers for [p, p+size) using a
+// 1-bit pointer bitmap. p is assumed to start maskOffset bytes into
+// the data covered by the bitmap in bits.
+//
+// This is used by heapBitsBulkBarrier for writes to data and BSS.
+//
+//go:nosplit
+func bulkBarrierBitmap(p, size, maskOffset uintptr, bits *uint8) {
+	word := maskOffset / sys.PtrSize
+	bits = addb(bits, word/8)
+	mask := uint8(1) << (word % 8)
+
+	for i := uintptr(0); i < size; i += sys.PtrSize {
+		if mask == 0 {
+			bits = addb(bits, 1)
+			if *bits == 0 {
+				// Skip 8 words.
+				i += 7 * sys.PtrSize
+				continue
+			}
+			mask = 1
+		}
+		if *bits&mask != 0 {
+			x := (*uintptr)(unsafe.Pointer(p + i))
+			writebarrierptr_nostore(x, *x)
+		}
+		mask <<= 1
+	}
+}
+
 // typeBitsBulkBarrier executes writebarrierptr_nostore
 // for every pointer slot in the memory range [p, p+size),
 // using the type bitmap to locate those pointer slots.
@@ -441,20 +670,20 @@
 		throw("runtime: typeBitsBulkBarrier without type")
 	}
 	if typ.size != size {
-		println("runtime: typeBitsBulkBarrier with type ", *typ._string, " of size ", typ.size, " but memory size", size)
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " of size ", typ.size, " but memory size", size)
 		throw("runtime: invalid typeBitsBulkBarrier")
 	}
 	if typ.kind&kindGCProg != 0 {
-		println("runtime: typeBitsBulkBarrier with type ", *typ._string, " with GC prog")
+		println("runtime: typeBitsBulkBarrier with type ", typ.string(), " with GC prog")
 		throw("runtime: invalid typeBitsBulkBarrier")
 	}
-	if !writeBarrierEnabled {
+	if !writeBarrier.needed {
 		return
 	}
 	ptrmask := typ.gcdata
 	var bits uint32
-	for i := uintptr(0); i < typ.ptrdata; i += ptrSize {
-		if i&(ptrSize*8-1) == 0 {
+	for i := uintptr(0); i < typ.ptrdata; i += sys.PtrSize {
+		if i&(sys.PtrSize*8-1) == 0 {
 			bits = uint32(*ptrmask)
 			ptrmask = addb(ptrmask, 1)
 		} else {
@@ -476,16 +705,32 @@
 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 
 // initSpan initializes the heap bitmap for a span.
-func (h heapBits) initSpan(size, n, total uintptr) {
+// It clears all checkmark bits.
+// If this is a span of pointer-sized objects, it initializes all
+// words to pointer/scan.
+// Otherwise, it initializes all words to scalar/dead.
+func (h heapBits) initSpan(s *mspan) {
+	size, n, total := s.layout()
+
+	// Init the markbit structures
+	s.freeindex = 0
+	s.allocCache = ^uint64(0) // all 1s indicating all free.
+	s.nelems = n
+	s.allocBits = nil
+	s.gcmarkBits = nil
+	s.gcmarkBits = newMarkBits(s.nelems)
+	s.allocBits = newAllocBits(s.nelems)
+
+	// Clear bits corresponding to objects.
 	if total%heapBitmapScale != 0 {
 		throw("initSpan: unaligned length")
 	}
 	nbyte := total / heapBitmapScale
-	if ptrSize == 8 && size == ptrSize {
+	if sys.PtrSize == 8 && size == sys.PtrSize {
 		end := h.bitp
 		bitp := subtractb(end, nbyte-1)
 		for {
-			*bitp = bitPointerAll
+			*bitp = bitPointerAll | bitMarkedAll
 			if bitp == end {
 				break
 			}
@@ -500,7 +745,7 @@
 // It clears the checkmark bits, which are set to 1 in normal operation.
 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
 	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if ptrSize == 8 && size == ptrSize {
+	if sys.PtrSize == 8 && size == sys.PtrSize {
 		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
 		// Must clear type bit (checkmark bit) of every word.
@@ -514,7 +759,7 @@
 	}
 	for i := uintptr(0); i < n; i++ {
 		*h.bitp &^= bitMarked << (heapBitsShift + h.shift)
-		h = h.forward(size / ptrSize)
+		h = h.forward(size / sys.PtrSize)
 	}
 }
 
@@ -524,7 +769,7 @@
 // but consulted by typedmemmove.)
 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
-	if ptrSize == 8 && size == ptrSize {
+	if sys.PtrSize == 8 && size == sys.PtrSize {
 		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
 		// Must clear type bit (checkmark bit) of every word.
@@ -537,106 +782,60 @@
 	}
 }
 
-// heapBitsSweepSpan coordinates the sweeping of a span by reading
-// and updating the corresponding heap bitmap entries.
-// For each free object in the span, heapBitsSweepSpan sets the type
-// bits for the first two words (or one for single-word objects) to typeDead
-// and then calls f(p), where p is the object's base address.
-// f is expected to add the object to a free list.
-// For non-free objects, heapBitsSweepSpan turns off the marked bit.
-func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
-	h := heapBitsForSpan(base)
-	switch {
-	default:
-		throw("heapBitsSweepSpan")
-	case ptrSize == 8 && size == ptrSize:
-		// Consider mark bits in all four 2-bit entries of each bitmap byte.
-		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 4 {
-			x := uint32(*bitp)
-			// Note that unlike the other size cases, we leave the pointer bits set here.
-			// These are initialized during initSpan when the span is created and left
-			// in place the whole time the span is used for pointer-sized objects.
-			// That lets heapBitsSetType avoid an atomic update to set the pointer bit
-			// during allocation.
-			if x&bitMarked != 0 {
-				x &^= bitMarked
-			} else {
-				f(base + i*ptrSize)
-			}
-			if x&(bitMarked<<heapBitsShift) != 0 {
-				x &^= bitMarked << heapBitsShift
-			} else {
-				f(base + (i+1)*ptrSize)
-			}
-			if x&(bitMarked<<(2*heapBitsShift)) != 0 {
-				x &^= bitMarked << (2 * heapBitsShift)
-			} else {
-				f(base + (i+2)*ptrSize)
-			}
-			if x&(bitMarked<<(3*heapBitsShift)) != 0 {
-				x &^= bitMarked << (3 * heapBitsShift)
-			} else {
-				f(base + (i+3)*ptrSize)
-			}
-			*bitp = uint8(x)
-			bitp = subtract1(bitp)
-		}
+// oneBitCount is indexed by byte and produces the
+// number of 1 bits in that byte. For example 128 has 1 bit set
+// and oneBitCount[128] will holds 1.
+var oneBitCount = [256]uint8{
+	0, 1, 1, 2, 1, 2, 2, 3,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	1, 2, 2, 3, 2, 3, 3, 4,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	2, 3, 3, 4, 3, 4, 4, 5,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	3, 4, 4, 5, 4, 5, 5, 6,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	4, 5, 5, 6, 5, 6, 6, 7,
+	5, 6, 6, 7, 6, 7, 7, 8}
 
-	case size%(4*ptrSize) == 0:
-		// Mark bit is in first word of each object.
-		// Each object starts at bit 0 of a heap bitmap byte.
-		bitp := h.bitp
-		step := size / heapBitmapScale
-		for i := uintptr(0); i < n; i++ {
-			x := uint32(*bitp)
-			if x&bitMarked != 0 {
-				x &^= bitMarked
-			} else {
-				x = 0
-				f(base + i*size)
-			}
-			*bitp = uint8(x)
-			bitp = subtractb(bitp, step)
-		}
-
-	case size%(4*ptrSize) == 2*ptrSize:
-		// Mark bit is in first word of each object,
-		// but every other object starts halfway through a heap bitmap byte.
-		// Unroll loop 2x to handle alternating shift count and step size.
-		bitp := h.bitp
-		step := size / heapBitmapScale
-		var i uintptr
-		for i = uintptr(0); i < n; i += 2 {
-			x := uint32(*bitp)
-			if x&bitMarked != 0 {
-				x &^= bitMarked
-			} else {
-				x &^= bitMarked | bitPointer | (bitMarked|bitPointer)<<heapBitsShift
-				f(base + i*size)
-				if size > 2*ptrSize {
-					x = 0
-				}
-			}
-			*bitp = uint8(x)
-			if i+1 >= n {
-				break
-			}
-			bitp = subtractb(bitp, step)
-			x = uint32(*bitp)
-			if x&(bitMarked<<(2*heapBitsShift)) != 0 {
-				x &^= bitMarked << (2 * heapBitsShift)
-			} else {
-				x &^= (bitMarked|bitPointer)<<(2*heapBitsShift) | (bitMarked|bitPointer)<<(3*heapBitsShift)
-				f(base + (i+1)*size)
-				if size > 2*ptrSize {
-					*subtract1(bitp) = 0
-				}
-			}
-			*bitp = uint8(x)
-			bitp = subtractb(bitp, step+1)
-		}
+// countFree runs through the mark bits in a span and counts the number of free objects
+// in the span.
+// TODO:(rlh) Use popcount intrinsic.
+func (s *mspan) countFree() int {
+	count := 0
+	maxIndex := s.nelems / 8
+	for i := uintptr(0); i < maxIndex; i++ {
+		mrkBits := *addb(s.gcmarkBits, i)
+		count += int(oneBitCount[mrkBits])
 	}
+	if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 {
+		mrkBits := *addb(s.gcmarkBits, maxIndex)
+		mask := uint8((1 << bitsInLastByte) - 1)
+		bits := mrkBits & mask
+		count += int(oneBitCount[bits])
+	}
+	return int(s.nelems) - count
 }
 
 // heapBitsSetType records that the new allocation [x, x+size)
@@ -648,16 +847,23 @@
 // malloc does not call heapBitsSetType when there are no pointers,
 // because all free objects are marked as noscan during
 // heapBitsSweepSpan.
+//
 // There can only be one allocation from a given span active at a time,
-// so this code is not racing with other instances of itself,
-// and we don't allocate from a span until it has been swept,
-// so this code is not racing with heapBitsSweepSpan.
-// It is, however, racing with the concurrent GC mark phase,
-// which can be setting the mark bit in the leading 2-bit entry
-// of an allocated block. The block we are modifying is not quite
-// allocated yet, so the GC marker is not racing with updates to x's bits,
-// but if the start or end of x shares a bitmap byte with an adjacent
-// object, the GC marker is racing with updates to those object's mark bits.
+// and the bitmap for a span always falls on byte boundaries,
+// so there are no write-write races for access to the heap bitmap.
+// Hence, heapBitsSetType can access the bitmap without atomics.
+//
+// There can be read-write races between heapBitsSetType and things
+// that read the heap bitmap like scanobject. However, since
+// heapBitsSetType is only used for objects that have not yet been
+// made reachable, readers will ignore bits being modified by this
+// function. This does mean this function cannot transiently modify
+// bits that belong to neighboring objects. Also, on weakly-ordered
+// machines, callers must execute a store/store (publication) barrier
+// between calling this function and making the object reachable.
+//
+// TODO: This still has atomic accesses left over from when it could
+// race with GC accessing mark bits in the bitmap. Remove these.
 func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	const doubleCheck = false // slow but helpful; enable to test modifications to this code
 
@@ -666,10 +872,10 @@
 	// size is sizeof(_defer{}) (at least 6 words) and dataSize may be
 	// arbitrarily larger.
 	//
-	// The checks for size == ptrSize and size == 2*ptrSize can therefore
+	// The checks for size == sys.PtrSize and size == 2*sys.PtrSize can therefore
 	// assume that dataSize == size without checking it explicitly.
 
-	if ptrSize == 8 && size == ptrSize {
+	if sys.PtrSize == 8 && size == sys.PtrSize {
 		// It's one word and it has pointers, it must be a pointer.
 		// In general we'd need an atomic update here if the
 		// concurrent GC were marking objects in this span,
@@ -683,6 +889,9 @@
 			if !h.isPointer() {
 				throw("heapBitsSetType: pointer bit missing")
 			}
+			if !h.morePointers() {
+				throw("heapBitsSetType: scan bit missing")
+			}
 		}
 		return
 	}
@@ -695,8 +904,8 @@
 	// This is called out as a special case primarily for 32-bit systems,
 	// so that on 32-bit systems the code below can assume all objects
 	// are 4-word aligned (because they're all 16-byte aligned).
-	if size == 2*ptrSize {
-		if typ.size == ptrSize {
+	if size == 2*sys.PtrSize {
+		if typ.size == sys.PtrSize {
 			// We're allocating a block big enough to hold two pointers.
 			// On 64-bit, that means the actual object must be two pointers,
 			// or else we'd have used the one-pointer-sized block.
@@ -705,36 +914,51 @@
 			// just the smallest block available. Distinguish by checking dataSize.
 			// (In general the number of instances of typ being allocated is
 			// dataSize/typ.size.)
-			if ptrSize == 4 && dataSize == ptrSize {
-				// 1 pointer.
+			if sys.PtrSize == 4 && dataSize == sys.PtrSize {
+				// 1 pointer object. On 32-bit machines clear the bit for the
+				// unused second word.
 				if gcphase == _GCoff {
-					*h.bitp |= bitPointer << h.shift
+					*h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift
+					*h.bitp |= (bitPointer | bitMarked) << h.shift
 				} else {
-					atomicor8(h.bitp, bitPointer<<h.shift)
+					atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<<heapBitsShift))<<h.shift))
+					atomic.Or8(h.bitp, (bitPointer|bitMarked)<<h.shift)
 				}
 			} else {
 				// 2-element slice of pointer.
 				if gcphase == _GCoff {
-					*h.bitp |= (bitPointer | bitPointer<<heapBitsShift) << h.shift
+					*h.bitp |= (bitPointer | bitMarked | bitPointer<<heapBitsShift) << h.shift
 				} else {
-					atomicor8(h.bitp, (bitPointer|bitPointer<<heapBitsShift)<<h.shift)
+					atomic.Or8(h.bitp, (bitPointer|bitMarked|bitPointer<<heapBitsShift)<<h.shift)
 				}
 			}
 			return
 		}
-		// Otherwise typ.size must be 2*ptrSize, and typ.kind&kindGCProg == 0.
+		// Otherwise typ.size must be 2*sys.PtrSize,
+		// and typ.kind&kindGCProg == 0.
 		if doubleCheck {
-			if typ.size != 2*ptrSize || typ.kind&kindGCProg != 0 {
+			if typ.size != 2*sys.PtrSize || typ.kind&kindGCProg != 0 {
 				print("runtime: heapBitsSetType size=", size, " but typ.size=", typ.size, " gcprog=", typ.kind&kindGCProg != 0, "\n")
 				throw("heapBitsSetType")
 			}
 		}
 		b := uint32(*ptrmask)
-		hb := b & 3
+		hb := (b & 3) | bitMarked
 		if gcphase == _GCoff {
+			// bitPointer == 1, bitMarked is 1 << 4, heapBitsShift is 1.
+			// 110011 is shifted h.shift and complemented.
+			// This clears out the bits that are about to be
+			// ored into *h.hbitp in the next instructions.
+			*h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift
 			*h.bitp |= uint8(hb << h.shift)
 		} else {
-			atomicor8(h.bitp, uint8(hb<<h.shift))
+			// TODO:(rlh) since the GC is not concurrently setting the
+			// mark bits in the heap map anymore and malloc
+			// owns the span we are allocating in why does this have
+			// to be atomic?
+
+			atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<<heapBitsShift))<<h.shift))
+			atomic.Or8(h.bitp, uint8(hb<<h.shift))
 		}
 		return
 	}
@@ -825,8 +1049,8 @@
 		// Filling in bits for an array of typ.
 		// Set up for repetition of ptrmask during main loop.
 		// Note that ptrmask describes only a prefix of
-		const maxBits = ptrSize*8 - 7
-		if typ.ptrdata/ptrSize <= maxBits {
+		const maxBits = sys.PtrSize*8 - 7
+		if typ.ptrdata/sys.PtrSize <= maxBits {
 			// Entire ptrmask fits in uintptr with room for a byte fragment.
 			// Load into pbits and never read from ptrmask again.
 			// This is especially important when the ptrmask has
@@ -837,23 +1061,23 @@
 			// Accumulate ptrmask into b.
 			// ptrmask is sized to describe only typ.ptrdata, but we record
 			// it as describing typ.size bytes, since all the high bits are zero.
-			nb = typ.ptrdata / ptrSize
+			nb = typ.ptrdata / sys.PtrSize
 			for i := uintptr(0); i < nb; i += 8 {
 				b |= uintptr(*p) << i
 				p = add1(p)
 			}
-			nb = typ.size / ptrSize
+			nb = typ.size / sys.PtrSize
 
 			// Replicate ptrmask to fill entire pbits uintptr.
 			// Doubling and truncating is fewer steps than
 			// iterating by nb each time. (nb could be 1.)
-			// Since we loaded typ.ptrdata/ptrSize bits
-			// but are pretending to have typ.size/ptrSize,
+			// Since we loaded typ.ptrdata/sys.PtrSize bits
+			// but are pretending to have typ.size/sys.PtrSize,
 			// there might be no replication necessary/possible.
 			pbits = b
 			endnb = nb
 			if nb+nb <= maxBits {
-				for endnb <= ptrSize*8 {
+				for endnb <= sys.PtrSize*8 {
 					pbits |= pbits << endnb
 					endnb += endnb
 				}
@@ -870,9 +1094,9 @@
 			endp = nil
 		} else {
 			// Ptrmask is larger. Read it multiple times.
-			n := (typ.ptrdata/ptrSize+7)/8 - 1
+			n := (typ.ptrdata/sys.PtrSize+7)/8 - 1
 			endp = addb(ptrmask, n)
-			endnb = typ.size/ptrSize - n*8
+			endnb = typ.size/sys.PtrSize - n*8
 		}
 	}
 	if p != nil {
@@ -883,16 +1107,16 @@
 
 	if typ.size == dataSize {
 		// Single entry: can stop once we reach the non-pointer data.
-		nw = typ.ptrdata / ptrSize
+		nw = typ.ptrdata / sys.PtrSize
 	} else {
 		// Repeated instances of typ in an array.
 		// Have to process first N-1 entries in full, but can stop
 		// once we reach the non-pointer data in the final entry.
-		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / ptrSize
+		nw = ((dataSize/typ.size-1)*typ.size + typ.ptrdata) / sys.PtrSize
 	}
 	if nw == 0 {
 		// No pointers! Caller was supposed to check.
-		println("runtime: invalid type ", *typ._string)
+		println("runtime: invalid type ", typ.string())
 		throw("heapBitsSetType: called with non-pointer type")
 		return
 	}
@@ -903,8 +1127,8 @@
 	}
 
 	// Phase 1: Special case for leading byte (shift==0) or half-byte (shift==4).
-	// The leading byte is special because it contains the bits for words 0 and 1,
-	// which do not have the marked bits set.
+	// The leading byte is special because it contains the bits for word 1,
+	// which does not have the marked bits set.
 	// The leading half-byte is special because it's a half a byte and must be
 	// manipulated atomically.
 	switch {
@@ -914,12 +1138,20 @@
 	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
 		// Handle first byte of bitmap specially.
-		// The first byte we write out contains the first two words of the object.
-		// In those words, the mark bits are mark and checkmark, respectively,
-		// and must not be set. In all following words, we want to set the mark bit
-		// as a signal that the object continues to the next 2-bit entry in the bitmap.
+		//
+		// The first byte we write out covers the first four
+		// words of the object. The scan/dead bit on the first
+		// word must be set to scan since there are pointers
+		// somewhere in the object. The scan/dead bit on the
+		// second word is the checkmark, so we don't set it.
+		// In all following words, we set the scan/dead
+		// appropriately to indicate that the object contains
+		// to the next 2-bit entry in the bitmap.
+		//
+		// TODO: It doesn't matter if we set the checkmark, so
+		// maybe this case isn't needed any more.
 		hb = b & bitPointerAll
-		hb |= bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
+		hb |= bitMarked | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
 		if w += 4; w >= nw {
 			goto Phase3
 		}
@@ -928,7 +1160,7 @@
 		b >>= 4
 		nb -= 4
 
-	case ptrSize == 8 && h.shift == 2:
+	case sys.PtrSize == 8 && h.shift == 2:
 		// Ptrmask and heap bitmap are misaligned.
 		// The bits for the first two words are in a byte shared with another object
 		// and must be updated atomically.
@@ -939,14 +1171,20 @@
 		// not with its mark bit. Since there is only one allocation
 		// from a given span at a time, we should be able to set
 		// these bits non-atomically. Not worth the risk right now.
-		hb = (b & 3) << (2 * heapBitsShift)
+		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
+		// This is not noscan, so set the scan bit in the
+		// first word.
+		hb |= bitMarked << (2 * heapBitsShift)
 		b >>= 2
 		nb -= 2
-		// Note: no bitMarker in hb because the first two words don't get markers from us.
+		// Note: no bitMarker for second word because that's
+		// the checkmark.
 		if gcphase == _GCoff {
+			*hbitp &^= uint8((bitPointer | bitMarked | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
 			*hbitp |= uint8(hb)
 		} else {
-			atomicor8(hbitp, uint8(hb))
+			atomic.And8(hbitp, ^(uint8(bitPointer|bitMarked|bitPointer<<heapBitsShift) << (2 * heapBitsShift)))
+			atomic.Or8(hbitp, uint8(hb))
 		}
 		hbitp = subtract1(hbitp)
 		if w += 2; w >= nw {
@@ -1041,7 +1279,7 @@
 	}
 
 	// Change nw from counting possibly-pointer words to total words in allocation.
-	nw = size / ptrSize
+	nw = size / sys.PtrSize
 
 	// Write whole bitmap bytes.
 	// The first is hb, the rest are zero.
@@ -1066,8 +1304,8 @@
 		if gcphase == _GCoff {
 			*hbitp = *hbitp&^(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift) | uint8(hb)
 		} else {
-			atomicand8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
-			atomicor8(hbitp, uint8(hb))
+			atomic.And8(hbitp, ^uint8(bitPointer|bitMarked|(bitPointer|bitMarked)<<heapBitsShift))
+			atomic.Or8(hbitp, uint8(hb))
 		}
 	}
 
@@ -1076,7 +1314,7 @@
 	if doubleCheck {
 		end := heapBitsForAddr(x + size)
 		if typ.kind&kindGCProg == 0 && (hbitp != end.bitp || (w == nw+2) != (end.shift == 2)) {
-			println("ended at wrong bitmap byte for", *typ._string, "x", dataSize/typ.size)
+			println("ended at wrong bitmap byte for", typ.string(), "x", dataSize/typ.size)
 			print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 			print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
 			h0 := heapBitsForAddr(x)
@@ -1088,11 +1326,11 @@
 		// Double-check that bits to be written were written correctly.
 		// Does not check that other bits were not written, unfortunately.
 		h := heapBitsForAddr(x)
-		nptr := typ.ptrdata / ptrSize
-		ndata := typ.size / ptrSize
+		nptr := typ.ptrdata / sys.PtrSize
+		ndata := typ.size / sys.PtrSize
 		count := dataSize / typ.size
-		totalptr := ((count-1)*typ.size + typ.ptrdata) / ptrSize
-		for i := uintptr(0); i < size/ptrSize; i++ {
+		totalptr := ((count-1)*typ.size + typ.ptrdata) / sys.PtrSize
+		for i := uintptr(0); i < size/sys.PtrSize; i++ {
 			j := i % ndata
 			var have, want uint8
 			have = (*h.bitp >> h.shift) & (bitPointer | bitMarked)
@@ -1105,14 +1343,14 @@
 				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
-				if i >= 2 {
+				if i != 1 {
 					want |= bitMarked
 				} else {
 					have &^= bitMarked
 				}
 			}
 			if have != want {
-				println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size)
+				println("mismatch writing bits for", typ.string(), "x", dataSize/typ.size)
 				print("typ.size=", typ.size, " typ.ptrdata=", typ.ptrdata, " dataSize=", dataSize, " size=", size, "\n")
 				print("kindGCProg=", typ.kind&kindGCProg != 0, "\n")
 				print("w=", w, " nw=", nw, " b=", hex(b), " nb=", nb, " hb=", hex(hb), "\n")
@@ -1120,7 +1358,7 @@
 				print("initial bits h0.bitp=", h0.bitp, " h0.shift=", h0.shift, "\n")
 				print("current bits h.bitp=", h.bitp, " h.shift=", h.shift, " *h.bitp=", hex(*h.bitp), "\n")
 				print("ptrmask=", ptrmask, " p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n")
-				println("at word", i, "offset", i*ptrSize, "have", have, "want", want)
+				println("at word", i, "offset", i*sys.PtrSize, "have", have, "want", want)
 				if typ.kind&kindGCProg != 0 {
 					println("GC program:")
 					dumpGCProg(addb(typ.gcdata, 4))
@@ -1135,6 +1373,13 @@
 	}
 }
 
+// heapBitsSetTypeNoScan marks x as noscan by setting the first word
+// of x in the heap bitmap to scalar/dead.
+func heapBitsSetTypeNoScan(x uintptr) {
+	h := heapBitsForAddr(uintptr(x))
+	*h.bitp &^= (bitPointer | bitMarked) << h.shift
+}
+
 var debugPtrmask struct {
 	lock mutex
 	data *byte
@@ -1151,14 +1396,14 @@
 // so that the relevant bitmap bytes are not shared with surrounding
 // objects and need not be accessed with atomic instructions.
 func heapBitsSetTypeGCProg(h heapBits, progSize, elemSize, dataSize, allocSize uintptr, prog *byte) {
-	if ptrSize == 8 && allocSize%(4*ptrSize) != 0 {
+	if sys.PtrSize == 8 && allocSize%(4*sys.PtrSize) != 0 {
 		// Alignment will be wrong.
 		throw("heapBitsSetTypeGCProg: small allocation")
 	}
 	var totalBits uintptr
 	if elemSize == dataSize {
 		totalBits = runGCProg(prog, nil, h.bitp, 2)
-		if totalBits*ptrSize != progSize {
+		if totalBits*sys.PtrSize != progSize {
 			println("runtime: heapBitsSetTypeGCProg: total bits", totalBits, "but progSize", progSize)
 			throw("heapBitsSetTypeGCProg: unexpected bit count")
 		}
@@ -1173,7 +1418,7 @@
 		// repeats that first element to fill the array.
 		var trailer [40]byte // 3 varints (max 10 each) + some bytes
 		i := 0
-		if n := elemSize/ptrSize - progSize/ptrSize; n > 0 {
+		if n := elemSize/sys.PtrSize - progSize/sys.PtrSize; n > 0 {
 			// literal(0)
 			trailer[i] = 0x01
 			i++
@@ -1195,7 +1440,7 @@
 		// repeat(elemSize/ptrSize, count-1)
 		trailer[i] = 0x80
 		i++
-		n := elemSize / ptrSize
+		n := elemSize / sys.PtrSize
 		for ; n >= 0x80; n >>= 7 {
 			trailer[i] = byte(n | 0x80)
 			i++
@@ -1219,7 +1464,7 @@
 		// last element. This will cause the code below to
 		// memclr the dead section of the final array element,
 		// so that scanobject can stop early in the final element.
-		totalBits = (elemSize*(count-1) + progSize) / ptrSize
+		totalBits = (elemSize*(count-1) + progSize) / sys.PtrSize
 	}
 	endProg := unsafe.Pointer(subtractb(h.bitp, (totalBits+3)/4))
 	endAlloc := unsafe.Pointer(subtractb(h.bitp, allocSize/heapBitmapScale))
@@ -1228,9 +1473,9 @@
 
 // progToPointerMask returns the 1-bit pointer mask output by the GC program prog.
 // size the size of the region described by prog, in bytes.
-// The resulting bitvector will have no more than size/ptrSize bits.
+// The resulting bitvector will have no more than size/sys.PtrSize bits.
 func progToPointerMask(prog *byte, size uintptr) bitvector {
-	n := (size/ptrSize + 7) / 8
+	n := (size/sys.PtrSize + 7) / 8
 	x := (*[1 << 30]byte)(persistentalloc(n+1, 1, &memstats.buckhash_sys))[:n+1]
 	x[len(x)-1] = 0xa1 // overflow check sentinel
 	n = runGCProg(prog, nil, &x[0], 1)
@@ -1364,11 +1609,11 @@
 		// into a register and use that register for the entire loop
 		// instead of repeatedly reading from memory.
 		// Handling fewer than 8 bits here makes the general loop simpler.
-		// The cutoff is ptrSize*8 - 7 to guarantee that when we add
+		// The cutoff is sys.PtrSize*8 - 7 to guarantee that when we add
 		// the pattern to a bit buffer holding at most 7 bits (a partial byte)
 		// it will not overflow.
 		src := dst
-		const maxBits = ptrSize*8 - 7
+		const maxBits = sys.PtrSize*8 - 7
 		if n <= maxBits {
 			// Start with bits in output buffer.
 			pattern := bits
@@ -1421,7 +1666,7 @@
 				nb := npattern
 				if nb+nb <= maxBits {
 					// Double pattern until the whole uintptr is filled.
-					for nb <= ptrSize*8 {
+					for nb <= sys.PtrSize*8 {
 						b |= b << nb
 						nb += nb
 					}
@@ -1538,12 +1783,6 @@
 			dst = subtract1(dst)
 			bits >>= 4
 		}
-		// Clear the mark bits in the first two entries.
-		// They are the actual mark and checkmark bits,
-		// not non-dead markers. It simplified the code
-		// above to set the marker in every bit written and
-		// then clear these two as a special case at the end.
-		*dstStart &^= bitMarked | bitMarked<<heapBitsShift
 	}
 	return totalBits
 }
@@ -1609,8 +1848,8 @@
 //go:linkname reflect_gcbits reflect.gcbits
 func reflect_gcbits(x interface{}) []byte {
 	ret := getgcmask(x)
-	typ := (*ptrtype)(unsafe.Pointer((*eface)(unsafe.Pointer(&x))._type)).elem
-	nptr := typ.ptrdata / ptrSize
+	typ := (*ptrtype)(unsafe.Pointer(efaceOf(&x)._type)).elem
+	nptr := typ.ptrdata / sys.PtrSize
 	for uintptr(len(ret)) > nptr && ret[len(ret)-1] == 0 {
 		ret = ret[:len(ret)-1]
 	}
@@ -1619,7 +1858,7 @@
 
 // Returns GC type info for object p for testing.
 func getgcmask(ep interface{}) (mask []byte) {
-	e := *(*eface)(unsafe.Pointer(&ep))
+	e := *efaceOf(&ep)
 	p := e.data
 	t := e._type
 	// data or bss
@@ -1628,10 +1867,10 @@
 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
 			bitmap := datap.gcdatamask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-			mask = make([]byte, n/ptrSize)
-			for i := uintptr(0); i < n; i += ptrSize {
-				off := (uintptr(p) + i - datap.data) / ptrSize
-				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+			mask = make([]byte, n/sys.PtrSize)
+			for i := uintptr(0); i < n; i += sys.PtrSize {
+				off := (uintptr(p) + i - datap.data) / sys.PtrSize
+				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
@@ -1640,10 +1879,10 @@
 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
 			bitmap := datap.gcbssmask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-			mask = make([]byte, n/ptrSize)
-			for i := uintptr(0); i < n; i += ptrSize {
-				off := (uintptr(p) + i - datap.bss) / ptrSize
-				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+			mask = make([]byte, n/sys.PtrSize)
+			for i := uintptr(0); i < n; i += sys.PtrSize {
+				off := (uintptr(p) + i - datap.bss) / sys.PtrSize
+				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
@@ -1653,14 +1892,14 @@
 	var n uintptr
 	var base uintptr
 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
-		mask = make([]byte, n/ptrSize)
-		for i := uintptr(0); i < n; i += ptrSize {
+		mask = make([]byte, n/sys.PtrSize)
+		for i := uintptr(0); i < n; i += sys.PtrSize {
 			hbits := heapBitsForAddr(base + i)
 			if hbits.isPointer() {
-				mask[i/ptrSize] = 1
+				mask[i/sys.PtrSize] = 1
 			}
-			if i >= 2*ptrSize && !hbits.isMarked() {
-				mask = mask[:i/ptrSize]
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
+				mask = mask[:i/sys.PtrSize]
 				break
 			}
 		}
@@ -1682,7 +1921,7 @@
 			if targetpc != f.entry {
 				targetpc--
 			}
-			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+			pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, nil)
 			if pcdata == -1 {
 				return
 			}
@@ -1691,13 +1930,13 @@
 				return
 			}
 			bv := stackmapdata(stkmap, pcdata)
-			size := uintptr(bv.n) * ptrSize
+			size := uintptr(bv.n) * sys.PtrSize
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
-			mask = make([]byte, n/ptrSize)
-			for i := uintptr(0); i < n; i += ptrSize {
+			mask = make([]byte, n/sys.PtrSize)
+			for i := uintptr(0); i < n; i += sys.PtrSize {
 				bitmap := bv.bytedata
-				off := (uintptr(p) + i - frame.varp + size) / ptrSize
-				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
+				off := (uintptr(p) + i - frame.varp + size) / sys.PtrSize
+				mask[i/sys.PtrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 		}
 		return

diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 8c2a6b0..5938e53 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go

@@ -8,15 +8,25 @@
 
 // Per-thread (in Go, per-P) cache for small objects.
 // No locking needed because it is per-thread (per-P).
+//
+// mcaches are allocated from non-GC'd memory, so any heap pointers
+// must be specially handled.
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample      int32   // trigger heap sample after allocating this many bytes
-	local_cachealloc uintptr // bytes allocated from cache since last lock of heap
-	local_scan       uintptr // bytes of scannable heap allocated
+	next_sample int32   // trigger heap sample after allocating this many bytes
+	local_scan  uintptr // bytes of scannable heap allocated
+
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
-	tiny             unsafe.Pointer
+
+	// tiny points to the beginning of the current tiny block, or
+	// nil if there is no current tiny block.
+	//
+	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
+	// we handle it by clearing it in releaseAll during mark
+	// termination.
+	tiny             uintptr
 	tinyoffset       uintptr
 	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
 
@@ -63,28 +73,19 @@
 
 func allocmcache() *mcache {
 	lock(&mheap_.lock)
-	c := (*mcache)(fixAlloc_Alloc(&mheap_.cachealloc))
+	c := (*mcache)(mheap_.cachealloc.alloc())
 	unlock(&mheap_.lock)
 	memclr(unsafe.Pointer(c), unsafe.Sizeof(*c))
 	for i := 0; i < _NumSizeClasses; i++ {
 		c.alloc[i] = &emptymspan
 	}
-
-	// Set first allocation sample size.
-	rate := MemProfileRate
-	if rate > 0x3fffffff { // make 2*rate not overflow
-		rate = 0x3fffffff
-	}
-	if rate != 0 {
-		c.next_sample = int32(int(fastrand1()) % (2 * rate))
-	}
-
+	c.next_sample = nextSample()
 	return c
 }
 
 func freemcache(c *mcache) {
 	systemstack(func() {
-		mCache_ReleaseAll(c)
+		c.releaseAll()
 		stackcache_clear(c)
 
 		// NOTE(rsc,rlh): If gcworkbuffree comes back, we need to coordinate
@@ -94,46 +95,52 @@
 
 		lock(&mheap_.lock)
 		purgecachedstats(c)
-		fixAlloc_Free(&mheap_.cachealloc, unsafe.Pointer(c))
+		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
 // Gets a span that has a free object in it and assigns it
-// to be the cached span for the given sizeclass.  Returns this span.
-func mCache_Refill(c *mcache, sizeclass int32) *mspan {
+// to be the cached span for the given sizeclass. Returns this span.
+func (c *mcache) refill(sizeclass int32) *mspan {
 	_g_ := getg()
 
 	_g_.m.locks++
 	// Return the current cached span to the central lists.
 	s := c.alloc[sizeclass]
-	if s.freelist.ptr() != nil {
-		throw("refill on a nonempty span")
+
+	if uintptr(s.allocCount) != s.nelems {
+		throw("refill of span with free space remaining")
 	}
+
 	if s != &emptymspan {
 		s.incache = false
 	}
 
 	// Get a new cached span from the central lists.
-	s = mCentral_CacheSpan(&mheap_.central[sizeclass].mcentral)
+	s = mheap_.central[sizeclass].mcentral.cacheSpan()
 	if s == nil {
 		throw("out of memory")
 	}
-	if s.freelist.ptr() == nil {
-		println(s.ref, (s.npages<<_PageShift)/s.elemsize)
-		throw("empty span")
+
+	if uintptr(s.allocCount) == s.nelems {
+		throw("span has no free space")
 	}
+
 	c.alloc[sizeclass] = s
 	_g_.m.locks--
 	return s
 }
 
-func mCache_ReleaseAll(c *mcache) {
+func (c *mcache) releaseAll() {
 	for i := 0; i < _NumSizeClasses; i++ {
 		s := c.alloc[i]
 		if s != &emptymspan {
-			mCentral_UncacheSpan(&mheap_.central[i].mcentral, s)
+			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
 	}
+	// Clear tinyalloc pool.
+	c.tiny = 0
+	c.tinyoffset = 0
 }

diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index 161af99..7b63110 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go

@@ -12,36 +12,39 @@
 
 package runtime
 
+import "runtime/internal/atomic"
+
 // Central list of free objects of a given size.
 type mcentral struct {
 	lock      mutex
 	sizeclass int32
-	nonempty  mspan // list of spans with a free object
-	empty     mspan // list of spans with no free objects (or cached in an mcache)
+	nonempty  mSpanList // list of spans with a free object, ie a nonempty free list
+	empty     mSpanList // list of spans with no free objects (or cached in an mcache)
 }
 
 // Initialize a single central free list.
-func mCentral_Init(c *mcentral, sizeclass int32) {
+func (c *mcentral) init(sizeclass int32) {
 	c.sizeclass = sizeclass
-	mSpanList_Init(&c.nonempty)
-	mSpanList_Init(&c.empty)
+	c.nonempty.init()
+	c.empty.init()
 }
 
 // Allocate a span to use in an MCache.
-func mCentral_CacheSpan(c *mcentral) *mspan {
+func (c *mcentral) cacheSpan() *mspan {
 	// Deduct credit for this span allocation and sweep if necessary.
-	deductSweepCredit(uintptr(class_to_size[c.sizeclass]), 0)
+	spanBytes := uintptr(class_to_allocnpages[c.sizeclass]) * _PageSize
+	deductSweepCredit(spanBytes, 0)
 
 	lock(&c.lock)
 	sg := mheap_.sweepgen
 retry:
 	var s *mspan
-	for s = c.nonempty.next; s != &c.nonempty; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
-			mSpanList_Remove(s)
-			mSpanList_InsertBack(&c.empty, s)
+	for s = c.nonempty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			c.nonempty.remove(s)
+			c.empty.insertBack(s)
 			unlock(&c.lock)
-			mSpan_Sweep(s, true)
+			s.sweep(true)
 			goto havespan
 		}
 		if s.sweepgen == sg-1 {
@@ -49,22 +52,24 @@
 			continue
 		}
 		// we have a nonempty span that does not require sweeping, allocate from it
-		mSpanList_Remove(s)
-		mSpanList_InsertBack(&c.empty, s)
+		c.nonempty.remove(s)
+		c.empty.insertBack(s)
 		unlock(&c.lock)
 		goto havespan
 	}
 
-	for s = c.empty.next; s != &c.empty; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
+	for s = c.empty.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			// we have an empty span that requires sweeping,
 			// sweep it and see if we can free some space in it
-			mSpanList_Remove(s)
+			c.empty.remove(s)
 			// swept spans are at the end of the list
-			mSpanList_InsertBack(&c.empty, s)
+			c.empty.insertBack(s)
 			unlock(&c.lock)
-			mSpan_Sweep(s, true)
-			if s.freelist.ptr() != nil {
+			s.sweep(true)
+			freeIndex := s.nextFreeIndex()
+			if freeIndex != s.nelems {
+				s.freeindex = freeIndex
 				goto havespan
 			}
 			lock(&c.lock)
@@ -83,72 +88,91 @@
 	unlock(&c.lock)
 
 	// Replenish central list if empty.
-	s = mCentral_Grow(c)
+	s = c.grow()
 	if s == nil {
 		return nil
 	}
 	lock(&c.lock)
-	mSpanList_InsertBack(&c.empty, s)
+	c.empty.insertBack(s)
 	unlock(&c.lock)
 
 	// At this point s is a non-empty span, queued at the end of the empty list,
 	// c is unlocked.
 havespan:
 	cap := int32((s.npages << _PageShift) / s.elemsize)
-	n := cap - int32(s.ref)
-	if n == 0 {
-		throw("empty span")
+	n := cap - int32(s.allocCount)
+	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
+		throw("span has no free objects")
 	}
-	if s.freelist.ptr() == nil {
-		throw("freelist empty")
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	if usedBytes > 0 {
+		reimburseSweepCredit(usedBytes)
+	}
+	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live changed.
+		gcController.revise()
 	}
 	s.incache = true
+	freeByteBase := s.freeindex &^ (64 - 1)
+	whichByte := freeByteBase / 8
+	// Init alloc bits cache.
+	s.refillAllocCache(whichByte)
+
+	// Adjust the allocCache so that s.freeindex corresponds to the low bit in
+	// s.allocCache.
+	s.allocCache >>= s.freeindex % 64
+
 	return s
 }
 
 // Return span from an MCache.
-func mCentral_UncacheSpan(c *mcentral, s *mspan) {
+func (c *mcentral) uncacheSpan(s *mspan) {
 	lock(&c.lock)
 
 	s.incache = false
 
-	if s.ref == 0 {
-		throw("uncaching full span")
+	if s.allocCount == 0 {
+		throw("uncaching span but s.allocCount == 0")
 	}
 
 	cap := int32((s.npages << _PageShift) / s.elemsize)
-	n := cap - int32(s.ref)
+	n := cap - int32(s.allocCount)
 	if n > 0 {
-		mSpanList_Remove(s)
-		mSpanList_Insert(&c.nonempty, s)
+		c.empty.remove(s)
+		c.nonempty.insert(s)
+		// mCentral_CacheSpan conservatively counted
+		// unallocated slots in heap_live. Undo this.
+		atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
 	}
 	unlock(&c.lock)
 }
 
-// Free n objects from a span s back into the central free list c.
-// Called during sweep.
-// Returns true if the span was returned to heap.  Sets sweepgen to
-// the latest generation.
-// If preserve=true, don't return the span to heap nor relink in MCentral lists;
-// caller takes care of it.
-func mCentral_FreeSpan(c *mcentral, s *mspan, n int32, start gclinkptr, end gclinkptr, preserve bool) bool {
+// freeSpan updates c and s after sweeping s.
+// It sets s's sweepgen to the latest generation,
+// and, based on the number of free objects in s,
+// moves s to the appropriate list of c or returns it
+// to the heap.
+// freeSpan returns true if s was returned to the heap.
+// If preserve=true, it does not move s (the caller
+// must take care of it).
+func (c *mcentral) freeSpan(s *mspan, preserve bool, wasempty bool) bool {
 	if s.incache {
-		throw("freespan into cached span")
+		throw("freeSpan given cached span")
 	}
-
-	// Add the objects back to s's free list.
-	wasempty := s.freelist.ptr() == nil
-	end.ptr().next = s.freelist
-	s.freelist = start
-	s.ref -= uint16(n)
+	s.needzero = 1
 
 	if preserve {
 		// preserve is set only when called from MCentral_CacheSpan above,
 		// the span must be in the empty list.
-		if s.next == nil {
+		if !s.inList() {
 			throw("can't preserve unlinked span")
 		}
-		atomicstore(&s.sweepgen, mheap_.sweepgen)
+		atomic.Store(&s.sweepgen, mheap_.sweepgen)
 		return false
 	}
 
@@ -156,57 +180,41 @@
 
 	// Move to nonempty if necessary.
 	if wasempty {
-		mSpanList_Remove(s)
-		mSpanList_Insert(&c.nonempty, s)
+		c.empty.remove(s)
+		c.nonempty.insert(s)
 	}
 
-	// delay updating sweepgen until here.  This is the signal that
+	// delay updating sweepgen until here. This is the signal that
 	// the span may be used in an MCache, so it must come after the
 	// linked list operations above (actually, just after the
 	// lock of c above.)
-	atomicstore(&s.sweepgen, mheap_.sweepgen)
+	atomic.Store(&s.sweepgen, mheap_.sweepgen)
 
-	if s.ref != 0 {
+	if s.allocCount != 0 {
 		unlock(&c.lock)
 		return false
 	}
 
-	// s is completely freed, return it to the heap.
-	mSpanList_Remove(s)
-	s.needzero = 1
-	s.freelist = 0
+	c.nonempty.remove(s)
 	unlock(&c.lock)
-	heapBitsForSpan(s.base()).initSpan(s.layout())
-	mHeap_Free(&mheap_, s, 0)
+	mheap_.freeSpan(s, 0)
 	return true
 }
 
-// Fetch a new span from the heap and carve into objects for the free list.
-func mCentral_Grow(c *mcentral) *mspan {
+// grow allocates a new empty span from the heap and initializes it for c's size class.
+func (c *mcentral) grow() *mspan {
 	npages := uintptr(class_to_allocnpages[c.sizeclass])
 	size := uintptr(class_to_size[c.sizeclass])
 	n := (npages << _PageShift) / size
 
-	s := mHeap_Alloc(&mheap_, npages, c.sizeclass, false, true)
+	s := mheap_.alloc(npages, c.sizeclass, false, true)
 	if s == nil {
 		return nil
 	}
 
-	p := uintptr(s.start << _PageShift)
+	p := s.base()
 	s.limit = p + size*n
-	head := gclinkptr(p)
-	tail := gclinkptr(p)
-	// i==0 iteration already done
-	for i := uintptr(1); i < n; i++ {
-		p += size
-		tail.ptr().next = gclinkptr(p)
-		tail = gclinkptr(p)
-	}
-	if s.freelist.ptr() != nil {
-		throw("freelist not empty")
-	}
-	tail.ptr().next = 0
-	s.freelist = head
-	heapBitsForSpan(s.base()).initSpan(s.layout())
+
+	heapBitsForSpan(s.base()).initSpan(s)
 	return s
 }

diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index ecab584..a65933f 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,13 +6,16 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	v := unsafe.Pointer(mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	v := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if uintptr(v) < 4096 {
 		return nil
 	}
@@ -41,14 +44,14 @@
 
 func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	// On 64-bit, people with ulimit -v set complain if we reserve too
-	// much address space.  Instead, assume that the reservation is okay
+	// much address space. Instead, assume that the reservation is okay
 	// and check the assumption in SysMap.
-	if ptrSize == 8 && uint64(n) > 1<<32 || goos_nacl != 0 {
+	if sys.PtrSize == 8 && uint64(n) > 1<<32 || sys.GoosNacl != 0 {
 		*reserved = false
 		return v
 	}
 
-	p := unsafe.Pointer(mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if uintptr(p) < 4096 {
 		return nil
 	}
@@ -56,9 +59,9 @@
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
-	const _ENOMEM = 12
+const _ENOMEM = 12
 
+func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 	mSysStatInc(sysStat, n)
 
 	// On 64-bit, we don't actually have v reserved, so tread carefully.

diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3bebd97..3f1c4d7 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -10,7 +10,7 @@
 // which prevents us from allocating more stack.
 //go:nosplit
 func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	v := (unsafe.Pointer)(mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	v := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if uintptr(v) < 4096 {
 		return nil
 	}
@@ -40,7 +40,7 @@
 
 func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	*reserved = true
-	p := (unsafe.Pointer)(mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0))
+	p := mmap(v, n, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if uintptr(p) < 4096 {
 		return nil
 	}
@@ -53,7 +53,7 @@
 
 func sysMap(v unsafe.Pointer, n uintptr, reserved bool, sysStat *uint64) {
 	mSysStatInc(sysStat, n)
-	p := (unsafe.Pointer)(mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0))
+	p := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if uintptr(p) == _ENOMEM {
 		throw("runtime: out of memory")
 	}

diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index f988e75..61fdcee 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go

@@ -1,13 +1,16 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
-	_PAGE_SIZE = _PhysPageSize
+	_PAGE_SIZE = sys.PhysPageSize
 	_EACCES    = 13
 )
 
@@ -69,29 +72,96 @@
 }
 
 func sysUnused(v unsafe.Pointer, n uintptr) {
-	var s uintptr = hugePageSize // division by constant 0 is a compile-time error :(
-	if s != 0 && (uintptr(v)%s != 0 || n%s != 0) {
-		// See issue 8832
-		// Linux kernel bug: https://bugzilla.kernel.org/show_bug.cgi?id=93111
-		// Mark the region as NOHUGEPAGE so the kernel's khugepaged
-		// doesn't undo our DONTNEED request.  khugepaged likes to migrate
-		// regions which are only partially mapped to huge pages, including
-		// regions with some DONTNEED marks.  That needlessly allocates physical
-		// memory for our DONTNEED regions.
-		madvise(v, n, _MADV_NOHUGEPAGE)
+	// By default, Linux's "transparent huge page" support will
+	// merge pages into a huge page if there's even a single
+	// present regular page, undoing the effects of the DONTNEED
+	// below. On amd64, that means khugepaged can turn a single
+	// 4KB page to 2MB, bloating the process's RSS by as much as
+	// 512X. (See issue #8832 and Linux kernel bug
+	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
+	//
+	// To work around this, we explicitly disable transparent huge
+	// pages when we release pages of the heap. However, we have
+	// to do this carefully because changing this flag tends to
+	// split the VMA (memory mapping) containing v in to three
+	// VMAs in order to track the different values of the
+	// MADV_NOHUGEPAGE flag in the different regions. There's a
+	// default limit of 65530 VMAs per address space (sysctl
+	// vm.max_map_count), so we must be careful not to create too
+	// many VMAs (see issue #12233).
+	//
+	// Since huge pages are huge, there's little use in adjusting
+	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
+	// exploding the number of VMAs by only adjusting the
+	// MADV_NOHUGEPAGE flag on a large granularity. This still
+	// gets most of the benefit of huge pages while keeping the
+	// number of VMAs under control. With hugePageSize = 2MB, even
+	// a pessimal heap can reach 128GB before running out of VMAs.
+	if sys.HugePageSize != 0 {
+		var s uintptr = sys.HugePageSize // division by constant 0 is a compile-time error :(
+
+		// If it's a large allocation, we want to leave huge
+		// pages enabled. Hence, we only adjust the huge page
+		// flag on the huge pages containing v and v+n-1, and
+		// only if those aren't aligned.
+		var head, tail uintptr
+		if uintptr(v)%s != 0 {
+			// Compute huge page containing v.
+			head = uintptr(v) &^ (s - 1)
+		}
+		if (uintptr(v)+n)%s != 0 {
+			// Compute huge page containing v+n-1.
+			tail = (uintptr(v) + n - 1) &^ (s - 1)
+		}
+
+		// Note that madvise will return EINVAL if the flag is
+		// already set, which is quite likely. We ignore
+		// errors.
+		if head != 0 && head+sys.HugePageSize == tail {
+			// head and tail are different but adjacent,
+			// so do this in one call.
+			madvise(unsafe.Pointer(head), 2*sys.HugePageSize, _MADV_NOHUGEPAGE)
+		} else {
+			// Advise the huge pages containing v and v+n-1.
+			if head != 0 {
+				madvise(unsafe.Pointer(head), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+			if tail != 0 && tail != head {
+				madvise(unsafe.Pointer(tail), sys.HugePageSize, _MADV_NOHUGEPAGE)
+			}
+		}
 	}
+
+	if uintptr(v)&(sys.PhysPageSize-1) != 0 || n&(sys.PhysPageSize-1) != 0 {
+		// madvise will round this to any physical page
+		// *covered* by this range, so an unaligned madvise
+		// will release more memory than intended.
+		throw("unaligned sysUnused")
+	}
+
 	madvise(v, n, _MADV_DONTNEED)
 }
 
 func sysUsed(v unsafe.Pointer, n uintptr) {
-	if hugePageSize != 0 {
-		// Undo the NOHUGEPAGE marks from sysUnused.  There is no alignment check
-		// around this call as spans may have been merged in the interim.
-		// Note that this might enable huge pages for regions which were
-		// previously disabled.  Unfortunately there is no easy way to detect
-		// what the previous state was, and in any case we probably want huge
-		// pages to back our heap if the kernel can arrange that.
-		madvise(v, n, _MADV_HUGEPAGE)
+	if sys.HugePageSize != 0 {
+		// Partially undo the NOHUGEPAGE marks from sysUnused
+		// for whole huge pages between v and v+n. This may
+		// leave huge pages off at the end points v and v+n
+		// even though allocations may cover these entire huge
+		// pages. We could detect this and undo NOHUGEPAGE on
+		// the end points as well, but it's probably not worth
+		// the cost because when neighboring allocations are
+		// freed sysUnused will just set NOHUGEPAGE again.
+		var s uintptr = sys.HugePageSize
+
+		// Round v up to a huge page boundary.
+		beg := (uintptr(v) + (s - 1)) &^ (s - 1)
+		// Round v+n down to a huge page boundary.
+		end := (uintptr(v) + n) &^ (s - 1)
+
+		if beg < end {
+			madvise(unsafe.Pointer(beg), end-beg, _MADV_HUGEPAGE)
+		}
 	}
 }
 
@@ -109,10 +179,10 @@
 
 func sysReserve(v unsafe.Pointer, n uintptr, reserved *bool) unsafe.Pointer {
 	// On 64-bit, people with ulimit -v set complain if we reserve too
-	// much address space.  Instead, assume that the reservation is okay
+	// much address space. Instead, assume that the reservation is okay
 	// if we can reserve at least 64K and check the assumption in SysMap.
 	// Only user-mode Linux (UML) rejects these requests.
-	if ptrSize == 8 && uint64(n) > 1<<32 {
+	if sys.PtrSize == 8 && uint64(n) > 1<<32 {
 		p := mmap_fixed(v, 64<<10, _PROT_NONE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 		if p != v {
 			if uintptr(p) >= 4096 {

diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 755887f..3d82a98 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -12,16 +12,21 @@
 var memlock mutex
 
 type memHdr struct {
-	next *memHdr
+	next memHdrPtr
 	size uintptr
 }
 
-var memFreelist *memHdr // sorted in ascending order
+var memFreelist memHdrPtr // sorted in ascending order
+
+type memHdrPtr uintptr
+
+func (p memHdrPtr) ptr() *memHdr   { return (*memHdr)(unsafe.Pointer(p)) }
+func (p *memHdrPtr) set(x *memHdr) { *p = memHdrPtr(unsafe.Pointer(x)) }
 
 func memAlloc(n uintptr) unsafe.Pointer {
 	n = memRound(n)
 	var prevp *memHdr
-	for p := memFreelist; p != nil; p = p.next {
+	for p := memFreelist.ptr(); p != nil; p = p.next.ptr() {
 		if p.size >= n {
 			if p.size == n {
 				if prevp != nil {
@@ -47,31 +52,31 @@
 	bp := (*memHdr)(ap)
 	bp.size = n
 	bpn := uintptr(ap)
-	if memFreelist == nil {
-		bp.next = nil
-		memFreelist = bp
+	if memFreelist == 0 {
+		bp.next = 0
+		memFreelist.set(bp)
 		return
 	}
-	p := memFreelist
+	p := memFreelist.ptr()
 	if bpn < uintptr(unsafe.Pointer(p)) {
-		memFreelist = bp
+		memFreelist.set(bp)
 		if bpn+bp.size == uintptr(unsafe.Pointer(p)) {
 			bp.size += p.size
 			bp.next = p.next
 			memclr(unsafe.Pointer(p), unsafe.Sizeof(memHdr{}))
 		} else {
-			bp.next = p
+			bp.next.set(p)
 		}
 		return
 	}
-	for ; p.next != nil; p = p.next {
+	for ; p.next != 0; p = p.next.ptr() {
 		if bpn > uintptr(unsafe.Pointer(p)) && bpn < uintptr(unsafe.Pointer(p.next)) {
 			break
 		}
 	}
 	if bpn+bp.size == uintptr(unsafe.Pointer(p.next)) {
-		bp.size += p.next.size
-		bp.next = p.next.next
+		bp.size += p.next.ptr().size
+		bp.next = p.next.ptr().next
 		memclr(unsafe.Pointer(p.next), unsafe.Sizeof(memHdr{}))
 	} else {
 		bp.next = p.next
@@ -81,7 +86,7 @@
 		p.next = bp.next
 		memclr(unsafe.Pointer(bp), unsafe.Sizeof(memHdr{}))
 	} else {
-		p.next = bp
+		p.next.set(bp)
 	}
 }
 
@@ -89,7 +94,7 @@
 	if memDebug == false {
 		return
 	}
-	for p := memFreelist; p != nil && p.next != nil; p = p.next {
+	for p := memFreelist.ptr(); p != nil && p.next != 0; p = p.next.ptr() {
 		if uintptr(unsafe.Pointer(p)) == uintptr(unsafe.Pointer(p.next)) {
 			print("runtime: ", unsafe.Pointer(p), " == ", unsafe.Pointer(p.next), "\n")
 			throw("mem: infinite loop")

diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 42aa7fb..2c338c8 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -48,6 +48,7 @@
 			small &^= 4096 - 1
 		}
 		if small < 4096 {
+			print("runtime: VirtualFree of ", small, " bytes failed with errno=", getlasterror(), "\n")
 			throw("runtime: failed to decommit pages")
 		}
 		v = add(v, small)
@@ -57,8 +58,8 @@
 
 func sysUsed(v unsafe.Pointer, n uintptr) {
 	r := stdcall4(_VirtualAlloc, uintptr(v), n, _MEM_COMMIT, _PAGE_READWRITE)
-	if r != uintptr(v) {
-		throw("runtime: failed to commit pages")
+	if r == uintptr(v) {
+		return
 	}
 
 	// Commit failed. See SysUnused.
@@ -69,7 +70,8 @@
 			small &^= 4096 - 1
 		}
 		if small < 4096 {
-			throw("runtime: failed to decommit pages")
+			print("runtime: VirtualAlloc of ", small, " bytes failed with errno=", getlasterror(), "\n")
+			throw("runtime: failed to commit pages")
 		}
 		v = add(v, small)
 		n -= small
@@ -83,6 +85,7 @@
 	mSysStatDec(sysStat, n)
 	r := stdcall3(_VirtualFree, uintptr(v), 0, _MEM_RELEASE)
 	if r == 0 {
+		print("runtime: VirtualFree of ", n, " bytes failed with errno=", getlasterror(), "\n")
 		throw("runtime: failed to release pages")
 	}
 }
@@ -109,6 +112,7 @@
 	mSysStatInc(sysStat, n)
 	p := stdcall4(_VirtualAlloc, uintptr(v), n, _MEM_COMMIT, _PAGE_READWRITE)
 	if p != uintptr(v) {
+		print("runtime: VirtualAlloc of ", n, " bytes failed with errno=", getlasterror(), "\n")
 		throw("runtime: cannot map pages in arena address space")
 	}
 }

diff --git a/src/runtime/memclr_386.s b/src/runtime/memclr_386.s
index 3f20b69..ce962f3 100644
--- a/src/runtime/memclr_386.s
+++ b/src/runtime/memclr_386.s

@@ -21,7 +21,8 @@
 	CMPL	BX, $2
 	JBE	_1or2
 	CMPL	BX, $4
-	JBE	_3or4
+	JB	_3
+	JE	_4
 	CMPL	BX, $8
 	JBE	_5through8
 	CMPL	BX, $16
@@ -68,9 +69,13 @@
 	RET
 _0:
 	RET
-_3or4:
+_3:
 	MOVW	AX, (DI)
-	MOVW	AX, -2(DI)(BX*1)
+	MOVB	AX, 2(DI)
+	RET
+_4:
+	// We need a separate case for 4 to make sure we clear pointers atomically.
+	MOVL	AX, (DI)
 	RET
 _5through8:
 	MOVL	AX, (DI)

diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index ec24f1d..6f30eca 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s

@@ -23,7 +23,8 @@
 	CMPQ	BX, $4
 	JBE	_3or4
 	CMPQ	BX, $8
-	JBE	_5through8
+	JB	_5through7
+	JE	_8
 	CMPQ	BX, $16
 	JBE	_9through16
 	PXOR	X0, X0
@@ -35,8 +36,10 @@
 	JBE	_65through128
 	CMPQ	BX, $256
 	JBE	_129through256
+	CMPB	runtime·support_avx2(SB), $1
+	JE loop_preheader_avx2
 	// TODO: use branch table and BSR to make this just a single dispatch
-	// TODO: for really big clears, use MOVNTDQ.
+	// TODO: for really big clears, use MOVNTDQ, even without AVX2.
 
 loop:
 	MOVOU	X0, 0(DI)
@@ -61,6 +64,57 @@
 	JAE	loop
 	JMP	tail
 
+loop_preheader_avx2:
+	VPXOR Y0, Y0, Y0
+	// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
+	// For larger sizes it is always faster, even on dual Xeons with 30M cache.
+	// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
+	CMPQ    BX, $0x2000000
+	JAE     loop_preheader_avx2_huge
+loop_avx2:
+	VMOVDQU	Y0, 0(DI)
+	VMOVDQU	Y0, 32(DI)
+	VMOVDQU	Y0, 64(DI)
+	VMOVDQU	Y0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2
+	VMOVDQU  Y0, -32(DI)(BX*1)
+	VMOVDQU  Y0, -64(DI)(BX*1)
+	VMOVDQU  Y0, -96(DI)(BX*1)
+	VMOVDQU  Y0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+loop_preheader_avx2_huge:
+	// Align to 32 byte boundary
+	VMOVDQU  Y0, 0(DI)
+	MOVQ	DI, SI
+	ADDQ	$32, DI
+	ANDQ	$~31, DI
+	SUBQ	DI, SI
+	ADDQ	SI, BX
+loop_avx2_huge:
+	VMOVNTDQ	Y0, 0(DI)
+	VMOVNTDQ	Y0, 32(DI)
+	VMOVNTDQ	Y0, 64(DI)
+	VMOVNTDQ	Y0, 96(DI)
+	SUBQ	$128, BX
+	ADDQ	$128, DI
+	CMPQ	BX, $128
+	JAE	loop_avx2_huge
+	// In the description of MOVNTDQ in [1]
+	// "... fencing operation implemented with the SFENCE or MFENCE instruction
+	// should be used in conjunction with MOVNTDQ instructions..."
+	// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
+	SFENCE
+	VMOVDQU  Y0, -32(DI)(BX*1)
+	VMOVDQU  Y0, -64(DI)(BX*1)
+	VMOVDQU  Y0, -96(DI)(BX*1)
+	VMOVDQU  Y0, -128(DI)(BX*1)
+	VZEROUPPER
+	RET
+
 _1or2:
 	MOVB	AX, (DI)
 	MOVB	AX, -1(DI)(BX*1)
@@ -71,10 +125,14 @@
 	MOVW	AX, (DI)
 	MOVW	AX, -2(DI)(BX*1)
 	RET
-_5through8:
+_5through7:
 	MOVL	AX, (DI)
 	MOVL	AX, -4(DI)(BX*1)
 	RET
+_8:
+	// We need a separate case for 8 to make sure we clear pointers atomically.
+	MOVQ	AX, (DI)
+	RET
 _9through16:
 	MOVQ	AX, (DI)
 	MOVQ	AX, -8(DI)(BX*1)

diff --git a/src/runtime/memclr_arm.s b/src/runtime/memclr_arm.s
index 8b5fe31..c9b8586 100644
--- a/src/runtime/memclr_arm.s
+++ b/src/runtime/memclr_arm.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/memset-arm.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/memset-arm.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //

diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s
index c44c123..47c6b73 100644
--- a/src/runtime/memclr_arm64.s
+++ b/src/runtime/memclr_arm64.s

@@ -8,11 +8,30 @@
 TEXT runtime·memclr(SB),NOSPLIT,$0-16
 	MOVD	ptr+0(FP), R3
 	MOVD	n+8(FP), R4
-	CMP	$0, R4
-	BEQ	done
-	ADD	R3, R4, R4
+	// TODO(mwhudson): this is written this way to avoid tickling
+	// warnings from addpool when written as AND $7, R4, R6 (see
+	// https://golang.org/issue/12708)
+	AND	$~7, R4, R5	// R5 is N&~7
+	SUB	R5, R4, R6	// R6 is N&7
+
+	CMP	$0, R5
+	BEQ	nowords
+
+	ADD	R3, R5, R5
+
+wordloop: // TODO: Optimize for unaligned ptr.
+	MOVD.P	$0, 8(R3)
+	CMP	R3, R5
+	BNE	wordloop
+nowords:
+        CMP	$0, R6
+        BEQ	done
+
+	ADD	R3, R6, R6
+
+byteloop:
 	MOVBU.P	$0, 1(R3)
-	CMP	R3, R4
-	BNE	-2(PC)
+	CMP	R3, R6
+	BNE	byteloop
 done:
 	RET

diff --git a/src/runtime/memclr_mips64x.s b/src/runtime/memclr_mips64x.s
new file mode 100644
index 0000000..30a4af3
--- /dev/null
+++ b/src/runtime/memclr_mips64x.s

@@ -0,0 +1,42 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB),NOSPLIT,$0-16
+	MOVV	ptr+0(FP), R1
+	MOVV	n+8(FP), R2
+	ADDV	R1, R2, R4
+
+	// if less than 8 bytes, do one byte at a time
+	SGTU	$8, R2, R3
+	BNE	R3, out
+
+	// do one byte at a time until 8-aligned
+	AND	$7, R1, R3
+	BEQ	R3, words
+	MOVB	R0, (R1)
+	ADDV	$1, R1
+	JMP	-4(PC)
+
+words:
+	// do 8 bytes at a time if there is room
+	ADDV	$-7, R4, R2
+
+	SGTU	R2, R1, R3
+	BEQ	R3, out
+	MOVV	R0, (R1)
+	ADDV	$8, R1
+	JMP	-4(PC)
+
+out:
+	BEQ	R1, R4, done
+	MOVB	R0, (R1)
+	ADDV	$1, R1
+	JMP	-3(PC)
+done:
+	RET

diff --git a/src/runtime/memclr_plan9_386.s b/src/runtime/memclr_plan9_386.s
index 50f327b..4707ab2 100644
--- a/src/runtime/memclr_plan9_386.s
+++ b/src/runtime/memclr_plan9_386.s

@@ -16,7 +16,8 @@
 	CMPL	BX, $2
 	JBE	_1or2
 	CMPL	BX, $4
-	JBE	_3or4
+	JB	_3
+	JE	_4
 	CMPL	BX, $8
 	JBE	_5through8
 	CMPL	BX, $16
@@ -35,9 +36,13 @@
 	RET
 _0:
 	RET
-_3or4:
+_3:
 	MOVW	AX, (DI)
-	MOVW	AX, -2(DI)(BX*1)
+	MOVB	AX, 2(DI)
+	RET
+_4:
+	// We need a separate case for 4 to make sure we clear pointers atomically.
+	MOVL	AX, (DI)
 	RET
 _5through8:
 	MOVL	AX, (DI)

diff --git a/src/runtime/memclr_ppc64x.s b/src/runtime/memclr_ppc64x.s
index cea42cb..442faa2 100644
--- a/src/runtime/memclr_ppc64x.s
+++ b/src/runtime/memclr_ppc64x.s

@@ -7,14 +7,25 @@
 #include "textflag.h"
 
 // void runtime·memclr(void*, uintptr)
-TEXT runtime·memclr(SB),NOSPLIT,$0-16
+TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
 	MOVD	ptr+0(FP), R3
 	MOVD	n+8(FP), R4
-	CMP	R4, $0
+	SRADCC	$3, R4, R6	// R6 is the number of words to zero
+	BEQ	bytes
+
+	SUB	$8, R3
+	MOVD	R6, CTR
+	MOVDU	R0, 8(R3)
+	BC	25, 0, -1(PC)	// bdnz+ $-4
+	ADD	$8, R3
+
+bytes:
+	ANDCC	$7, R4, R7	// R7 is the number of bytes to zero
 	BEQ	done
 	SUB	$1, R3
-	MOVD	R4, CTR
+	MOVD	R7, CTR
 	MOVBU	R0, 1(R3)
-	BC	25, 0, -1(PC) // bdnz+ $-4
+	BC	25, 0, -1(PC)	// bdnz+ $-4
+
 done:
 	RET

diff --git a/src/runtime/memclr_s390x.s b/src/runtime/memclr_s390x.s
new file mode 100644
index 0000000..86eafec
--- /dev/null
+++ b/src/runtime/memclr_s390x.s

@@ -0,0 +1,122 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memclr(void*, uintptr)
+TEXT runtime·memclr(SB),NOSPLIT|NOFRAME,$0-16
+	MOVD	ptr+0(FP), R4
+	MOVD	n+8(FP), R5
+
+start:
+	CMPBLE	R5, $3, clear0to3
+	CMPBLE	R5, $7, clear4to7
+	CMPBLE	R5, $11, clear8to11
+	CMPBLE	R5, $15, clear12to15
+	CMP	R5, $32
+	BGE	clearmt32
+	MOVD	R0, 0(R4)
+	MOVD	R0, 8(R4)
+	ADD	$16, R4
+	SUB	$16, R5
+	BR	start
+
+clear0to3:
+	CMPBEQ	R5, $0, done
+	CMPBNE	R5, $1, clear2
+	MOVB	R0, 0(R4)
+	RET
+clear2:
+	CMPBNE	R5, $2, clear3
+	MOVH	R0, 0(R4)
+	RET
+clear3:
+	MOVH	R0, 0(R4)
+	MOVB	R0, 2(R4)
+	RET
+
+clear4to7:
+	CMPBNE	R5, $4, clear5
+	MOVW	R0, 0(R4)
+	RET
+clear5:
+	CMPBNE	R5, $5, clear6
+	MOVW	R0, 0(R4)
+	MOVB	R0, 4(R4)
+	RET
+clear6:
+	CMPBNE	R5, $6, clear7
+	MOVW	R0, 0(R4)
+	MOVH	R0, 4(R4)
+	RET
+clear7:
+	MOVW	R0, 0(R4)
+	MOVH	R0, 4(R4)
+	MOVB	R0, 6(R4)
+	RET
+
+clear8to11:
+	CMPBNE	R5, $8, clear9
+	MOVD	R0, 0(R4)
+	RET
+clear9:
+	CMPBNE	R5, $9, clear10
+	MOVD	R0, 0(R4)
+	MOVB	R0, 8(R4)
+	RET
+clear10:
+	CMPBNE	R5, $10, clear11
+	MOVD	R0, 0(R4)
+	MOVH	R0, 8(R4)
+	RET
+clear11:
+	MOVD	R0, 0(R4)
+	MOVH	R0, 8(R4)
+	MOVB	R0, 10(R4)
+	RET
+
+clear12to15:
+	CMPBNE	R5, $12, clear13
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	RET
+clear13:
+	CMPBNE	R5, $13, clear14
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVB	R0, 12(R4)
+	RET
+clear14:
+	CMPBNE	R5, $14, clear15
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVH	R0, 12(R4)
+	RET
+clear15:
+	MOVD	R0, 0(R4)
+	MOVW	R0, 8(R4)
+	MOVH	R0, 12(R4)
+	MOVB	R0, 14(R4)
+	RET
+
+clearmt32:
+	CMP	R5, $256
+	BLT	clearlt256
+	XC	$256, 0(R4), 0(R4)
+	ADD	$256, R4
+	ADD	$-256, R5
+	BR	clearmt32
+clearlt256:
+	CMPBEQ	R5, $0, done
+	ADD	$-1, R5
+	EXRL	$runtime·memclr_s390x_exrl_xc(SB), R5
+done:
+	RET
+
+// DO NOT CALL - target for exrl (execute relative long) instruction.
+TEXT runtime·memclr_s390x_exrl_xc(SB),NOSPLIT|NOFRAME,$0-0
+	XC	$1, 0(R4), 0(R4)
+	MOVD	R0, 0(R0)
+	RET
+

diff --git a/src/runtime/memmove_386.s b/src/runtime/memmove_386.s
index 4c0c74c..52b35a6 100644
--- a/src/runtime/memmove_386.s
+++ b/src/runtime/memmove_386.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/memmove-386.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
@@ -33,8 +33,8 @@
 	MOVL	n+8(FP), BX
 
 	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code.  The REP MOVSL instruction is really fast
-	// for large sizes.  The cutover is approximately 1K.  We implement up to
+	// with some straightline code. The REP MOVSL instruction is really fast
+	// for large sizes. The cutover is approximately 1K.  We implement up to
 	// 128 because that is the maximum SSE register load (loading all data
 	// into registers lets us ignore copy direction).
 tail:
@@ -43,7 +43,8 @@
 	CMPL	BX, $2
 	JBE	move_1or2
 	CMPL	BX, $4
-	JBE	move_3or4
+	JB	move_3
+	JE	move_4
 	CMPL	BX, $8
 	JBE	move_5through8
 	CMPL	BX, $16
@@ -68,13 +69,30 @@
 /*
  * forward copy loop
  */
-forward:	
+forward:
+	// If REP MOVSB isn't fast, don't use it
+	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
+	JEQ	fwdBy4
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$3, AX
+	JEQ	fwdBy4
+
+	// Do 1 byte at a time
+	MOVL	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy4:
+	// Do 4 bytes at a time
 	MOVL	BX, CX
 	SHRL	$2, CX
 	ANDL	$3, BX
-
 	REP;	MOVSL
 	JMP	tail
+
 /*
  * check overlap
  */
@@ -118,11 +136,16 @@
 	RET
 move_0:
 	RET
-move_3or4:
+move_3:
 	MOVW	(SI), AX
-	MOVW	-2(SI)(BX*1), CX
+	MOVB	2(SI), CX
 	MOVW	AX, (DI)
-	MOVW	CX, -2(DI)(BX*1)
+	MOVB	CX, 2(DI)
+	RET
+move_4:
+	// We need a separate case for 4 to make sure we write pointers atomically.
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
 	RET
 move_5through8:
 	MOVL	(SI), AX

diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index f968435..39b4c3a 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s

@@ -1,7 +1,7 @@
 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
@@ -35,8 +35,8 @@
 	MOVQ	n+16(FP), BX
 
 	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code.  The REP MOVSQ instruction is really fast
-	// for large sizes.  The cutover is approximately 2K.
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 2K.
 tail:
 	// move_129through256 or smaller work whether or not the source and the
 	// destination memory regions overlap because they load all data into
@@ -50,7 +50,8 @@
 	CMPQ	BX, $4
 	JBE	move_3or4
 	CMPQ	BX, $8
-	JBE	move_5through8
+	JB	move_5through7
+	JE	move_8
 	CMPQ	BX, $16
 	JBE	move_9through16
 	CMPQ	BX, $32
@@ -76,6 +77,23 @@
 	CMPQ	BX, $2048
 	JLS	move_256through2048
 
+	// If REP MOVSB isn't fast, don't use it
+	TESTL	$(1<<9), runtime·cpuid_ebx7(SB) // erms, aka enhanced REP MOVSB/STOSB
+	JEQ	fwdBy8
+
+	// Check alignment
+	MOVL	SI, AX
+	ORL	DI, AX
+	TESTL	$7, AX
+	JEQ	fwdBy8
+
+	// Do 1 byte at a time
+	MOVQ	BX, CX
+	REP;	MOVSB
+	RET
+
+fwdBy8:
+	// Do 8 bytes at a time
 	MOVQ	BX, CX
 	SHRQ	$3, CX
 	ANDQ	$7, BX
@@ -131,12 +149,17 @@
 	MOVW	AX, (DI)
 	MOVW	CX, -2(DI)(BX*1)
 	RET
-move_5through8:
+move_5through7:
 	MOVL	(SI), AX
 	MOVL	-4(SI)(BX*1), CX
 	MOVL	AX, (DI)
 	MOVL	CX, -4(DI)(BX*1)
 	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
 move_9through16:
 	MOVQ	(SI), AX
 	MOVQ	-8(SI)(BX*1), CX

diff --git a/src/runtime/memmove_arm.s b/src/runtime/memmove_arm.s
index 35f04a8..6b880d5 100644
--- a/src/runtime/memmove_arm.s
+++ b/src/runtime/memmove_arm.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/memmove-arm.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-arm.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //

diff --git a/src/runtime/memmove_arm64.s b/src/runtime/memmove_arm64.s
index 66059a7..00813d4 100644
--- a/src/runtime/memmove_arm64.s
+++ b/src/runtime/memmove_arm64.s

@@ -14,23 +14,78 @@
 	RET
 
 check:
+	AND	$~7, R5, R7	// R7 is N&~7
+	// TODO(mwhudson): this is written this way to avoid tickling
+	// warnings from addpool when written as AND $7, R5, R6 (see
+	// https://golang.org/issue/12708)
+	SUB	R7, R5, R6	// R6 is N&7
+
 	CMP	R3, R4
 	BLT	backward
 
-	ADD	R3, R5
-loop:
-	MOVBU.P	1(R4), R6
-	MOVBU.P	R6, 1(R3)
-	CMP	R3, R5
-	BNE	loop
+	// Copying forward proceeds by copying R7/8 words then copying R6 bytes.
+	// R3 and R4 are advanced as we copy.
+
+        // (There may be implementations of armv8 where copying by bytes until
+        // at least one of source or dest is word aligned is a worthwhile
+        // optimization, but the on the one tested so far (xgene) it did not
+        // make a significance difference.)
+
+	CMP	$0, R7		// Do we need to do any word-by-word copying?
+	BEQ	noforwardlarge
+
+	ADD	R3, R7, R9	// R9 points just past where we copy by word
+
+forwardlargeloop:
+	MOVD.P	8(R4), R8	// R8 is just a scratch register
+	MOVD.P	R8, 8(R3)
+	CMP	R3, R9
+	BNE	forwardlargeloop
+
+noforwardlarge:
+	CMP	$0, R6		// Do we need to do any byte-by-byte copying?
+	BNE	forwardtail
+	RET
+
+forwardtail:
+	ADD	R3, R6, R9	// R9 points just past the destination memory
+
+forwardtailloop:
+	MOVBU.P 1(R4), R8
+	MOVBU.P	R8, 1(R3)
+	CMP	R3, R9
+	BNE	forwardtailloop
 	RET
 
 backward:
-	ADD	R5, R4
-	ADD	R3, R5
-loop1:
-	MOVBU.W	-1(R4), R6
-	MOVBU.W	R6, -1(R5)
-	CMP	R3, R5
-	BNE	loop1
+	// Copying backwards proceeds by copying R6 bytes then copying R7/8 words.
+	// R3 and R4 are advanced to the end of the destination/source buffers
+	// respectively and moved back as we copy.
+
+	ADD	R4, R5, R4	// R4 points just past the last source byte
+	ADD	R3, R5, R3	// R3 points just past the last destination byte
+
+	CMP	$0, R6		// Do we need to do any byte-by-byte copying?
+	BEQ	nobackwardtail
+
+	SUB	R6, R3, R9	// R9 points at the lowest destination byte that should be copied by byte.
+backwardtailloop:
+	MOVBU.W	-1(R4), R8
+	MOVBU.W	R8, -1(R3)
+	CMP	R9, R3
+	BNE	backwardtailloop
+
+nobackwardtail:
+	CMP     $0, R7		// Do we need to do any word-by-word copying?
+	BNE	backwardlarge
+	RET
+
+backwardlarge:
+        SUB	R7, R3, R9      // R9 points at the lowest destination byte
+
+backwardlargeloop:
+	MOVD.W	-8(R4), R8
+	MOVD.W	R8, -8(R3)
+	CMP	R9, R3
+	BNE	backwardlargeloop
 	RET

diff --git a/src/runtime/memmove_linux_amd64_test.go b/src/runtime/memmove_linux_amd64_test.go
index f7221f4..1dd5d49 100644
--- a/src/runtime/memmove_linux_amd64_test.go
+++ b/src/runtime/memmove_linux_amd64_test.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/memmove_mips64x.s b/src/runtime/memmove_mips64x.s
new file mode 100644
index 0000000..f0f6852
--- /dev/null
+++ b/src/runtime/memmove_mips64x.s

@@ -0,0 +1,105 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB), NOSPLIT, $-8-24
+	MOVV	to+0(FP), R1
+	MOVV	from+8(FP), R2
+	MOVV	n+16(FP), R3
+	BNE	R3, check
+	RET
+
+check:
+	SGTU	R1, R2, R4
+	BNE	R4, backward
+
+	ADDV	R1, R3, R6 // end pointer
+
+	// if the two pointers are not of same alignments, do byte copying
+	SUBVU	R2, R1, R4
+	AND	$7, R4
+	BNE	R4, out
+
+	// if less than 8 bytes, do byte copying
+	SGTU	$8, R3, R4
+	BNE	R4, out
+
+	// do one byte at a time until 8-aligned
+	AND	$7, R1, R5
+	BEQ	R5, words
+	MOVB	(R2), R4
+	ADDV	$1, R2
+	MOVB	R4, (R1)
+	ADDV	$1, R1
+	JMP	-6(PC)
+
+words:
+	// do 8 bytes at a time if there is room
+	ADDV	$-7, R6, R3 // R3 is end pointer-7
+
+	SGTU	R3, R1, R5
+	BEQ	R5, out
+	MOVV	(R2), R4
+	ADDV	$8, R2
+	MOVV	R4, (R1)
+	ADDV	$8, R1
+	JMP	-6(PC)
+
+out:
+	BEQ	R1, R6, done
+	MOVB	(R2), R4
+	ADDV	$1, R2
+	MOVB	R4, (R1)
+	ADDV	$1, R1
+	JMP	-5(PC)
+done:
+	RET
+
+backward:
+	ADDV	R3, R2 // from-end pointer
+	ADDV	R1, R3, R6 // to-end pointer
+
+	// if the two pointers are not of same alignments, do byte copying
+	SUBVU	R6, R2, R4
+	AND	$7, R4
+	BNE	R4, out1
+
+	// if less than 8 bytes, do byte copying
+	SGTU	$8, R3, R4
+	BNE	R4, out1
+
+	// do one byte at a time until 8-aligned
+	AND	$7, R6, R5
+	BEQ	R5, words1
+	ADDV	$-1, R2
+	MOVB	(R2), R4
+	ADDV	$-1, R6
+	MOVB	R4, (R6)
+	JMP	-6(PC)
+
+words1:
+	// do 8 bytes at a time if there is room
+	ADDV	$7, R1, R3 // R3 is start pointer+7
+
+	SGTU	R6, R3, R5
+	BEQ	R5, out1
+	ADDV	$-8, R2
+	MOVV	(R2), R4
+	ADDV	$-8, R6
+	MOVV	R4, (R6)
+	JMP	-6(PC)
+
+out1:
+	BEQ	R1, R6, done1
+	ADDV	$-1, R2
+	MOVB	(R2), R4
+	ADDV	$-1, R6
+	MOVB	R4, (R6)
+	JMP	-5(PC)
+done1:
+	RET

diff --git a/src/runtime/memmove_nacl_amd64p32.s b/src/runtime/memmove_nacl_amd64p32.s
index 373607a..13907a9 100644
--- a/src/runtime/memmove_nacl_amd64p32.s
+++ b/src/runtime/memmove_nacl_amd64p32.s

@@ -1,9 +1,12 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
 
+// This could use MOVSQ, but we use MOVSL so that if an object ends in
+// a 4 byte pointer, we copy it as a unit instead of byte by byte.
+
 TEXT runtime·memmove(SB), NOSPLIT, $0-12
 	MOVL	to+0(FP), DI
 	MOVL	from+4(FP), SI
@@ -14,9 +17,9 @@
 
 forward:
 	MOVL	BX, CX
-	SHRL	$3, CX
-	ANDL	$7, BX
-	REP; MOVSQ
+	SHRL	$2, CX
+	ANDL	$3, BX
+	REP; MOVSL
 	MOVL	BX, CX
 	REP; MOVSB
 	RET
@@ -32,15 +35,18 @@
 	STD
 	
 	MOVL	BX, CX
-	SHRL	$3, CX
-	ANDL	$7, BX
-	SUBL	$8, DI
-	SUBL	$8, SI
-	REP; MOVSQ
-	ADDL	$7, DI
-	ADDL	$7, SI
+	SHRL	$2, CX
+	ANDL	$3, BX
+	SUBL	$4, DI
+	SUBL	$4, SI
+	REP; MOVSL
+	ADDL	$3, DI
+	ADDL	$3, SI
 	MOVL	BX, CX
 	REP; MOVSB
 	CLD
 
+	// Note: we copy only 4 bytes at a time so that the tail is at most
+	// 3 bytes. That guarantees that we aren't copying pointers with MOVSB.
+	// See issue 13160.
 	RET

diff --git a/src/runtime/memmove_plan9_386.s b/src/runtime/memmove_plan9_386.s
index 025d4ce..c4d62ec 100644
--- a/src/runtime/memmove_plan9_386.s
+++ b/src/runtime/memmove_plan9_386.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/memmove-386.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
@@ -31,15 +31,16 @@
 	MOVL	n+8(FP), BX
 
 	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code.  The REP MOVSL instruction is really fast
-	// for large sizes.  The cutover is approximately 1K.
+	// with some straightline code. The REP MOVSL instruction is really fast
+	// for large sizes. The cutover is approximately 1K.
 tail:
 	TESTL	BX, BX
 	JEQ	move_0
 	CMPL	BX, $2
 	JBE	move_1or2
 	CMPL	BX, $4
-	JBE	move_3or4
+	JB	move_3
+	JE	move_4
 	CMPL	BX, $8
 	JBE	move_5through8
 	CMPL	BX, $16
@@ -104,11 +105,16 @@
 	RET
 move_0:
 	RET
-move_3or4:
+move_3:
 	MOVW	(SI), AX
-	MOVW	-2(SI)(BX*1), CX
+	MOVB	2(SI), CX
 	MOVW	AX, (DI)
-	MOVW	CX, -2(DI)(BX*1)
+	MOVB	CX, 2(DI)
+	RET
+move_4:
+	// We need a separate case for 4 to make sure we write pointers atomically.
+	MOVL	(SI), AX
+	MOVL	AX, (DI)
 	RET
 move_5through8:
 	MOVL	(SI), AX

diff --git a/src/runtime/memmove_plan9_amd64.s b/src/runtime/memmove_plan9_amd64.s
index 8e96b87..9bef31d 100644
--- a/src/runtime/memmove_plan9_amd64.s
+++ b/src/runtime/memmove_plan9_amd64.s

@@ -1,7 +1,7 @@
 // Derived from Inferno's libkern/memmove-386.s (adapted for amd64)
 // http://code.google.com/p/inferno-os/source/browse/libkern/memmove-386.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
@@ -33,8 +33,8 @@
 	MOVQ	n+16(FP), BX
 
 	// REP instructions have a high startup cost, so we handle small sizes
-	// with some straightline code.  The REP MOVSQ instruction is really fast
-	// for large sizes.  The cutover is approximately 1K.
+	// with some straightline code. The REP MOVSQ instruction is really fast
+	// for large sizes. The cutover is approximately 1K.
 tail:
 	TESTQ	BX, BX
 	JEQ	move_0
@@ -43,7 +43,8 @@
 	CMPQ	BX, $4
 	JBE	move_3or4
 	CMPQ	BX, $8
-	JBE	move_5through8
+	JB	move_5through7
+	JE	move_8
 	CMPQ	BX, $16
 	JBE	move_9through16
 
@@ -113,12 +114,17 @@
 	MOVW	AX, (DI)
 	MOVW	CX, -2(DI)(BX*1)
 	RET
-move_5through8:
+move_5through7:
 	MOVL	(SI), AX
 	MOVL	-4(SI)(BX*1), CX
 	MOVL	AX, (DI)
 	MOVL	CX, -4(DI)(BX*1)
 	RET
+move_8:
+	// We need a separate case for 8 to make sure we write pointers atomically.
+	MOVQ	(SI), AX
+	MOVQ	AX, (DI)
+	RET
 move_9through16:
 	MOVQ	(SI), AX
 	MOVQ	-8(SI)(BX*1), CX

diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index 3ada63e..26dabd9 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s

@@ -7,34 +7,113 @@
 #include "textflag.h"
 
 // void runtime·memmove(void*, void*, uintptr)
-TEXT runtime·memmove(SB), NOSPLIT, $-8-24
+TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
 	MOVD	to+0(FP), R3
 	MOVD	from+8(FP), R4
 	MOVD	n+16(FP), R5
-	CMP	R5, $0
-	BNE	check
-	RET
 
+	// Determine if there are doublewords to
+	// copy so a more efficient move can be done
 check:
-	CMP	R3, R4
-	BGT	backward
+	ANDCC	$7, R5, R7	// R7: bytes to copy
+	SRAD	$3, R5, R6	// R6: double words to copy
+	CMP	R6, $0, CR1	// CR1[EQ] set if no double words to copy
 
-	SUB	$1, R3
-	ADD	R3, R5
-	SUB	$1, R4
-loop:
-	MOVBU	1(R4), R6
-	MOVBU	R6, 1(R3)
-	CMP	R3, R5
-	BNE	loop
+	// Determine overlap by subtracting dest - src and comparing against the
+	// length.  The catches the cases where src and dest are in different types
+	// of storage such as stack and static to avoid doing backward move when not
+	// necessary.
+
+	SUB	R4, R3, R8	// dest - src
+	CMPU	R8, R5, CR2	// < len?
+	BC	12, 8, backward // BLT CR2 backward
+
+	// Copying forward if no overlap.
+
+	BC	12, 6, noforwardlarge	// "BEQ CR1, noforwardlarge"
+	MOVD	R6,CTR			// R6 = number of double words
+	SRADCC	$2,R6,R8		// 32 byte chunks?
+	BNE	forward32setup		//
+
+	// Move double words
+
+forward8:
+	MOVD    0(R4), R8		// double word
+	ADD     $8,R4
+	MOVD    R8, 0(R3)		//
+	ADD     $8,R3
+	BC      16, 0, forward8
+	BR	noforwardlarge		// handle remainder
+
+	// Prepare for moves of 32 bytes at a time.
+
+forward32setup:
+	DCBTST	(R3)			// prepare data cache
+	DCBT	(R4)
+	MOVD	R8, CTR			// double work count
+
+forward32:
+	MOVD	0(R4), R8		// load 4 double words
+	MOVD	8(R4), R9
+	MOVD	16(R4), R14
+	MOVD	24(R4), R15
+	ADD	$32,R4
+	MOVD	R8, 0(R3)		// store those 4
+	MOVD	R9, 8(R3)
+	MOVD	R14,16(R3)
+	MOVD	R15,24(R3)
+	ADD	$32,R3			// bump up for next set
+	BC	16, 0, forward32	// continue
+	RLDCLCC	$61,R5,$3,R6		// remaining doublewords
+	BEQ	noforwardlarge
+	MOVD	R6,CTR			// set up the CTR
+	BR	forward8
+
+noforwardlarge:
+	CMP	R7,$0			// any remaining bytes
+	BC	4, 1, LR
+
+forwardtail:
+	MOVD	R7, CTR			// move tail bytes
+
+forwardtailloop:
+	MOVBZ	0(R4), R8		// move single bytes
+	ADD	$1,R4
+	MOVBZ	R8, 0(R3)
+	ADD	$1,R3
+	BC	16, 0, forwardtailloop
 	RET
 
 backward:
-	ADD	R5, R4
-	ADD	R3, R5
-loop1:
-	MOVBU	-1(R4), R6
-	MOVBU	R6, -1(R5)
-	CMP	R3, R5
-	BNE	loop1
+	// Copying backwards proceeds by copying R7 bytes then copying R6 double words.
+	// R3 and R4 are advanced to the end of the destination/source buffers
+	// respectively and moved back as we copy.
+
+	ADD	R5, R4, R4		// end of source
+	ADD	R3, R5, R3		// end of dest
+
+	BEQ	nobackwardtail		// earlier condition
+
+	MOVD	R7, CTR			// bytes to move
+
+backwardtailloop:
+	MOVBZ 	-1(R4), R8		// point to last byte
+	SUB	$1,R4
+	MOVBZ 	R8, -1(R3)
+	SUB	$1,R3
+	BC	16, 0, backwardtailloop
+
+nobackwardtail:
+	CMP	R6,$0
+	BC	4, 5, LR
+
+backwardlarge:
+	MOVD	R6, CTR
+
+backwardlargeloop:
+	MOVD 	-8(R4), R8
+	SUB	$8,R4
+	MOVD 	R8, -8(R3)
+	SUB	$8,R3
+	BC	16, 0, backwardlargeloop	//
 	RET

diff --git a/src/runtime/memmove_s390x.s b/src/runtime/memmove_s390x.s
new file mode 100644
index 0000000..238f308
--- /dev/null
+++ b/src/runtime/memmove_s390x.s

@@ -0,0 +1,189 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// void runtime·memmove(void*, void*, uintptr)
+TEXT runtime·memmove(SB),NOSPLIT|NOFRAME,$0-24
+	MOVD	to+0(FP), R6
+	MOVD	from+8(FP), R4
+	MOVD	n+16(FP), R5
+
+	CMPBEQ	R6, R4, done
+
+start:
+	CMPBLE	R5, $3, move0to3
+	CMPBLE	R5, $7, move4to7
+	CMPBLE	R5, $11, move8to11
+	CMPBLE	R5, $15, move12to15
+	CMPBNE	R5, $16, movemt16
+	MOVD	0(R4), R7
+	MOVD	8(R4), R8
+	MOVD	R7, 0(R6)
+	MOVD	R8, 8(R6)
+	RET
+
+movemt16:
+	CMPBGT	R4, R6, forwards
+	ADD	R5, R4, R7
+	CMPBLE	R7, R6, forwards
+	ADD	R5, R6, R8
+backwards:
+	MOVD	-8(R7), R3
+	MOVD	R3, -8(R8)
+	MOVD	-16(R7), R3
+	MOVD	R3, -16(R8)
+	ADD	$-16, R5
+	ADD	$-16, R7
+	ADD	$-16, R8
+	CMP	R5, $16
+	BGE	backwards
+	BR	start
+
+forwards:
+	CMPBGT	R5, $64, forwards_fast
+	MOVD	0(R4), R3
+	MOVD	R3, 0(R6)
+	MOVD	8(R4), R3
+	MOVD	R3, 8(R6)
+	ADD	$16, R4
+	ADD	$16, R6
+	ADD	$-16, R5
+	CMP	R5, $16
+	BGE	forwards
+	BR	start
+
+forwards_fast:
+	CMP	R5, $256
+	BLE	forwards_small
+	MVC	$256, 0(R4), 0(R6)
+	ADD	$256, R4
+	ADD	$256, R6
+	ADD	$-256, R5
+	BR	forwards_fast
+
+forwards_small:
+	CMPBEQ	R5, $0, done
+	ADD	$-1, R5
+	EXRL	$runtime·memmove_s390x_exrl_mvc(SB), R5
+	RET
+
+move0to3:
+	CMPBEQ	R5, $0, done
+move1:
+	CMPBNE	R5, $1, move2
+	MOVB	0(R4), R3
+	MOVB	R3, 0(R6)
+	RET
+move2:
+	CMPBNE	R5, $2, move3
+	MOVH	0(R4), R3
+	MOVH	R3, 0(R6)
+	RET
+move3:
+	MOVH	0(R4), R3
+	MOVB	2(R4), R7
+	MOVH	R3, 0(R6)
+	MOVB	R7, 2(R6)
+	RET
+
+move4to7:
+	CMPBNE	R5, $4, move5
+	MOVW	0(R4), R3
+	MOVW	R3, 0(R6)
+	RET
+move5:
+	CMPBNE	R5, $5, move6
+	MOVW	0(R4), R3
+	MOVB	4(R4), R7
+	MOVW	R3, 0(R6)
+	MOVB	R7, 4(R6)
+	RET
+move6:
+	CMPBNE	R5, $6, move7
+	MOVW	0(R4), R3
+	MOVH	4(R4), R7
+	MOVW	R3, 0(R6)
+	MOVH	R7, 4(R6)
+	RET
+move7:
+	MOVW	0(R4), R3
+	MOVH	4(R4), R7
+	MOVB	6(R4), R8
+	MOVW	R3, 0(R6)
+	MOVH	R7, 4(R6)
+	MOVB	R8, 6(R6)
+	RET
+
+move8to11:
+	CMPBNE	R5, $8, move9
+	MOVD	0(R4), R3
+	MOVD	R3, 0(R6)
+	RET
+move9:
+	CMPBNE	R5, $9, move10
+	MOVD	0(R4), R3
+	MOVB	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVB	R7, 8(R6)
+	RET
+move10:
+	CMPBNE	R5, $10, move11
+	MOVD	0(R4), R3
+	MOVH	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVH	R7, 8(R6)
+	RET
+move11:
+	MOVD	0(R4), R3
+	MOVH	8(R4), R7
+	MOVB	10(R4), R8
+	MOVD	R3, 0(R6)
+	MOVH	R7, 8(R6)
+	MOVB	R8, 10(R6)
+	RET
+
+move12to15:
+	CMPBNE	R5, $12, move13
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	RET
+move13:
+	CMPBNE	R5, $13, move14
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVB	12(R4), R8
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVB	R8, 12(R6)
+	RET
+move14:
+	CMPBNE	R5, $14, move15
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVH	12(R4), R8
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVH	R8, 12(R6)
+	RET
+move15:
+	MOVD	0(R4), R3
+	MOVW	8(R4), R7
+	MOVH	12(R4), R8
+	MOVB	14(R4), R10
+	MOVD	R3, 0(R6)
+	MOVW	R7, 8(R6)
+	MOVH	R8, 12(R6)
+	MOVB	R10, 14(R6)
+done:
+	RET
+
+// DO NOT CALL - target for exrl (execute relative long) instruction.
+TEXT runtime·memmove_s390x_exrl_mvc(SB),NOSPLIT|NOFRAME,$0-0
+	MVC	$1, 0(R4), 0(R6)
+	MOVD	R0, 0(R0)
+	RET
+

diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 857f99b..2124cb9 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go

@@ -1,10 +1,11 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime_test
 
 import (
+	"fmt"
 	. "runtime"
 	"testing"
 )
@@ -81,40 +82,49 @@
 	}
 }
 
-func bmMemmove(b *testing.B, n int) {
-	x := make([]byte, n)
-	y := make([]byte, n)
-	b.SetBytes(int64(n))
-	for i := 0; i < b.N; i++ {
-		copy(x, y)
+func benchmarkSizes(b *testing.B, sizes []int, fn func(b *testing.B, n int)) {
+	for _, n := range sizes {
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n))
+			fn(b, n)
+		})
 	}
 }
 
-func BenchmarkMemmove0(b *testing.B)    { bmMemmove(b, 0) }
-func BenchmarkMemmove1(b *testing.B)    { bmMemmove(b, 1) }
-func BenchmarkMemmove2(b *testing.B)    { bmMemmove(b, 2) }
-func BenchmarkMemmove3(b *testing.B)    { bmMemmove(b, 3) }
-func BenchmarkMemmove4(b *testing.B)    { bmMemmove(b, 4) }
-func BenchmarkMemmove5(b *testing.B)    { bmMemmove(b, 5) }
-func BenchmarkMemmove6(b *testing.B)    { bmMemmove(b, 6) }
-func BenchmarkMemmove7(b *testing.B)    { bmMemmove(b, 7) }
-func BenchmarkMemmove8(b *testing.B)    { bmMemmove(b, 8) }
-func BenchmarkMemmove9(b *testing.B)    { bmMemmove(b, 9) }
-func BenchmarkMemmove10(b *testing.B)   { bmMemmove(b, 10) }
-func BenchmarkMemmove11(b *testing.B)   { bmMemmove(b, 11) }
-func BenchmarkMemmove12(b *testing.B)   { bmMemmove(b, 12) }
-func BenchmarkMemmove13(b *testing.B)   { bmMemmove(b, 13) }
-func BenchmarkMemmove14(b *testing.B)   { bmMemmove(b, 14) }
-func BenchmarkMemmove15(b *testing.B)   { bmMemmove(b, 15) }
-func BenchmarkMemmove16(b *testing.B)   { bmMemmove(b, 16) }
-func BenchmarkMemmove32(b *testing.B)   { bmMemmove(b, 32) }
-func BenchmarkMemmove64(b *testing.B)   { bmMemmove(b, 64) }
-func BenchmarkMemmove128(b *testing.B)  { bmMemmove(b, 128) }
-func BenchmarkMemmove256(b *testing.B)  { bmMemmove(b, 256) }
-func BenchmarkMemmove512(b *testing.B)  { bmMemmove(b, 512) }
-func BenchmarkMemmove1024(b *testing.B) { bmMemmove(b, 1024) }
-func BenchmarkMemmove2048(b *testing.B) { bmMemmove(b, 2048) }
-func BenchmarkMemmove4096(b *testing.B) { bmMemmove(b, 4096) }
+var bufSizes = []int{
+	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+	32, 64, 128, 256, 512, 1024, 2048, 4096,
+}
+
+func BenchmarkMemmove(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		y := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			copy(x, y)
+		}
+	})
+}
+
+func BenchmarkMemmoveUnalignedDst(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n+1)
+		y := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			copy(x[1:], y)
+		}
+	})
+}
+
+func BenchmarkMemmoveUnalignedSrc(b *testing.B) {
+	benchmarkSizes(b, bufSizes, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		y := make([]byte, n+1)
+		for i := 0; i < b.N; i++ {
+			copy(x, y[1:])
+		}
+	})
+}
 
 func TestMemclr(t *testing.T) {
 	size := 512
@@ -148,33 +158,37 @@
 	}
 }
 
-func bmMemclr(b *testing.B, n int) {
-	x := make([]byte, n)
-	b.SetBytes(int64(n))
-	for i := 0; i < b.N; i++ {
-		MemclrBytes(x)
+func BenchmarkMemclr(b *testing.B) {
+	for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
+		x := make([]byte, n)
+		b.Run(fmt.Sprint(n), func(b *testing.B) {
+			b.SetBytes(int64(n))
+			for i := 0; i < b.N; i++ {
+				MemclrBytes(x)
+			}
+		})
+	}
+	for _, m := range []int{1, 4, 8, 16, 64} {
+		x := make([]byte, m<<20)
+		b.Run(fmt.Sprint(m, "M"), func(b *testing.B) {
+			b.SetBytes(int64(m << 20))
+			for i := 0; i < b.N; i++ {
+				MemclrBytes(x)
+			}
+		})
 	}
 }
-func BenchmarkMemclr5(b *testing.B)     { bmMemclr(b, 5) }
-func BenchmarkMemclr16(b *testing.B)    { bmMemclr(b, 16) }
-func BenchmarkMemclr64(b *testing.B)    { bmMemclr(b, 64) }
-func BenchmarkMemclr256(b *testing.B)   { bmMemclr(b, 256) }
-func BenchmarkMemclr4096(b *testing.B)  { bmMemclr(b, 4096) }
-func BenchmarkMemclr65536(b *testing.B) { bmMemclr(b, 65536) }
 
-func bmGoMemclr(b *testing.B, n int) {
-	x := make([]byte, n)
-	b.SetBytes(int64(n))
-	for i := 0; i < b.N; i++ {
-		for j := range x {
-			x[j] = 0
+func BenchmarkGoMemclr(b *testing.B) {
+	benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
+		x := make([]byte, n)
+		for i := 0; i < b.N; i++ {
+			for j := range x {
+				x[j] = 0
+			}
 		}
-	}
+	})
 }
-func BenchmarkGoMemclr5(b *testing.B)   { bmGoMemclr(b, 5) }
-func BenchmarkGoMemclr16(b *testing.B)  { bmGoMemclr(b, 16) }
-func BenchmarkGoMemclr64(b *testing.B)  { bmGoMemclr(b, 64) }
-func BenchmarkGoMemclr256(b *testing.B) { bmGoMemclr(b, 256) }
 
 func BenchmarkClearFat8(b *testing.B) {
 	for i := 0; i < b.N; i++ {

diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index 7e1773c..1a744e4 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,21 +6,25 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type finblock struct {
 	alllink *finblock
 	next    *finblock
 	cnt     int32
 	_       int32
-	fin     [(_FinBlockSize - 2*ptrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
+	fin     [(_FinBlockSize - 2*sys.PtrSize - 2*4) / unsafe.Sizeof(finalizer{})]finalizer
 }
 
 var finlock mutex  // protects the following variables
 var fing *g        // goroutine that runs finalizers
 var finq *finblock // list of finalizers that are to be executed
 var finc *finblock // cache of free blocks
-var finptrmask [_FinBlockSize / ptrSize / 8]byte
+var finptrmask [_FinBlockSize / sys.PtrSize / 8]byte
 var fingwait bool
 var fingwake bool
 var allfin *finblock // list of all blocks
@@ -73,12 +77,12 @@
 			if finptrmask[0] == 0 {
 				// Build pointer mask for Finalizer array in block.
 				// Check assumptions made in finalizer1 array above.
-				if (unsafe.Sizeof(finalizer{}) != 5*ptrSize ||
+				if (unsafe.Sizeof(finalizer{}) != 5*sys.PtrSize ||
 					unsafe.Offsetof(finalizer{}.fn) != 0 ||
-					unsafe.Offsetof(finalizer{}.arg) != ptrSize ||
-					unsafe.Offsetof(finalizer{}.nret) != 2*ptrSize ||
-					unsafe.Offsetof(finalizer{}.fint) != 3*ptrSize ||
-					unsafe.Offsetof(finalizer{}.ot) != 4*ptrSize) {
+					unsafe.Offsetof(finalizer{}.arg) != sys.PtrSize ||
+					unsafe.Offsetof(finalizer{}.nret) != 2*sys.PtrSize ||
+					unsafe.Offsetof(finalizer{}.fint) != 3*sys.PtrSize ||
+					unsafe.Offsetof(finalizer{}.ot) != 4*sys.PtrSize) {
 					throw("finalizer out of sync")
 				}
 				for i := range finptrmask {
@@ -131,7 +135,7 @@
 
 func createfing() {
 	// start the finalizer goroutine exactly once
-	if fingCreate == 0 && cas(&fingCreate, 0, 1) {
+	if fingCreate == 0 && atomic.Cas(&fingCreate, 0, 1) {
 		go runfinq()
 	}
 }
@@ -160,15 +164,15 @@
 		}
 		for fb != nil {
 			for i := fb.cnt; i > 0; i-- {
-				f := (*finalizer)(add(unsafe.Pointer(&fb.fin), uintptr(i-1)*unsafe.Sizeof(finalizer{})))
+				f := &fb.fin[i-1]
 
-				framesz := unsafe.Sizeof((interface{})(nil)) + uintptr(f.nret)
+				framesz := unsafe.Sizeof((interface{})(nil)) + f.nret
 				if framecap < framesz {
 					// The frame does not contain pointers interesting for GC,
 					// all not yet finalized objects are stored in finq.
 					// If we do not mark it as FlagNoScan,
 					// the last finalized object is not collected.
-					frame = mallocgc(framesz, nil, flagNoScan)
+					frame = mallocgc(framesz, nil, true)
 					framecap = framesz
 				}
 
@@ -187,7 +191,7 @@
 					if len(ityp.mhdr) != 0 {
 						// convert to interface with methods
 						// this conversion is guaranteed to succeed - we checked in SetFinalizer
-						assertE2I(ityp, *(*interface{})(frame), (*fInterface)(frame))
+						assertE2I(ityp, *(*eface)(frame), (*iface)(frame))
 					}
 				default:
 					throw("bad kind in runfinq")
@@ -212,20 +216,20 @@
 	}
 }
 
-// SetFinalizer sets the finalizer associated with x to f.
-// When the garbage collector finds an unreachable block
+// SetFinalizer sets the finalizer associated with obj to the provided
+// finalizer function. When the garbage collector finds an unreachable block
 // with an associated finalizer, it clears the association and runs
-// f(x) in a separate goroutine.  This makes x reachable again, but
-// now without an associated finalizer.  Assuming that SetFinalizer
+// finalizer(obj) in a separate goroutine. This makes obj reachable again,
+// but now without an associated finalizer. Assuming that SetFinalizer
 // is not called again, the next time the garbage collector sees
-// that x is unreachable, it will free x.
+// that obj is unreachable, it will free obj.
 //
-// SetFinalizer(x, nil) clears any finalizer associated with x.
+// SetFinalizer(obj, nil) clears any finalizer associated with obj.
 //
-// The argument x must be a pointer to an object allocated by
+// The argument obj must be a pointer to an object allocated by
 // calling new or by taking the address of a composite literal.
-// The argument f must be a function that takes a single argument
-// to which x's type can be assigned, and can have arbitrary ignored return
+// The argument finalizer must be a function that takes a single argument
+// to which obj's type can be assigned, and can have arbitrary ignored return
 // values. If either of these is not true, SetFinalizer aborts the
 // program.
 //
@@ -237,8 +241,8 @@
 // is not guaranteed to run, because there is no ordering that
 // respects the dependencies.
 //
-// The finalizer for x is scheduled to run at some arbitrary time after
-// x becomes unreachable.
+// The finalizer for obj is scheduled to run at some arbitrary time after
+// obj becomes unreachable.
 // There is no guarantee that finalizers will run before a program exits,
 // so typically they are useful only for releasing non-memory resources
 // associated with an object during a long-running program.
@@ -248,13 +252,31 @@
 // to depend on a finalizer to flush an in-memory I/O buffer such as a
 // bufio.Writer, because the buffer would not be flushed at program exit.
 //
-// It is not guaranteed that a finalizer will run if the size of *x is
+// It is not guaranteed that a finalizer will run if the size of *obj is
 // zero bytes.
 //
 // It is not guaranteed that a finalizer will run for objects allocated
 // in initializers for package-level variables. Such objects may be
 // linker-allocated, not heap-allocated.
 //
+// A finalizer may run as soon as an object becomes unreachable.
+// In order to use finalizers correctly, the program must ensure that
+// the object is reachable until it is no longer required.
+// Objects stored in global variables, or that can be found by tracing
+// pointers from a global variable, are reachable. For other objects,
+// pass the object to a call of the KeepAlive function to mark the
+// last point in the function where the object must be reachable.
+//
+// For example, if p points to a struct that contains a file descriptor d,
+// and p has a finalizer that closes that file descriptor, and if the last
+// use of p in a function is a call to syscall.Write(p.d, buf, size), then
+// p may be unreachable as soon as the program enters syscall.Write. The
+// finalizer may run at that moment, closing p.d, causing syscall.Write
+// to fail because it is writing to a closed file descriptor (or, worse,
+// to an entirely different file descriptor opened by a different goroutine).
+// To avoid this problem, call runtime.KeepAlive(p) after the call to
+// syscall.Write.
+//
 // A single goroutine runs all finalizers for a program, sequentially.
 // If a finalizer must run for a long time, it should do so by starting
 // a new goroutine.
@@ -264,13 +286,13 @@
 		// (and we don't have the data structures to record them).
 		return
 	}
-	e := (*eface)(unsafe.Pointer(&obj))
+	e := efaceOf(&obj)
 	etyp := e._type
 	if etyp == nil {
 		throw("runtime.SetFinalizer: first argument is nil")
 	}
 	if etyp.kind&kindMask != kindPtr {
-		throw("runtime.SetFinalizer: first argument is " + *etyp._string + ", not pointer")
+		throw("runtime.SetFinalizer: first argument is " + etyp.string() + ", not pointer")
 	}
 	ot := (*ptrtype)(unsafe.Pointer(etyp))
 	if ot.elem == nil {
@@ -313,7 +335,7 @@
 		}
 	}
 
-	f := (*eface)(unsafe.Pointer(&finalizer))
+	f := efaceOf(&finalizer)
 	ftyp := f._type
 	if ftyp == nil {
 		// switch to system stack and remove finalizer
@@ -324,20 +346,22 @@
 	}
 
 	if ftyp.kind&kindMask != kindFunc {
-		throw("runtime.SetFinalizer: second argument is " + *ftyp._string + ", not a function")
+		throw("runtime.SetFinalizer: second argument is " + ftyp.string() + ", not a function")
 	}
 	ft := (*functype)(unsafe.Pointer(ftyp))
-	ins := *(*[]*_type)(unsafe.Pointer(&ft.in))
-	if ft.dotdotdot || len(ins) != 1 {
-		throw("runtime.SetFinalizer: cannot pass " + *etyp._string + " to finalizer " + *ftyp._string)
+	if ft.dotdotdot() {
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string() + " because dotdotdot")
 	}
-	fint := ins[0]
+	if ft.dotdotdot() || ft.inCount != 1 {
+		throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
+	}
+	fint := ft.in()[0]
 	switch {
 	case fint == etyp:
 		// ok - same type
 		goto okarg
 	case fint.kind&kindMask == kindPtr:
-		if (fint.x == nil || fint.x.name == nil || etyp.x == nil || etyp.x.name == nil) && (*ptrtype)(unsafe.Pointer(fint)).elem == ot.elem {
+		if (fint.uncommon() == nil || etyp.uncommon() == nil) && (*ptrtype)(unsafe.Pointer(fint)).elem == ot.elem {
 			// ok - not same type, but both pointers,
 			// one or the other is unnamed, and same element type, so assignable.
 			goto okarg
@@ -348,18 +372,18 @@
 			// ok - satisfies empty interface
 			goto okarg
 		}
-		if assertE2I2(ityp, obj, nil) {
+		if assertE2I2(ityp, *efaceOf(&obj), nil) {
 			goto okarg
 		}
 	}
-	throw("runtime.SetFinalizer: cannot pass " + *etyp._string + " to finalizer " + *ftyp._string)
+	throw("runtime.SetFinalizer: cannot pass " + etyp.string() + " to finalizer " + ftyp.string())
 okarg:
 	// compute size needed for return parameters
 	nret := uintptr(0)
-	for _, t := range *(*[]*_type)(unsafe.Pointer(&ft.out)) {
+	for _, t := range ft.out() {
 		nret = round(nret, uintptr(t.align)) + uintptr(t.size)
 	}
-	nret = round(nret, ptrSize)
+	nret = round(nret, sys.PtrSize)
 
 	// make sure we have a finalizer goroutine
 	createfing()
@@ -371,13 +395,13 @@
 	})
 }
 
-// Look up pointer v in heap.  Return the span containing the object,
-// the start of the object, and the size of the object.  If the object
+// Look up pointer v in heap. Return the span containing the object,
+// the start of the object, and the size of the object. If the object
 // does not exist, return nil, nil, 0.
 func findObject(v unsafe.Pointer) (s *mspan, x unsafe.Pointer, n uintptr) {
 	c := gomcache()
 	c.local_nlookup++
-	if ptrSize == 4 && c.local_nlookup >= 1<<30 {
+	if sys.PtrSize == 4 && c.local_nlookup >= 1<<30 {
 		// purge cache stats to prevent overflow
 		lock(&mheap_.lock)
 		purgecachedstats(c)
@@ -385,18 +409,18 @@
 	}
 
 	// find span
-	arena_start := uintptr(unsafe.Pointer(mheap_.arena_start))
-	arena_used := uintptr(unsafe.Pointer(mheap_.arena_used))
+	arena_start := mheap_.arena_start
+	arena_used := mheap_.arena_used
 	if uintptr(v) < arena_start || uintptr(v) >= arena_used {
 		return
 	}
 	p := uintptr(v) >> pageShift
 	q := p - arena_start>>pageShift
-	s = *(**mspan)(add(unsafe.Pointer(mheap_.spans), q*ptrSize))
+	s = *(**mspan)(add(unsafe.Pointer(mheap_.spans), q*sys.PtrSize))
 	if s == nil {
 		return
 	}
-	x = unsafe.Pointer(uintptr(s.start) << pageShift)
+	x = unsafe.Pointer(s.base())
 
 	if uintptr(v) < uintptr(x) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != mSpanInUse {
 		s = nil
@@ -404,9 +428,37 @@
 		return
 	}
 
-	n = uintptr(s.elemsize)
+	n = s.elemsize
 	if s.sizeclass != 0 {
 		x = add(x, (uintptr(v)-uintptr(x))/n*n)
 	}
 	return
 }
+
+// Mark KeepAlive as noinline so that the current compiler will ensure
+// that the argument is alive at the point of the function call.
+// If it were inlined, it would disappear, and there would be nothing
+// keeping the argument alive. Perhaps a future compiler will recognize
+// runtime.KeepAlive specially and do something more efficient.
+//go:noinline
+
+// KeepAlive marks its argument as currently reachable.
+// This ensures that the object is not freed, and its finalizer is not run,
+// before the point in the program where KeepAlive is called.
+//
+// A very simplified example showing where KeepAlive is required:
+// 	type File struct { d int }
+// 	d, err := syscall.Open("/file/path", syscall.O_RDONLY, 0)
+// 	// ... do something if err != nil ...
+// 	p := &FILE{d}
+// 	runtime.SetFinalizer(p, func(p *File) { syscall.Close(p.d) })
+// 	var buf [10]byte
+// 	n, err := syscall.Read(p.d, buf[:])
+// 	// Ensure p is not finalized until Read returns.
+// 	runtime.KeepAlive(p)
+// 	// No more uses of p after this point.
+//
+// Without the KeepAlive call, the finalizer could run at the start of
+// syscall.Read, closing the file descriptor before syscall.Read makes
+// the actual system call.
+func KeepAlive(interface{}) {}

diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index bb2f4e7..c4ab648 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Fixed-size object allocator.  Returned memory is not zeroed.
+// Fixed-size object allocator. Returned memory is not zeroed.
 //
 // See malloc.go for overview.
 
@@ -20,18 +20,18 @@
 // smashed by freeing and reallocating.
 type fixalloc struct {
 	size   uintptr
-	first  unsafe.Pointer // go func(unsafe.pointer, unsafe.pointer); f(arg, p) called first time p is returned
+	first  func(arg, p unsafe.Pointer) // called first time p is returned
 	arg    unsafe.Pointer
 	list   *mlink
-	chunk  *byte
+	chunk  unsafe.Pointer
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
 	stat   *uint64
 }
 
 // A generic linked list of blocks.  (Typically the block is bigger than sizeof(MLink).)
-// Since assignments to mlink.next will result in a write barrier being preformed
-// this can not be used by some of the internal GC structures. For example when
+// Since assignments to mlink.next will result in a write barrier being performed
+// this cannot be used by some of the internal GC structures. For example when
 // the sweeper is placing an unmarked object on the free list it does not want the
 // write barrier to be called since that could result in the object being reachable.
 type mlink struct {
@@ -40,9 +40,9 @@
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func fixAlloc_Init(f *fixalloc, size uintptr, first func(unsafe.Pointer, unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
 	f.size = size
-	f.first = *(*unsafe.Pointer)(unsafe.Pointer(&first))
+	f.first = first
 	f.arg = arg
 	f.list = nil
 	f.chunk = nil
@@ -51,7 +51,7 @@
 	f.stat = stat
 }
 
-func fixAlloc_Alloc(f *fixalloc) unsafe.Pointer {
+func (f *fixalloc) alloc() unsafe.Pointer {
 	if f.size == 0 {
 		print("runtime: use of FixAlloc_Alloc before FixAlloc_Init\n")
 		throw("runtime: internal error")
@@ -64,22 +64,21 @@
 		return v
 	}
 	if uintptr(f.nchunk) < f.size {
-		f.chunk = (*uint8)(persistentalloc(_FixAllocChunk, 0, f.stat))
+		f.chunk = persistentalloc(_FixAllocChunk, 0, f.stat)
 		f.nchunk = _FixAllocChunk
 	}
 
-	v := (unsafe.Pointer)(f.chunk)
+	v := f.chunk
 	if f.first != nil {
-		fn := *(*func(unsafe.Pointer, unsafe.Pointer))(unsafe.Pointer(&f.first))
-		fn(f.arg, v)
+		f.first(f.arg, v)
 	}
-	f.chunk = (*byte)(add(unsafe.Pointer(f.chunk), f.size))
+	f.chunk = add(f.chunk, f.size)
 	f.nchunk -= uint32(f.size)
 	f.inuse += f.size
 	return v
 }
 
-func fixAlloc_Free(f *fixalloc, p unsafe.Pointer) {
+func (f *fixalloc) free(p unsafe.Pointer) {
 	f.inuse -= f.size
 	v := (*mlink)(p)
 	v.next = f.list

diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 82b12b6..1eabf43 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go

@@ -24,6 +24,10 @@
 // Hudson, R., and Moss, J.E.B. Copying Garbage Collection without stopping the world.
 // Concurrency and Computation: Practice and Experience 15(3-5), 2003.
 //
+// TODO(austin): The rest of this comment is woefully out of date and
+// needs to be rewritten. There is no distinct scan phase any more and
+// we allocate black during GC.
+//
 //  0. Set phase = GCscan from GCoff.
 //  1. Wait for all P's to acknowledge phase change.
 //         At this point all goroutines have passed through a GC safepoint and
@@ -120,20 +124,16 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
 	_DebugGC         = 0
 	_ConcurrentSweep = true
 	_FinBlockSize    = 4 * 1024
-	_RootData        = 0
-	_RootBss         = 1
-	_RootFinalizers  = 2
-	_RootSpans       = 3
-	_RootFlushCaches = 4
-	_RootCount       = 5
-
-	debugStackBarrier = false
 
 	// sweepMinHeapDistance is a lower bound on the heap distance
 	// (in bytes) reserved for concurrent sweeping between GC
@@ -141,18 +141,6 @@
 	sweepMinHeapDistance = 1024 * 1024
 )
 
-// firstStackBarrierOffset is the approximate byte offset at
-// which to place the first stack barrier from the current SP.
-// This is a lower bound on how much stack will have to be
-// re-scanned during mark termination. Subsequent barriers are
-// placed at firstStackBarrierOffset * 2^n offsets.
-//
-// For debugging, this can be set to 0, which will install a
-// stack barrier at every frame. If you do this, you may also
-// have to raise _StackMin, since the stack barrier
-// bookkeeping will use a large amount of each stack.
-var firstStackBarrierOffset = 1024
-
 // heapminimum is the minimum heap size at which to trigger GC.
 // For small heaps, this overrides the usual GOGC*live set rule.
 //
@@ -178,13 +166,14 @@
 		throw("size of Workbuf is suboptimal")
 	}
 
-	work.markfor = parforalloc(_MaxGcproc)
 	_ = setGCPercent(readgogc())
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
 		datap.gcdatamask = progToPointerMask((*byte)(unsafe.Pointer(datap.gcdata)), datap.edata-datap.data)
 		datap.gcbssmask = progToPointerMask((*byte)(unsafe.Pointer(datap.gcbss)), datap.ebss-datap.bss)
 	}
 	memstats.next_gc = heapminimum
+	work.startSema = 1
+	work.markDoneSema = 1
 }
 
 func readgogc() int32 {
@@ -208,6 +197,7 @@
 	memstats.enablegc = true // now that runtime is initialized, GC is okay
 }
 
+//go:linkname setGCPercent runtime/debug.setGCPercent
 func setGCPercent(in int32) (out int32) {
 	lock(&mheap_.lock)
 	out = gcpercent
@@ -216,6 +206,9 @@
 	}
 	gcpercent = in
 	heapminimum = defaultHeapMinimum * uint64(gcpercent) / 100
+	if gcController.triggerRatio > float64(gcpercent)/100 {
+		gcController.triggerRatio = float64(gcpercent) / 100
+	}
 	unlock(&mheap_.lock)
 	return out
 }
@@ -223,7 +216,15 @@
 // Garbage collector phase.
 // Indicates to write barrier and sychronization task to preform.
 var gcphase uint32
-var writeBarrierEnabled bool // compiler emits references to this in write barriers
+
+// The compiler knows about this variable.
+// If you change it, you must change the compiler too.
+var writeBarrier struct {
+	enabled bool   // compiler emits a check of this before calling write barrier
+	needed  bool   // whether we need a write barrier for current GC phase
+	cgo     bool   // whether we need a write barrier for a cgo check
+	alignme uint64 // guarantee alignment so that compiler can use a 32 or 64-bit load
+}
 
 // gcBlackenEnabled is 1 if mutator assists and background mark
 // workers are allowed to blacken objects. This must only be set when
@@ -247,16 +248,15 @@
 
 const (
 	_GCoff             = iota // GC not running; sweeping in background, write barrier disabled
-	_GCstw                    // unused state
-	_GCscan                   // GC collecting roots into workbufs, write barrier ENABLED
-	_GCmark                   // GC marking from workbufs, write barrier ENABLED
+	_GCmark                   // GC marking roots and workbufs: allocate black, write barrier ENABLED
 	_GCmarktermination        // GC mark termination: allocate black, P's help GC, write barrier ENABLED
 )
 
 //go:nosplit
 func setGCPhase(x uint32) {
-	atomicstore(&gcphase, x)
-	writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination || gcphase == _GCscan
+	atomic.Store(&gcphase, x)
+	writeBarrier.needed = gcphase == _GCmark || gcphase == _GCmarktermination
+	writeBarrier.enabled = writeBarrier.needed || writeBarrier.cgo
 }
 
 // gcMarkWorkerMode represents the mode that a concurrent mark worker
@@ -271,8 +271,7 @@
 const (
 	// gcMarkWorkerDedicatedMode indicates that the P of a mark
 	// worker is dedicated to running that mark worker. The mark
-	// worker should run without preemption until concurrent mark
-	// is done.
+	// worker should run without preemption.
 	gcMarkWorkerDedicatedMode gcMarkWorkerMode = iota
 
 	// gcMarkWorkerFractionalMode indicates that a P is currently
@@ -307,9 +306,10 @@
 
 type gcControllerState struct {
 	// scanWork is the total scan work performed this cycle. This
-	// is updated atomically during the cycle. Updates may be
-	// batched arbitrarily, since the value is only read at the
-	// end of the cycle.
+	// is updated atomically during the cycle. Updates occur in
+	// bounded batches, since it is both written and read
+	// throughout the cycle. At the end of the cycle, this is how
+	// much of the retained heap is scannable.
 	//
 	// Currently this is the bytes of heap scanned. For most uses,
 	// this is an opaque unit of work, but for estimation the
@@ -345,13 +345,9 @@
 	// the cycle.
 	idleMarkTime int64
 
-	// bgMarkStartTime is the absolute start time in nanoseconds
-	// that the background mark phase started.
-	bgMarkStartTime int64
-
-	// assistTime is the absolute start time in nanoseconds that
-	// mutator assists were enabled.
-	assistStartTime int64
+	// markStartTime is the absolute start time in nanoseconds
+	// that assists and background mark workers started.
+	markStartTime int64
 
 	// heapGoal is the goal memstats.heap_live for when this cycle
 	// ends. This is computed at the beginning of each cycle.
@@ -363,11 +359,14 @@
 	// dedicated mark workers get started.
 	dedicatedMarkWorkersNeeded int64
 
-	// assistRatio is the ratio of allocated bytes to scan work
-	// that should be performed by mutator assists. This is
+	// assistWorkPerByte is the ratio of scan work to allocated
+	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
 	// time heap_scan is updated.
-	assistRatio float64
+	assistWorkPerByte float64
+
+	// assistBytesPerWork is 1/assistWorkPerByte.
+	assistBytesPerWork float64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
 	// time that should be spent in the fractional mark worker.
@@ -385,7 +384,7 @@
 	// at the end of of each cycle.
 	triggerRatio float64
 
-	_ [_CacheLineSize]byte
+	_ [sys.CacheLineSize]byte
 
 	// fractionalMarkWorkersNeeded is the number of fractional
 	// mark workers that need to be started. This is either 0 or
@@ -393,7 +392,7 @@
 	// scheduling point (hence it gets its own cache line).
 	fractionalMarkWorkersNeeded int64
 
-	_ [_CacheLineSize]byte
+	_ [sys.CacheLineSize]byte
 }
 
 // startCycle resets the GC controller's state and computes estimates
@@ -420,6 +419,17 @@
 	// Compute the heap goal for this cycle
 	c.heapGoal = memstats.heap_reachable + memstats.heap_reachable*uint64(gcpercent)/100
 
+	// Ensure that the heap goal is at least a little larger than
+	// the current live heap size. This may not be the case if GC
+	// start is delayed or if the allocation that pushed heap_live
+	// over next_gc is large or if the trigger is really close to
+	// GOGC. Assist is proportional to this distance, so enforce a
+	// minimum distance, even if it means going over the GOGC goal
+	// by a tiny bit.
+	if c.heapGoal < memstats.heap_live+1024*1024 {
+		c.heapGoal = memstats.heap_live + 1024*1024
+	}
+
 	// Compute the total mark utilization goal and divide it among
 	// dedicated and fractional workers.
 	totalUtilizationGoal := float64(gomaxprocs) * gcGoalUtilization
@@ -444,7 +454,7 @@
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
-		print("pacer: assist ratio=", c.assistRatio,
+		print("pacer: assist ratio=", c.assistWorkPerByte,
 			" (scan ", memstats.heap_scan>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			c.heapGoal>>20, " MB)",
@@ -455,33 +465,59 @@
 
 // revise updates the assist ratio during the GC cycle to account for
 // improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan is updated (with mheap_.lock held).
+// whenever memstats.heap_scan or memstats.heap_live is updated (with
+// mheap_.lock held).
+//
+// It should only be called when gcBlackenEnabled != 0 (because this
+// is when assists are enabled and the necessary statistics are
+// available).
+//
+// TODO: Consider removing the periodic controller update altogether.
+// Since we switched to allocating black, in theory we shouldn't have
+// to change the assist ratio. However, this is still a useful hook
+// that we've found many uses for when experimenting.
 func (c *gcControllerState) revise() {
-	// Compute the expected scan work. This is a strict upper
-	// bound on the possible scan work in the current heap.
+	// Compute the expected scan work remaining.
 	//
+	// Note that we currently count allocations during GC as both
+	// scannable heap (heap_scan) and scan work completed
+	// (scanWork), so this difference won't be changed by
+	// allocations during GC.
+	//
+	// This particular estimate is a strict upper bound on the
+	// possible remaining scan work for the current heap.
 	// You might consider dividing this by 2 (or by
 	// (100+GOGC)/100) to counter this over-estimation, but
 	// benchmarks show that this has almost no effect on mean
 	// mutator utilization, heap size, or assist time and it
 	// introduces the danger of under-estimating and letting the
 	// mutator outpace the garbage collector.
-	scanWorkExpected := memstats.heap_scan
+	scanWorkExpected := int64(memstats.heap_scan) - c.scanWork
+	if scanWorkExpected < 1000 {
+		// We set a somewhat arbitrary lower bound on
+		// remaining scan work since if we aim a little high,
+		// we can miss by a little.
+		//
+		// We *do* need to enforce that this is at least 1,
+		// since marking is racy and double-scanning objects
+		// may legitimately make the expected scan work
+		// negative.
+		scanWorkExpected = 1000
+	}
+
+	// Compute the heap distance remaining.
+	heapDistance := int64(c.heapGoal) - int64(memstats.heap_live)
+	if heapDistance <= 0 {
+		// This shouldn't happen, but if it does, avoid
+		// dividing by zero or setting the assist negative.
+		heapDistance = 1
+	}
 
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
-	// have done (or stolen) the estimated amount of scan work.
-	heapDistance := int64(c.heapGoal) - int64(work.initialHeapLive)
-	if heapDistance <= 1024*1024 {
-		// heapDistance can be negative if GC start is delayed
-		// or if the allocation that pushed heap_live over
-		// next_gc is large or if the trigger is really close
-		// to GOGC. We don't want to set the assist negative
-		// (or divide by zero, or set it really high), so
-		// enforce a minimum on the distance.
-		heapDistance = 1024 * 1024
-	}
-	c.assistRatio = float64(scanWorkExpected) / float64(heapDistance)
+	// have done (or stolen) the remaining amount of scan work.
+	c.assistWorkPerByte = float64(scanWorkExpected) / float64(heapDistance)
+	c.assistBytesPerWork = float64(heapDistance) / float64(scanWorkExpected)
 }
 
 // endCycle updates the GC controller state at the end of the
@@ -511,7 +547,7 @@
 	// technically isn't comparable to the trigger ratio.
 	goalGrowthRatio := float64(gcpercent) / 100
 	actualGrowthRatio := float64(memstats.heap_live)/float64(memstats.heap_marked) - 1
-	assistDuration := nanotime() - c.assistStartTime
+	assistDuration := nanotime() - c.markStartTime
 
 	// Assume background mark hit its utilization goal.
 	utilization := gcGoalUtilization
@@ -560,31 +596,67 @@
 	}
 }
 
+// enlistWorker encourages another dedicated mark worker to start on
+// another P if there are spare worker slots. It is used by putfull
+// when more work is made available.
+//
+//go:nowritebarrier
+func (c *gcControllerState) enlistWorker() {
+	if c.dedicatedMarkWorkersNeeded <= 0 {
+		return
+	}
+	// Pick a random other P to preempt.
+	if gomaxprocs <= 1 {
+		return
+	}
+	gp := getg()
+	if gp == nil || gp.m == nil || gp.m.p == 0 {
+		return
+	}
+	myID := gp.m.p.ptr().id
+	for tries := 0; tries < 5; tries++ {
+		id := int32(fastrand1() % uint32(gomaxprocs-1))
+		if id >= myID {
+			id++
+		}
+		p := allp[id]
+		if p.status != _Prunning {
+			continue
+		}
+		if preemptone(p) {
+			return
+		}
+	}
+}
+
 // findRunnableGCWorker returns the background mark worker for _p_ if it
 // should be run. This must only be called when gcBlackenEnabled != 0.
 func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	if gcBlackenEnabled == 0 {
 		throw("gcControllerState.findRunnable: blackening not enabled")
 	}
-	if _p_.gcBgMarkWorker == nil {
-		throw("gcControllerState.findRunnable: no background mark worker")
+	if _p_.gcBgMarkWorker == 0 {
+		// The mark worker associated with this P is blocked
+		// performing a mark transition. We can't run it
+		// because it may be on some other run or wait queue.
+		return nil
 	}
-	if work.bgMark1.done != 0 && work.bgMark2.done != 0 {
-		// Background mark is done. Don't schedule background
-		// mark worker any more. (This is not just an
-		// optimization. Without this we can spin scheduling
-		// the background worker and having it return
-		// immediately with no work to do.)
+
+	if !gcMarkWorkAvailable(_p_) {
+		// No work to be done right now. This can happen at
+		// the end of the mark phase when there are still
+		// assists tapering off. Don't bother running a worker
+		// now because it'll just return immediately.
 		return nil
 	}
 
 	decIfPositive := func(ptr *int64) bool {
 		if *ptr > 0 {
-			if xaddint64(ptr, -1) >= 0 {
+			if atomic.Xaddint64(ptr, -1) >= 0 {
 				return true
 			}
 			// We lost a race
-			xaddint64(ptr, +1)
+			atomic.Xaddint64(ptr, +1)
 		}
 		return false
 	}
@@ -597,36 +669,6 @@
 		// else for a while, so kick everything out of its run
 		// queue.
 	} else {
-		if _p_.gcw.wbuf == 0 && work.full == 0 && work.partial == 0 {
-			// No work to be done right now. This can
-			// happen at the end of the mark phase when
-			// there are still assists tapering off. Don't
-			// bother running background mark because
-			// it'll just return immediately.
-			if work.nwait == work.nproc {
-				// There are also no workers, which
-				// means we've reached a completion point.
-				// There may not be any workers to
-				// signal it, so signal it here.
-				readied := false
-				if gcBlackenPromptly {
-					if work.bgMark1.done == 0 {
-						throw("completing mark 2, but bgMark1.done == 0")
-					}
-					readied = work.bgMark2.complete()
-				} else {
-					readied = work.bgMark1.complete()
-				}
-				if readied {
-					// complete just called ready,
-					// but we're inside the
-					// scheduler. Let it know that
-					// that's okay.
-					resetspinning()
-				}
-			}
-			return nil
-		}
 		if !decIfPositive(&c.fractionalMarkWorkersNeeded) {
 			// No more workers are need right now.
 			return nil
@@ -663,19 +705,19 @@
 		// TODO(austin): Shorter preemption interval for mark
 		// worker to improve fairness and give this
 		// finer-grained control over schedule?
-		now := nanotime() - gcController.bgMarkStartTime
+		now := nanotime() - gcController.markStartTime
 		then := now + gcForcePreemptNS
 		timeUsed := c.fractionalMarkTime + gcForcePreemptNS
 		if then > 0 && float64(timeUsed)/float64(then) > c.fractionalUtilizationGoal {
 			// Nope, we'd overshoot the utilization goal
-			xaddint64(&c.fractionalMarkWorkersNeeded, +1)
+			atomic.Xaddint64(&c.fractionalMarkWorkersNeeded, +1)
 			return nil
 		}
 		_p_.gcMarkWorkerMode = gcMarkWorkerFractionalMode
 	}
 
 	// Run the background mark worker
-	gp := _p_.gcBgMarkWorker
+	gp := _p_.gcBgMarkWorker.ptr()
 	casgstatus(gp, _Gwaiting, _Grunnable)
 	if trace.enabled {
 		traceGoUnpark(gp, 0)
@@ -687,101 +729,74 @@
 // marking as a fraction of GOMAXPROCS.
 const gcGoalUtilization = 0.25
 
-// gcBgCreditSlack is the amount of scan work credit background
-// scanning can accumulate locally before updating
-// gcController.bgScanCredit. Lower values give mutator assists more
-// accurate accounting of background scanning. Higher values reduce
-// memory contention.
-const gcBgCreditSlack = 2000
+// gcCreditSlack is the amount of scan work credit that can can
+// accumulate locally before updating gcController.scanWork and,
+// optionally, gcController.bgScanCredit. Lower values give a more
+// accurate assist ratio and make it more likely that assists will
+// successfully steal background credit. Higher values reduce memory
+// contention.
+const gcCreditSlack = 2000
 
 // gcAssistTimeSlack is the nanoseconds of mutator assist time that
 // can accumulate on a P before updating gcController.assistTime.
 const gcAssistTimeSlack = 5000
 
-// Determine whether to initiate a GC.
-// If the GC is already working no need to trigger another one.
-// This should establish a feedback loop where if the GC does not
-// have sufficient time to complete then more memory will be
-// requested from the OS increasing heap size thus allow future
-// GCs more time to complete.
-// memstat.heap_live read has a benign race.
-// A false negative simple does not start a GC, a false positive
-// will start a GC needlessly. Neither have correctness issues.
-func shouldtriggergc() bool {
-	return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
-}
-
-// bgMarkSignal synchronizes the GC coordinator and background mark workers.
-type bgMarkSignal struct {
-	// Workers race to cas to 1. Winner signals coordinator.
-	done uint32
-	// Coordinator to wake up.
-	lock mutex
-	g    *g
-	wake bool
-}
-
-func (s *bgMarkSignal) wait() {
-	lock(&s.lock)
-	if s.wake {
-		// Wakeup already happened
-		unlock(&s.lock)
-	} else {
-		s.g = getg()
-		goparkunlock(&s.lock, "mark wait (idle)", traceEvGoBlock, 1)
-	}
-	s.wake = false
-	s.g = nil
-}
-
-// complete signals the completion of this phase of marking. This can
-// be called multiple times during a cycle; only the first call has
-// any effect.
-//
-// The caller should arrange to deschedule itself as soon as possible
-// after calling complete in order to let the coordinator goroutine
-// run.
-func (s *bgMarkSignal) complete() bool {
-	if cas(&s.done, 0, 1) {
-		// This is the first worker to reach this completion point.
-		// Signal the main GC goroutine.
-		lock(&s.lock)
-		if s.g == nil {
-			// It hasn't parked yet.
-			s.wake = true
-		} else {
-			ready(s.g, 0)
-		}
-		unlock(&s.lock)
-		return true
-	}
-	return false
-}
-
-func (s *bgMarkSignal) clear() {
-	s.done = 0
-}
+// gcOverAssistBytes determines how many extra allocation bytes of
+// assist credit a GC assist builds up when an assist happens. This
+// amortizes the cost of an assist by pre-paying for this many bytes
+// of future allocations.
+const gcOverAssistBytes = 1 << 20
 
 var work struct {
-	full  uint64 // lock-free list of full blocks workbuf
-	empty uint64 // lock-free list of empty blocks workbuf
-	// TODO(rlh): partial no longer used, remove. (issue #11922)
-	partial uint64                // lock-free list of partially filled blocks workbuf
-	pad0    [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+	full  uint64                   // lock-free list of full blocks workbuf
+	empty uint64                   // lock-free list of empty blocks workbuf
+	pad0  [sys.CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait
+
+	markrootNext uint32 // next markroot job
+	markrootJobs uint32 // number of markroot jobs
+
 	nproc   uint32
 	tstart  int64
 	nwait   uint32
 	ndone   uint32
 	alldone note
-	markfor *parfor
+
+	// Number of roots of various root types. Set by gcMarkRootPrepare.
+	nDataRoots, nBSSRoots, nSpanRoots, nStackRoots, nRescanRoots int
+
+	// markrootDone indicates that roots have been marked at least
+	// once during the current GC cycle. This is checked by root
+	// marking operations that have to happen only during the
+	// first root marking pass, whether that's during the
+	// concurrent mark phase in current GC or mark termination in
+	// STW GC.
+	markrootDone bool
+
+	// Each type of GC state transition is protected by a lock.
+	// Since multiple threads can simultaneously detect the state
+	// transition condition, any thread that detects a transition
+	// condition must acquire the appropriate transition lock,
+	// re-check the transition condition and return if it no
+	// longer holds or perform the transition if it does.
+	// Likewise, any transition must invalidate the transition
+	// condition before releasing the lock. This ensures that each
+	// transition is performed by exactly one thread and threads
+	// that need the transition to happen block until it has
+	// happened.
+	//
+	// startSema protects the transition from "off" to mark or
+	// mark termination.
+	startSema uint32
+	// markDoneSema protects transitions from mark 1 to mark 2 and
+	// from mark 2 to mark termination.
+	markDoneSema uint32
 
 	bgMarkReady note   // signal background mark worker has started
 	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
 	// Background mark completion signaling
 
-	// Coordination for the 2 parts of the mark phase.
-	bgMark1 bgMarkSignal
-	bgMark2 bgMarkSignal
+	// mode is the concurrency mode of the current GC cycle.
+	mode gcMode
 
 	// Copy of mheap.allspans for marker or sweeper.
 	spans []*mspan
@@ -806,115 +821,80 @@
 	// initialHeapLive is the value of memstats.heap_live at the
 	// beginning of this GC cycle.
 	initialHeapLive uint64
+
+	// assistQueue is a queue of assists that are blocked because
+	// there was neither enough credit to steal or enough work to
+	// do.
+	assistQueue struct {
+		lock       mutex
+		head, tail guintptr
+	}
+
+	// rescan is a list of G's that need to be rescanned during
+	// mark termination. A G adds itself to this list when it
+	// first invalidates its stack scan.
+	rescan struct {
+		lock mutex
+		list []guintptr
+	}
+
+	// Timing/utilization stats for this cycle.
+	stwprocs, maxprocs                 int32
+	tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
+
+	pauseNS    int64 // total STW time this cycle
+	pauseStart int64 // nanotime() of last STW
+
+	// debug.gctrace heap sizes for this cycle.
+	heap0, heap1, heap2, heapGoal uint64
 }
 
 // GC runs a garbage collection and blocks the caller until the
 // garbage collection is complete. It may also block the entire
 // program.
 func GC() {
-	startGC(gcForceBlockMode, false)
+	gcStart(gcForceBlockMode, false)
 }
 
+// gcMode indicates how concurrent a GC cycle should be.
+type gcMode int
+
 const (
-	gcBackgroundMode = iota // concurrent GC
-	gcForceMode             // stop-the-world GC now
-	gcForceBlockMode        // stop-the-world GC now and wait for sweep
+	gcBackgroundMode gcMode = iota // concurrent GC and sweep
+	gcForceMode                    // stop-the-world GC now, concurrent sweep
+	gcForceBlockMode               // stop-the-world GC now and STW sweep
 )
 
-// startGC starts a GC cycle. If mode is gcBackgroundMode, this will
-// start GC in the background and return. Otherwise, this will block
-// until the new GC cycle is started and finishes. If forceTrigger is
-// true, it indicates that GC should be started regardless of the
-// current heap size.
-func startGC(mode int, forceTrigger bool) {
-	// The gc is turned off (via enablegc) until the bootstrap has completed.
-	// Also, malloc gets called in the guts of a number of libraries that might be
-	// holding locks. To avoid deadlocks during stop-the-world, don't bother
-	// trying to run gc while holding a lock. The next mallocgc without a lock
-	// will do the gc instead.
+// gcShouldStart returns true if the exit condition for the _GCoff
+// phase has been met. The exit condition should be tested when
+// allocating.
+//
+// If forceTrigger is true, it ignores the current heap size, but
+// checks all other conditions. In general this should be false.
+func gcShouldStart(forceTrigger bool) bool {
+	return gcphase == _GCoff && (forceTrigger || memstats.heap_live >= memstats.next_gc) && memstats.enablegc && panicking == 0 && gcpercent >= 0
+}
+
+// gcStart transitions the GC from _GCoff to _GCmark (if mode ==
+// gcBackgroundMode) or _GCmarktermination (if mode !=
+// gcBackgroundMode) by performing sweep termination and GC
+// initialization.
+//
+// This may return without performing this transition in some cases,
+// such as when called on a system stack or with locks held.
+func gcStart(mode gcMode, forceTrigger bool) {
+	// Since this is called from malloc and malloc is called in
+	// the guts of a number of libraries that might be holding
+	// locks, don't attempt to start GC in non-preemptible or
+	// potentially unstable situations.
 	mp := acquirem()
-	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" || !memstats.enablegc || panicking != 0 || gcpercent < 0 {
+	if gp := getg(); gp == mp.g0 || mp.locks > 1 || mp.preemptoff != "" {
 		releasem(mp)
 		return
 	}
 	releasem(mp)
 	mp = nil
 
-	if debug.gcstoptheworld == 1 {
-		mode = gcForceMode
-	} else if debug.gcstoptheworld == 2 {
-		mode = gcForceBlockMode
-	}
-
-	if mode != gcBackgroundMode {
-		// special synchronous cases
-		gc(mode)
-		return
-	}
-
-	// trigger concurrent GC
-	readied := false
-	lock(&bggc.lock)
-	// The trigger was originally checked speculatively, so
-	// recheck that this really should trigger GC. (For example,
-	// we may have gone through a whole GC cycle since the
-	// speculative check.)
-	if !(forceTrigger || shouldtriggergc()) {
-		unlock(&bggc.lock)
-		return
-	}
-	if !bggc.started {
-		bggc.working = 1
-		bggc.started = true
-		readied = true
-		go backgroundgc()
-	} else if bggc.working == 0 {
-		bggc.working = 1
-		readied = true
-		ready(bggc.g, 0)
-	}
-	unlock(&bggc.lock)
-	if readied {
-		// This G just started or ready()d the GC goroutine.
-		// Switch directly to it by yielding.
-		Gosched()
-	}
-}
-
-// State of the background concurrent GC goroutine.
-var bggc struct {
-	lock    mutex
-	g       *g
-	working uint
-	started bool
-}
-
-// backgroundgc is running in a goroutine and does the concurrent GC work.
-// bggc holds the state of the backgroundgc.
-func backgroundgc() {
-	bggc.g = getg()
-	for {
-		gc(gcBackgroundMode)
-		lock(&bggc.lock)
-		bggc.working = 0
-		goparkunlock(&bggc.lock, "Concurrent GC wait", traceEvGoBlock, 1)
-	}
-}
-
-func gc(mode int) {
-	// Timing/utilization tracking
-	var stwprocs, maxprocs int32
-	var tSweepTerm, tScan, tInstallWB, tMark, tMarkTerm int64
-
-	// debug.gctrace variables
-	var heap0, heap1, heap2, heapGoal uint64
-
-	// memstats statistics
-	var now, pauseStart, pauseNS int64
-
-	// Ok, we're doing it!  Stop everybody else
-	semacquire(&worldsema, false)
-
 	// Pick up the remaining unswept/not being swept spans concurrently
 	//
 	// This shouldn't happen if we're being invoked in background
@@ -922,10 +902,45 @@
 	// sweeping everything, but rounding errors, etc, may leave a
 	// few spans unswept. In forced mode, this is necessary since
 	// GC can be forced at any point in the sweeping cycle.
-	for gosweepone() != ^uintptr(0) {
+	//
+	// We check the transition condition continuously here in case
+	// this G gets delayed in to the next GC cycle.
+	for (mode != gcBackgroundMode || gcShouldStart(forceTrigger)) && gosweepone() != ^uintptr(0) {
 		sweep.nbgsweep++
 	}
 
+	// Perform GC initialization and the sweep termination
+	// transition.
+	//
+	// If this is a forced GC, don't acquire the transition lock
+	// or re-check the transition condition because we
+	// specifically *don't* want to share the transition with
+	// another thread.
+	useStartSema := mode == gcBackgroundMode
+	if useStartSema {
+		semacquire(&work.startSema, false)
+		// Re-check transition condition under transition lock.
+		if !gcShouldStart(forceTrigger) {
+			semrelease(&work.startSema)
+			return
+		}
+	}
+
+	// In gcstoptheworld debug mode, upgrade the mode accordingly.
+	// We do this after re-checking the transition condition so
+	// that multiple goroutines that detect the heap trigger don't
+	// start multiple STW GCs.
+	if mode == gcBackgroundMode {
+		if debug.gcstoptheworld == 1 {
+			mode = gcForceMode
+		} else if debug.gcstoptheworld == 2 {
+			mode = gcForceBlockMode
+		}
+	}
+
+	// Ok, we're doing it!  Stop everybody else
+	semacquire(&worldsema, false)
+
 	if trace.enabled {
 		traceGCStart()
 	}
@@ -933,143 +948,218 @@
 	if mode == gcBackgroundMode {
 		gcBgMarkStartWorkers()
 	}
-	now = nanotime()
-	stwprocs, maxprocs = gcprocs(), gomaxprocs
-	tSweepTerm = now
-	heap0 = memstats.heap_live
 
-	pauseStart = now
+	gcResetMarkState()
+
+	now := nanotime()
+	work.stwprocs, work.maxprocs = gcprocs(), gomaxprocs
+	work.tSweepTerm = now
+	work.heap0 = memstats.heap_live
+	work.pauseNS = 0
+	work.mode = mode
+
+	work.pauseStart = now
 	systemstack(stopTheWorldWithSema)
-	systemstack(finishsweep_m) // finish sweep before we start concurrent scan.
+	// Finish sweep before we start concurrent scan.
+	systemstack(func() {
+		finishsweep_m(true)
+	})
 	// clearpools before we start the GC. If we wait they memory will not be
 	// reclaimed until the next GC cycle.
 	clearpools()
 
-	gcResetMarkState()
-
 	if mode == gcBackgroundMode { // Do as much work concurrently as possible
 		gcController.startCycle()
-		heapGoal = gcController.heapGoal
+		work.heapGoal = gcController.heapGoal
 
-		systemstack(func() {
-			// Enter scan phase. This enables write
-			// barriers to track changes to stack frames
-			// above the stack barrier.
-			//
-			// TODO: This has evolved to the point where
-			// we carefully ensure invariants we no longer
-			// depend on. Either:
-			//
-			// 1) Enable full write barriers for the scan,
-			// but eliminate the ragged barrier below
-			// (since the start the world ensures all Ps
-			// have observed the write barrier enable) and
-			// consider draining during the scan.
-			//
-			// 2) Only enable write barriers for writes to
-			// the stack at this point, and then enable
-			// write barriers for heap writes when we
-			// enter the mark phase. This means we cannot
-			// drain in the scan phase and must perform a
-			// ragged barrier to ensure all Ps have
-			// enabled heap write barriers before we drain
-			// or enable assists.
-			//
-			// 3) Don't install stack barriers over frame
-			// boundaries where there are up-pointers.
-			setGCPhase(_GCscan)
+		// Enter concurrent mark phase and enable
+		// write barriers.
+		//
+		// Because the world is stopped, all Ps will
+		// observe that write barriers are enabled by
+		// the time we start the world and begin
+		// scanning.
+		//
+		// It's necessary to enable write barriers
+		// during the scan phase for several reasons:
+		//
+		// They must be enabled for writes to higher
+		// stack frames before we scan stacks and
+		// install stack barriers because this is how
+		// we track writes to inactive stack frames.
+		// (Alternatively, we could not install stack
+		// barriers over frame boundaries with
+		// up-pointers).
+		//
+		// They must be enabled before assists are
+		// enabled because they must be enabled before
+		// any non-leaf heap objects are marked. Since
+		// allocations are blocked until assists can
+		// happen, we want enable assists as early as
+		// possible.
+		setGCPhase(_GCmark)
 
-			gcBgMarkPrepare() // Must happen before assist enable.
+		// markrootSpans uses work.spans, so make sure
+		// it is up to date.
+		gcCopySpans()
 
-			// At this point all Ps have enabled the write
-			// barrier, thus maintaining the no white to
-			// black invariant. Enable mutator assists to
-			// put back-pressure on fast allocating
-			// mutators.
-			atomicstore(&gcBlackenEnabled, 1)
+		gcBgMarkPrepare() // Must happen before assist enable.
+		gcMarkRootPrepare()
 
-			// Concurrent scan.
-			startTheWorldWithSema()
-			now = nanotime()
-			pauseNS += now - pauseStart
-			tScan = now
-			gcController.assistStartTime = now
-			gcscan_m()
+		// At this point all Ps have enabled the write
+		// barrier, thus maintaining the no white to
+		// black invariant. Enable mutator assists to
+		// put back-pressure on fast allocating
+		// mutators.
+		atomic.Store(&gcBlackenEnabled, 1)
 
-			// Enter mark phase.
-			tInstallWB = nanotime()
-			setGCPhase(_GCmark)
-			// Ensure all Ps have observed the phase
-			// change and have write barriers enabled
-			// before any blackening occurs.
-			forEachP(func(*p) {})
-		})
+		// Assists and workers can start the moment we start
+		// the world.
+		gcController.markStartTime = now
+
 		// Concurrent mark.
-		tMark = nanotime()
+		systemstack(startTheWorldWithSema)
+		now = nanotime()
+		work.pauseNS += now - work.pauseStart
+		work.tMark = now
+	} else {
+		t := nanotime()
+		work.tMark, work.tMarkTerm = t, t
+		work.heapGoal = work.heap0
 
-		// Enable background mark workers and wait for
-		// background mark completion.
-		gcController.bgMarkStartTime = nanotime()
-		work.bgMark1.clear()
-		work.bgMark1.wait()
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
+	}
 
+	if useStartSema {
+		semrelease(&work.startSema)
+	}
+}
+
+// gcMarkDone transitions the GC from mark 1 to mark 2 and from mark 2
+// to mark termination.
+//
+// This should be called when all mark work has been drained. In mark
+// 1, this includes all root marking jobs, global work buffers, and
+// active work buffers in assists and background workers; however,
+// work may still be cached in per-P work buffers. In mark 2, per-P
+// caches are disabled.
+//
+// The calling context must be preemptible.
+//
+// Note that it is explicitly okay to have write barriers in this
+// function because completion of concurrent mark is best-effort
+// anyway. Any work created by write barriers here will be cleaned up
+// by mark termination.
+func gcMarkDone() {
+top:
+	semacquire(&work.markDoneSema, false)
+
+	// Re-check transition condition under transition lock.
+	if !(gcphase == _GCmark && work.nwait == work.nproc && !gcMarkWorkAvailable(nil)) {
+		semrelease(&work.markDoneSema)
+		return
+	}
+
+	// Disallow starting new workers so that any remaining workers
+	// in the current mark phase will drain out.
+	//
+	// TODO(austin): Should dedicated workers keep an eye on this
+	// and exit gcDrain promptly?
+	atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, -0xffffffff)
+	atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, -0xffffffff)
+
+	if !gcBlackenPromptly {
+		// Transition from mark 1 to mark 2.
+		//
 		// The global work list is empty, but there can still be work
 		// sitting in the per-P work caches and there can be more
 		// objects reachable from global roots since they don't have write
 		// barriers. Rescan some roots and flush work caches.
+
+		gcMarkRootCheck()
+
+		// Disallow caching workbufs and indicate that we're in mark 2.
+		gcBlackenPromptly = true
+
+		// Prevent completion of mark 2 until we've flushed
+		// cached workbufs.
+		atomic.Xadd(&work.nwait, -1)
+
+		// GC is set up for mark 2. Let Gs blocked on the
+		// transition lock go while we flush caches.
+		semrelease(&work.markDoneSema)
+
 		systemstack(func() {
-			// rescan global data and bss.
-			markroot(nil, _RootData)
-			markroot(nil, _RootBss)
-
-			// Disallow caching workbufs.
-			gcBlackenPromptly = true
-
-			// Flush all currently cached workbufs. This
-			// also forces any remaining background
-			// workers out of their loop.
+			// Flush all currently cached workbufs and
+			// ensure all Ps see gcBlackenPromptly. This
+			// also blocks until any remaining mark 1
+			// workers have exited their loop so we can
+			// start new mark 2 workers that will observe
+			// the new root marking jobs.
 			forEachP(func(_p_ *p) {
 				_p_.gcw.dispose()
 			})
 		})
 
-		// Wait for this more aggressive background mark to complete.
-		work.bgMark2.clear()
-		work.bgMark2.wait()
+		// Now we can start up mark 2 workers.
+		atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 0xffffffff)
+		atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 0xffffffff)
 
-		// Begin mark termination.
-		now = nanotime()
-		tMarkTerm = now
-		pauseStart = now
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// This loop will make progress because
+			// gcBlackenPromptly is now true, so it won't
+			// take this same "if" branch.
+			goto top
+		}
+	} else {
+		// Transition to mark termination.
+		now := nanotime()
+		work.tMarkTerm = now
+		work.pauseStart = now
+		getg().m.preemptoff = "gcing"
 		systemstack(stopTheWorldWithSema)
 		// The gcphase is _GCmark, it will transition to _GCmarktermination
 		// below. The important thing is that the wb remains active until
 		// all marking is complete. This includes writes made by the GC.
 
+		// Record that one root marking pass has completed.
+		work.markrootDone = true
+
+		// Disable assists and background workers. We must do
+		// this before waking blocked assists.
+		atomic.Store(&gcBlackenEnabled, 0)
+
 		// Flush the gcWork caches. This must be done before
 		// endCycle since endCycle depends on statistics kept
 		// in these caches.
 		gcFlushGCWork()
 
+		// Wake all blocked assists. These will run when we
+		// start the world again.
+		gcWakeAllAssists()
+
+		// Likewise, release the transition lock. Blocked
+		// workers and assists will run when we start the
+		// world again.
+		semrelease(&work.markDoneSema)
+
 		gcController.endCycle()
-	} else {
-		// For non-concurrent GC (mode != gcBackgroundMode)
-		// The g stacks have not been scanned so clear g state
-		// such that mark termination scans all stacks.
-		gcResetGState()
 
-		t := nanotime()
-		tScan, tInstallWB, tMark, tMarkTerm = t, t, t, t
-		heapGoal = heap0
+		// Perform mark termination. This will restart the world.
+		gcMarkTermination()
 	}
+}
 
+func gcMarkTermination() {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
-	atomicstore(&gcBlackenEnabled, 0)
+	atomic.Store(&gcBlackenEnabled, 0)
 	gcBlackenPromptly = false
 	setGCPhase(_GCmarktermination)
 
-	heap1 = memstats.heap_live
+	work.heap1 = memstats.heap_live
 	startTime := nanotime()
 
 	mp := acquirem()
@@ -1080,8 +1170,8 @@
 	casgstatus(gp, _Grunning, _Gwaiting)
 	gp.waitreason = "garbage collection"
 
-	// Run gc on the g0 stack.  We do this so that the g stack
-	// we're currently running on will no longer change.  Cuts
+	// Run gc on the g0 stack. We do this so that the g stack
+	// we're currently running on will no longer change. Cuts
 	// the root set down a bit (g0 stacks are not scanned, and
 	// we don't need to scan gc's internal state).  We also
 	// need to switch to g0 so we can shrink the stack.
@@ -1097,12 +1187,11 @@
 	})
 
 	systemstack(func() {
-		heap2 = work.bytesMarked
+		work.heap2 = work.bytesMarked
 		if debug.gccheckmark > 0 {
 			// Run a full stop-the-world mark using checkmark bits,
 			// to check that we didn't forget to mark anything during
 			// the concurrent mark process.
-			gcResetGState() // Rescan stacks
 			gcResetMarkState()
 			initCheckmarks()
 			gcMark(startTime)
@@ -1111,16 +1200,15 @@
 
 		// marking is complete so we can turn the write barrier off
 		setGCPhase(_GCoff)
-		gcSweep(mode)
+		gcSweep(work.mode)
 
 		if debug.gctrace > 1 {
 			startTime = nanotime()
 			// The g stacks have been scanned so
 			// they have gcscanvalid==true and gcworkdone==true.
 			// Reset these so that all stacks will be rescanned.
-			gcResetGState()
 			gcResetMarkState()
-			finishsweep_m()
+			finishsweep_m(true)
 
 			// Still in STW but gcphase is _GCoff, reset to _GCmarktermination
 			// At this point all objects will be found during the gcMark which
@@ -1128,7 +1216,7 @@
 			setGCPhase(_GCmarktermination)
 			gcMark(startTime)
 			setGCPhase(_GCoff) // marking is done, turn off wb.
-			gcSweep(mode)
+			gcSweep(work.mode)
 		}
 	})
 
@@ -1148,21 +1236,20 @@
 
 	// Update timing memstats
 	now, unixNow := nanotime(), unixnanotime()
-	pauseNS += now - pauseStart
-	atomicstore64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
-	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(pauseNS)
+	work.pauseNS += now - work.pauseStart
+	work.tEnd = now
+	atomic.Store64(&memstats.last_gc, uint64(unixNow)) // must be Unix time to make sense to user
+	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
 	memstats.pause_end[memstats.numgc%uint32(len(memstats.pause_end))] = uint64(unixNow)
-	memstats.pause_total_ns += uint64(pauseNS)
+	memstats.pause_total_ns += uint64(work.pauseNS)
 
 	// Update work.totaltime.
-	sweepTermCpu := int64(stwprocs) * (tScan - tSweepTerm)
-	scanCpu := tInstallWB - tScan
-	installWBCpu := int64(0)
+	sweepTermCpu := int64(work.stwprocs) * (work.tMark - work.tSweepTerm)
 	// We report idle marking time below, but omit it from the
 	// overall utilization here since it's "free".
 	markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime
-	markTermCpu := int64(stwprocs) * (now - tMarkTerm)
-	cycleCpu := sweepTermCpu + scanCpu + installWBCpu + markCpu + markTermCpu
+	markTermCpu := int64(work.stwprocs) * (work.tEnd - work.tMarkTerm)
+	cycleCpu := sweepTermCpu + markCpu + markTermCpu
 	work.totaltime += cycleCpu
 
 	// Compute overall GC CPU utilization.
@@ -1171,23 +1258,35 @@
 
 	memstats.numgc++
 
+	// Reset sweep state.
+	sweep.nbgsweep = 0
+	sweep.npausesweep = 0
+
 	systemstack(startTheWorldWithSema)
-	semrelease(&worldsema)
 
-	releasem(mp)
-	mp = nil
+	// Free stack spans. This must be done between GC cycles.
+	systemstack(freeStackSpans)
 
+	// Best-effort remove stack barriers so they don't get in the
+	// way of things like GDB and perf.
+	lock(&allglock)
+	myallgs := allgs
+	unlock(&allglock)
+	gcTryRemoveAllStackBarriers(myallgs)
+
+	// Print gctrace before dropping worldsema. As soon as we drop
+	// worldsema another cycle could start and smash the stats
+	// we're trying to print.
 	if debug.gctrace > 0 {
-		tEnd := now
 		util := int(memstats.gc_cpu_fraction * 100)
 
 		var sbuf [24]byte
 		printlock()
 		print("gc ", memstats.numgc,
-			" @", string(itoaDiv(sbuf[:], uint64(tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
+			" @", string(itoaDiv(sbuf[:], uint64(work.tSweepTerm-runtimeInitTime)/1e6, 3)), "s ",
 			util, "%: ")
-		prev := tSweepTerm
-		for i, ns := range []int64{tScan, tInstallWB, tMark, tMarkTerm, tEnd} {
+		prev := work.tSweepTerm
+		for i, ns := range []int64{work.tMark, work.tMarkTerm, work.tEnd} {
 			if i != 0 {
 				print("+")
 			}
@@ -1195,8 +1294,8 @@
 			prev = ns
 		}
 		print(" ms clock, ")
-		for i, ns := range []int64{sweepTermCpu, scanCpu, installWBCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
-			if i == 4 || i == 5 {
+		for i, ns := range []int64{sweepTermCpu, gcController.assistTime, gcController.dedicatedMarkTime + gcController.fractionalMarkTime, gcController.idleMarkTime, markTermCpu} {
+			if i == 2 || i == 3 {
 				// Separate mark time components with /.
 				print("/")
 			} else if i != 0 {
@@ -1205,17 +1304,21 @@
 			print(string(fmtNSAsMS(sbuf[:], uint64(ns))))
 		}
 		print(" ms cpu, ",
-			heap0>>20, "->", heap1>>20, "->", heap2>>20, " MB, ",
-			heapGoal>>20, " MB goal, ",
-			maxprocs, " P")
-		if mode != gcBackgroundMode {
+			work.heap0>>20, "->", work.heap1>>20, "->", work.heap2>>20, " MB, ",
+			work.heapGoal>>20, " MB goal, ",
+			work.maxprocs, " P")
+		if work.mode != gcBackgroundMode {
 			print(" (forced)")
 		}
 		print("\n")
 		printunlock()
 	}
-	sweep.nbgsweep = 0
-	sweep.npausesweep = 0
+
+	semrelease(&worldsema)
+	// Careful: another GC cycle may start now.
+
+	releasem(mp)
+	mp = nil
 
 	// now that gc is done, kick off finalizer thread if needed
 	if !concurrentSweep {
@@ -1235,7 +1338,7 @@
 		if p == nil || p.status == _Pdead {
 			break
 		}
-		if p.gcBgMarkWorker == nil {
+		if p.gcBgMarkWorker == 0 {
 			go gcBgMarkWorker(p)
 			notetsleepg(&work.bgMarkReady, -1)
 			noteclear(&work.bgMarkReady)
@@ -1257,21 +1360,25 @@
 	// there are no workers.
 	work.nproc = ^uint32(0)
 	work.nwait = ^uint32(0)
-
-	// Reset background mark completion points.
-	work.bgMark1.done = 1
-	work.bgMark2.done = 1
 }
 
-func gcBgMarkWorker(p *p) {
-	// Register this G as the background mark worker for p.
-	if p.gcBgMarkWorker != nil {
-		throw("P already has a background mark worker")
-	}
+func gcBgMarkWorker(_p_ *p) {
 	gp := getg()
 
-	mp := acquirem()
-	p.gcBgMarkWorker = gp
+	type parkInfo struct {
+		m      muintptr // Release this m on park.
+		attach puintptr // If non-nil, attach to this p on park.
+	}
+	// We pass park to a gopark unlock function, so it can't be on
+	// the stack (see gopark). Prevent deadlock from recursively
+	// starting GC by disabling preemption.
+	gp.m.preemptoff = "GC worker init"
+	park := new(parkInfo)
+	gp.m.preemptoff = ""
+
+	park.m.set(acquirem())
+	park.attach.set(_p_)
+	// Inform gcBgMarkStartWorkers that this worker is ready.
 	// After this point, the background mark worker is scheduled
 	// cooperatively by gcController.findRunnable. Hence, it must
 	// never be preempted, as this would put it into _Grunnable
@@ -1279,26 +1386,51 @@
 	// is set, this puts itself into _Gwaiting to be woken up by
 	// gcController.findRunnable at the appropriate time.
 	notewakeup(&work.bgMarkReady)
+
 	for {
-		// Go to sleep until woken by gcContoller.findRunnable.
+		// Go to sleep until woken by gcController.findRunnable.
 		// We can't releasem yet since even the call to gopark
 		// may be preempted.
-		gopark(func(g *g, mp unsafe.Pointer) bool {
-			releasem((*m)(mp))
+		gopark(func(g *g, parkp unsafe.Pointer) bool {
+			park := (*parkInfo)(parkp)
+
+			// The worker G is no longer running, so it's
+			// now safe to allow preemption.
+			releasem(park.m.ptr())
+
+			// If the worker isn't attached to its P,
+			// attach now. During initialization and after
+			// a phase change, the worker may have been
+			// running on a different P. As soon as we
+			// attach, the owner P may schedule the
+			// worker, so this must be done after the G is
+			// stopped.
+			if park.attach != 0 {
+				p := park.attach.ptr()
+				park.attach.set(nil)
+				// cas the worker because we may be
+				// racing with a new worker starting
+				// on this P.
+				if !p.gcBgMarkWorker.cas(0, guintptr(unsafe.Pointer(g))) {
+					// The P got a new worker.
+					// Exit this worker.
+					return false
+				}
+			}
 			return true
-		}, unsafe.Pointer(mp), "mark worker (idle)", traceEvGoBlock, 0)
+		}, unsafe.Pointer(park), "GC worker (idle)", traceEvGoBlock, 0)
 
 		// Loop until the P dies and disassociates this
-		// worker. (The P may later be reused, in which case
-		// it will get a new worker.)
-		if p.gcBgMarkWorker != gp {
+		// worker (the P may later be reused, in which case
+		// it will get a new worker) or we failed to associate.
+		if _p_.gcBgMarkWorker.ptr() != gp {
 			break
 		}
 
 		// Disable preemption so we can use the gcw. If the
 		// scheduler wants to preempt us, we'll stop draining,
 		// dispose the gcw, and then preempt.
-		mp = acquirem()
+		park.m.set(acquirem())
 
 		if gcBlackenEnabled == 0 {
 			throw("gcBgMarkWorker: blackening not enabled")
@@ -1306,85 +1438,90 @@
 
 		startTime := nanotime()
 
-		decnwait := xadd(&work.nwait, -1)
+		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
 			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 			throw("work.nwait was > work.nproc")
 		}
 
-		done := false
-		switch p.gcMarkWorkerMode {
+		switch _p_.gcMarkWorkerMode {
 		default:
 			throw("gcBgMarkWorker: unexpected gcMarkWorkerMode")
 		case gcMarkWorkerDedicatedMode:
-			gcDrain(&p.gcw, gcBgCreditSlack)
-			// gcDrain did the xadd(&work.nwait +1) to
-			// match the decrement above. It only returns
-			// at a mark completion point.
-			done = true
-			if !p.gcw.empty() {
-				throw("gcDrain returned with buffer")
-			}
+			gcDrain(&_p_.gcw, gcDrainNoBlock|gcDrainFlushBgCredit)
 		case gcMarkWorkerFractionalMode, gcMarkWorkerIdleMode:
-			gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
+			gcDrain(&_p_.gcw, gcDrainUntilPreempt|gcDrainFlushBgCredit)
+		}
 
-			// If we are nearing the end of mark, dispose
-			// of the cache promptly. We must do this
-			// before signaling that we're no longer
-			// working so that other workers can't observe
-			// no workers and no work while we have this
-			// cached, and before we compute done.
-			if gcBlackenPromptly {
-				p.gcw.dispose()
-			}
+		// If we are nearing the end of mark, dispose
+		// of the cache promptly. We must do this
+		// before signaling that we're no longer
+		// working so that other workers can't observe
+		// no workers and no work while we have this
+		// cached, and before we compute done.
+		if gcBlackenPromptly {
+			_p_.gcw.dispose()
+		}
 
-			// Was this the last worker and did we run out
-			// of work?
-			incnwait := xadd(&work.nwait, +1)
-			if incnwait > work.nproc {
-				println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
-					"work.nwait=", incnwait, "work.nproc=", work.nproc)
-				throw("work.nwait > work.nproc")
-			}
-			done = incnwait == work.nproc && work.full == 0 && work.partial == 0
+		// Account for time.
+		duration := nanotime() - startTime
+		switch _p_.gcMarkWorkerMode {
+		case gcMarkWorkerDedicatedMode:
+			atomic.Xaddint64(&gcController.dedicatedMarkTime, duration)
+			atomic.Xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
+		case gcMarkWorkerFractionalMode:
+			atomic.Xaddint64(&gcController.fractionalMarkTime, duration)
+			atomic.Xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
+		case gcMarkWorkerIdleMode:
+			atomic.Xaddint64(&gcController.idleMarkTime, duration)
+		}
+
+		// Was this the last worker and did we run out
+		// of work?
+		incnwait := atomic.Xadd(&work.nwait, +1)
+		if incnwait > work.nproc {
+			println("runtime: p.gcMarkWorkerMode=", _p_.gcMarkWorkerMode,
+				"work.nwait=", incnwait, "work.nproc=", work.nproc)
+			throw("work.nwait > work.nproc")
 		}
 
 		// If this worker reached a background mark completion
 		// point, signal the main GC goroutine.
-		if done {
-			if gcBlackenPromptly {
-				if work.bgMark1.done == 0 {
-					throw("completing mark 2, but bgMark1.done == 0")
-				}
-				work.bgMark2.complete()
-			} else {
-				work.bgMark1.complete()
-			}
-		}
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
+			// Make this G preemptible and disassociate it
+			// as the worker for this P so
+			// findRunnableGCWorker doesn't try to
+			// schedule it.
+			_p_.gcBgMarkWorker.set(nil)
+			releasem(park.m.ptr())
 
-		duration := nanotime() - startTime
-		switch p.gcMarkWorkerMode {
-		case gcMarkWorkerDedicatedMode:
-			xaddint64(&gcController.dedicatedMarkTime, duration)
-			xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
-		case gcMarkWorkerFractionalMode:
-			xaddint64(&gcController.fractionalMarkTime, duration)
-			xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
-		case gcMarkWorkerIdleMode:
-			xaddint64(&gcController.idleMarkTime, duration)
+			gcMarkDone()
+
+			// Disable preemption and prepare to reattach
+			// to the P.
+			//
+			// We may be running on a different P at this
+			// point, so we can't reattach until this G is
+			// parked.
+			park.m.set(acquirem())
+			park.attach.set(_p_)
 		}
 	}
 }
 
 // gcMarkWorkAvailable returns true if executing a mark worker
-// on p is potentially useful.
+// on p is potentially useful. p may be nil, in which case it only
+// checks the global sources of work.
 func gcMarkWorkAvailable(p *p) bool {
-	if !p.gcw.empty() {
+	if p != nil && !p.gcw.empty() {
 		return true
 	}
-	if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
+	if atomic.Load64(&work.full) != 0 {
 		return true // global work available
 	}
+	if work.markrootNext < work.markrootJobs {
+		return true // root scan work available
+	}
 	return false
 }
 
@@ -1419,6 +1556,9 @@
 	// but must be disposed to the global lists immediately.
 	gcFlushGCWork()
 
+	// Queue root marking jobs.
+	gcMarkRootPrepare()
+
 	work.nwait = 0
 	work.ndone = 0
 	work.nproc = uint32(gcprocs())
@@ -1427,64 +1567,51 @@
 		traceGCScanStart()
 	}
 
-	parforsetup(work.markfor, work.nproc, uint32(_RootCount+allglen), false, markroot)
 	if work.nproc > 1 {
 		noteclear(&work.alldone)
 		helpgc(int32(work.nproc))
 	}
 
 	gchelperstart()
-	parfordo(work.markfor)
 
-	var gcw gcWork
-	gcDrain(&gcw, -1)
+	gcw := &getg().m.p.ptr().gcw
+	gcDrain(gcw, gcDrainBlock)
 	gcw.dispose()
 
+	if debug.gccheckmark > 0 {
+		// This is expensive when there's a large number of
+		// Gs, so only do it if checkmark is also enabled.
+		gcMarkRootCheck()
+	}
 	if work.full != 0 {
 		throw("work.full != 0")
 	}
-	if work.partial != 0 {
-		throw("work.partial != 0")
-	}
 
 	if work.nproc > 1 {
 		notesleep(&work.alldone)
 	}
 
+	// Record that at least one root marking pass has completed.
+	work.markrootDone = true
+
 	for i := 0; i < int(gomaxprocs); i++ {
-		if allp[i].gcw.wbuf != 0 {
+		gcw := &allp[i].gcw
+		if !gcw.empty() {
 			throw("P has cached GC work at end of mark termination")
 		}
+		if gcw.scanWork != 0 || gcw.bytesMarked != 0 {
+			throw("P has unflushed stats at end of mark termination")
+		}
 	}
 
 	if trace.enabled {
 		traceGCScanDone()
 	}
 
-	// TODO(austin): This doesn't have to be done during STW, as
-	// long as we block the next GC cycle until this is done. Move
-	// it after we start the world, but before dropping worldsema.
-	// (See issue #11465.)
-	freeStackSpans()
-
 	cachestats()
 
-	// Compute the reachable heap size at the beginning of the
-	// cycle. This is approximately the marked heap size at the
-	// end (which we know) minus the amount of marked heap that
-	// was allocated after marking began (which we don't know, but
-	// is approximately the amount of heap that was allocated
-	// since marking began).
-	allocatedDuringCycle := memstats.heap_live - work.initialHeapLive
-	if work.bytesMarked >= allocatedDuringCycle {
-		memstats.heap_reachable = work.bytesMarked - allocatedDuringCycle
-	} else {
-		// This can happen if most of the allocation during
-		// the cycle never became reachable from the heap.
-		// Just set the reachable heap approximation to 0 and
-		// let the heapminimum kick in below.
-		memstats.heap_reachable = 0
-	}
+	// Update the reachable heap stat.
+	memstats.heap_reachable = work.bytesMarked
 
 	// Trigger the next GC cycle when the allocated heap has grown
 	// by triggerRatio over the reachable heap size. Assume that
@@ -1499,7 +1626,9 @@
 		throw("next_gc underflow")
 	}
 
-	// Update other GC heap size stats.
+	// Update other GC heap size stats. This must happen after
+	// cachestats (which flushes local statistics to these) and
+	// flushallmcaches (which modifies heap_live).
 	memstats.heap_live = work.bytesMarked
 	memstats.heap_marked = work.bytesMarked
 	memstats.heap_scan = uint64(gcController.scanWork)
@@ -1525,7 +1654,7 @@
 	}
 }
 
-func gcSweep(mode int) {
+func gcSweep(mode gcMode) {
 	if gcphase != _GCoff {
 		throw("gcSweep being done but phase is not GCoff")
 	}
@@ -1554,14 +1683,9 @@
 		return
 	}
 
-	// Account how much sweeping needs to be done before the next
-	// GC cycle and set up proportional sweep statistics.
-	var pagesToSweep uintptr
-	for _, s := range work.spans {
-		if s.state == mSpanInUse {
-			pagesToSweep += s.npages
-		}
-	}
+	// Concurrent sweep needs to sweep all of the in-use pages by
+	// the time the allocated heap reaches the GC trigger. Compute
+	// the ratio of in-use pages to sweep per byte allocated.
 	heapDistance := int64(memstats.next_gc) - int64(memstats.heap_live)
 	// Add a little margin so rounding errors and concurrent
 	// sweep are less likely to leave pages unswept when GC starts.
@@ -1571,7 +1695,7 @@
 		heapDistance = _PageSize
 	}
 	lock(&mheap_.lock)
-	mheap_.sweepPagesPerByte = float64(pagesToSweep) / float64(heapDistance)
+	mheap_.sweepPagesPerByte = float64(mheap_.pagesInUse) / float64(heapDistance)
 	mheap_.pagesSwept = 0
 	mheap_.spanBytesAlloc = 0
 	unlock(&mheap_.lock)
@@ -1580,7 +1704,7 @@
 	lock(&sweep.lock)
 	if sweep.parked {
 		sweep.parked = false
-		ready(sweep.g, 0)
+		ready(sweep.g, 0, true)
 	}
 	unlock(&sweep.lock)
 	mProf_GC()
@@ -1605,29 +1729,33 @@
 	unlock(&mheap_.lock)
 }
 
-// gcResetGState resets the GC state of all G's and returns the length
-// of allgs.
-func gcResetGState() (numgs int) {
+// gcResetMarkState resets global state prior to marking (concurrent
+// or STW) and resets the stack scan state of all Gs.
+//
+// This is safe to do without the world stopped because any Gs created
+// during or after this will start out in the reset state.
+func gcResetMarkState() {
 	// This may be called during a concurrent phase, so make sure
 	// allgs doesn't change.
+	if !(gcphase == _GCoff || gcphase == _GCmarktermination) {
+		// Accessing gcRescan is unsafe.
+		throw("bad GC phase")
+	}
 	lock(&allglock)
 	for _, gp := range allgs {
 		gp.gcscandone = false  // set to true in gcphasework
 		gp.gcscanvalid = false // stack has not been scanned
-		gp.gcalloc = 0
-		gp.gcscanwork = 0
+		gp.gcRescan = -1
+		gp.gcAssistBytes = 0
 	}
-	numgs = len(allgs)
 	unlock(&allglock)
-	return
-}
 
-// gcResetMarkState resets state prior to marking (concurrent or STW).
-//
-// TODO(austin): Merge with gcResetGState. See issue #11427.
-func gcResetMarkState() {
+	// Clear rescan list.
+	work.rescan.list = work.rescan.list[:0]
+
 	work.bytesMarked = 0
 	work.initialHeapLive = memstats.heap_live
+	work.markrootDone = false
 }
 
 // Hooks for other packages
@@ -1672,17 +1800,6 @@
 		sched.deferpool[i] = nil
 	}
 	unlock(&sched.deferlock)
-
-	for _, p := range &allp {
-		if p == nil {
-			break
-		}
-		// clear tinyalloc pool
-		if c := p.mcache; c != nil {
-			c.tiny = nil
-			c.tinyoffset = 0
-		}
-	}
 }
 
 // Timing
@@ -1697,11 +1814,10 @@
 		traceGCScanStart()
 	}
 
-	// parallel mark for over GC roots
-	parfordo(work.markfor)
-	if gcphase != _GCscan {
-		var gcw gcWork
-		gcDrain(&gcw, -1) // blocks in getfull
+	// Parallel mark over GC roots and heap
+	if gcphase == _GCmarktermination {
+		gcw := &_g_.m.p.ptr().gcw
+		gcDrain(gcw, gcDrainBlock) // blocks in getfull
 		gcw.dispose()
 	}
 
@@ -1710,7 +1826,7 @@
 	}
 
 	nproc := work.nproc // work.nproc can change right after we increment work.ndone
-	if xadd(&work.ndone, +1) == nproc-1 {
+	if atomic.Xadd(&work.ndone, +1) == nproc-1 {
 		notewakeup(&work.alldone)
 	}
 	_g_.m.traceback = 0

diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 44f9512..00b96fd 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go

@@ -6,114 +6,193 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
-// Scan all of the stacks, greying (or graying if in America) the referents
-// but not blackening them since the mark write barrier isn't installed.
+const (
+	fixedRootFinalizers = iota
+	fixedRootFlushCaches
+	fixedRootFreeGStacks
+	fixedRootCount
+
+	// rootBlockBytes is the number of bytes to scan per data or
+	// BSS root.
+	rootBlockBytes = 256 << 10
+
+	// rootBlockSpans is the number of spans to scan per span
+	// root.
+	rootBlockSpans = 8 * 1024 // 64MB worth of spans
+)
+
+// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and
+// some miscellany) and initializes scanning-related state.
+//
+// The caller must have call gcCopySpans().
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
-func gcscan_m() {
-	_g_ := getg()
+func gcMarkRootPrepare() {
+	// Compute how many data and BSS root blocks there are.
+	nBlocks := func(bytes uintptr) int {
+		return int((bytes + rootBlockBytes - 1) / rootBlockBytes)
+	}
 
-	// Grab the g that called us and potentially allow rescheduling.
-	// This allows it to be scanned like other goroutines.
-	mastergp := _g_.m.curg
-	casgstatus(mastergp, _Grunning, _Gwaiting)
-	mastergp.waitreason = "garbage collection scan"
+	work.nDataRoots = 0
+	work.nBSSRoots = 0
 
-	// Span sweeping has been done by finishsweep_m.
-	// Long term we will want to make this goroutine runnable
-	// by placing it onto a scanenqueue state and then calling
-	// runtime·restartg(mastergp) to make it Grunnable.
-	// At the bottom we will want to return this p back to the scheduler.
+	// Only scan globals once per cycle; preferably concurrently.
+	if !work.markrootDone {
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			nDataRoots := nBlocks(datap.edata - datap.data)
+			if nDataRoots > work.nDataRoots {
+				work.nDataRoots = nDataRoots
+			}
+		}
 
-	// Prepare flag indicating that the scan has not been completed.
-	local_allglen := gcResetGState()
+		for datap := &firstmoduledata; datap != nil; datap = datap.next {
+			nBSSRoots := nBlocks(datap.ebss - datap.bss)
+			if nBSSRoots > work.nBSSRoots {
+				work.nBSSRoots = nBSSRoots
+			}
+		}
+	}
 
-	work.ndone = 0
-	useOneP := uint32(1) // For now do not do this in parallel.
-	//	ackgcphase is not needed since we are not scanning running goroutines.
-	parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
-	parfordo(work.markfor)
+	if !work.markrootDone {
+		// On the first markroot, we need to scan span roots.
+		// In concurrent GC, this happens during concurrent
+		// mark and we depend on addfinalizer to ensure the
+		// above invariants for objects that get finalizers
+		// after concurrent mark. In STW GC, this will happen
+		// during mark termination.
+		work.nSpanRoots = (len(work.spans) + rootBlockSpans - 1) / rootBlockSpans
+
+		// On the first markroot, we need to scan all Gs. Gs
+		// may be created after this point, but it's okay that
+		// we ignore them because they begin life without any
+		// roots, so there's nothing to scan, and any roots
+		// they create during the concurrent phase will be
+		// scanned during mark termination. During mark
+		// termination, allglen isn't changing, so we'll scan
+		// all Gs.
+		work.nStackRoots = int(atomic.Loaduintptr(&allglen))
+		work.nRescanRoots = 0
+	} else {
+		// We've already scanned span roots and kept the scan
+		// up-to-date during concurrent mark.
+		work.nSpanRoots = 0
+
+		// On the second pass of markroot, we're just scanning
+		// dirty stacks. It's safe to access rescan since the
+		// world is stopped.
+		work.nStackRoots = 0
+		work.nRescanRoots = len(work.rescan.list)
+	}
+
+	work.markrootNext = 0
+	work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.nRescanRoots)
+}
+
+// gcMarkRootCheck checks that all roots have been scanned. It is
+// purely for debugging.
+func gcMarkRootCheck() {
+	if work.markrootNext < work.markrootJobs {
+		print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n")
+		throw("left over markroot jobs")
+	}
 
 	lock(&allglock)
-	// Check that gc work is done.
-	for i := 0; i < local_allglen; i++ {
-		gp := allgs[i]
-		if !gp.gcscandone {
-			throw("scan missed a g")
+	// Check that stacks have been scanned.
+	if gcphase == _GCmarktermination {
+		for i := 0; i < len(allgs); i++ {
+			gp := allgs[i]
+			if !(gp.gcscandone && gp.gcscanvalid) && readgstatus(gp) != _Gdead {
+				println("gp", gp, "goid", gp.goid,
+					"status", readgstatus(gp),
+					"gcscandone", gp.gcscandone,
+					"gcscanvalid", gp.gcscanvalid)
+				throw("scan missed a g")
+			}
+		}
+	} else {
+		for i := 0; i < work.nStackRoots; i++ {
+			gp := allgs[i]
+			if !gp.gcscandone {
+				throw("scan missed a g")
+			}
 		}
 	}
 	unlock(&allglock)
-
-	casgstatus(mastergp, _Gwaiting, _Grunning)
-	// Let the g that called us continue to run.
 }
 
 // ptrmask for an allocation containing a single pointer.
 var oneptrmask = [...]uint8{1}
 
+// markroot scans the i'th root.
+//
+// Preemption must be disabled (because this uses a gcWork).
+//
+// nowritebarrier is only advisory here.
+//
 //go:nowritebarrier
-func markroot(desc *parfor, i uint32) {
-	// TODO: Consider using getg().m.p.ptr().gcw.
-	var gcw gcWork
+func markroot(gcw *gcWork, i uint32) {
+	// TODO(austin): This is a bit ridiculous. Compute and store
+	// the bases in gcMarkRootPrepare instead of the counts.
+	baseData := uint32(fixedRootCount)
+	baseBSS := baseData + uint32(work.nDataRoots)
+	baseSpans := baseBSS + uint32(work.nBSSRoots)
+	baseStacks := baseSpans + uint32(work.nSpanRoots)
+	baseRescan := baseStacks + uint32(work.nStackRoots)
+	end := baseRescan + uint32(work.nRescanRoots)
 
 	// Note: if you add a case here, please also update heapdump.go:dumproots.
-	switch i {
-	case _RootData:
+	switch {
+	case baseData <= i && i < baseBSS:
 		for datap := &firstmoduledata; datap != nil; datap = datap.next {
-			scanblock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, &gcw)
+			markrootBlock(datap.data, datap.edata-datap.data, datap.gcdatamask.bytedata, gcw, int(i-baseData))
 		}
 
-	case _RootBss:
+	case baseBSS <= i && i < baseSpans:
 		for datap := &firstmoduledata; datap != nil; datap = datap.next {
-			scanblock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, &gcw)
+			markrootBlock(datap.bss, datap.ebss-datap.bss, datap.gcbssmask.bytedata, gcw, int(i-baseBSS))
 		}
 
-	case _RootFinalizers:
+	case i == fixedRootFinalizers:
 		for fb := allfin; fb != nil; fb = fb.alllink {
-			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], &gcw)
+			scanblock(uintptr(unsafe.Pointer(&fb.fin[0])), uintptr(fb.cnt)*unsafe.Sizeof(fb.fin[0]), &finptrmask[0], gcw)
 		}
 
-	case _RootSpans:
-		// mark MSpan.specials
-		sg := mheap_.sweepgen
-		for spanidx := uint32(0); spanidx < uint32(len(work.spans)); spanidx++ {
-			s := work.spans[spanidx]
-			if s.state != mSpanInUse {
-				continue
-			}
-			if !useCheckmark && s.sweepgen != sg {
-				// sweepgen was updated (+2) during non-checkmark GC pass
-				print("sweep ", s.sweepgen, " ", sg, "\n")
-				throw("gc: unswept span")
-			}
-			for sp := s.specials; sp != nil; sp = sp.next {
-				if sp.kind != _KindSpecialFinalizer {
-					continue
-				}
-				// don't mark finalized object, but scan it so we
-				// retain everything it points to.
-				spf := (*specialfinalizer)(unsafe.Pointer(sp))
-				// A finalizer can be set for an inner byte of an object, find object beginning.
-				p := uintptr(s.start<<_PageShift) + uintptr(spf.special.offset)/s.elemsize*s.elemsize
-				if gcphase != _GCscan {
-					scanobject(p, &gcw) // scanned during mark termination
-				}
-				scanblock(uintptr(unsafe.Pointer(&spf.fn)), ptrSize, &oneptrmask[0], &gcw)
-			}
-		}
-
-	case _RootFlushCaches:
-		if gcphase != _GCscan { // Do not flush mcaches during GCscan phase.
+	case i == fixedRootFlushCaches:
+		if gcphase == _GCmarktermination { // Do not flush mcaches during concurrent phase.
 			flushallmcaches()
 		}
 
+	case i == fixedRootFreeGStacks:
+		// Only do this once per GC cycle; preferably
+		// concurrently.
+		if !work.markrootDone {
+			// Switch to the system stack so we can call
+			// stackfree.
+			systemstack(markrootFreeGStacks)
+		}
+
+	case baseSpans <= i && i < baseStacks:
+		// mark MSpan.specials
+		markrootSpans(gcw, int(i-baseSpans))
+
 	default:
 		// the rest is scanning goroutine stacks
-		if uintptr(i-_RootCount) >= allglen {
+		var gp *g
+		if baseStacks <= i && i < baseRescan {
+			gp = allgs[i-baseStacks]
+		} else if baseRescan <= i && i < end {
+			gp = work.rescan.list[i-baseRescan].ptr()
+		} else {
 			throw("markroot: bad index")
 		}
-		gp := allgs[i-_RootCount]
 
 		// remember when we've first observed the G blocked
 		// needed only to output in traceback
@@ -122,42 +201,188 @@
 			gp.waitsince = work.tstart
 		}
 
-		// Shrink a stack if not much of it is being used but not in the scan phase.
-		if gcphase == _GCmarktermination {
-			// Shrink during STW GCmarktermination phase thus avoiding
-			// complications introduced by shrinking during
-			// non-STW phases.
-			shrinkstack(gp)
+		if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC && readgstatus(gp) != _Gdead {
+			// GC background workers may be
+			// non-preemptible, so we may deadlock if we
+			// try to scan them during a concurrent phase.
+			// They also have tiny stacks, so just ignore
+			// them until mark termination.
+			gp.gcscandone = true
+			queueRescan(gp)
+			break
 		}
 
-		scang(gp)
-	}
+		// scang must be done on the system stack in case
+		// we're trying to scan our own stack.
+		systemstack(func() {
+			// If this is a self-scan, put the user G in
+			// _Gwaiting to prevent self-deadlock. It may
+			// already be in _Gwaiting if this is mark
+			// termination.
+			userG := getg().m.curg
+			selfScan := gp == userG && readgstatus(userG) == _Grunning
+			if selfScan {
+				casgstatus(userG, _Grunning, _Gwaiting)
+				userG.waitreason = "garbage collection scan"
+			}
 
-	gcw.dispose()
+			// TODO: scang blocks until gp's stack has
+			// been scanned, which may take a while for
+			// running goroutines. Consider doing this in
+			// two phases where the first is non-blocking:
+			// we scan the stacks we can and ask running
+			// goroutines to scan themselves; and the
+			// second blocks.
+			scang(gp, gcw)
+
+			if selfScan {
+				casgstatus(userG, _Gwaiting, _Grunning)
+			}
+		})
+	}
 }
 
-// gcAssistAlloc records and allocation of size bytes and, if
-// allowAssist is true, may assist GC scanning in proportion to the
-// allocations performed by this mutator since the last assist.
+// markrootBlock scans the shard'th shard of the block of memory [b0,
+// b0+n0), with the given pointer mask.
 //
-// It should only be called if gcAssistAlloc != 0.
-//
-// This must be called with preemption disabled.
 //go:nowritebarrier
-func gcAssistAlloc(size uintptr, allowAssist bool) {
-	// Find the G responsible for this assist.
-	gp := getg()
-	if gp.m.curg != nil {
-		gp = gp.m.curg
+func markrootBlock(b0, n0 uintptr, ptrmask0 *uint8, gcw *gcWork, shard int) {
+	if rootBlockBytes%(8*sys.PtrSize) != 0 {
+		// This is necessary to pick byte offsets in ptrmask0.
+		throw("rootBlockBytes must be a multiple of 8*ptrSize")
 	}
 
-	// Record allocation.
-	gp.gcalloc += size
+	b := b0 + uintptr(shard)*rootBlockBytes
+	if b >= b0+n0 {
+		return
+	}
+	ptrmask := (*uint8)(add(unsafe.Pointer(ptrmask0), uintptr(shard)*(rootBlockBytes/(8*sys.PtrSize))))
+	n := uintptr(rootBlockBytes)
+	if b+n > b0+n0 {
+		n = b0 + n0 - b
+	}
 
-	if !allowAssist {
+	// Scan this shard.
+	scanblock(b, n, ptrmask, gcw)
+}
+
+// markrootFreeGStacks frees stacks of dead Gs.
+//
+// This does not free stacks of dead Gs cached on Ps, but having a few
+// cached stacks around isn't a problem.
+//
+//TODO go:nowritebarrier
+func markrootFreeGStacks() {
+	// Take list of dead Gs with stacks.
+	lock(&sched.gflock)
+	list := sched.gfreeStack
+	sched.gfreeStack = nil
+	unlock(&sched.gflock)
+	if list == nil {
 		return
 	}
 
+	// Free stacks.
+	tail := list
+	for gp := list; gp != nil; gp = gp.schedlink.ptr() {
+		shrinkstack(gp)
+		tail = gp
+	}
+
+	// Put Gs back on the free list.
+	lock(&sched.gflock)
+	tail.schedlink.set(sched.gfreeNoStack)
+	sched.gfreeNoStack = list
+	unlock(&sched.gflock)
+}
+
+// markrootSpans marks roots for one shard of work.spans.
+//
+//go:nowritebarrier
+func markrootSpans(gcw *gcWork, shard int) {
+	// Objects with finalizers have two GC-related invariants:
+	//
+	// 1) Everything reachable from the object must be marked.
+	// This ensures that when we pass the object to its finalizer,
+	// everything the finalizer can reach will be retained.
+	//
+	// 2) Finalizer specials (which are not in the garbage
+	// collected heap) are roots. In practice, this means the fn
+	// field must be scanned.
+	//
+	// TODO(austin): There are several ideas for making this more
+	// efficient in issue #11485.
+
+	if work.markrootDone {
+		throw("markrootSpans during second markroot")
+	}
+
+	sg := mheap_.sweepgen
+	startSpan := shard * rootBlockSpans
+	endSpan := (shard + 1) * rootBlockSpans
+	if endSpan > len(work.spans) {
+		endSpan = len(work.spans)
+	}
+	// Note that work.spans may not include spans that were
+	// allocated between entering the scan phase and now. This is
+	// okay because any objects with finalizers in those spans
+	// must have been allocated and given finalizers after we
+	// entered the scan phase, so addfinalizer will have ensured
+	// the above invariants for them.
+	for _, s := range work.spans[startSpan:endSpan] {
+		if s.state != mSpanInUse {
+			continue
+		}
+		if !useCheckmark && s.sweepgen != sg {
+			// sweepgen was updated (+2) during non-checkmark GC pass
+			print("sweep ", s.sweepgen, " ", sg, "\n")
+			throw("gc: unswept span")
+		}
+
+		// Speculatively check if there are any specials
+		// without acquiring the span lock. This may race with
+		// adding the first special to a span, but in that
+		// case addfinalizer will observe that the GC is
+		// active (which is globally synchronized) and ensure
+		// the above invariants. We may also ensure the
+		// invariants, but it's okay to scan an object twice.
+		if s.specials == nil {
+			continue
+		}
+
+		// Lock the specials to prevent a special from being
+		// removed from the list while we're traversing it.
+		lock(&s.speciallock)
+
+		for sp := s.specials; sp != nil; sp = sp.next {
+			if sp.kind != _KindSpecialFinalizer {
+				continue
+			}
+			// don't mark finalized object, but scan it so we
+			// retain everything it points to.
+			spf := (*specialfinalizer)(unsafe.Pointer(sp))
+			// A finalizer can be set for an inner byte of an object, find object beginning.
+			p := s.base() + uintptr(spf.special.offset)/s.elemsize*s.elemsize
+
+			// Mark everything that can be reached from
+			// the object (but *not* the object itself or
+			// we'll never collect it).
+			scanobject(p, gcw)
+
+			// The special itself is a root.
+			scanblock(uintptr(unsafe.Pointer(&spf.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+		}
+
+		unlock(&s.speciallock)
+	}
+}
+
+// gcAssistAlloc performs GC work to make gp's assist debt positive.
+// gp must be the calling user gorountine.
+//
+// This must be called with preemption enabled.
+//go:nowritebarrier
+func gcAssistAlloc(gp *g) {
 	// Don't assist in non-preemptible contexts. These are
 	// generally fragile and won't allow the assist to block.
 	if getg() == gp.m.g0 {
@@ -167,13 +392,11 @@
 		return
 	}
 
-	// Compute the amount of assist scan work we need to do.
-	scanWork := int64(gcController.assistRatio*float64(gp.gcalloc)) - gp.gcscanwork
-	// scanWork can be negative if the last assist scanned a large
-	// object and we're still ahead of our assist goal.
-	if scanWork <= 0 {
-		return
-	}
+	// Compute the amount of scan work we need to do to make the
+	// balance positive. We over-assist to build up credit for
+	// future allocations and amortize the cost of assisting.
+	debtBytes := -gp.gcAssistBytes + gcOverAssistBytes
+	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
 
 retry:
 	// Steal as much credit as we can from the background GC's
@@ -182,20 +405,23 @@
 	// will just cause steals to fail until credit is accumulated
 	// again, so in the long run it doesn't really matter, but we
 	// do have to handle the negative credit case.
-	bgScanCredit := atomicloadint64(&gcController.bgScanCredit)
+	bgScanCredit := atomic.Loadint64(&gcController.bgScanCredit)
 	stolen := int64(0)
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
 			stolen = bgScanCredit
+			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
 		} else {
 			stolen = scanWork
+			gp.gcAssistBytes += debtBytes
 		}
-		xaddint64(&gcController.bgScanCredit, -stolen)
+		atomic.Xaddint64(&gcController.bgScanCredit, -stolen)
 
 		scanWork -= stolen
-		gp.gcscanwork += stolen
 
 		if scanWork == 0 {
+			// We were able to steal all of the credit we
+			// needed.
 			return
 		}
 	}
@@ -203,7 +429,7 @@
 	// Perform assist work
 	completed := false
 	systemstack(func() {
-		if atomicload(&gcBlackenEnabled) == 0 {
+		if atomic.Load(&gcBlackenEnabled) == 0 {
 			// The gcBlackenEnabled check in malloc races with the
 			// store that clears it but an atomic check in every malloc
 			// would be a performance hit.
@@ -211,7 +437,7 @@
 			// stack to determine if we should preform an assist.
 
 			// GC is done, so ignore any remaining debt.
-			scanWork = 0
+			gp.gcAssistBytes = 0
 			return
 		}
 		// Track time spent in this assist. Since we're on the
@@ -219,7 +445,7 @@
 		// just measure start and end time.
 		startTime := nanotime()
 
-		decnwait := xadd(&work.nwait, -1)
+		decnwait := atomic.Xadd(&work.nwait, -1)
 		if decnwait == work.nproc {
 			println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
 			throw("nwait > work.nprocs")
@@ -228,20 +454,24 @@
 		// drain own cached work first in the hopes that it
 		// will be more cache friendly.
 		gcw := &getg().m.p.ptr().gcw
-		startScanWork := gcw.scanWork
-		gcDrainN(gcw, scanWork)
-		// Record that we did this much scan work.
-		workDone := gcw.scanWork - startScanWork
-		gp.gcscanwork += workDone
-		scanWork -= workDone
+		workDone := gcDrainN(gcw, scanWork)
 		// If we are near the end of the mark phase
 		// dispose of the gcw.
 		if gcBlackenPromptly {
 			gcw.dispose()
 		}
+
+		// Record that we did this much scan work.
+		//
+		// Back out the number of bytes of assist credit that
+		// this scan work counts for. The "1+" is a poor man's
+		// round-up, to ensure this adds credit even if
+		// assistBytesPerWork is very low.
+		gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+
 		// If this is the last worker and we ran out of work,
 		// signal a completion point.
-		incnwait := xadd(&work.nwait, +1)
+		incnwait := atomic.Xadd(&work.nwait, +1)
 		if incnwait > work.nproc {
 			println("runtime: work.nwait=", incnwait,
 				"work.nproc=", work.nproc,
@@ -249,60 +479,184 @@
 			throw("work.nwait > work.nproc")
 		}
 
-		if incnwait == work.nproc && work.full == 0 && work.partial == 0 {
+		if incnwait == work.nproc && !gcMarkWorkAvailable(nil) {
 			// This has reached a background completion
 			// point.
-			if gcBlackenPromptly {
-				if work.bgMark1.done == 0 {
-					throw("completing mark 2, but bgMark1.done == 0")
-				}
-				work.bgMark2.complete()
-			} else {
-				work.bgMark1.complete()
-			}
 			completed = true
 		}
 		duration := nanotime() - startTime
 		_p_ := gp.m.p.ptr()
 		_p_.gcAssistTime += duration
 		if _p_.gcAssistTime > gcAssistTimeSlack {
-			xaddint64(&gcController.assistTime, _p_.gcAssistTime)
+			atomic.Xaddint64(&gcController.assistTime, _p_.gcAssistTime)
 			_p_.gcAssistTime = 0
 		}
 	})
 
 	if completed {
-		// We called complete() above, so we should yield to
-		// the now-runnable GC coordinator.
-		Gosched()
-
-		// It's likely that this assist wasn't able to pay off
-		// its debt, but it's also likely that the Gosched let
-		// the GC finish this cycle and there's no point in
-		// waiting. If the GC finished, skip the delay below.
-		if atomicload(&gcBlackenEnabled) == 0 {
-			scanWork = 0
-		}
+		gcMarkDone()
 	}
 
-	if scanWork > 0 {
+	if gp.gcAssistBytes < 0 {
 		// We were unable steal enough credit or perform
 		// enough work to pay off the assist debt. We need to
 		// do one of these before letting the mutator allocate
-		// more, so go around again after performing an
-		// interruptible sleep for 100 us (the same as the
-		// getfull barrier) to let other mutators run.
-		timeSleep(100 * 1000)
-		goto retry
+		// more to prevent over-allocation.
+		//
+		// If this is because we were preempted, reschedule
+		// and try some more.
+		if gp.preempt {
+			Gosched()
+			goto retry
+		}
+
+		// Add this G to an assist queue and park. When the GC
+		// has more background credit, it will satisfy queued
+		// assists before flushing to the global credit pool.
+		//
+		// Note that this does *not* get woken up when more
+		// work is added to the work list. The theory is that
+		// there wasn't enough work to do anyway, so we might
+		// as well let background marking take care of the
+		// work that is available.
+		lock(&work.assistQueue.lock)
+
+		// If the GC cycle is over, just return. This is the
+		// likely path if we completed above. We do this
+		// under the lock to prevent a GC cycle from ending
+		// between this check and queuing the assist.
+		if atomic.Load(&gcBlackenEnabled) == 0 {
+			unlock(&work.assistQueue.lock)
+			return
+		}
+
+		oldHead, oldTail := work.assistQueue.head, work.assistQueue.tail
+		if oldHead == 0 {
+			work.assistQueue.head.set(gp)
+		} else {
+			oldTail.ptr().schedlink.set(gp)
+		}
+		work.assistQueue.tail.set(gp)
+		gp.schedlink.set(nil)
+		// Recheck for background credit now that this G is in
+		// the queue, but can still back out. This avoids a
+		// race in case background marking has flushed more
+		// credit since we checked above.
+		if atomic.Loadint64(&gcController.bgScanCredit) > 0 {
+			work.assistQueue.head = oldHead
+			work.assistQueue.tail = oldTail
+			if oldTail != 0 {
+				oldTail.ptr().schedlink.set(nil)
+			}
+			unlock(&work.assistQueue.lock)
+			goto retry
+		}
+		// Park for real.
+		goparkunlock(&work.assistQueue.lock, "GC assist wait", traceEvGoBlock, 2)
+
+		// At this point either background GC has satisfied
+		// this G's assist debt, or the GC cycle is over.
 	}
 }
 
-//go:nowritebarrier
-func scanstack(gp *g) {
-	if gp.gcscanvalid {
-		if gcphase == _GCmarktermination {
-			gcRemoveStackBarriers(gp)
+// gcWakeAllAssists wakes all currently blocked assists. This is used
+// at the end of a GC cycle. gcBlackenEnabled must be false to prevent
+// new assists from going to sleep after this point.
+func gcWakeAllAssists() {
+	lock(&work.assistQueue.lock)
+	injectglist(work.assistQueue.head.ptr())
+	work.assistQueue.head.set(nil)
+	work.assistQueue.tail.set(nil)
+	unlock(&work.assistQueue.lock)
+}
+
+// gcFlushBgCredit flushes scanWork units of background scan work
+// credit. This first satisfies blocked assists on the
+// work.assistQueue and then flushes any remaining credit to
+// gcController.bgScanCredit.
+//
+// Write barriers are disallowed because this is used by gcDrain after
+// it has ensured that all work is drained and this must preserve that
+// condition.
+//
+//go:nowritebarrierrec
+func gcFlushBgCredit(scanWork int64) {
+	if work.assistQueue.head == 0 {
+		// Fast path; there are no blocked assists. There's a
+		// small window here where an assist may add itself to
+		// the blocked queue and park. If that happens, we'll
+		// just get it on the next flush.
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+		return
+	}
+
+	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+
+	lock(&work.assistQueue.lock)
+	gp := work.assistQueue.head.ptr()
+	for gp != nil && scanBytes > 0 {
+		// Note that gp.gcAssistBytes is negative because gp
+		// is in debt. Think carefully about the signs below.
+		if scanBytes+gp.gcAssistBytes >= 0 {
+			// Satisfy this entire assist debt.
+			scanBytes += gp.gcAssistBytes
+			gp.gcAssistBytes = 0
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			// It's important that we *not* put xgp in
+			// runnext. Otherwise, it's possible for user
+			// code to exploit the GC worker's high
+			// scheduler priority to get itself always run
+			// before other goroutines and always in the
+			// fresh quantum started by GC.
+			ready(xgp, 0, false)
+		} else {
+			// Partially satisfy this assist.
+			gp.gcAssistBytes += scanBytes
+			scanBytes = 0
+			// As a heuristic, we move this assist to the
+			// back of the queue so that large assists
+			// can't clog up the assist queue and
+			// substantially delay small assists.
+			xgp := gp
+			gp = gp.schedlink.ptr()
+			if gp == nil {
+				// gp is the only assist in the queue.
+				gp = xgp
+			} else {
+				xgp.schedlink = 0
+				work.assistQueue.tail.ptr().schedlink.set(xgp)
+				work.assistQueue.tail.set(xgp)
+			}
+			break
 		}
+	}
+	work.assistQueue.head.set(gp)
+	if gp == nil {
+		work.assistQueue.tail.set(nil)
+	}
+
+	if scanBytes > 0 {
+		// Convert from scan bytes back to work.
+		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
+	}
+	unlock(&work.assistQueue.lock)
+}
+
+// scanstack scans gp's stack, greying all pointers found on the stack.
+//
+// During mark phase, it also installs stack barriers while traversing
+// gp's stack. During mark termination, it stops scanning when it
+// reaches an unhit stack barrier.
+//
+// scanstack is marked go:systemstack because it must not be preempted
+// while using a workbuf.
+//
+//go:nowritebarrier
+//go:systemstack
+func scanstack(gp *g, gcw *gcWork) {
+	if gp.gcscanvalid {
 		return
 	}
 
@@ -332,14 +686,22 @@
 		throw("can't scan gchelper stack")
 	}
 
+	// Shrink the stack if not much of it is being used. During
+	// concurrent GC, we can do this during concurrent mark.
+	if !work.markrootDone {
+		shrinkstack(gp)
+	}
+
+	// Prepare for stack barrier insertion/removal.
 	var sp, barrierOffset, nextBarrier uintptr
 	if gp.syscallsp != 0 {
 		sp = gp.syscallsp
 	} else {
 		sp = gp.sched.sp
 	}
+	gcLockStackBarriers(gp) // Not necessary during mark term, but harmless.
 	switch gcphase {
-	case _GCscan:
+	case _GCmark:
 		// Install stack barriers during stack scan.
 		barrierOffset = uintptr(firstStackBarrierOffset)
 		nextBarrier = sp + barrierOffset
@@ -348,14 +710,18 @@
 			nextBarrier = ^uintptr(0)
 		}
 
-		if gp.stkbarPos != 0 || len(gp.stkbar) != 0 {
-			// If this happens, it's probably because we
-			// scanned a stack twice in the same phase.
-			print("stkbarPos=", gp.stkbarPos, " len(stkbar)=", len(gp.stkbar), " goid=", gp.goid, " gcphase=", gcphase, "\n")
-			throw("g already has stack barriers")
-		}
+		// Remove any existing stack barriers before we
+		// install new ones.
+		gcRemoveStackBarriers(gp)
 
 	case _GCmarktermination:
+		if !work.markrootDone {
+			// This is a STW GC. There may be stale stack
+			// barriers from an earlier cycle since we
+			// never passed through mark phase.
+			gcRemoveStackBarriers(gp)
+		}
+
 		if int(gp.stkbarPos) == len(gp.stkbar) {
 			// gp hit all of the stack barriers (or there
 			// were none). Re-scan the whole stack.
@@ -363,7 +729,7 @@
 		} else {
 			// Only re-scan up to the lowest un-hit
 			// barrier. Any frames above this have not
-			// executed since the _GCscan scan of gp and
+			// executed since the concurrent scan of gp and
 			// any writes through up-pointers to above
 			// this barrier had write barriers.
 			nextBarrier = gp.stkbar[gp.stkbarPos].savedLRPtr
@@ -372,22 +738,21 @@
 			}
 		}
 
-		gcRemoveStackBarriers(gp)
-
 	default:
 		throw("scanstack in wrong phase")
 	}
 
-	gcw := &getg().m.p.ptr().gcw
+	// Scan the stack.
+	var cache pcvalueCache
 	n := 0
 	scanframe := func(frame *stkframe, unused unsafe.Pointer) bool {
-		scanframeworker(frame, unused, gcw)
+		scanframeworker(frame, &cache, gcw)
 
 		if frame.fp > nextBarrier {
 			// We skip installing a barrier on bottom-most
 			// frame because on LR machines this LR is not
 			// on the stack.
-			if gcphase == _GCscan && n != 0 {
+			if gcphase == _GCmark && n != 0 {
 				if gcInstallStackBarrier(gp, frame) {
 					barrierOffset *= 2
 					nextBarrier = sp + barrierOffset
@@ -406,15 +771,21 @@
 	}
 	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, scanframe, nil, 0)
 	tracebackdefers(gp, scanframe, nil)
-	if gcphase == _GCmarktermination {
-		gcw.dispose()
+	gcUnlockStackBarriers(gp)
+	if gcphase == _GCmark {
+		// gp may have added itself to the rescan list between
+		// when GC started and now. It's clean now, so remove
+		// it. This isn't safe during mark termination because
+		// mark termination is consuming this list, but it's
+		// also not necessary.
+		dequeueRescan(gp)
 	}
 	gp.gcscanvalid = true
 }
 
 // Scan a stack frame: local variables and function arguments/results.
 //go:nowritebarrier
-func scanframeworker(frame *stkframe, unused unsafe.Pointer, gcw *gcWork) {
+func scanframeworker(frame *stkframe, cache *pcvalueCache, gcw *gcWork) {
 
 	f := frame.fn
 	targetpc := frame.continpc
@@ -428,10 +799,10 @@
 	if targetpc != f.entry {
 		targetpc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, cache)
 	if pcdata == -1 {
 		// We do not have a valid pcdata value but there might be a
-		// stackmap for this function.  It is likely that we are looking
+		// stackmap for this function. It is likely that we are looking
 		// at the function prologue, assume so and hope for the best.
 		pcdata = 0
 	}
@@ -439,13 +810,11 @@
 	// Scan local variables if stack frame has been allocated.
 	size := frame.varp - frame.sp
 	var minsize uintptr
-	switch thechar {
-	case '6', '8':
-		minsize = 0
-	case '7':
-		minsize = spAlign
+	switch sys.ArchFamily {
+	case sys.ARM64:
+		minsize = sys.SpAlign
 	default:
-		minsize = ptrSize
+		minsize = sys.MinFrameSize
 	}
 	if size > minsize {
 		stkmap := (*stackmap)(funcdata(f, _FUNCDATA_LocalsPointerMaps))
@@ -461,7 +830,7 @@
 			throw("scanframe: bad symbol table")
 		}
 		bv := stackmapdata(stkmap, pcdata)
-		size = uintptr(bv.n) * ptrSize
+		size = uintptr(bv.n) * sys.PtrSize
 		scanblock(frame.varp-size, size, bv.bytedata, gcw)
 	}
 
@@ -483,259 +852,137 @@
 			}
 			bv = stackmapdata(stkmap, pcdata)
 		}
-		scanblock(frame.argp, uintptr(bv.n)*ptrSize, bv.bytedata, gcw)
+		scanblock(frame.argp, uintptr(bv.n)*sys.PtrSize, bv.bytedata, gcw)
 	}
 }
 
-// gcMaxStackBarriers returns the maximum number of stack barriers
-// that can be installed in a stack of stackSize bytes.
-func gcMaxStackBarriers(stackSize int) (n int) {
-	if firstStackBarrierOffset == 0 {
-		// Special debugging case for inserting stack barriers
-		// at every frame. Steal half of the stack for the
-		// []stkbar. Technically, if the stack were to consist
-		// solely of return PCs we would need two thirds of
-		// the stack, but stealing that much breaks things and
-		// this doesn't happen in practice.
-		return stackSize / 2 / int(unsafe.Sizeof(stkbar{}))
+// queueRescan adds gp to the stack rescan list and clears
+// gp.gcscanvalid. The caller must own gp and ensure that gp isn't
+// already on the rescan list.
+func queueRescan(gp *g) {
+	if gcphase == _GCoff {
+		gp.gcscanvalid = false
+		return
+	}
+	if gp.gcRescan != -1 {
+		throw("g already on rescan list")
 	}
 
-	offset := firstStackBarrierOffset
-	for offset < stackSize {
-		n++
-		offset *= 2
+	lock(&work.rescan.lock)
+	gp.gcscanvalid = false
+
+	// Recheck gcphase under the lock in case there was a phase change.
+	if gcphase == _GCoff {
+		unlock(&work.rescan.lock)
+		return
 	}
-	return n + 1
+	if len(work.rescan.list) == cap(work.rescan.list) {
+		throw("rescan list overflow")
+	}
+	n := len(work.rescan.list)
+	gp.gcRescan = int32(n)
+	work.rescan.list = work.rescan.list[:n+1]
+	work.rescan.list[n].set(gp)
+	unlock(&work.rescan.lock)
 }
 
-// gcInstallStackBarrier installs a stack barrier over the return PC of frame.
-//go:nowritebarrier
-func gcInstallStackBarrier(gp *g, frame *stkframe) bool {
-	if frame.lr == 0 {
-		if debugStackBarrier {
-			print("not installing stack barrier with no LR, goid=", gp.goid, "\n")
-		}
-		return false
+// dequeueRescan removes gp from the stack rescan list, if gp is on
+// the rescan list. The caller must own gp.
+func dequeueRescan(gp *g) {
+	if gp.gcRescan == -1 {
+		return
+	}
+	if gcphase == _GCoff {
+		gp.gcRescan = -1
+		return
 	}
 
-	if frame.fn.entry == cgocallback_gofuncPC {
-		// cgocallback_gofunc doesn't return to its LR;
-		// instead, its return path puts LR in g.sched.pc and
-		// switches back to the system stack on which
-		// cgocallback_gofunc was originally called. We can't
-		// have a stack barrier in g.sched.pc, so don't
-		// install one in this frame.
-		if debugStackBarrier {
-			print("not installing stack barrier over LR of cgocallback_gofunc, goid=", gp.goid, "\n")
-		}
-		return false
+	lock(&work.rescan.lock)
+	if work.rescan.list[gp.gcRescan].ptr() != gp {
+		throw("bad dequeueRescan")
 	}
-
-	// Save the return PC and overwrite it with stackBarrier.
-	var lrUintptr uintptr
-	if usesLR {
-		lrUintptr = frame.sp
-	} else {
-		lrUintptr = frame.fp - regSize
-	}
-	lrPtr := (*uintreg)(unsafe.Pointer(lrUintptr))
-	if debugStackBarrier {
-		print("install stack barrier at ", hex(lrUintptr), " over ", hex(*lrPtr), ", goid=", gp.goid, "\n")
-		if uintptr(*lrPtr) != frame.lr {
-			print("frame.lr=", hex(frame.lr))
-			throw("frame.lr differs from stack LR")
-		}
-	}
-
-	gp.stkbar = gp.stkbar[:len(gp.stkbar)+1]
-	stkbar := &gp.stkbar[len(gp.stkbar)-1]
-	stkbar.savedLRPtr = lrUintptr
-	stkbar.savedLRVal = uintptr(*lrPtr)
-	*lrPtr = uintreg(stackBarrierPC)
-	return true
+	// Careful: gp may itself be the last G on the list.
+	last := work.rescan.list[len(work.rescan.list)-1]
+	work.rescan.list[gp.gcRescan] = last
+	last.ptr().gcRescan = gp.gcRescan
+	gp.gcRescan = -1
+	work.rescan.list = work.rescan.list[:len(work.rescan.list)-1]
+	unlock(&work.rescan.lock)
 }
 
-// gcRemoveStackBarriers removes all stack barriers installed in gp's stack.
-//go:nowritebarrier
-func gcRemoveStackBarriers(gp *g) {
-	if debugStackBarrier && gp.stkbarPos != 0 {
-		print("hit ", gp.stkbarPos, " stack barriers, goid=", gp.goid, "\n")
-	}
+type gcDrainFlags int
 
-	// Remove stack barriers that we didn't hit.
-	for _, stkbar := range gp.stkbar[gp.stkbarPos:] {
-		gcRemoveStackBarrier(gp, stkbar)
-	}
+const (
+	gcDrainUntilPreempt gcDrainFlags = 1 << iota
+	gcDrainNoBlock
+	gcDrainFlushBgCredit
 
-	// Clear recorded stack barriers so copystack doesn't try to
-	// adjust them.
-	gp.stkbarPos = 0
-	gp.stkbar = gp.stkbar[:0]
-}
+	// gcDrainBlock means neither gcDrainUntilPreempt or
+	// gcDrainNoBlock. It is the default, but callers should use
+	// the constant for documentation purposes.
+	gcDrainBlock gcDrainFlags = 0
+)
 
-// gcRemoveStackBarrier removes a single stack barrier. It is the
-// inverse operation of gcInstallStackBarrier.
+// gcDrain scans roots and objects in work buffers, blackening grey
+// objects until all roots and work buffers have been drained.
 //
-// This is nosplit to ensure gp's stack does not move.
+// If flags&gcDrainUntilPreempt != 0, gcDrain returns when g.preempt
+// is set. This implies gcDrainNoBlock.
+//
+// If flags&gcDrainNoBlock != 0, gcDrain returns as soon as it is
+// unable to get more work. Otherwise, it will block until all
+// blocking calls are blocked in gcDrain.
+//
+// If flags&gcDrainFlushBgCredit != 0, gcDrain flushes scan work
+// credit to gcController.bgScanCredit every gcCreditSlack units of
+// scan work.
 //
 //go:nowritebarrier
-//go:nosplit
-func gcRemoveStackBarrier(gp *g, stkbar stkbar) {
-	if debugStackBarrier {
-		print("remove stack barrier at ", hex(stkbar.savedLRPtr), " with ", hex(stkbar.savedLRVal), ", goid=", gp.goid, "\n")
-	}
-	lrPtr := (*uintreg)(unsafe.Pointer(stkbar.savedLRPtr))
-	if val := *lrPtr; val != uintreg(stackBarrierPC) {
-		printlock()
-		print("at *", hex(stkbar.savedLRPtr), " expected stack barrier PC ", hex(stackBarrierPC), ", found ", hex(val), ", goid=", gp.goid, "\n")
-		print("gp.stkbar=")
-		gcPrintStkbars(gp.stkbar)
-		print(", gp.stkbarPos=", gp.stkbarPos, ", gp.stack=[", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
-		throw("stack barrier lost")
-	}
-	*lrPtr = uintreg(stkbar.savedLRVal)
-}
-
-// gcPrintStkbars prints a []stkbar for debugging.
-func gcPrintStkbars(stkbar []stkbar) {
-	print("[")
-	for i, s := range stkbar {
-		if i > 0 {
-			print(" ")
-		}
-		print("*", hex(s.savedLRPtr), "=", hex(s.savedLRVal))
-	}
-	print("]")
-}
-
-// gcUnwindBarriers marks all stack barriers up the frame containing
-// sp as hit and removes them. This is used during stack unwinding for
-// panic/recover and by heapBitsBulkBarrier to force stack re-scanning
-// when its destination is on the stack.
-//
-// This is nosplit to ensure gp's stack does not move.
-//
-//go:nosplit
-func gcUnwindBarriers(gp *g, sp uintptr) {
-	// On LR machines, if there is a stack barrier on the return
-	// from the frame containing sp, this will mark it as hit even
-	// though it isn't, but it's okay to be conservative.
-	before := gp.stkbarPos
-	for int(gp.stkbarPos) < len(gp.stkbar) && gp.stkbar[gp.stkbarPos].savedLRPtr < sp {
-		gcRemoveStackBarrier(gp, gp.stkbar[gp.stkbarPos])
-		gp.stkbarPos++
-	}
-	if debugStackBarrier && gp.stkbarPos != before {
-		print("skip barriers below ", hex(sp), " in goid=", gp.goid, ": ")
-		gcPrintStkbars(gp.stkbar[before:gp.stkbarPos])
-		print("\n")
-	}
-}
-
-// nextBarrierPC returns the original return PC of the next stack barrier.
-// Used by getcallerpc, so it must be nosplit.
-//go:nosplit
-func nextBarrierPC() uintptr {
-	gp := getg()
-	return gp.stkbar[gp.stkbarPos].savedLRVal
-}
-
-// setNextBarrierPC sets the return PC of the next stack barrier.
-// Used by setcallerpc, so it must be nosplit.
-//go:nosplit
-func setNextBarrierPC(pc uintptr) {
-	gp := getg()
-	gp.stkbar[gp.stkbarPos].savedLRVal = pc
-}
-
-// TODO(austin): Can we consolidate the gcDrain* functions?
-
-// gcDrain scans objects in work buffers, blackening grey
-// objects until all work buffers have been drained.
-// If flushScanCredit != -1, gcDrain flushes accumulated scan work
-// credit to gcController.bgScanCredit whenever gcw's local scan work
-// credit exceeds flushScanCredit.
-//go:nowritebarrier
-func gcDrain(gcw *gcWork, flushScanCredit int64) {
-	if !writeBarrierEnabled {
+func gcDrain(gcw *gcWork, flags gcDrainFlags) {
+	if !writeBarrier.needed {
 		throw("gcDrain phase incorrect")
 	}
 
-	var lastScanFlush, nextScanFlush int64
-	if flushScanCredit != -1 {
-		lastScanFlush = gcw.scanWork
-		nextScanFlush = lastScanFlush + flushScanCredit
-	} else {
-		nextScanFlush = int64(^uint64(0) >> 1)
-	}
-
-	for {
-		// If another proc wants a pointer, give it some.
-		if work.nwait > 0 && work.full == 0 {
-			gcw.balance()
-		}
-
-		b := gcw.get()
-		if b == 0 {
-			// work barrier reached
-			break
-		}
-		// If the current wbuf is filled by the scan a new wbuf might be
-		// returned that could possibly hold only a single object. This
-		// could result in each iteration draining only a single object
-		// out of the wbuf passed in + a single object placed
-		// into an empty wbuf in scanobject so there could be
-		// a performance hit as we keep fetching fresh wbufs.
-		scanobject(b, gcw)
-
-		// Flush background scan work credit to the global
-		// account if we've accumulated enough locally so
-		// mutator assists can draw on it.
-		if gcw.scanWork >= nextScanFlush {
-			credit := gcw.scanWork - lastScanFlush
-			xaddint64(&gcController.bgScanCredit, credit)
-			lastScanFlush = gcw.scanWork
-			nextScanFlush = lastScanFlush + flushScanCredit
-		}
-	}
-	if flushScanCredit != -1 {
-		credit := gcw.scanWork - lastScanFlush
-		xaddint64(&gcController.bgScanCredit, credit)
-	}
-}
-
-// gcDrainUntilPreempt blackens grey objects until g.preempt is set.
-// This is best-effort, so it will return as soon as it is unable to
-// get work, even though there may be more work in the system.
-//go:nowritebarrier
-func gcDrainUntilPreempt(gcw *gcWork, flushScanCredit int64) {
-	if !writeBarrierEnabled {
-		println("gcphase =", gcphase)
-		throw("gcDrainUntilPreempt phase incorrect")
-	}
-
-	var lastScanFlush, nextScanFlush int64
-	if flushScanCredit != -1 {
-		lastScanFlush = gcw.scanWork
-		nextScanFlush = lastScanFlush + flushScanCredit
-	} else {
-		nextScanFlush = int64(^uint64(0) >> 1)
-	}
-
 	gp := getg()
-	for !gp.preempt {
-		// If the work queue is empty, balance. During
-		// concurrent mark we don't really know if anyone else
-		// can make use of this work, but even if we're the
-		// only worker, the total cost of this per cycle is
-		// only O(_WorkbufSize) pointer copies.
-		if work.full == 0 && work.partial == 0 {
+	preemptible := flags&gcDrainUntilPreempt != 0
+	blocking := flags&(gcDrainUntilPreempt|gcDrainNoBlock) == 0
+	flushBgCredit := flags&gcDrainFlushBgCredit != 0
+
+	// Drain root marking jobs.
+	if work.markrootNext < work.markrootJobs {
+		for blocking || !gp.preempt {
+			job := atomic.Xadd(&work.markrootNext, +1) - 1
+			if job >= work.markrootJobs {
+				break
+			}
+			markroot(gcw, job)
+		}
+	}
+
+	initScanWork := gcw.scanWork
+
+	// Drain heap marking jobs.
+	for !(preemptible && gp.preempt) {
+		// Try to keep work available on the global queue. We used to
+		// check if there were waiting workers, but it's better to
+		// just keep work available than to make workers wait. In the
+		// worst case, we'll do O(log(_WorkbufSize)) unnecessary
+		// balances.
+		if work.full == 0 {
 			gcw.balance()
 		}
 
-		b := gcw.tryGet()
+		var b uintptr
+		if blocking {
+			b = gcw.get()
+		} else {
+			b = gcw.tryGetFast()
+			if b == 0 {
+				b = gcw.tryGet()
+			}
+		}
 		if b == 0 {
-			// No more work
+			// work barrier reached or tryGet failed.
 			break
 		}
 		scanobject(b, gcw)
@@ -743,41 +990,81 @@
 		// Flush background scan work credit to the global
 		// account if we've accumulated enough locally so
 		// mutator assists can draw on it.
-		if gcw.scanWork >= nextScanFlush {
-			credit := gcw.scanWork - lastScanFlush
-			xaddint64(&gcController.bgScanCredit, credit)
-			lastScanFlush = gcw.scanWork
-			nextScanFlush = lastScanFlush + flushScanCredit
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			if flushBgCredit {
+				gcFlushBgCredit(gcw.scanWork - initScanWork)
+				initScanWork = 0
+			}
+			gcw.scanWork = 0
 		}
 	}
-	if flushScanCredit != -1 {
-		credit := gcw.scanWork - lastScanFlush
-		xaddint64(&gcController.bgScanCredit, credit)
+
+	// In blocking mode, write barriers are not allowed after this
+	// point because we must preserve the condition that the work
+	// buffers are empty.
+
+	// Flush remaining scan work credit.
+	if gcw.scanWork > 0 {
+		atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+		if flushBgCredit {
+			gcFlushBgCredit(gcw.scanWork - initScanWork)
+		}
+		gcw.scanWork = 0
 	}
 }
 
 // gcDrainN blackens grey objects until it has performed roughly
-// scanWork units of scan work. This is best-effort, so it may perform
-// less work if it fails to get a work buffer. Otherwise, it will
-// perform at least n units of work, but may perform more because
-// scanning is always done in whole object increments.
+// scanWork units of scan work or the G is preempted. This is
+// best-effort, so it may perform less work if it fails to get a work
+// buffer. Otherwise, it will perform at least n units of work, but
+// may perform more because scanning is always done in whole object
+// increments. It returns the amount of scan work performed.
 //go:nowritebarrier
-func gcDrainN(gcw *gcWork, scanWork int64) {
-	if !writeBarrierEnabled {
+func gcDrainN(gcw *gcWork, scanWork int64) int64 {
+	if !writeBarrier.needed {
 		throw("gcDrainN phase incorrect")
 	}
-	targetScanWork := gcw.scanWork + scanWork
-	for gcw.scanWork < targetScanWork {
+
+	// There may already be scan work on the gcw, which we don't
+	// want to claim was done by this call.
+	workFlushed := -gcw.scanWork
+
+	gp := getg().m.curg
+	for !gp.preempt && workFlushed+gcw.scanWork < scanWork {
+		// See gcDrain comment.
+		if work.full == 0 {
+			gcw.balance()
+		}
+
 		// This might be a good place to add prefetch code...
 		// if(wbuf.nobj > 4) {
 		//         PREFETCH(wbuf->obj[wbuf.nobj - 3];
 		//  }
-		b := gcw.tryGet()
+		//
+		b := gcw.tryGetFast()
 		if b == 0 {
-			return
+			b = gcw.tryGet()
+		}
+
+		if b == 0 {
+			break
 		}
 		scanobject(b, gcw)
+
+		// Flush background scan work credit.
+		if gcw.scanWork >= gcCreditSlack {
+			atomic.Xaddint64(&gcController.scanWork, gcw.scanWork)
+			workFlushed += gcw.scanWork
+			gcw.scanWork = 0
+		}
 	}
+
+	// Unlike gcDrain, there's no need to flush remaining work
+	// here because this never flushes to bgScanCredit and
+	// gcw.dispose will flush any remaining work to scanWork.
+
+	return workFlushed + gcw.scanWork
 }
 
 // scanblock scans b as scanobject would, but using an explicit
@@ -799,9 +1086,9 @@
 
 	for i := uintptr(0); i < n; {
 		// Find bits for the next word.
-		bits := uint32(*addb(ptrmask, i/(ptrSize*8)))
+		bits := uint32(*addb(ptrmask, i/(sys.PtrSize*8)))
 		if bits == 0 {
-			i += ptrSize * 8
+			i += sys.PtrSize * 8
 			continue
 		}
 		for j := 0; j < 8 && i < n; j++ {
@@ -809,13 +1096,13 @@
 				// Same work as in scanobject; see comments there.
 				obj := *(*uintptr)(unsafe.Pointer(b + i))
 				if obj != 0 && arena_start <= obj && obj < arena_used {
-					if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
-						greyobject(obj, b, i, hbits, span, gcw)
+					if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 {
+						greyobject(obj, b, i, hbits, span, gcw, objIndex)
 					}
 				}
 			}
 			bits >>= 1
-			i += ptrSize
+			i += sys.PtrSize
 		}
 	}
 }
@@ -823,7 +1110,7 @@
 // scanobject scans the object starting at b, adding pointers to gcw.
 // b must point to the beginning of a heap object; scanobject consults
 // the GC bitmap for the pointer mask and the spans for the size of the
-// object (it ignores n).
+// object.
 //go:nowritebarrier
 func scanobject(b uintptr, gcw *gcWork) {
 	// Note that arena_used may change concurrently during
@@ -849,7 +1136,7 @@
 	}
 
 	var i uintptr
-	for i = 0; i < n; i += ptrSize {
+	for i = 0; i < n; i += sys.PtrSize {
 		// Find bits for this word.
 		if i != 0 {
 			// Avoid needless hbits.next() on last iteration.
@@ -859,11 +1146,10 @@
 		// in the type bit for the one word. The only one-word objects
 		// are pointers, or else they'd be merged with other non-pointer
 		// data into larger allocations.
-		bits := hbits.bits()
-		if i >= 2*ptrSize && bits&bitMarked == 0 {
+		if i != 1*sys.PtrSize && !hbits.morePointers() {
 			break // no more pointers in this object
 		}
-		if bits&bitPointer == 0 {
+		if !hbits.isPointer() {
 			continue // not a pointer
 		}
 
@@ -875,8 +1161,8 @@
 		// Check if it points into heap and not back at the current object.
 		if obj != 0 && arena_start <= obj && obj < arena_used && obj-b >= n {
 			// Mark the object.
-			if obj, hbits, span := heapBitsForObject(obj); obj != 0 {
-				greyobject(obj, b, i, hbits, span, gcw)
+			if obj, hbits, span, objIndex := heapBitsForObject(obj, b, i); obj != 0 {
+				greyobject(obj, b, i, hbits, span, gcw, objIndex)
 			}
 		}
 	}
@@ -889,9 +1175,9 @@
 // Preemption must be disabled.
 //go:nowritebarrier
 func shade(b uintptr) {
-	if obj, hbits, span := heapBitsForObject(b); obj != 0 {
+	if obj, hbits, span, objIndex := heapBitsForObject(b, 0, 0); obj != 0 {
 		gcw := &getg().m.p.ptr().gcw
-		greyobject(obj, 0, 0, hbits, span, gcw)
+		greyobject(obj, 0, 0, hbits, span, gcw, objIndex)
 		if gcphase == _GCmarktermination || gcBlackenPromptly {
 			// Ps aren't allowed to cache work during mark
 			// termination.
@@ -903,15 +1189,16 @@
 // obj is the start of an object with mark mbits.
 // If it isn't already marked, mark it and enqueue into gcw.
 // base and off are for debugging only and could be removed.
-//go:nowritebarrier
-func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork) {
+//go:nowritebarrierrec
+func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork, objIndex uintptr) {
 	// obj should be start of allocation, and so must be at least pointer-aligned.
-	if obj&(ptrSize-1) != 0 {
+	if obj&(sys.PtrSize-1) != 0 {
 		throw("greyobject: obj not pointer-aligned")
 	}
+	mbits := span.markBitsForIndex(objIndex)
 
 	if useCheckmark {
-		if !hbits.isMarked() {
+		if !mbits.isMarked() {
 			printlock()
 			print("runtime:greyobject: checkmarks finds unexpected unmarked object obj=", hex(obj), "\n")
 			print("runtime: found obj at *(", hex(base), "+", hex(off), ")\n")
@@ -933,11 +1220,11 @@
 		}
 	} else {
 		// If marked we have nothing to do.
-		if hbits.isMarked() {
+		if mbits.isMarked() {
 			return
 		}
-		hbits.setMarked()
-
+		// mbits.setMarked() // Avoid extra call overhead with manual inlining.
+		atomic.Or8(mbits.bytep, mbits.mask)
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
 		if !hbits.hasPointers(span.elemsize) {
@@ -952,15 +1239,16 @@
 	// Previously we put the obj in an 8 element buffer that is drained at a rate
 	// to give the PREFETCH time to do its work.
 	// Use of PREFETCHNTA might be more appropriate than PREFETCH
-
-	gcw.put(obj)
+	if !gcw.putFast(obj) {
+		gcw.put(obj)
+	}
 }
 
 // gcDumpObject dumps the contents of obj for debugging and marks the
 // field at byte offset off in obj.
 func gcDumpObject(label string, obj, off uintptr) {
 	if obj < mheap_.arena_start || obj >= mheap_.arena_used {
-		print(label, "=", hex(obj), " is not a heap object\n")
+		print(label, "=", hex(obj), " is not in the Go heap\n")
 		return
 	}
 	k := obj >> _PageShift
@@ -972,24 +1260,46 @@
 		print(" s=nil\n")
 		return
 	}
-	print(" s.start*_PageSize=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
-	for i := uintptr(0); i < s.elemsize; i += ptrSize {
-		print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + uintptr(i)))))
+	print(" s.base()=", hex(s.base()), " s.limit=", hex(s.limit), " s.sizeclass=", s.sizeclass, " s.elemsize=", s.elemsize, "\n")
+	skipped := false
+	for i := uintptr(0); i < s.elemsize; i += sys.PtrSize {
+		// For big objects, just print the beginning (because
+		// that usually hints at the object's type) and the
+		// fields around off.
+		if !(i < 128*sys.PtrSize || off-16*sys.PtrSize < i && i < off+16*sys.PtrSize) {
+			skipped = true
+			continue
+		}
+		if skipped {
+			print(" ...\n")
+			skipped = false
+		}
+		print(" *(", label, "+", i, ") = ", hex(*(*uintptr)(unsafe.Pointer(obj + i))))
 		if i == off {
 			print(" <==")
 		}
 		print("\n")
 	}
+	if skipped {
+		print(" ...\n")
+	}
 }
 
-// If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
+// gcmarknewobject marks a newly allocated object black. obj must
+// not contain any non-nil pointers.
+//
+// This is nosplit so it can manipulate a gcWork without preemption.
+//
 //go:nowritebarrier
-func gcmarknewobject_m(obj, size uintptr) {
+//go:nosplit
+func gcmarknewobject(obj, size, scanSize uintptr) {
 	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
-	heapBitsForAddr(obj).setMarked()
-	xadd64(&work.bytesMarked, int64(size))
+	markBitsForAddr(obj).setMarked()
+	gcw := &getg().m.p.ptr().gcw
+	gcw.bytesMarked += uint64(size)
+	gcw.scanWork += int64(scanSize)
 }
 
 // Checkmarking

diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index eaa4463..947c38e 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go

@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 var sweep sweepdata
 
@@ -24,22 +27,31 @@
 }
 
 //go:nowritebarrier
-func finishsweep_m() {
-	// The world is stopped so we should be able to complete the sweeps
-	// quickly.
+func finishsweep_m(stw bool) {
+	// Sweeping must be complete before marking commences, so
+	// sweep any unswept spans. If this is a concurrent GC, there
+	// shouldn't be any spans left to sweep, so this should finish
+	// instantly. If GC was forced before the concurrent sweep
+	// finished, there may be spans to sweep.
 	for sweepone() != ^uintptr(0) {
 		sweep.npausesweep++
 	}
 
 	// There may be some other spans being swept concurrently that
 	// we need to wait for. If finishsweep_m is done with the world stopped
-	// this code is not required.
-	sg := mheap_.sweepgen
-	for _, s := range work.spans {
-		if s.sweepgen != sg && s.state == _MSpanInUse {
-			mSpan_EnsureSwept(s)
+	// this is not required because the STW must have waited for sweeps.
+	//
+	// TODO(austin): As of this writing, we always pass true for stw.
+	// Consider removing this code.
+	if !stw {
+		sg := mheap_.sweepgen
+		for _, s := range work.spans {
+			if s.sweepgen != sg && s.state == _MSpanInUse {
+				s.ensureSwept()
+			}
 		}
 	}
+	nextMarkBitArenaEpoch()
 }
 
 func bgsweep(c chan int) {
@@ -79,10 +91,13 @@
 	_g_.m.locks++
 	sg := mheap_.sweepgen
 	for {
-		idx := xadd(&sweep.spanidx, 1) - 1
+		idx := atomic.Xadd(&sweep.spanidx, 1) - 1
 		if idx >= uint32(len(work.spans)) {
 			mheap_.sweepdone = 1
 			_g_.m.locks--
+			if debug.gcpacertrace > 0 && idx == uint32(len(work.spans)) {
+				print("pacer: sweep done at heap size ", memstats.heap_live>>20, "MB; allocated ", mheap_.spanBytesAlloc>>20, "MB of spans; swept ", mheap_.pagesSwept, " pages at ", mheap_.sweepPagesPerByte, " pages/byte\n")
+			}
 			return ^uintptr(0)
 		}
 		s := work.spans[idx]
@@ -90,11 +105,11 @@
 			s.sweepgen = sg
 			continue
 		}
-		if s.sweepgen != sg-2 || !cas(&s.sweepgen, sg-2, sg-1) {
+		if s.sweepgen != sg-2 || !atomic.Cas(&s.sweepgen, sg-2, sg-1) {
 			continue
 		}
 		npages := s.npages
-		if !mSpan_Sweep(s, false) {
+		if !s.sweep(false) {
 			npages = 0
 		}
 		_g_.m.locks--
@@ -118,7 +133,7 @@
 
 // Returns only when span s has been swept.
 //go:nowritebarrier
-func mSpan_EnsureSwept(s *mspan) {
+func (s *mspan) ensureSwept() {
 	// Caller must disable preemption.
 	// Otherwise when this function returns the span can become unswept again
 	// (if GC is triggered on another goroutine).
@@ -128,16 +143,16 @@
 	}
 
 	sg := mheap_.sweepgen
-	if atomicload(&s.sweepgen) == sg {
+	if atomic.Load(&s.sweepgen) == sg {
 		return
 	}
 	// The caller must be sure that the span is a MSpanInUse span.
-	if cas(&s.sweepgen, sg-2, sg-1) {
-		mSpan_Sweep(s, false)
+	if atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+		s.sweep(false)
 		return
 	}
 	// unfortunate condition, and we don't have efficient means to wait
-	for atomicload(&s.sweepgen) != sg {
+	for atomic.Load(&s.sweepgen) != sg {
 		osyield()
 	}
 }
@@ -148,7 +163,7 @@
 // If preserve=true, don't return it to heap nor relink in MCentral lists;
 // caller takes care of it.
 //TODO go:nowritebarrier
-func mSpan_Sweep(s *mspan, preserve bool) bool {
+func (s *mspan) sweep(preserve bool) bool {
 	// It's critical that we enter this function with preemption disabled,
 	// GC must not start while we are in the middle of this function.
 	_g_ := getg()
@@ -165,47 +180,69 @@
 		traceGCSweepStart()
 	}
 
-	xadd64(&mheap_.pagesSwept, int64(s.npages))
+	atomic.Xadd64(&mheap_.pagesSwept, int64(s.npages))
 
 	cl := s.sizeclass
 	size := s.elemsize
 	res := false
 	nfree := 0
 
-	var head, end gclinkptr
-
 	c := _g_.m.mcache
 	freeToHeap := false
 
-	// Mark any free objects in this span so we don't collect them.
-	sstart := uintptr(s.start << _PageShift)
-	for link := s.freelist; link.ptr() != nil; link = link.ptr().next {
-		if uintptr(link) < sstart || s.limit <= uintptr(link) {
-			// Free list is corrupted.
-			dumpFreeList(s)
-			throw("free list corrupted")
-		}
-		heapBitsForAddr(uintptr(link)).setMarkedNonAtomic()
-	}
+	// The allocBits indicate which unmarked objects don't need to be
+	// processed since they were free at the end of the last GC cycle
+	// and were not allocated since then.
+	// If the allocBits index is >= s.freeindex and the bit
+	// is not marked then the object remains unallocated
+	// since the last GC.
+	// This situation is analogous to being on a freelist.
 
 	// Unlink & free special records for any objects we're about to free.
+	// Two complications here:
+	// 1. An object can have both finalizer and profile special records.
+	//    In such case we need to queue finalizer for execution,
+	//    mark the object as live and preserve the profile special.
+	// 2. A tiny object can have several finalizers setup for different offsets.
+	//    If such object is not marked, we need to queue all finalizers at once.
+	// Both 1 and 2 are possible at the same time.
 	specialp := &s.specials
 	special := *specialp
 	for special != nil {
 		// A finalizer can be set for an inner byte of an object, find object beginning.
-		p := uintptr(s.start<<_PageShift) + uintptr(special.offset)/size*size
-		hbits := heapBitsForAddr(p)
-		if !hbits.isMarked() {
-			// Find the exact byte for which the special was setup
-			// (as opposed to object beginning).
-			p := uintptr(s.start<<_PageShift) + uintptr(special.offset)
-			// about to free object: splice out special record
-			y := special
-			special = special.next
-			*specialp = special
-			if !freespecial(y, unsafe.Pointer(p), size, false) {
-				// stop freeing of object if it has a finalizer
-				hbits.setMarkedNonAtomic()
+		objIndex := uintptr(special.offset) / size
+		p := s.base() + objIndex*size
+		mbits := s.markBitsForIndex(objIndex)
+		if !mbits.isMarked() {
+			// This object is not marked and has at least one special record.
+			// Pass 1: see if it has at least one finalizer.
+			hasFin := false
+			endOffset := p - s.base() + size
+			for tmp := special; tmp != nil && uintptr(tmp.offset) < endOffset; tmp = tmp.next {
+				if tmp.kind == _KindSpecialFinalizer {
+					// Stop freeing of object if it has a finalizer.
+					mbits.setMarkedNonAtomic()
+					hasFin = true
+					break
+				}
+			}
+			// Pass 2: queue all finalizers _or_ handle profile record.
+			for special != nil && uintptr(special.offset) < endOffset {
+				// Find the exact byte for which the special was setup
+				// (as opposed to object beginning).
+				p := s.base() + uintptr(special.offset)
+				if special.kind == _KindSpecialFinalizer || !hasFin {
+					// Splice out special record.
+					y := special
+					special = special.next
+					*specialp = special
+					freespecial(y, unsafe.Pointer(p), size)
+				} else {
+					// This is profile record, but the object has finalizers (so kept alive).
+					// Keep special record.
+					specialp = &special.next
+					special = *specialp
+				}
 			}
 		} else {
 			// object is still live: keep special record
@@ -214,71 +251,75 @@
 		}
 	}
 
-	// Sweep through n objects of given size starting at p.
-	// This thread owns the span now, so it can manipulate
-	// the block bitmap without atomic operations.
-
-	size, n, _ := s.layout()
-	heapBitsSweepSpan(s.base(), size, n, func(p uintptr) {
-		// At this point we know that we are looking at garbage object
-		// that needs to be collected.
-		if debug.allocfreetrace != 0 {
-			tracefree(unsafe.Pointer(p), size)
+	if debug.allocfreetrace != 0 || raceenabled || msanenabled {
+		// Find all newly freed objects. This doesn't have to
+		// efficient; allocfreetrace has massive overhead.
+		mbits := s.markBitsForBase()
+		abits := s.allocBitsForIndex(0)
+		for i := uintptr(0); i < s.nelems; i++ {
+			if !mbits.isMarked() && (abits.index < s.freeindex || abits.isMarked()) {
+				x := s.base() + i*s.elemsize
+				if debug.allocfreetrace != 0 {
+					tracefree(unsafe.Pointer(x), size)
+				}
+				if raceenabled {
+					racefree(unsafe.Pointer(x), size)
+				}
+				if msanenabled {
+					msanfree(unsafe.Pointer(x), size)
+				}
+			}
+			mbits.advance()
+			abits.advance()
 		}
+	}
 
-		// Reset to allocated+noscan.
-		if cl == 0 {
-			// Free large span.
-			if preserve {
-				throw("can't preserve large span")
-			}
-			heapBitsForSpan(p).initSpan(s.layout())
-			s.needzero = 1
+	// Count the number of free objects in this span.
+	nfree = s.countFree()
+	if cl == 0 && nfree != 0 {
+		s.needzero = 1
+		freeToHeap = true
+	}
+	nalloc := uint16(s.nelems) - uint16(nfree)
+	nfreed := s.allocCount - nalloc
+	if nalloc > s.allocCount {
+		print("runtime: nelems=", s.nelems, " nfree=", nfree, " nalloc=", nalloc, " previous allocCount=", s.allocCount, " nfreed=", nfreed, "\n")
+		throw("sweep increased allocation count")
+	}
 
-			// important to set sweepgen before returning it to heap
-			atomicstore(&s.sweepgen, sweepgen)
+	s.allocCount = nalloc
+	wasempty := s.nextFreeIndex() == s.nelems
+	s.freeindex = 0 // reset allocation index to start of span.
 
-			// Free the span after heapBitsSweepSpan
-			// returns, since it's not done with the span.
-			freeToHeap = true
-		} else {
-			// Free small object.
-			if size > 2*ptrSize {
-				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed"
-			} else if size > ptrSize {
-				*(*uintptr)(unsafe.Pointer(p + ptrSize)) = 0
-			}
-			if head.ptr() == nil {
-				head = gclinkptr(p)
-			} else {
-				end.ptr().next = gclinkptr(p)
-			}
-			end = gclinkptr(p)
-			end.ptr().next = gclinkptr(0x0bade5)
-			nfree++
-		}
-	})
+	// gcmarkBits becomes the allocBits.
+	// get a fresh cleared gcmarkBits in preparation for next GC
+	s.allocBits = s.gcmarkBits
+	s.gcmarkBits = newMarkBits(s.nelems)
+
+	// Initialize alloc bits cache.
+	s.refillAllocCache(0)
 
 	// We need to set s.sweepgen = h.sweepgen only when all blocks are swept,
 	// because of the potential for a concurrent free/SetFinalizer.
 	// But we need to set it before we make the span available for allocation
 	// (return it to heap or mcentral), because allocation code assumes that a
 	// span is already swept if available for allocation.
-	//
-	// TODO(austin): Clean this up by consolidating atomicstore in
-	// large span path above with this.
-	if !freeToHeap && nfree == 0 {
+	if freeToHeap || nfreed == 0 {
 		// The span must be in our exclusive ownership until we update sweepgen,
 		// check for potential races.
 		if s.state != mSpanInUse || s.sweepgen != sweepgen-1 {
 			print("MSpan_Sweep: state=", s.state, " sweepgen=", s.sweepgen, " mheap.sweepgen=", sweepgen, "\n")
 			throw("MSpan_Sweep: bad span state after sweep")
 		}
-		atomicstore(&s.sweepgen, sweepgen)
+		// Serialization point.
+		// At this point the mark bits are cleared and allocation ready
+		// to go so release the span.
+		atomic.Store(&s.sweepgen, sweepgen)
 	}
-	if nfree > 0 {
-		c.local_nsmallfree[cl] += uintptr(nfree)
-		res = mCentral_FreeSpan(&mheap_.central[cl].mcentral, s, int32(nfree), head, end, preserve)
+
+	if nfreed > 0 && cl != 0 {
+		c.local_nsmallfree[cl] += uintptr(nfreed)
+		res = mheap_.central[cl].mcentral.freeSpan(s, preserve, wasempty)
 		// MCentral_FreeSpan updates sweepgen
 	} else if freeToHeap {
 		// Free large span to heap
@@ -299,9 +340,9 @@
 		// implement and then call some kind of MHeap_DeleteSpan.
 		if debug.efence > 0 {
 			s.limit = 0 // prevent mlookup from finding this span
-			sysFault(unsafe.Pointer(uintptr(s.start<<_PageShift)), size)
+			sysFault(unsafe.Pointer(s.base()), size)
 		} else {
-			mHeap_Free(&mheap_, s, 1)
+			mheap_.freeSpan(s, 1)
 		}
 		c.local_nlargefree++
 		c.local_largefree += size
@@ -320,6 +361,11 @@
 // also sweep pages (e.g., for a large allocation), it can pass a
 // non-zero callerSweepPages to leave that many pages unswept.
 //
+// deductSweepCredit makes a worst-case assumption that all spanBytes
+// bytes of the ultimately allocated span will be available for object
+// allocation. The caller should call reimburseSweepCredit if that
+// turns out not to be the case once the span is allocated.
+//
 // deductSweepCredit is the core of the "proportional sweep" system.
 // It uses statistics gathered by the garbage collector to perform
 // enough sweeping so that all pages are swept during the concurrent
@@ -333,11 +379,11 @@
 	}
 
 	// Account for this span allocation.
-	spanBytesAlloc := xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
+	spanBytesAlloc := atomic.Xadd64(&mheap_.spanBytesAlloc, int64(spanBytes))
 
 	// Fix debt if necessary.
 	pagesOwed := int64(mheap_.sweepPagesPerByte * float64(spanBytesAlloc))
-	for pagesOwed-int64(atomicload64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
+	for pagesOwed-int64(atomic.Load64(&mheap_.pagesSwept)) > int64(callerSweepPages) {
 		if gosweepone() == ^uintptr(0) {
 			mheap_.sweepPagesPerByte = 0
 			break
@@ -345,26 +391,15 @@
 	}
 }
 
-func dumpFreeList(s *mspan) {
-	printlock()
-	print("runtime: free list of span ", s, ":\n")
-	sstart := uintptr(s.start << _PageShift)
-	link := s.freelist
-	for i := 0; i < int(s.npages*_PageSize/s.elemsize); i++ {
-		if i != 0 {
-			print(" -> ")
-		}
-		print(hex(link))
-		if link.ptr() == nil {
-			break
-		}
-		if uintptr(link) < sstart || s.limit <= uintptr(link) {
-			// Bad link. Stop walking before we crash.
-			print(" (BAD)")
-			break
-		}
-		link = link.ptr().next
+// reimburseSweepCredit records that unusableBytes bytes of a
+// just-allocated span are not available for object allocation. This
+// offsets the worst-case charge performed by deductSweepCredit.
+func reimburseSweepCredit(unusableBytes uintptr) {
+	if mheap_.sweepPagesPerByte == 0 {
+		// Nobody cares about the credit. Avoid the atomic.
+		return
 	}
-	print("\n")
-	printunlock()
+	if int64(atomic.Xadd64(&mheap_.spanBytesAlloc, -int64(unusableBytes))) < 0 {
+		throw("spanBytesAlloc underflow")
+	}
 }

diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index b18eaaf..d04840b 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go

@@ -4,21 +4,24 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
-	_Debugwbufs  = false   // if true check wbufs consistency
-	_WorkbufSize = 1 * 256 // in bytes - if small wbufs are passed to GC in a timely fashion.
+	_WorkbufSize = 2048 // in bytes; larger values result in less contention
 )
 
 // Garbage collector work pool abstraction.
 //
 // This implements a producer/consumer model for pointers to grey
-// objects.  A grey object is one that is marked and on a work
-// queue.  A black object is marked and not on a work queue.
+// objects. A grey object is one that is marked and on a work
+// queue. A black object is marked and not on a work queue.
 //
 // Write barriers, root discovery, stack scanning, and object scanning
-// produce pointers to grey objects.  Scanning consumes pointers to
+// produce pointers to grey objects. Scanning consumes pointers to
 // grey objects, thus blackening them, and then scans them,
 // potentially producing new pointers to grey objects.
 
@@ -40,18 +43,10 @@
 //
 // A gcWork can be used on the stack as follows:
 //
-//     var gcw gcWork
-//     disable preemption
-//     .. call gcw.put() to produce and gcw.get() to consume ..
-//     gcw.dispose()
-//     enable preemption
-//
-// Or from the per-P gcWork cache:
-//
 //     (preemption must be disabled)
 //     gcw := &getg().m.p.ptr().gcw
 //     .. call gcw.put() to produce and gcw.get() to consume ..
-//     if gcphase == _GCmarktermination {
+//     if gcBlackenPromptly {
 //         gcw.dispose()
 //     }
 //
@@ -60,37 +55,81 @@
 // gcWork may locally hold GC work buffers. This can be done by
 // disabling preemption (systemstack or acquirem).
 type gcWork struct {
-	// Invariant: wbuf is never full or empty
-	wbuf wbufptr
+	// wbuf1 and wbuf2 are the primary and secondary work buffers.
+	//
+	// This can be thought of as a stack of both work buffers'
+	// pointers concatenated. When we pop the last pointer, we
+	// shift the stack up by one work buffer by bringing in a new
+	// full buffer and discarding an empty one. When we fill both
+	// buffers, we shift the stack down by one work buffer by
+	// bringing in a new empty buffer and discarding a full one.
+	// This way we have one buffer's worth of hysteresis, which
+	// amortizes the cost of getting or putting a work buffer over
+	// at least one buffer of work and reduces contention on the
+	// global work lists.
+	//
+	// wbuf1 is always the buffer we're currently pushing to and
+	// popping from and wbuf2 is the buffer that will be discarded
+	// next.
+	//
+	// Invariant: Both wbuf1 and wbuf2 are nil or neither are.
+	wbuf1, wbuf2 wbufptr
 
 	// Bytes marked (blackened) on this gcWork. This is aggregated
 	// into work.bytesMarked by dispose.
 	bytesMarked uint64
 
 	// Scan work performed on this gcWork. This is aggregated into
-	// gcController by dispose.
+	// gcController by dispose and may also be flushed by callers.
 	scanWork int64
 }
 
+func (w *gcWork) init() {
+	w.wbuf1 = wbufptrOf(getempty())
+	wbuf2 := trygetfull()
+	if wbuf2 == nil {
+		wbuf2 = getempty()
+	}
+	w.wbuf2 = wbufptrOf(wbuf2)
+}
+
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object.
 //go:nowritebarrier
-func (ww *gcWork) put(obj uintptr) {
-	w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
-
-	wbuf := w.wbuf.ptr()
+func (w *gcWork) put(obj uintptr) {
+	wbuf := w.wbuf1.ptr()
 	if wbuf == nil {
-		wbuf = getpartialorempty(42)
-		w.wbuf = wbufptrOf(wbuf)
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	} else if wbuf.nobj == len(wbuf.obj) {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == len(wbuf.obj) {
+			putfull(wbuf)
+			wbuf = getempty()
+			w.wbuf1 = wbufptrOf(wbuf)
+		}
 	}
 
 	wbuf.obj[wbuf.nobj] = obj
 	wbuf.nobj++
+}
 
-	if wbuf.nobj == len(wbuf.obj) {
-		putfull(wbuf, 50)
-		w.wbuf = 0
+// putFast does a put and returns true if it can be done quickly
+// otherwise it returns false and the caller needs to call put.
+//go:nowritebarrier
+func (w *gcWork) putFast(obj uintptr) bool {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return false
+	} else if wbuf.nobj == len(wbuf.obj) {
+		return false
 	}
+
+	wbuf.obj[wbuf.nobj] = obj
+	wbuf.nobj++
+	return true
 }
 
 // tryGet dequeues a pointer for the garbage collector to trace.
@@ -99,57 +138,77 @@
 // queue, tryGet returns 0.  Note that there may still be pointers in
 // other gcWork instances or other caches.
 //go:nowritebarrier
-func (ww *gcWork) tryGet() uintptr {
-	w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
-
-	wbuf := w.wbuf.ptr()
+func (w *gcWork) tryGet() uintptr {
+	wbuf := w.wbuf1.ptr()
 	if wbuf == nil {
-		wbuf = trygetfull(74)
-		if wbuf == nil {
-			return 0
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = trygetfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
 		}
-		w.wbuf = wbufptrOf(wbuf)
 	}
 
 	wbuf.nobj--
-	obj := wbuf.obj[wbuf.nobj]
+	return wbuf.obj[wbuf.nobj]
+}
 
+// tryGetFast dequeues a pointer for the garbage collector to trace
+// if one is readily available. Otherwise it returns 0 and
+// the caller is expected to call tryGet().
+//go:nowritebarrier
+func (w *gcWork) tryGetFast() uintptr {
+	wbuf := w.wbuf1.ptr()
+	if wbuf == nil {
+		return 0
+	}
 	if wbuf.nobj == 0 {
-		putempty(wbuf, 86)
-		w.wbuf = 0
+		return 0
 	}
 
-	return obj
+	wbuf.nobj--
+	return wbuf.obj[wbuf.nobj]
 }
 
 // get dequeues a pointer for the garbage collector to trace, blocking
 // if necessary to ensure all pointers from all queues and caches have
 // been retrieved.  get returns 0 if there are no pointers remaining.
 //go:nowritebarrier
-func (ww *gcWork) get() uintptr {
-	w := (*gcWork)(noescape(unsafe.Pointer(ww))) // TODO: remove when escape analysis is fixed
-
-	wbuf := w.wbuf.ptr()
+func (w *gcWork) get() uintptr {
+	wbuf := w.wbuf1.ptr()
 	if wbuf == nil {
-		wbuf = getfull(103)
-		if wbuf == nil {
-			return 0
+		w.init()
+		wbuf = w.wbuf1.ptr()
+		// wbuf is empty at this point.
+	}
+	if wbuf.nobj == 0 {
+		w.wbuf1, w.wbuf2 = w.wbuf2, w.wbuf1
+		wbuf = w.wbuf1.ptr()
+		if wbuf.nobj == 0 {
+			owbuf := wbuf
+			wbuf = getfull()
+			if wbuf == nil {
+				return 0
+			}
+			putempty(owbuf)
+			w.wbuf1 = wbufptrOf(wbuf)
 		}
-		wbuf.checknonempty()
-		w.wbuf = wbufptrOf(wbuf)
 	}
 
 	// TODO: This might be a good place to add prefetch code
 
 	wbuf.nobj--
-	obj := wbuf.obj[wbuf.nobj]
-
-	if wbuf.nobj == 0 {
-		putempty(wbuf, 115)
-		w.wbuf = 0
-	}
-
-	return obj
+	return wbuf.obj[wbuf.nobj]
 }
 
 // dispose returns any cached pointers to the global queue.
@@ -160,23 +219,32 @@
 //
 //go:nowritebarrier
 func (w *gcWork) dispose() {
-	if wbuf := w.wbuf; wbuf != 0 {
-		if wbuf.ptr().nobj == 0 {
-			throw("dispose: workbuf is empty")
+	if wbuf := w.wbuf1.ptr(); wbuf != nil {
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
 		}
-		putfull(wbuf.ptr(), 166)
-		w.wbuf = 0
+		w.wbuf1 = 0
+
+		wbuf = w.wbuf2.ptr()
+		if wbuf.nobj == 0 {
+			putempty(wbuf)
+		} else {
+			putfull(wbuf)
+		}
+		w.wbuf2 = 0
 	}
 	if w.bytesMarked != 0 {
 		// dispose happens relatively infrequently. If this
 		// atomic becomes a problem, we should first try to
 		// dispose less and if necessary aggregate in a per-P
 		// counter.
-		xadd64(&work.bytesMarked, int64(w.bytesMarked))
+		atomic.Xadd64(&work.bytesMarked, int64(w.bytesMarked))
 		w.bytesMarked = 0
 	}
 	if w.scanWork != 0 {
-		xaddint64(&gcController.scanWork, w.scanWork)
+		atomic.Xaddint64(&gcController.scanWork, w.scanWork)
 		w.scanWork = 0
 	}
 }
@@ -185,16 +253,21 @@
 // global queue.
 //go:nowritebarrier
 func (w *gcWork) balance() {
-	if wbuf := w.wbuf; wbuf != 0 && wbuf.ptr().nobj > 4 {
-		w.wbuf = wbufptrOf(handoff(wbuf.ptr()))
+	if w.wbuf1 == 0 {
+		return
+	}
+	if wbuf := w.wbuf2.ptr(); wbuf.nobj != 0 {
+		putfull(wbuf)
+		w.wbuf2 = wbufptrOf(getempty())
+	} else if wbuf := w.wbuf1.ptr(); wbuf.nobj > 4 {
+		w.wbuf1 = wbufptrOf(handoff(wbuf))
 	}
 }
 
 // empty returns true if w has no mark work available.
 //go:nowritebarrier
 func (w *gcWork) empty() bool {
-	wbuf := w.wbuf
-	return wbuf == 0 || wbuf.ptr().nobj == 0
+	return w.wbuf1 == 0 || (w.wbuf1.ptr().nobj == 0 && w.wbuf2.ptr().nobj == 0)
 }
 
 // Internally, the GC work pool is kept in arrays in work buffers.
@@ -202,85 +275,37 @@
 // avoid contending on the global work buffer lists.
 
 type workbufhdr struct {
-	node  lfnode // must be first
-	nobj  int
-	inuse bool   // This workbuf is in use by some gorotuine and is not on the work.empty/partial/full queues.
-	log   [4]int // line numbers forming a history of ownership changes to workbuf
+	node lfnode // must be first
+	nobj int
 }
 
 type workbuf struct {
 	workbufhdr
 	// account for the above fields
-	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / ptrSize]uintptr
+	obj [(_WorkbufSize - unsafe.Sizeof(workbufhdr{})) / sys.PtrSize]uintptr
 }
 
 // workbuf factory routines. These funcs are used to manage the
 // workbufs.
 // If the GC asks for some work these are the only routines that
-// make partially full wbufs available to the GC.
-// Each of the gets and puts also take an distinct integer that is used
-// to record a brief history of changes to ownership of the workbuf.
-// The convention is to use a unique line number but any encoding
-// is permissible. For example if you want to pass in 2 bits of information
-// you could simple add lineno1*100000+lineno2.
-
-// logget records the past few values of entry to aid in debugging.
-// logget checks the buffer b is not currently in use.
-func (b *workbuf) logget(entry int) {
-	if !_Debugwbufs {
-		return
-	}
-	if b.inuse {
-		println("runtime: logget fails log entry=", entry,
-			"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
-			"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
-		throw("logget: get not legal")
-	}
-	b.inuse = true
-	copy(b.log[1:], b.log[:])
-	b.log[0] = entry
-}
-
-// logput records the past few values of entry to aid in debugging.
-// logput checks the buffer b is currently in use.
-func (b *workbuf) logput(entry int) {
-	if !_Debugwbufs {
-		return
-	}
-	if !b.inuse {
-		println("runtime: logput fails log entry=", entry,
-			"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
-			"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
-		throw("logput: put not legal")
-	}
-	b.inuse = false
-	copy(b.log[1:], b.log[:])
-	b.log[0] = entry
-}
+// make wbufs available to the GC.
 
 func (b *workbuf) checknonempty() {
 	if b.nobj == 0 {
-		println("runtime: nonempty check fails",
-			"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
-			"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
 		throw("workbuf is empty")
 	}
 }
 
 func (b *workbuf) checkempty() {
 	if b.nobj != 0 {
-		println("runtime: empty check fails",
-			"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
-			"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
 		throw("workbuf is not empty")
 	}
 }
 
 // getempty pops an empty work buffer off the work.empty list,
 // allocating new buffers if none are available.
-// entry is used to record a brief history of ownership.
 //go:nowritebarrier
-func getempty(entry int) *workbuf {
+func getempty() *workbuf {
 	var b *workbuf
 	if work.empty != 0 {
 		b = (*workbuf)(lfstackpop(&work.empty))
@@ -289,18 +314,16 @@
 		}
 	}
 	if b == nil {
-		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), _CacheLineSize, &memstats.gc_sys))
+		b = (*workbuf)(persistentalloc(unsafe.Sizeof(*b), sys.CacheLineSize, &memstats.gc_sys))
 	}
-	b.logget(entry)
 	return b
 }
 
 // putempty puts a workbuf onto the work.empty list.
 // Upon entry this go routine owns b. The lfstackpush relinquishes ownership.
 //go:nowritebarrier
-func putempty(b *workbuf, entry int) {
+func putempty(b *workbuf) {
 	b.checkempty()
-	b.logput(entry)
 	lfstackpush(&work.empty, &b.node)
 }
 
@@ -308,72 +331,32 @@
 // putfull accepts partially full buffers so the GC can avoid competing
 // with the mutators for ownership of partially full buffers.
 //go:nowritebarrier
-func putfull(b *workbuf, entry int) {
+func putfull(b *workbuf) {
 	b.checknonempty()
-	b.logput(entry)
 	lfstackpush(&work.full, &b.node)
-}
 
-// getpartialorempty tries to return a partially empty
-// and if none are available returns an empty one.
-// entry is used to provide a brief history of ownership
-// using entry + xxx00000 to
-// indicating that two line numbers in the call chain.
-//go:nowritebarrier
-func getpartialorempty(entry int) *workbuf {
-	b := (*workbuf)(lfstackpop(&work.partial))
-	if b != nil {
-		b.logget(entry)
-		return b
-	}
-	// Let getempty do the logget check but
-	// use the entry to encode that it passed
-	// through this routine.
-	b = getempty(entry + 80700000)
-	return b
-}
-
-// putpartial puts empty buffers on the work.empty queue,
-// full buffers on the work.full queue and
-// others on the work.partial queue.
-// entry is used to provide a brief history of ownership
-// using entry + xxx00000 to
-// indicating that two call chain line numbers.
-//go:nowritebarrier
-func putpartial(b *workbuf, entry int) {
-	if b.nobj == 0 {
-		putempty(b, entry+81500000)
-	} else if b.nobj < len(b.obj) {
-		b.logput(entry)
-		lfstackpush(&work.partial, &b.node)
-	} else if b.nobj == len(b.obj) {
-		b.logput(entry)
-		lfstackpush(&work.full, &b.node)
-	} else {
-		throw("putpartial: bad Workbuf b.nobj")
+	// We just made more work available. Let the GC controller
+	// know so it can encourage more workers to run.
+	if gcphase == _GCmark {
+		gcController.enlistWorker()
 	}
 }
 
 // trygetfull tries to get a full or partially empty workbuffer.
 // If one is not immediately available return nil
 //go:nowritebarrier
-func trygetfull(entry int) *workbuf {
+func trygetfull() *workbuf {
 	b := (*workbuf)(lfstackpop(&work.full))
-	if b == nil {
-		b = (*workbuf)(lfstackpop(&work.partial))
-	}
 	if b != nil {
-		b.logget(entry)
 		b.checknonempty()
 		return b
 	}
 	return b
 }
 
-// Get a full work buffer off the work.full or a partially
-// filled one off the work.partial list. If nothing is available
-// wait until all the other gc helpers have finished and then
-// return nil.
+// Get a full work buffer off the work.full list.
+// If nothing is available wait until all the other gc helpers have
+// finished and then return nil.
 // getfull acts as a barrier for work.nproc helpers. As long as one
 // gchelper is actively marking objects it
 // may create a workbuffer that the other helpers can work on.
@@ -383,47 +366,37 @@
 // This is in fact the termination condition for the STW mark
 // phase.
 //go:nowritebarrier
-func getfull(entry int) *workbuf {
+func getfull() *workbuf {
 	b := (*workbuf)(lfstackpop(&work.full))
 	if b != nil {
-		b.logget(entry)
 		b.checknonempty()
 		return b
 	}
-	b = (*workbuf)(lfstackpop(&work.partial))
-	if b != nil {
-		b.logget(entry)
-		return b
-	}
 
-	incnwait := xadd(&work.nwait, +1)
+	incnwait := atomic.Xadd(&work.nwait, +1)
 	if incnwait > work.nproc {
 		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 		throw("work.nwait > work.nproc")
 	}
 	for i := 0; ; i++ {
-		if work.full != 0 || work.partial != 0 {
-			decnwait := xadd(&work.nwait, -1)
+		if work.full != 0 {
+			decnwait := atomic.Xadd(&work.nwait, -1)
 			if decnwait == work.nproc {
 				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
 			}
 			b = (*workbuf)(lfstackpop(&work.full))
-			if b == nil {
-				b = (*workbuf)(lfstackpop(&work.partial))
-			}
 			if b != nil {
-				b.logget(entry)
 				b.checknonempty()
 				return b
 			}
-			incnwait := xadd(&work.nwait, +1)
+			incnwait := atomic.Xadd(&work.nwait, +1)
 			if incnwait > work.nproc {
 				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
 				throw("work.nwait > work.nproc")
 			}
 		}
-		if work.nwait == work.nproc {
+		if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs {
 			return nil
 		}
 		_g_ := getg()
@@ -443,7 +416,7 @@
 //go:nowritebarrier
 func handoff(b *workbuf) *workbuf {
 	// Make new buffer with half of b's pointers.
-	b1 := getempty(915)
+	b1 := getempty()
 	n := b.nobj / 2
 	b.nobj -= n
 	b1.nobj = n
@@ -453,6 +426,6 @@
 	_g_.m.gcstats.nhandoffcnt += uint64(n)
 
 	// Put b on full list - let first half of b get stolen.
-	putfull(b, 942)
+	putfull(b)
 	return b1
 }

diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index bc4e7c1..4093288 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go

@@ -8,19 +8,23 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Main malloc heap.
 // The heap itself is the "free[]" and "large" arrays,
 // but all the other global data is here too.
 type mheap struct {
 	lock      mutex
-	free      [_MaxMHeapList]mspan // free lists of given length
-	freelarge mspan                // free lists length >= _MaxMHeapList
-	busy      [_MaxMHeapList]mspan // busy lists of large objects of given length
-	busylarge mspan                // busy lists of large objects length >= _MaxMHeapList
-	allspans  **mspan              // all spans out there
-	gcspans   **mspan              // copy of allspans referenced by gc marker or sweeper
+	free      [_MaxMHeapList]mSpanList // free lists of given length
+	freelarge mSpanList                // free lists length >= _MaxMHeapList
+	busy      [_MaxMHeapList]mSpanList // busy lists of large objects of given length
+	busylarge mSpanList                // busy lists of large objects length >= _MaxMHeapList
+	allspans  **mspan                  // all spans out there
+	gcspans   **mspan                  // copy of allspans referenced by gc marker or sweeper
 	nspan     uint32
 	sweepgen  uint32 // sweep generation, see comment in mspan
 	sweepdone uint32 // all spans are swept
@@ -29,9 +33,12 @@
 	spans_mapped uintptr
 
 	// Proportional sweep
+	pagesInUse        uint64  // pages of spans in stats _MSpanInUse; R/W with mheap.lock
 	spanBytesAlloc    uint64  // bytes of spans allocated this cycle; updated atomically
 	pagesSwept        uint64  // pages swept this cycle; updated atomically
 	sweepPagesPerByte float64 // proportional sweep ratio; written with lock, read without
+	// TODO(austin): pagesInUse should be a uintptr, but the 386
+	// compiler can't 8-byte align fields.
 
 	// Malloc stats.
 	largefree  uint64                  // bytes freed for large objects (>maxsmallsize)
@@ -39,7 +46,7 @@
 	nsmallfree [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
 
 	// range of addresses we might see in the heap
-	bitmap         uintptr
+	bitmap         uintptr // Points to one byte past the end of the bitmap
 	bitmap_mapped  uintptr
 	arena_start    uintptr
 	arena_used     uintptr // always mHeap_Map{Bits,Spans} before updating
@@ -52,7 +59,7 @@
 	// gets its own cache line.
 	central [_NumSizeClasses]struct {
 		mcentral mcentral
-		pad      [_CacheLineSize]byte
+		pad      [sys.CacheLineSize]byte
 	}
 
 	spanalloc             fixalloc // allocator for span*
@@ -74,7 +81,7 @@
 
 // Every MSpan is in one doubly-linked list,
 // either one of the MHeap's free lists or one of the
-// MCentral's span lists.  We use empty MSpan structures as list heads.
+// MCentral's span lists.
 
 // An MSpan representing actual memory has state _MSpanInUse,
 // _MSpanStack, or _MSpanFree. Transitions between these states are
@@ -94,16 +101,79 @@
 	_MSpanInUse = iota // allocated for garbage collected heap
 	_MSpanStack        // allocated for use by stack allocator
 	_MSpanFree
-	_MSpanListHead
 	_MSpanDead
 )
 
+// mSpanList heads a linked list of spans.
+//
+// Linked list structure is based on BSD's "tail queue" data structure.
+type mSpanList struct {
+	first *mspan  // first span in list, or nil if none
+	last  **mspan // last span's next field, or first if none
+}
+
 type mspan struct {
-	next     *mspan    // in a span linked list
-	prev     *mspan    // in a span linked list
-	start    pageID    // starting page number
-	npages   uintptr   // number of pages in span
-	freelist gclinkptr // list of free objects
+	next *mspan     // next span in list, or nil if none
+	prev **mspan    // previous span's next field, or list head's first field if none
+	list *mSpanList // For debugging. TODO: Remove.
+
+	startAddr     uintptr   // address of first byte of span aka s.base()
+	npages        uintptr   // number of pages in span
+	stackfreelist gclinkptr // list of free stacks, avoids overloading freelist
+
+	// freeindex is the slot index between 0 and nelems at which to begin scanning
+	// for the next free object in this span.
+	// Each allocation scans allocBits starting at freeindex until it encounters a 0
+	// indicating a free object. freeindex is then adjusted so that subsequent scans begin
+	// just past the the newly discovered free object.
+	//
+	// If freeindex == nelem, this span has no free objects.
+	//
+	// allocBits is a bitmap of objects in this span.
+	// If n >= freeindex and allocBits[n/8] & (1<<(n%8)) is 0
+	// then object n is free;
+	// otherwise, object n is allocated. Bits starting at nelem are
+	// undefined and should never be referenced.
+	//
+	// Object n starts at address n*elemsize + (start << pageShift).
+	freeindex uintptr
+	// TODO: Look up nelems from sizeclass and remove this field if it
+	// helps performance.
+	nelems uintptr // number of object in the span.
+
+	// Cache of the allocBits at freeindex. allocCache is shifted
+	// such that the lowest bit corresponds to the bit freeindex.
+	// allocCache holds the complement of allocBits, thus allowing
+	// ctz (count trailing zero) to use it directly.
+	// allocCache may contain bits beyond s.nelems; the caller must ignore
+	// these.
+	allocCache uint64
+
+	// allocBits and gcmarkBits hold pointers to a span's mark and
+	// allocation bits. The pointers are 8 byte aligned.
+	// There are three arenas where this data is held.
+	// free: Dirty arenas that are no longer accessed
+	//       and can be reused.
+	// next: Holds information to be used in the next GC cycle.
+	// current: Information being used during this GC cycle.
+	// previous: Information being used during the last GC cycle.
+	// A new GC cycle starts with the call to finishsweep_m.
+	// finishsweep_m moves the previous arena to the free arena,
+	// the current arena to the previous arena, and
+	// the next arena to the current arena.
+	// The next arena is populated as the spans request
+	// memory to hold gcmarkBits for the next GC cycle as well
+	// as allocBits for newly allocated spans.
+	//
+	// The pointer arithmetic is done "by hand" instead of using
+	// arrays to avoid bounds checks along critical performance
+	// paths.
+	// The sweep will free the old allocBits and set allocBits to the
+	// gcmarkBits. The gcmarkBits are replaced with a fresh zeroed
+	// out memory.
+	allocBits  *uint8
+	gcmarkBits *uint8
+
 	// sweep generation:
 	// if sweepgen == h->sweepgen - 2, the span needs sweeping
 	// if sweepgen == h->sweepgen - 1, the span is currently being swept
@@ -112,7 +182,7 @@
 
 	sweepgen    uint32
 	divMul      uint32   // for divide by elemsize - divMagic.mul
-	ref         uint16   // capacity - number of objects in freelist
+	allocCount  uint16   // capacity - number of objects in freelist
 	sizeclass   uint8    // size class
 	incache     bool     // being used by an mcache
 	state       uint8    // mspaninuse etc
@@ -129,7 +199,7 @@
 }
 
 func (s *mspan) base() uintptr {
-	return uintptr(s.start << _PageShift)
+	return s.startAddr
 }
 
 func (s *mspan) layout() (size, n, total uintptr) {
@@ -145,7 +215,7 @@
 
 // h_spans is a lookup table to map virtual address page IDs to *mspan.
 // For allocated spans, their pages map to the span itself.
-// For free spans, only the lowest and highest pages map to the span itself.  Internal
+// For free spans, only the lowest and highest pages map to the span itself. Internal
 // pages map to an arbitrary span.
 // For pages that have never been allocated, h_spans entries are nil.
 var h_spans []*mspan // TODO: make this h.spans once mheap can be defined in Go
@@ -154,13 +224,13 @@
 	h := (*mheap)(vh)
 	s := (*mspan)(p)
 	if len(h_allspans) >= cap(h_allspans) {
-		n := 64 * 1024 / ptrSize
+		n := 64 * 1024 / sys.PtrSize
 		if n < cap(h_allspans)*3/2 {
 			n = cap(h_allspans) * 3 / 2
 		}
 		var new []*mspan
 		sp := (*slice)(unsafe.Pointer(&new))
-		sp.array = sysAlloc(uintptr(n)*ptrSize, &memstats.other_sys)
+		sp.array = sysAlloc(uintptr(n)*sys.PtrSize, &memstats.other_sys)
 		if sp.array == nil {
 			throw("runtime: cannot allocate memory")
 		}
@@ -171,11 +241,11 @@
 			// Don't free the old array if it's referenced by sweep.
 			// See the comment in mgc.go.
 			if h.allspans != mheap_.gcspans {
-				sysFree(unsafe.Pointer(h.allspans), uintptr(cap(h_allspans))*ptrSize, &memstats.other_sys)
+				sysFree(unsafe.Pointer(h.allspans), uintptr(cap(h_allspans))*sys.PtrSize, &memstats.other_sys)
 			}
 		}
 		h_allspans = new
-		h.allspans = (**mspan)(unsafe.Pointer(sp.array))
+		h.allspans = (**mspan)(sp.array)
 	}
 	h_allspans = append(h_allspans, s)
 	h.nspan = uint32(len(h_allspans))
@@ -191,16 +261,35 @@
 		return false
 	}
 	// Not a beginning of a block, consult span table to find the block beginning.
-	k := b >> _PageShift
-	x := k
-	x -= mheap_.arena_start >> _PageShift
-	s := h_spans[x]
-	if s == nil || pageID(k) < s.start || b >= s.limit || s.state != mSpanInUse {
+	s := h_spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() || b >= s.limit || s.state != mSpanInUse {
 		return false
 	}
 	return true
 }
 
+// inHeapOrStack is a variant of inheap that returns true for pointers into stack spans.
+//go:nowritebarrier
+//go:nosplit
+func inHeapOrStack(b uintptr) bool {
+	if b == 0 || b < mheap_.arena_start || b >= mheap_.arena_used {
+		return false
+	}
+	// Not a beginning of a block, consult span table to find the block beginning.
+	s := h_spans[(b-mheap_.arena_start)>>_PageShift]
+	if s == nil || b < s.base() {
+		return false
+	}
+	switch s.state {
+	case mSpanInUse:
+		return b < s.limit
+	case _MSpanStack:
+		return b < s.base()+s.npages<<_PageShift
+	default:
+		return false
+	}
+}
+
 // TODO: spanOf and spanOfUnchecked are open-coded in a lot of places.
 // Use the functions instead.
 
@@ -224,14 +313,14 @@
 	_g_ := getg()
 
 	_g_.m.mcache.local_nlookup++
-	if ptrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
+	if sys.PtrSize == 4 && _g_.m.mcache.local_nlookup >= 1<<30 {
 		// purge cache stats to prevent overflow
 		lock(&mheap_.lock)
 		purgecachedstats(_g_.m.mcache)
 		unlock(&mheap_.lock)
 	}
 
-	s := mHeap_LookupMaybe(&mheap_, unsafe.Pointer(v))
+	s := mheap_.lookupMaybe(unsafe.Pointer(v))
 	if sp != nil {
 		*sp = s
 	}
@@ -245,7 +334,7 @@
 		return 0
 	}
 
-	p := uintptr(s.start) << _PageShift
+	p := s.base()
 	if s.sizeclass == 0 {
 		// Large object.
 		if base != nil {
@@ -259,7 +348,7 @@
 
 	n := s.elemsize
 	if base != nil {
-		i := (uintptr(v) - uintptr(p)) / n
+		i := (v - p) / n
 		*base = p + i*n
 	}
 	if size != nil {
@@ -270,28 +359,28 @@
 }
 
 // Initialize the heap.
-func mHeap_Init(h *mheap, spans_size uintptr) {
-	fixAlloc_Init(&h.spanalloc, unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
-	fixAlloc_Init(&h.cachealloc, unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
-	fixAlloc_Init(&h.specialfinalizeralloc, unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
-	fixAlloc_Init(&h.specialprofilealloc, unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
+func (h *mheap) init(spans_size uintptr) {
+	h.spanalloc.init(unsafe.Sizeof(mspan{}), recordspan, unsafe.Pointer(h), &memstats.mspan_sys)
+	h.cachealloc.init(unsafe.Sizeof(mcache{}), nil, nil, &memstats.mcache_sys)
+	h.specialfinalizeralloc.init(unsafe.Sizeof(specialfinalizer{}), nil, nil, &memstats.other_sys)
+	h.specialprofilealloc.init(unsafe.Sizeof(specialprofile{}), nil, nil, &memstats.other_sys)
 
 	// h->mapcache needs no init
 	for i := range h.free {
-		mSpanList_Init(&h.free[i])
-		mSpanList_Init(&h.busy[i])
+		h.free[i].init()
+		h.busy[i].init()
 	}
 
-	mSpanList_Init(&h.freelarge)
-	mSpanList_Init(&h.busylarge)
+	h.freelarge.init()
+	h.busylarge.init()
 	for i := range h.central {
-		mCentral_Init(&h.central[i].mcentral, int32(i))
+		h.central[i].mcentral.init(int32(i))
 	}
 
 	sp := (*slice)(unsafe.Pointer(&h_spans))
 	sp.array = unsafe.Pointer(h.spans)
-	sp.len = int(spans_size / ptrSize)
-	sp.cap = int(spans_size / ptrSize)
+	sp.len = int(spans_size / sys.PtrSize)
+	sp.cap = int(spans_size / sys.PtrSize)
 }
 
 // mHeap_MapSpans makes sure that the spans are mapped
@@ -302,12 +391,12 @@
 // Waiting to update arena_used until after the memory has been mapped
 // avoids faults when other threads try access the bitmap immediately
 // after observing the change to arena_used.
-func mHeap_MapSpans(h *mheap, arena_used uintptr) {
+func (h *mheap) mapSpans(arena_used uintptr) {
 	// Map spans array, PageSize at a time.
 	n := arena_used
 	n -= h.arena_start
-	n = n / _PageSize * ptrSize
-	n = round(n, _PhysPageSize)
+	n = n / _PageSize * sys.PtrSize
+	n = round(n, sys.PhysPageSize)
 	if h.spans_mapped >= n {
 		return
 	}
@@ -317,18 +406,18 @@
 
 // Sweeps spans in list until reclaims at least npages into heap.
 // Returns the actual number of pages reclaimed.
-func mHeap_ReclaimList(h *mheap, list *mspan, npages uintptr) uintptr {
+func (h *mheap) reclaimList(list *mSpanList, npages uintptr) uintptr {
 	n := uintptr(0)
 	sg := mheap_.sweepgen
 retry:
-	for s := list.next; s != list; s = s.next {
-		if s.sweepgen == sg-2 && cas(&s.sweepgen, sg-2, sg-1) {
-			mSpanList_Remove(s)
+	for s := list.first; s != nil; s = s.next {
+		if s.sweepgen == sg-2 && atomic.Cas(&s.sweepgen, sg-2, sg-1) {
+			list.remove(s)
 			// swept spans are at the end of the list
-			mSpanList_InsertBack(list, s)
+			list.insertBack(s)
 			unlock(&h.lock)
 			snpages := s.npages
-			if mSpan_Sweep(s, false) {
+			if s.sweep(false) {
 				n += snpages
 			}
 			lock(&h.lock)
@@ -351,17 +440,17 @@
 
 // Sweeps and reclaims at least npage pages into heap.
 // Called before allocating npage pages.
-func mHeap_Reclaim(h *mheap, npage uintptr) {
+func (h *mheap) reclaim(npage uintptr) {
 	// First try to sweep busy spans with large objects of size >= npage,
 	// this has good chances of reclaiming the necessary space.
 	for i := int(npage); i < len(h.busy); i++ {
-		if mHeap_ReclaimList(h, &h.busy[i], npage) != 0 {
+		if h.reclaimList(&h.busy[i], npage) != 0 {
 			return // Bingo!
 		}
 	}
 
 	// Then -- even larger objects.
-	if mHeap_ReclaimList(h, &h.busylarge, npage) != 0 {
+	if h.reclaimList(&h.busylarge, npage) != 0 {
 		return // Bingo!
 	}
 
@@ -369,7 +458,7 @@
 	// One such object is not enough, so we need to reclaim several of them.
 	reclaimed := uintptr(0)
 	for i := 0; i < int(npage) && i < len(h.busy); i++ {
-		reclaimed += mHeap_ReclaimList(h, &h.busy[i], npage-reclaimed)
+		reclaimed += h.reclaimList(&h.busy[i], npage-reclaimed)
 		if reclaimed >= npage {
 			return
 		}
@@ -392,7 +481,7 @@
 
 // Allocate a new span of npage pages from the heap for GC'd memory
 // and record its size class in the HeapMap and HeapMapCache.
-func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan {
+func (h *mheap) alloc_m(npage uintptr, sizeclass int32, large bool) *mspan {
 	_g_ := getg()
 	if _g_ != _g_.m.g0 {
 		throw("_mheap_alloc not on g0 stack")
@@ -409,27 +498,22 @@
 		// If GC kept a bit for whether there were any marks
 		// in a span, we could release these free spans
 		// at the end of GC and eliminate this entirely.
-		mHeap_Reclaim(h, npage)
+		h.reclaim(npage)
 	}
 
 	// transfer stats from cache to global
-	memstats.heap_live += uint64(_g_.m.mcache.local_cachealloc)
-	_g_.m.mcache.local_cachealloc = 0
 	memstats.heap_scan += uint64(_g_.m.mcache.local_scan)
 	_g_.m.mcache.local_scan = 0
 	memstats.tinyallocs += uint64(_g_.m.mcache.local_tinyallocs)
 	_g_.m.mcache.local_tinyallocs = 0
 
-	gcController.revise()
-
-	s := mHeap_AllocSpanLocked(h, npage)
+	s := h.allocSpanLocked(npage)
 	if s != nil {
 		// Record span info, because gc needs to be
 		// able to map interior pointer to containing span.
-		atomicstore(&s.sweepgen, h.sweepgen)
+		atomic.Store(&s.sweepgen, h.sweepgen)
 		s.state = _MSpanInUse
-		s.freelist = 0
-		s.ref = 0
+		s.allocCount = 0
 		s.sizeclass = uint8(sizeclass)
 		if sizeclass == 0 {
 			s.elemsize = s.npages << _PageShift
@@ -447,17 +531,23 @@
 		}
 
 		// update stats, sweep lists
+		h.pagesInUse += uint64(npage)
 		if large {
 			memstats.heap_objects++
-			memstats.heap_live += uint64(npage << _PageShift)
+			atomic.Xadd64(&memstats.heap_live, int64(npage<<_PageShift))
 			// Swept spans are at the end of lists.
 			if s.npages < uintptr(len(h.free)) {
-				mSpanList_InsertBack(&h.busy[s.npages], s)
+				h.busy[s.npages].insertBack(s)
 			} else {
-				mSpanList_InsertBack(&h.busylarge, s)
+				h.busylarge.insertBack(s)
 			}
 		}
 	}
+	// heap_scan and heap_live were updated.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
 	if trace.enabled {
 		traceHeapAlloc()
 	}
@@ -475,35 +565,35 @@
 	return s
 }
 
-func mHeap_Alloc(h *mheap, npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
+func (h *mheap) alloc(npage uintptr, sizeclass int32, large bool, needzero bool) *mspan {
 	// Don't do any operations that lock the heap on the G stack.
 	// It might trigger stack growth, and the stack growth code needs
 	// to be able to allocate heap.
 	var s *mspan
 	systemstack(func() {
-		s = mHeap_Alloc_m(h, npage, sizeclass, large)
+		s = h.alloc_m(npage, sizeclass, large)
 	})
 
 	if s != nil {
 		if needzero && s.needzero != 0 {
-			memclr(unsafe.Pointer(s.start<<_PageShift), s.npages<<_PageShift)
+			memclr(unsafe.Pointer(s.base()), s.npages<<_PageShift)
 		}
 		s.needzero = 0
 	}
 	return s
 }
 
-func mHeap_AllocStack(h *mheap, npage uintptr) *mspan {
+func (h *mheap) allocStack(npage uintptr) *mspan {
 	_g_ := getg()
 	if _g_ != _g_.m.g0 {
 		throw("mheap_allocstack not on g0 stack")
 	}
 	lock(&h.lock)
-	s := mHeap_AllocSpanLocked(h, npage)
+	s := h.allocSpanLocked(npage)
 	if s != nil {
 		s.state = _MSpanStack
-		s.freelist = 0
-		s.ref = 0
+		s.stackfreelist = 0
+		s.allocCount = 0
 		memstats.stacks_inuse += uint64(s.npages << _PageShift)
 	}
 
@@ -515,24 +605,27 @@
 // Allocates a span of the given size.  h must be locked.
 // The returned span has been removed from the
 // free list, but its state is still MSpanFree.
-func mHeap_AllocSpanLocked(h *mheap, npage uintptr) *mspan {
+func (h *mheap) allocSpanLocked(npage uintptr) *mspan {
+	var list *mSpanList
 	var s *mspan
 
 	// Try in fixed-size lists up to max.
 	for i := int(npage); i < len(h.free); i++ {
-		if !mSpanList_IsEmpty(&h.free[i]) {
-			s = h.free[i].next
+		list = &h.free[i]
+		if !list.isEmpty() {
+			s = list.first
 			goto HaveSpan
 		}
 	}
 
 	// Best fit in list of large spans.
-	s = mHeap_AllocLarge(h, npage)
+	list = &h.freelarge
+	s = h.allocLarge(npage)
 	if s == nil {
-		if !mHeap_Grow(h, npage) {
+		if !h.grow(npage) {
 			return nil
 		}
-		s = mHeap_AllocLarge(h, npage)
+		s = h.allocLarge(npage)
 		if s == nil {
 			return nil
 		}
@@ -546,23 +639,22 @@
 	if s.npages < npage {
 		throw("MHeap_AllocLocked - bad npages")
 	}
-	mSpanList_Remove(s)
-	if s.next != nil || s.prev != nil {
+	list.remove(s)
+	if s.inList() {
 		throw("still in list")
 	}
 	if s.npreleased > 0 {
-		sysUsed((unsafe.Pointer)(s.start<<_PageShift), s.npages<<_PageShift)
+		sysUsed(unsafe.Pointer(s.base()), s.npages<<_PageShift)
 		memstats.heap_released -= uint64(s.npreleased << _PageShift)
 		s.npreleased = 0
 	}
 
 	if s.npages > npage {
 		// Trim extra and put it back in the heap.
-		t := (*mspan)(fixAlloc_Alloc(&h.spanalloc))
-		mSpan_Init(t, s.start+pageID(npage), s.npages-npage)
+		t := (*mspan)(h.spanalloc.alloc())
+		t.init(s.base()+npage<<_PageShift, s.npages-npage)
 		s.npages = npage
-		p := uintptr(t.start)
-		p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+		p := (t.base() - h.arena_start) >> _PageShift
 		if p > 0 {
 			h_spans[p-1] = s
 		}
@@ -571,13 +663,12 @@
 		t.needzero = s.needzero
 		s.state = _MSpanStack // prevent coalescing with s
 		t.state = _MSpanStack
-		mHeap_FreeSpanLocked(h, t, false, false, s.unusedsince)
+		h.freeSpanLocked(t, false, false, s.unusedsince)
 		s.state = _MSpanFree
 	}
 	s.unusedsince = 0
 
-	p := uintptr(s.start)
-	p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+	p := (s.base() - h.arena_start) >> _PageShift
 	for n := uintptr(0); n < npage; n++ {
 		h_spans[p+n] = s
 	}
@@ -586,26 +677,26 @@
 	memstats.heap_idle -= uint64(npage << _PageShift)
 
 	//println("spanalloc", hex(s.start<<_PageShift))
-	if s.next != nil || s.prev != nil {
+	if s.inList() {
 		throw("still in list")
 	}
 	return s
 }
 
 // Allocate a span of exactly npage pages from the list of large spans.
-func mHeap_AllocLarge(h *mheap, npage uintptr) *mspan {
+func (h *mheap) allocLarge(npage uintptr) *mspan {
 	return bestFit(&h.freelarge, npage, nil)
 }
 
 // Search list for smallest span with >= npage pages.
 // If there are multiple smallest spans, take the one
 // with the earliest starting address.
-func bestFit(list *mspan, npage uintptr, best *mspan) *mspan {
-	for s := list.next; s != list; s = s.next {
+func bestFit(list *mSpanList, npage uintptr, best *mspan) *mspan {
+	for s := list.first; s != nil; s = s.next {
 		if s.npages < npage {
 			continue
 		}
-		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.start < best.start) {
+		if best == nil || s.npages < best.npages || (s.npages == best.npages && s.base() < best.base()) {
 			best = s
 		}
 	}
@@ -614,7 +705,9 @@
 
 // Try to add at least npage pages of memory to the heap,
 // returning whether it worked.
-func mHeap_Grow(h *mheap, npage uintptr) bool {
+//
+// h must be locked.
+func (h *mheap) grow(npage uintptr) bool {
 	// Ask for a big chunk, to reduce the number of mappings
 	// the operating system needs to track; also amortizes
 	// the overhead of an operating system mapping.
@@ -625,11 +718,11 @@
 		ask = _HeapAllocChunk
 	}
 
-	v := mHeap_SysAlloc(h, ask)
+	v := h.sysAlloc(ask)
 	if v == nil {
 		if ask > npage<<_PageShift {
 			ask = npage << _PageShift
-			v = mHeap_SysAlloc(h, ask)
+			v = h.sysAlloc(ask)
 		}
 		if v == nil {
 			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
@@ -639,25 +732,25 @@
 
 	// Create a fake "in use" span and free it, so that the
 	// right coalescing happens.
-	s := (*mspan)(fixAlloc_Alloc(&h.spanalloc))
-	mSpan_Init(s, pageID(uintptr(v)>>_PageShift), ask>>_PageShift)
-	p := uintptr(s.start)
-	p -= (uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift)
+	s := (*mspan)(h.spanalloc.alloc())
+	s.init(uintptr(v), ask>>_PageShift)
+	p := (s.base() - h.arena_start) >> _PageShift
 	for i := p; i < p+s.npages; i++ {
 		h_spans[i] = s
 	}
-	atomicstore(&s.sweepgen, h.sweepgen)
+	atomic.Store(&s.sweepgen, h.sweepgen)
 	s.state = _MSpanInUse
-	mHeap_FreeSpanLocked(h, s, false, true, 0)
+	h.pagesInUse += uint64(s.npages)
+	h.freeSpanLocked(s, false, true, 0)
 	return true
 }
 
 // Look up the span at the given address.
 // Address is guaranteed to be in map
 // and is guaranteed to be start or end of span.
-func mHeap_Lookup(h *mheap, v unsafe.Pointer) *mspan {
+func (h *mheap) lookup(v unsafe.Pointer) *mspan {
 	p := uintptr(v)
-	p -= uintptr(unsafe.Pointer(h.arena_start))
+	p -= h.arena_start
 	return h_spans[p>>_PageShift]
 }
 
@@ -665,47 +758,48 @@
 // Address is *not* guaranteed to be in map
 // and may be anywhere in the span.
 // Map entries for the middle of a span are only
-// valid for allocated spans.  Free spans may have
+// valid for allocated spans. Free spans may have
 // other garbage in their middles, so we have to
 // check for that.
-func mHeap_LookupMaybe(h *mheap, v unsafe.Pointer) *mspan {
-	if uintptr(v) < uintptr(unsafe.Pointer(h.arena_start)) || uintptr(v) >= uintptr(unsafe.Pointer(h.arena_used)) {
+func (h *mheap) lookupMaybe(v unsafe.Pointer) *mspan {
+	if uintptr(v) < h.arena_start || uintptr(v) >= h.arena_used {
 		return nil
 	}
-	p := uintptr(v) >> _PageShift
-	q := p
-	q -= uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift
-	s := h_spans[q]
-	if s == nil || p < uintptr(s.start) || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse {
+	s := h_spans[(uintptr(v)-h.arena_start)>>_PageShift]
+	if s == nil || uintptr(v) < s.base() || uintptr(v) >= uintptr(unsafe.Pointer(s.limit)) || s.state != _MSpanInUse {
 		return nil
 	}
 	return s
 }
 
 // Free the span back into the heap.
-func mHeap_Free(h *mheap, s *mspan, acct int32) {
+func (h *mheap) freeSpan(s *mspan, acct int32) {
 	systemstack(func() {
 		mp := getg().m
 		lock(&h.lock)
-		memstats.heap_live += uint64(mp.mcache.local_cachealloc)
-		mp.mcache.local_cachealloc = 0
 		memstats.heap_scan += uint64(mp.mcache.local_scan)
 		mp.mcache.local_scan = 0
 		memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs)
 		mp.mcache.local_tinyallocs = 0
+		if msanenabled {
+			// Tell msan that this entire span is no longer in use.
+			base := unsafe.Pointer(s.base())
+			bytes := s.npages << _PageShift
+			msanfree(base, bytes)
+		}
 		if acct != 0 {
 			memstats.heap_objects--
 		}
-		gcController.revise()
-		mHeap_FreeSpanLocked(h, s, true, true, 0)
-		if trace.enabled {
-			traceHeapAlloc()
+		if gcBlackenEnabled != 0 {
+			// heap_scan changed.
+			gcController.revise()
 		}
+		h.freeSpanLocked(s, true, true, 0)
 		unlock(&h.lock)
 	})
 }
 
-func mHeap_FreeStack(h *mheap, s *mspan) {
+func (h *mheap) freeStack(s *mspan) {
 	_g_ := getg()
 	if _g_ != _g_.m.g0 {
 		throw("mheap_freestack not on g0 stack")
@@ -713,21 +807,23 @@
 	s.needzero = 1
 	lock(&h.lock)
 	memstats.stacks_inuse -= uint64(s.npages << _PageShift)
-	mHeap_FreeSpanLocked(h, s, true, true, 0)
+	h.freeSpanLocked(s, true, true, 0)
 	unlock(&h.lock)
 }
 
-func mHeap_FreeSpanLocked(h *mheap, s *mspan, acctinuse, acctidle bool, unusedsince int64) {
+// s must be on a busy list (h.busy or h.busylarge) or unlinked.
+func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool, unusedsince int64) {
 	switch s.state {
 	case _MSpanStack:
-		if s.ref != 0 {
+		if s.allocCount != 0 {
 			throw("MHeap_FreeSpanLocked - invalid stack free")
 		}
 	case _MSpanInUse:
-		if s.ref != 0 || s.sweepgen != h.sweepgen {
-			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.start<<_PageShift), " ref ", s.ref, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
+		if s.allocCount != 0 || s.sweepgen != h.sweepgen {
+			print("MHeap_FreeSpanLocked - span ", s, " ptr ", hex(s.base()), " allocCount ", s.allocCount, " sweepgen ", s.sweepgen, "/", h.sweepgen, "\n")
 			throw("MHeap_FreeSpanLocked - invalid free")
 		}
+		h.pagesInUse -= uint64(s.npages)
 	default:
 		throw("MHeap_FreeSpanLocked - invalid span state")
 	}
@@ -739,7 +835,9 @@
 		memstats.heap_idle += uint64(s.npages << _PageShift)
 	}
 	s.state = _MSpanFree
-	mSpanList_Remove(s)
+	if s.inList() {
+		h.busyList(s.npages).remove(s)
+	}
 
 	// Stamp newly unused spans. The scavenger will use that
 	// info to potentially give back some pages to the OS.
@@ -750,72 +848,90 @@
 	s.npreleased = 0
 
 	// Coalesce with earlier, later spans.
-	p := uintptr(s.start)
-	p -= uintptr(unsafe.Pointer(h.arena_start)) >> _PageShift
+	p := (s.base() - h.arena_start) >> _PageShift
 	if p > 0 {
 		t := h_spans[p-1]
-		if t != nil && t.state != _MSpanInUse && t.state != _MSpanStack {
-			s.start = t.start
+		if t != nil && t.state == _MSpanFree {
+			s.startAddr = t.startAddr
 			s.npages += t.npages
 			s.npreleased = t.npreleased // absorb released pages
 			s.needzero |= t.needzero
 			p -= t.npages
 			h_spans[p] = s
-			mSpanList_Remove(t)
+			h.freeList(t.npages).remove(t)
 			t.state = _MSpanDead
-			fixAlloc_Free(&h.spanalloc, (unsafe.Pointer)(t))
+			h.spanalloc.free(unsafe.Pointer(t))
 		}
 	}
-	if (p+s.npages)*ptrSize < h.spans_mapped {
+	if (p+s.npages)*sys.PtrSize < h.spans_mapped {
 		t := h_spans[p+s.npages]
-		if t != nil && t.state != _MSpanInUse && t.state != _MSpanStack {
+		if t != nil && t.state == _MSpanFree {
 			s.npages += t.npages
 			s.npreleased += t.npreleased
 			s.needzero |= t.needzero
 			h_spans[p+s.npages-1] = s
-			mSpanList_Remove(t)
+			h.freeList(t.npages).remove(t)
 			t.state = _MSpanDead
-			fixAlloc_Free(&h.spanalloc, (unsafe.Pointer)(t))
+			h.spanalloc.free(unsafe.Pointer(t))
 		}
 	}
 
 	// Insert s into appropriate list.
-	if s.npages < uintptr(len(h.free)) {
-		mSpanList_Insert(&h.free[s.npages], s)
-	} else {
-		mSpanList_Insert(&h.freelarge, s)
-	}
+	h.freeList(s.npages).insert(s)
 }
 
-func scavengelist(list *mspan, now, limit uint64) uintptr {
-	if _PhysPageSize > _PageSize {
-		// golang.org/issue/9993
-		// If the physical page size of the machine is larger than
-		// our logical heap page size the kernel may round up the
-		// amount to be freed to its page size and corrupt the heap
-		// pages surrounding the unused block.
-		return 0
+func (h *mheap) freeList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.free[npages]
 	}
+	return &h.freelarge
+}
 
-	if mSpanList_IsEmpty(list) {
+func (h *mheap) busyList(npages uintptr) *mSpanList {
+	if npages < uintptr(len(h.free)) {
+		return &h.busy[npages]
+	}
+	return &h.busylarge
+}
+
+func scavengelist(list *mSpanList, now, limit uint64) uintptr {
+	if list.isEmpty() {
 		return 0
 	}
 
 	var sumreleased uintptr
-	for s := list.next; s != list; s = s.next {
+	for s := list.first; s != nil; s = s.next {
 		if (now-uint64(s.unusedsince)) > limit && s.npreleased != s.npages {
-			released := (s.npages - s.npreleased) << _PageShift
+			start := s.base()
+			end := start + s.npages<<_PageShift
+			if sys.PhysPageSize > _PageSize {
+				// We can only release pages in
+				// PhysPageSize blocks, so round start
+				// and end in. (Otherwise, madvise
+				// will round them *out* and release
+				// more memory than we want.)
+				start = (start + sys.PhysPageSize - 1) &^ (sys.PhysPageSize - 1)
+				end &^= sys.PhysPageSize - 1
+				if start == end {
+					continue
+				}
+			}
+			len := end - start
+
+			released := len - (s.npreleased << _PageShift)
+			if sys.PhysPageSize > _PageSize && released == 0 {
+				continue
+			}
 			memstats.heap_released += uint64(released)
 			sumreleased += released
-			s.npreleased = s.npages
-			sysUnused((unsafe.Pointer)(s.start<<_PageShift), s.npages<<_PageShift)
+			s.npreleased = len >> _PageShift
+			sysUnused(unsafe.Pointer(start), len)
 		}
 	}
 	return sumreleased
 }
 
-func mHeap_Scavenge(k int32, now, limit uint64) {
-	h := &mheap_
+func (h *mheap) scavenge(k int32, now, limit uint64) {
 	lock(&h.lock)
 	var sumreleased uintptr
 	for i := 0; i < len(h.free); i++ {
@@ -836,18 +952,18 @@
 
 //go:linkname runtime_debug_freeOSMemory runtime/debug.freeOSMemory
 func runtime_debug_freeOSMemory() {
-	startGC(gcForceBlockMode, false)
-	systemstack(func() { mHeap_Scavenge(-1, ^uint64(0), 0) })
+	gcStart(gcForceBlockMode, false)
+	systemstack(func() { mheap_.scavenge(-1, ^uint64(0), 0) })
 }
 
 // Initialize a new span with the given start and npages.
-func mSpan_Init(span *mspan, start pageID, npages uintptr) {
+func (span *mspan) init(base uintptr, npages uintptr) {
 	span.next = nil
 	span.prev = nil
-	span.start = start
+	span.list = nil
+	span.startAddr = base
 	span.npages = npages
-	span.freelist = 0
-	span.ref = 0
+	span.allocCount = 0
 	span.sizeclass = 0
 	span.incache = false
 	span.elemsize = 0
@@ -857,49 +973,69 @@
 	span.speciallock.key = 0
 	span.specials = nil
 	span.needzero = 0
+	span.freeindex = 0
+	span.allocBits = nil
+	span.gcmarkBits = nil
+}
+
+func (span *mspan) inList() bool {
+	return span.prev != nil
 }
 
 // Initialize an empty doubly-linked list.
-func mSpanList_Init(list *mspan) {
-	list.state = _MSpanListHead
-	list.next = list
-	list.prev = list
+func (list *mSpanList) init() {
+	list.first = nil
+	list.last = &list.first
 }
 
-func mSpanList_Remove(span *mspan) {
-	if span.prev == nil && span.next == nil {
-		return
+func (list *mSpanList) remove(span *mspan) {
+	if span.prev == nil || span.list != list {
+		println("runtime: failed MSpanList_Remove", span, span.prev, span.list, list)
+		throw("MSpanList_Remove")
 	}
-	span.prev.next = span.next
-	span.next.prev = span.prev
-	span.prev = nil
+	if span.next != nil {
+		span.next.prev = span.prev
+	} else {
+		// TODO: After we remove the span.list != list check above,
+		// we could at least still check list.last == &span.next here.
+		list.last = span.prev
+	}
+	*span.prev = span.next
 	span.next = nil
+	span.prev = nil
+	span.list = nil
 }
 
-func mSpanList_IsEmpty(list *mspan) bool {
-	return list.next == list
+func (list *mSpanList) isEmpty() bool {
+	return list.first == nil
 }
 
-func mSpanList_Insert(list *mspan, span *mspan) {
-	if span.next != nil || span.prev != nil {
-		println("failed MSpanList_Insert", span, span.next, span.prev)
+func (list *mSpanList) insert(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("runtime: failed MSpanList_Insert", span, span.next, span.prev, span.list)
 		throw("MSpanList_Insert")
 	}
-	span.next = list.next
-	span.prev = list
-	span.next.prev = span
-	span.prev.next = span
+	span.next = list.first
+	if list.first != nil {
+		list.first.prev = &span.next
+	} else {
+		list.last = &span.next
+	}
+	list.first = span
+	span.prev = &list.first
+	span.list = list
 }
 
-func mSpanList_InsertBack(list *mspan, span *mspan) {
-	if span.next != nil || span.prev != nil {
-		println("failed MSpanList_InsertBack", span, span.next, span.prev)
+func (list *mSpanList) insertBack(span *mspan) {
+	if span.next != nil || span.prev != nil || span.list != nil {
+		println("failed MSpanList_InsertBack", span, span.next, span.prev, span.list)
 		throw("MSpanList_InsertBack")
 	}
-	span.next = list
-	span.prev = list.prev
-	span.next.prev = span
-	span.prev.next = span
+	span.next = nil
+	span.prev = list.last
+	*list.last = span
+	list.last = &span.next
+	span.list = list
 }
 
 const (
@@ -918,23 +1054,24 @@
 }
 
 // Adds the special record s to the list of special records for
-// the object p.  All fields of s should be filled in except for
+// the object p. All fields of s should be filled in except for
 // offset & next, which this routine will fill in.
 // Returns true if the special was successfully added, false otherwise.
 // (The add will fail only if a record with the same p and s->kind
 //  already exists.)
 func addspecial(p unsafe.Pointer, s *special) bool {
-	span := mHeap_LookupMaybe(&mheap_, p)
+	span := mheap_.lookupMaybe(p)
 	if span == nil {
 		throw("addspecial on invalid pointer")
 	}
 
 	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
 	mp := acquirem()
-	mSpan_EnsureSwept(span)
+	span.ensureSwept()
 
-	offset := uintptr(p) - uintptr(span.start<<_PageShift)
+	offset := uintptr(p) - span.base()
 	kind := s.kind
 
 	lock(&span.speciallock)
@@ -971,17 +1108,18 @@
 // Returns the record if the record existed, nil otherwise.
 // The caller must FixAlloc_Free the result.
 func removespecial(p unsafe.Pointer, kind uint8) *special {
-	span := mHeap_LookupMaybe(&mheap_, p)
+	span := mheap_.lookupMaybe(p)
 	if span == nil {
 		throw("removespecial on invalid pointer")
 	}
 
 	// Ensure that the span is swept.
-	// GC accesses specials list w/o locks. And it's just much safer.
+	// Sweeping accesses the specials list w/o locks, so we have
+	// to synchronize with it. And it's just much safer.
 	mp := acquirem()
-	mSpan_EnsureSwept(span)
+	span.ensureSwept()
 
-	offset := uintptr(p) - uintptr(span.start<<_PageShift)
+	offset := uintptr(p) - span.base()
 
 	lock(&span.speciallock)
 	t := &span.specials
@@ -1014,10 +1152,10 @@
 	ot      *ptrtype
 }
 
-// Adds a finalizer to the object p.  Returns true if it succeeded.
+// Adds a finalizer to the object p. Returns true if it succeeded.
 func addfinalizer(p unsafe.Pointer, f *funcval, nret uintptr, fint *_type, ot *ptrtype) bool {
 	lock(&mheap_.speciallock)
-	s := (*specialfinalizer)(fixAlloc_Alloc(&mheap_.specialfinalizeralloc))
+	s := (*specialfinalizer)(mheap_.specialfinalizeralloc.alloc())
 	unlock(&mheap_.speciallock)
 	s.special.kind = _KindSpecialFinalizer
 	s.fn = f
@@ -1025,12 +1163,31 @@
 	s.fint = fint
 	s.ot = ot
 	if addspecial(p, &s.special) {
+		// This is responsible for maintaining the same
+		// GC-related invariants as markrootSpans in any
+		// situation where it's possible that markrootSpans
+		// has already run but mark termination hasn't yet.
+		if gcphase != _GCoff {
+			_, base, _ := findObject(p)
+			mp := acquirem()
+			gcw := &mp.p.ptr().gcw
+			// Mark everything reachable from the object
+			// so it's retained for the finalizer.
+			scanobject(uintptr(base), gcw)
+			// Mark the finalizer itself, since the
+			// special isn't part of the GC'd heap.
+			scanblock(uintptr(unsafe.Pointer(&s.fn)), sys.PtrSize, &oneptrmask[0], gcw)
+			if gcBlackenPromptly {
+				gcw.dispose()
+			}
+			releasem(mp)
+		}
 		return true
 	}
 
 	// There was an old finalizer
 	lock(&mheap_.speciallock)
-	fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(s))
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
 	unlock(&mheap_.speciallock)
 	return false
 }
@@ -1042,7 +1199,7 @@
 		return // there wasn't a finalizer to remove
 	}
 	lock(&mheap_.speciallock)
-	fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(s))
+	mheap_.specialfinalizeralloc.free(unsafe.Pointer(s))
 	unlock(&mheap_.speciallock)
 }
 
@@ -1055,7 +1212,7 @@
 // Set the heap profile bucket associated with addr to b.
 func setprofilebucket(p unsafe.Pointer, b *bucket) {
 	lock(&mheap_.speciallock)
-	s := (*specialprofile)(fixAlloc_Alloc(&mheap_.specialprofilealloc))
+	s := (*specialprofile)(mheap_.specialprofilealloc.alloc())
 	unlock(&mheap_.speciallock)
 	s.special.kind = _KindSpecialProfile
 	s.b = b
@@ -1064,27 +1221,138 @@
 	}
 }
 
-// Do whatever cleanup needs to be done to deallocate s.  It has
+// Do whatever cleanup needs to be done to deallocate s. It has
 // already been unlinked from the MSpan specials list.
-// Returns true if we should keep working on deallocating p.
-func freespecial(s *special, p unsafe.Pointer, size uintptr, freed bool) bool {
+func freespecial(s *special, p unsafe.Pointer, size uintptr) {
 	switch s.kind {
 	case _KindSpecialFinalizer:
 		sf := (*specialfinalizer)(unsafe.Pointer(s))
 		queuefinalizer(p, sf.fn, sf.nret, sf.fint, sf.ot)
 		lock(&mheap_.speciallock)
-		fixAlloc_Free(&mheap_.specialfinalizeralloc, (unsafe.Pointer)(sf))
+		mheap_.specialfinalizeralloc.free(unsafe.Pointer(sf))
 		unlock(&mheap_.speciallock)
-		return false // don't free p until finalizer is done
 	case _KindSpecialProfile:
 		sp := (*specialprofile)(unsafe.Pointer(s))
-		mProf_Free(sp.b, size, freed)
+		mProf_Free(sp.b, size)
 		lock(&mheap_.speciallock)
-		fixAlloc_Free(&mheap_.specialprofilealloc, (unsafe.Pointer)(sp))
+		mheap_.specialprofilealloc.free(unsafe.Pointer(sp))
 		unlock(&mheap_.speciallock)
-		return true
 	default:
 		throw("bad special kind")
 		panic("not reached")
 	}
 }
+
+const gcBitsChunkBytes = uintptr(64 << 10)
+const gcBitsHeaderBytes = unsafe.Sizeof(gcBitsHeader{})
+
+type gcBitsHeader struct {
+	free uintptr // free is the index into bits of the next free byte.
+	next uintptr // *gcBits triggers recursive type bug. (issue 14620)
+}
+
+type gcBits struct {
+	// gcBitsHeader // side step recursive type bug (issue 14620) by including fields by hand.
+	free uintptr // free is the index into bits of the next free byte.
+	next *gcBits
+	bits [gcBitsChunkBytes - gcBitsHeaderBytes]uint8
+}
+
+var gcBitsArenas struct {
+	lock     mutex
+	free     *gcBits
+	next     *gcBits
+	current  *gcBits
+	previous *gcBits
+}
+
+// newMarkBits returns a pointer to 8 byte aligned bytes
+// to be used for a span's mark bits.
+func newMarkBits(nelems uintptr) *uint8 {
+	lock(&gcBitsArenas.lock)
+	blocksNeeded := uintptr((nelems + 63) / 64)
+	bytesNeeded := blocksNeeded * 8
+	if gcBitsArenas.next == nil ||
+		gcBitsArenas.next.free+bytesNeeded > uintptr(len(gcBits{}.bits)) {
+		// Allocate a new arena.
+		fresh := newArena()
+		fresh.next = gcBitsArenas.next
+		gcBitsArenas.next = fresh
+	}
+	if gcBitsArenas.next.free >= gcBitsChunkBytes {
+		println("runtime: gcBitsArenas.next.free=", gcBitsArenas.next.free, gcBitsChunkBytes)
+		throw("markBits overflow")
+	}
+	result := &gcBitsArenas.next.bits[gcBitsArenas.next.free]
+	gcBitsArenas.next.free += bytesNeeded
+	unlock(&gcBitsArenas.lock)
+	return result
+}
+
+// newAllocBits returns a pointer to 8 byte aligned bytes
+// to be used for this span's alloc bits.
+// newAllocBits is used to provide newly initialized spans
+// allocation bits. For spans not being initialized the
+// the mark bits are repurposed as allocation bits when
+// the span is swept.
+func newAllocBits(nelems uintptr) *uint8 {
+	return newMarkBits(nelems)
+}
+
+// nextMarkBitArenaEpoch establishes a new epoch for the arenas
+// holding the mark bits. The arenas are named relative to the
+// current GC cycle which is demarcated by the call to finishweep_m.
+//
+// All current spans have been swept.
+// During that sweep each span allocated room for its gcmarkBits in
+// gcBitsArenas.next block. gcBitsArenas.next becomes the gcBitsArenas.current
+// where the GC will mark objects and after each span is swept these bits
+// will be used to allocate objects.
+// gcBitsArenas.current becomes gcBitsArenas.previous where the span's
+// gcAllocBits live until all the spans have been swept during this GC cycle.
+// The span's sweep extinguishes all the references to gcBitsArenas.previous
+// by pointing gcAllocBits into the gcBitsArenas.current.
+// The gcBitsArenas.previous is released to the gcBitsArenas.free list.
+func nextMarkBitArenaEpoch() {
+	lock(&gcBitsArenas.lock)
+	if gcBitsArenas.previous != nil {
+		if gcBitsArenas.free == nil {
+			gcBitsArenas.free = gcBitsArenas.previous
+		} else {
+			// Find end of previous arenas.
+			last := gcBitsArenas.previous
+			for last = gcBitsArenas.previous; last.next != nil; last = last.next {
+			}
+			last.next = gcBitsArenas.free
+			gcBitsArenas.free = gcBitsArenas.previous
+		}
+	}
+	gcBitsArenas.previous = gcBitsArenas.current
+	gcBitsArenas.current = gcBitsArenas.next
+	gcBitsArenas.next = nil // newMarkBits calls newArena when needed
+	unlock(&gcBitsArenas.lock)
+}
+
+// newArena allocates and zeroes a gcBits arena.
+func newArena() *gcBits {
+	var result *gcBits
+	if gcBitsArenas.free == nil {
+		result = (*gcBits)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		if result == nil {
+			throw("runtime: cannot allocate memory")
+		}
+	} else {
+		result = gcBitsArenas.free
+		gcBitsArenas.free = gcBitsArenas.free.next
+		memclr(unsafe.Pointer(result), gcBitsChunkBytes)
+	}
+	result.next = nil
+	// If result.bits is not 8 byte aligned adjust index so
+	// that &result.bits[result.free] is 8 byte aligned.
+	if uintptr(unsafe.Offsetof(gcBits{}.bits))&7 == 0 {
+		result.free = 0
+	} else {
+		result.free = 8 - (uintptr(unsafe.Pointer(&result.bits[0])) & 7)
+	}
+	return result
+}

diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index dc94cee..0e7cc66 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go

@@ -37,6 +37,7 @@
 	gen("arm", notags, zeroARM, copyARM)
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
+	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
 }
 
 func gen(arch string, tags, zero, copy func(io.Writer)) {
@@ -60,21 +61,18 @@
 func notags(w io.Writer) { fmt.Fprintln(w) }
 
 func zeroAMD64(w io.Writer) {
-	// AX: zero
+	// X0: zero
 	// DI: ptr to memory to be zeroed
 	// DI is updated as a side effect.
 	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
-	for i := 0; i < 31; i++ {
-		fmt.Fprintln(w, "\tMOVQ\tAX,(DI)")
-		fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)")
-		fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)")
-		fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)")
-		fmt.Fprintln(w, "\tADDQ\t$32,DI")
+	for i := 0; i < 16; i++ {
+		fmt.Fprintln(w, "\tMOVUPS\tX0,(DI)")
+		fmt.Fprintln(w, "\tMOVUPS\tX0,16(DI)")
+		fmt.Fprintln(w, "\tMOVUPS\tX0,32(DI)")
+		fmt.Fprintln(w, "\tMOVUPS\tX0,48(DI)")
+		fmt.Fprintln(w, "\tADDQ\t$64,DI")
 		fmt.Fprintln(w)
 	}
-	for i := 0; i < 4; i++ {
-		fmt.Fprintln(w, "\tSTOSQ")
-	}
 	fmt.Fprintln(w, "\tRET")
 }
 
@@ -87,11 +85,11 @@
 	// for some reason that is 3.5x slower than this code.
 	// The STOSQ in duffzero seem fine, though.
 	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT, $0-0")
-	for i := 0; i < 128; i++ {
-		fmt.Fprintln(w, "\tMOVQ\t(SI), CX")
-		fmt.Fprintln(w, "\tADDQ\t$8, SI")
-		fmt.Fprintln(w, "\tMOVQ\tCX, (DI)")
-		fmt.Fprintln(w, "\tADDQ\t$8, DI")
+	for i := 0; i < 64; i++ {
+		fmt.Fprintln(w, "\tMOVUPS\t(SI), X0")
+		fmt.Fprintln(w, "\tADDQ\t$16, SI")
+		fmt.Fprintln(w, "\tMOVUPS\tX0, (DI)")
+		fmt.Fprintln(w, "\tADDQ\t$16, DI")
 		fmt.Fprintln(w)
 	}
 	fmt.Fprintln(w, "\tRET")
@@ -176,13 +174,35 @@
 	// R0: always zero
 	// R3 (aka REGRT1): ptr to memory to be zeroed - 8
 	// On return, R3 points to the last zeroed dword.
-	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
 	for i := 0; i < 128; i++ {
 		fmt.Fprintln(w, "\tMOVDU\tR0, 8(R3)")
 	}
-	fmt.Fprintln(w, "\tRETURN")
+	fmt.Fprintln(w, "\tRET")
 }
 
 func copyPPC64x(w io.Writer) {
 	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
 }
+
+func tagsMIPS64x(w io.Writer) {
+	fmt.Fprintln(w)
+	fmt.Fprintln(w, "// +build mips64 mips64le")
+	fmt.Fprintln(w)
+}
+
+func zeroMIPS64x(w io.Writer) {
+	// R0: always zero
+	// R1 (aka REGRT1): ptr to memory to be zeroed - 8
+	// On return, R1 points to the last zeroed dword.
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $-8-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOVV\tR0, 8(R1)")
+		fmt.Fprintln(w, "\tADDV\t$8, R1")
+	}
+	fmt.Fprintln(w, "\tRET")
+}
+
+func copyMIPS64x(w io.Writer) {
+	fmt.Fprintln(w, "// TODO: Implement runtime·duffcopy.")
+}

diff --git a/src/runtime/mkfastlog2table.go b/src/runtime/mkfastlog2table.go
new file mode 100644
index 0000000..587ebf4
--- /dev/null
+++ b/src/runtime/mkfastlog2table.go

@@ -0,0 +1,52 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// fastlog2Table contains log2 approximations for 5 binary digits.
+// This is used to implement fastlog2, which is used for heap sampling.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"math"
+)
+
+func main() {
+	var buf bytes.Buffer
+
+	fmt.Fprintln(&buf, "// AUTO-GENERATED by mkfastlog2table.go")
+	fmt.Fprintln(&buf, "// Run go generate from src/runtime to update.")
+	fmt.Fprintln(&buf, "// See mkfastlog2table.go for comments.")
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "package runtime")
+	fmt.Fprintln(&buf)
+	fmt.Fprintln(&buf, "const fastlogNumBits =", fastlogNumBits)
+	fmt.Fprintln(&buf)
+
+	fmt.Fprintln(&buf, "var fastlog2Table = [1<<fastlogNumBits + 1]float64{")
+	table := computeTable()
+	for _, t := range table {
+		fmt.Fprintf(&buf, "\t%v,\n", t)
+	}
+	fmt.Fprintln(&buf, "}")
+
+	if err := ioutil.WriteFile("fastlog2table.go", buf.Bytes(), 0644); err != nil {
+		log.Fatalln(err)
+	}
+}
+
+const fastlogNumBits = 5
+
+func computeTable() []float64 {
+	fastlog2Table := make([]float64, 1<<fastlogNumBits+1)
+	for i := 0; i <= (1 << fastlogNumBits); i++ {
+		fastlog2Table[i] = math.Log2(1.0 + float64(i)/(1<<fastlogNumBits))
+	}
+	return fastlog2Table
+}

diff --git a/src/runtime/mknacl.sh b/src/runtime/mknacl.sh
index 47fb7bd..0a74db1 100644
--- a/src/runtime/mknacl.sh
+++ b/src/runtime/mknacl.sh

@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2013 The Go Authors.  All rights reserved.
+# Copyright 2013 The Go Authors. All rights reserved.
 # Use of this source code is governed by a BSD-style
 # license that can be found in the LICENSE file.
 

diff --git a/src/runtime/mmap.go b/src/runtime/mmap.go
new file mode 100644
index 0000000..53617e4
--- /dev/null
+++ b/src/runtime/mmap.go

@@ -0,0 +1,19 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9
+// +build !solaris
+// +build !windows
+// +build !nacl
+// +build !linux !amd64
+
+package runtime
+
+import "unsafe"
+
+// mmap calls the mmap system call. It is implemented in assembly.
+// We only pass the lower 32 bits of file offset to the
+// assembly routine; the higher bits (if required), should be provided
+// by the assembly routine as 0.
+func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer

diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index a618bd5..c3e4e2c 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go

@@ -8,6 +8,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -250,23 +251,18 @@
 }
 
 // Called when freeing a profiled block.
-func mProf_Free(b *bucket, size uintptr, freed bool) {
+func mProf_Free(b *bucket, size uintptr) {
 	lock(&proflock)
 	mp := b.mp()
-	if freed {
-		mp.recent_frees++
-		mp.recent_free_bytes += size
-	} else {
-		mp.prev_frees++
-		mp.prev_free_bytes += size
-	}
+	mp.prev_frees++
+	mp.prev_free_bytes += size
 	unlock(&proflock)
 }
 
 var blockprofilerate uint64 // in CPU ticks
 
 // SetBlockProfileRate controls the fraction of goroutine blocking events
-// that are reported in the blocking profile.  The profiler aims to sample
+// that are reported in the blocking profile. The profiler aims to sample
 // an average of one blocking event per rate nanoseconds spent blocked.
 //
 // To include every blocking event in the profile, pass rate = 1.
@@ -285,14 +281,14 @@
 		}
 	}
 
-	atomicstore64(&blockprofilerate, uint64(r))
+	atomic.Store64(&blockprofilerate, uint64(r))
 }
 
 func blockevent(cycles int64, skip int) {
 	if cycles <= 0 {
 		cycles = 1
 	}
-	rate := int64(atomicload64(&blockprofilerate))
+	rate := int64(atomic.Load64(&blockprofilerate))
 	if rate <= 0 || (rate > cycles && int64(fastrand1())%rate > cycles) {
 		return
 	}
@@ -339,7 +335,7 @@
 //
 // The tools that process the memory profiles assume that the
 // profile rate is constant across the lifetime of the program
-// and equal to the current value.  Programs that change the
+// and equal to the current value. Programs that change the
 // memory profiling rate should do so just once, as early as
 // possible in the execution of the program (for example,
 // at the beginning of main).
@@ -372,6 +368,9 @@
 	return r.Stack0[0:]
 }
 
+// MemProfile returns a profile of memory allocated and freed per allocation
+// site.
+//
 // MemProfile returns n, the number of records in the current memory profile.
 // If len(p) >= n, MemProfile copies the profile into p and returns n, true.
 // If len(p) < n, MemProfile does not change p and returns n, false.
@@ -381,6 +380,12 @@
 // These are sites where memory was allocated, but it has all
 // been released back to the runtime.
 //
+// The returned profile may be up to two garbage collection cycles old.
+// This is to avoid skewing the profile toward allocations; because
+// allocations happen in real time but frees are delayed until the garbage
+// collector performs sweeping, the profile only accounts for allocations
+// that have had a chance to be freed by the garbage collector.
+//
 // Most clients should use the runtime/pprof package or
 // the testing package's -test.memprofile flag instead
 // of calling MemProfile directly.
@@ -443,7 +448,7 @@
 	lock(&proflock)
 	for b := mbuckets; b != nil; b = b.allnext {
 		mp := b.mp()
-		fn(b, uintptr(b.nstk), &b.stk()[0], b.size, mp.allocs, mp.frees)
+		fn(b, b.nstk, &b.stk()[0], b.size, mp.allocs, mp.frees)
 	}
 	unlock(&proflock)
 }
@@ -473,8 +478,8 @@
 		for b := bbuckets; b != nil; b = b.allnext {
 			bp := b.bp()
 			r := &p[0]
-			r.Count = int64(bp.count)
-			r.Cycles = int64(bp.cycles)
+			r.Count = bp.count
+			r.Cycles = bp.cycles
 			i := copy(r.Stack0[:], b.stk())
 			for ; i < len(r.Stack0); i++ {
 				r.Stack0[i] = 0
@@ -493,7 +498,7 @@
 // Most clients should use the runtime/pprof package instead
 // of calling ThreadCreateProfile directly.
 func ThreadCreateProfile(p []StackRecord) (n int, ok bool) {
-	first := (*m)(atomicloadp(unsafe.Pointer(&allm)))
+	first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
 	for mp := first; mp != nil; mp = mp.alllink {
 		n++
 	}
@@ -501,9 +506,7 @@
 		ok = true
 		i := 0
 		for mp := first; mp != nil; mp = mp.alllink {
-			for s := range mp.createstack {
-				p[i].Stack0[s] = uintptr(mp.createstack[s])
-			}
+			p[i].Stack0 = mp.createstack
 			i++
 		}
 	}
@@ -517,34 +520,51 @@
 // Most clients should use the runtime/pprof package instead
 // of calling GoroutineProfile directly.
 func GoroutineProfile(p []StackRecord) (n int, ok bool) {
+	gp := getg()
 
-	n = NumGoroutine()
+	isOK := func(gp1 *g) bool {
+		// Checking isSystemGoroutine here makes GoroutineProfile
+		// consistent with both NumGoroutine and Stack.
+		return gp1 != gp && readgstatus(gp1) != _Gdead && !isSystemGoroutine(gp1)
+	}
+
+	stopTheWorld("profile")
+
+	n = 1
+	for _, gp1 := range allgs {
+		if isOK(gp1) {
+			n++
+		}
+	}
+
 	if n <= len(p) {
-		gp := getg()
-		stopTheWorld("profile")
+		ok = true
+		r := p
 
-		n = NumGoroutine()
-		if n <= len(p) {
-			ok = true
-			r := p
-			sp := getcallersp(unsafe.Pointer(&p))
-			pc := getcallerpc(unsafe.Pointer(&p))
-			systemstack(func() {
-				saveg(pc, sp, gp, &r[0])
-			})
-			r = r[1:]
-			for _, gp1 := range allgs {
-				if gp1 == gp || readgstatus(gp1) == _Gdead {
-					continue
+		// Save current goroutine.
+		sp := getcallersp(unsafe.Pointer(&p))
+		pc := getcallerpc(unsafe.Pointer(&p))
+		systemstack(func() {
+			saveg(pc, sp, gp, &r[0])
+		})
+		r = r[1:]
+
+		// Save other goroutines.
+		for _, gp1 := range allgs {
+			if isOK(gp1) {
+				if len(r) == 0 {
+					// Should be impossible, but better to return a
+					// truncated profile than to crash the entire process.
+					break
 				}
 				saveg(^uintptr(0), ^uintptr(0), gp1, &r[0])
 				r = r[1:]
 			}
 		}
-
-		startTheWorld()
 	}
 
+	startTheWorld()
+
 	return n, ok
 }
 
@@ -571,12 +591,17 @@
 		pc := getcallerpc(unsafe.Pointer(&buf))
 		systemstack(func() {
 			g0 := getg()
+			// Force traceback=1 to override GOTRACEBACK setting,
+			// so that Stack's results are consistent.
+			// GOTRACEBACK is only about crash dumps.
+			g0.m.traceback = 1
 			g0.writebuf = buf[0:0:len(buf)]
 			goroutineheader(gp)
 			traceback(pc, sp, 0, gp)
 			if all {
 				tracebackothers(gp)
 			}
+			g0.m.traceback = 0
 			n = len(g0.writebuf)
 			g0.writebuf = nil
 		})
@@ -599,7 +624,7 @@
 	if typ == nil {
 		print("tracealloc(", p, ", ", hex(size), ")\n")
 	} else {
-		print("tracealloc(", p, ", ", hex(size), ", ", *typ._string, ")\n")
+		print("tracealloc(", p, ", ", hex(size), ", ", typ.string(), ")\n")
 	}
 	if gp.m.curg == nil || gp == gp.m.curg {
 		goroutineheader(gp)

diff --git a/src/runtime/msan.go b/src/runtime/msan.go
new file mode 100644
index 0000000..7177c8e
--- /dev/null
+++ b/src/runtime/msan.go

@@ -0,0 +1,55 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build msan
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+// Public memory sanitizer API.
+
+func MSanRead(addr unsafe.Pointer, len int) {
+	msanread(addr, uintptr(len))
+}
+
+func MSanWrite(addr unsafe.Pointer, len int) {
+	msanwrite(addr, uintptr(len))
+}
+
+// Private interface for the runtime.
+const msanenabled = true
+
+// If we are running on the system stack, the C program may have
+// marked part of that stack as uninitialized. We don't instrument
+// the runtime, but operations like a slice copy can call msanread
+// anyhow for values on the stack. Just ignore msanread when running
+// on the system stack. The other msan functions are fine.
+func msanread(addr unsafe.Pointer, sz uintptr) {
+	g := getg()
+	if g == g.m.g0 || g == g.m.gsignal {
+		return
+	}
+	domsanread(addr, sz)
+}
+
+//go:noescape
+func domsanread(addr unsafe.Pointer, sz uintptr)
+
+//go:noescape
+func msanwrite(addr unsafe.Pointer, sz uintptr)
+
+//go:noescape
+func msanmalloc(addr unsafe.Pointer, sz uintptr)
+
+//go:noescape
+func msanfree(addr unsafe.Pointer, sz uintptr)
+
+// These are called from msan_amd64.s
+//go:cgo_import_static __msan_read_go
+//go:cgo_import_static __msan_write_go
+//go:cgo_import_static __msan_malloc_go
+//go:cgo_import_static __msan_free_go

diff --git a/src/runtime/msan/msan.go b/src/runtime/msan/msan.go
new file mode 100644
index 0000000..b6ea3f0
--- /dev/null
+++ b/src/runtime/msan/msan.go

@@ -0,0 +1,32 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build msan,linux,amd64
+
+package msan
+
+/*
+#cgo CFLAGS: -fsanitize=memory
+#cgo LDFLAGS: -fsanitize=memory
+
+#include <stdint.h>
+#include <sanitizer/msan_interface.h>
+
+void __msan_read_go(void *addr, uintptr_t sz) {
+	__msan_check_mem_is_initialized(addr, sz);
+}
+
+void __msan_write_go(void *addr, uintptr_t sz) {
+	__msan_unpoison(addr, sz);
+}
+
+void __msan_malloc_go(void *addr, uintptr_t sz) {
+	__msan_unpoison(addr, sz);
+}
+
+void __msan_free_go(void *addr, uintptr_t sz) {
+	__msan_poison(addr, sz);
+}
+*/
+import "C"

diff --git a/src/runtime/msan0.go b/src/runtime/msan0.go
new file mode 100644
index 0000000..117c5e5
--- /dev/null
+++ b/src/runtime/msan0.go

@@ -0,0 +1,22 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !msan
+
+// Dummy MSan support API, used when not built with -msan.
+
+package runtime
+
+import (
+	"unsafe"
+)
+
+const msanenabled = false
+
+// Because msanenabled is false, none of these functions should be called.
+
+func msanread(addr unsafe.Pointer, sz uintptr)   { throw("msan") }
+func msanwrite(addr unsafe.Pointer, sz uintptr)  { throw("msan") }
+func msanmalloc(addr unsafe.Pointer, sz uintptr) { throw("msan") }
+func msanfree(addr unsafe.Pointer, sz uintptr)   { throw("msan") }

diff --git a/src/runtime/msan_amd64.s b/src/runtime/msan_amd64.s
new file mode 100644
index 0000000..9c59eec
--- /dev/null
+++ b/src/runtime/msan_amd64.s

@@ -0,0 +1,76 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build msan
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// This is like race_amd64.s, but for the msan calls.
+// See race_amd64.s for detailed comments.
+
+#ifdef GOOS_windows
+#define RARG0 CX
+#define RARG1 DX
+#define RARG2 R8
+#define RARG3 R9
+#else
+#define RARG0 DI
+#define RARG1 SI
+#define RARG2 DX
+#define RARG3 CX
+#endif
+
+// func runtime·domsanread(addr unsafe.Pointer, sz uintptr)
+// Called from msanread.
+TEXT	runtime·domsanread(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG0
+	MOVQ	size+8(FP), RARG1
+	// void __msan_read_go(void *addr, uintptr_t sz);
+	MOVQ	$__msan_read_go(SB), AX
+	JMP	msancall<>(SB)
+
+// func runtime·msanwrite(addr unsafe.Pointer, sz uintptr)
+// Called from instrumented code.
+TEXT	runtime·msanwrite(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG0
+	MOVQ	size+8(FP), RARG1
+	// void __msan_write_go(void *addr, uintptr_t sz);
+	MOVQ	$__msan_write_go(SB), AX
+	JMP	msancall<>(SB)
+
+// func runtime·msanmalloc(addr unsafe.Pointer, sz uintptr)
+TEXT	runtime·msanmalloc(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG0
+	MOVQ	size+8(FP), RARG1
+	// void __msan_malloc_go(void *addr, uintptr_t sz);
+	MOVQ	$__msan_malloc_go(SB), AX
+	JMP	msancall<>(SB)
+
+// func runtime·msanfree(addr unsafe.Pointer, sz uintptr)
+TEXT	runtime·msanfree(SB), NOSPLIT, $0-16
+	MOVQ	addr+0(FP), RARG0
+	MOVQ	size+8(FP), RARG1
+	// void __msan_free_go(void *addr, uintptr_t sz);
+	MOVQ	$__msan_free_go(SB), AX
+	JMP	msancall<>(SB)
+
+// Switches SP to g0 stack and calls (AX). Arguments already set.
+TEXT	msancall<>(SB), NOSPLIT, $0-0
+	get_tls(R12)
+	MOVQ	g(R12), R14
+	MOVQ	g_m(R14), R13
+	// Switch to g0 stack.
+	MOVQ	SP, R12		// callee-saved, preserved across the CALL
+	MOVQ	m_g0(R13), R10
+	CMPQ	R10, R14
+	JE	call	// already on g0
+	MOVQ	(g_sched+gobuf_sp)(R10), SP
+call:
+	ANDQ	$~15, SP	// alignment for gcc ABI
+	CALL	AX
+	MOVQ	R12, SP
+	RET

diff --git a/src/runtime/msize.go b/src/runtime/msize.go
index bc735be..18577b3 100644
--- a/src/runtime/msize.go
+++ b/src/runtime/msize.go

@@ -13,7 +13,7 @@
 // and chopped up when new objects of the size class are needed.
 // That page count is chosen so that chopping up the run of
 // pages into objects of the given size wastes at most 12.5% (1.125x)
-// of the memory.  It is not necessary that the cutoff here be
+// of the memory. It is not necessary that the cutoff here be
 // the same as above.
 //
 // The two sources of waste multiply, so the worst possible case
@@ -27,7 +27,7 @@
 
 package runtime
 
-// Size classes.  Computed and initialized by InitSizes.
+// Size classes. Computed and initialized by InitSizes.
 //
 // SizeToClass(0 <= n <= MaxSmallSize) returns the size class,
 //	1 <= sizeclass < NumSizeClasses, for n.
@@ -55,7 +55,7 @@
 
 func sizeToClass(size int32) int32 {
 	if size > _MaxSmallSize {
-		throw("SizeToClass - invalid size")
+		throw("invalid size")
 	}
 	if size > 1024-8 {
 		return int32(size_to_class128[(size-1024+127)>>7])
@@ -79,7 +79,7 @@
 			}
 		}
 		if align&(align-1) != 0 {
-			throw("InitSizes - bug")
+			throw("incorrect alignment")
 		}
 
 		// Make the allocnpages big enough that
@@ -106,10 +106,18 @@
 		sizeclass++
 	}
 	if sizeclass != _NumSizeClasses {
-		print("sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n")
-		throw("InitSizes - bad NumSizeClasses")
+		print("runtime: sizeclass=", sizeclass, " NumSizeClasses=", _NumSizeClasses, "\n")
+		throw("bad NumSizeClasses")
 	}
-
+	// Check maxObjsPerSpan => number of objects invariant.
+	for i, size := range class_to_size {
+		if size != 0 && class_to_allocnpages[i]*pageSize/size > maxObjsPerSpan {
+			throw("span contains too many objects")
+		}
+		if size == 0 && i != 0 {
+			throw("size is 0 but class is not 0")
+		}
+	}
 	// Initialize the size_to_class tables.
 	nextsize := 0
 	for sizeclass = 1; sizeclass < _NumSizeClasses; sizeclass++ {
@@ -128,12 +136,12 @@
 		for n := int32(0); n < _MaxSmallSize; n++ {
 			sizeclass := sizeToClass(n)
 			if sizeclass < 1 || sizeclass >= _NumSizeClasses || class_to_size[sizeclass] < n {
-				print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
+				print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
 				print("incorrect SizeToClass\n")
 				goto dump
 			}
 			if sizeclass > 1 && class_to_size[sizeclass-1] >= n {
-				print("size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
+				print("runtime: size=", n, " sizeclass=", sizeclass, " runtime·class_to_size=", class_to_size[sizeclass], "\n")
 				print("SizeToClass too big\n")
 				goto dump
 			}
@@ -155,18 +163,18 @@
 
 dump:
 	if true {
-		print("NumSizeClasses=", _NumSizeClasses, "\n")
+		print("runtime: NumSizeClasses=", _NumSizeClasses, "\n")
 		print("runtime·class_to_size:")
 		for sizeclass = 0; sizeclass < _NumSizeClasses; sizeclass++ {
 			print(" ", class_to_size[sizeclass], "")
 		}
 		print("\n\n")
-		print("size_to_class8:")
+		print("runtime: size_to_class8:")
 		for i := 0; i < len(size_to_class8); i++ {
 			print(" ", i*8, "=>", size_to_class8[i], "(", class_to_size[size_to_class8[i]], ")\n")
 		}
 		print("\n")
-		print("size_to_class128:")
+		print("runtime: size_to_class128:")
 		for i := 0; i < len(size_to_class128); i++ {
 			print(" ", i*128, "=>", size_to_class128[i], "(", class_to_size[size_to_class128[i]], ")\n")
 		}

diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index 08b82e0..2d75d2f 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go

@@ -1,4 +1,4 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,7 +6,11 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Statistics.
 // If you edit this structure, also edit type MemStats below.
@@ -42,7 +46,7 @@
 
 	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
-	next_gc         uint64 // next gc (in heap_alloc time)
+	next_gc         uint64 // next gc (in heap_live time)
 	last_gc         uint64 // last gc (in absolute time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
@@ -66,13 +70,33 @@
 
 	// heap_live is the number of bytes considered live by the GC.
 	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= heap_alloc, since heap_live
-	// excludes unmarked objects that have not yet been swept.
+	// since then. heap_live <= heap_alloc, since heap_alloc
+	// includes unmarked objects that have not yet been swept (and
+	// hence goes up as we allocate and down as we sweep) while
+	// heap_live excludes these objects (and hence only goes up
+	// between GCs).
+	//
+	// This is updated atomically without locking. To reduce
+	// contention, this is updated only when obtaining a span from
+	// an mcentral and at this point it counts all of the
+	// unallocated slots in that span (which will be allocated
+	// before that mcache obtains another span from that
+	// mcentral). Hence, it slightly overestimates the "true" live
+	// heap size. It's better to overestimate than to
+	// underestimate because 1) this triggers the GC earlier than
+	// necessary rather than potentially too late and 2) this
+	// leads to a conservative GC rate rather than a GC rate that
+	// is potentially too low.
+	//
+	// Whenever this is updated, call traceHeapAlloc() and
+	// gcController.revise().
 	heap_live uint64
 
 	// heap_scan is the number of bytes of "scannable" heap. This
 	// is the live heap (as counted by heap_live), but omitting
 	// no-scan objects and no-scan tails of objects.
+	//
+	// Whenever this is updated, call gcController.revise().
 	heap_scan uint64
 
 	// heap_marked is the number of bytes marked by the previous
@@ -141,7 +165,7 @@
 
 // Size of the trailing by_size array differs between Go and C,
 // and all data after by_size is local to runtime, not exported.
-// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+// NumSizeClasses was changed, but we cannot change Go struct because of backward compatibility.
 // sizeof_C_MStats is what C thinks about size of Go struct.
 var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
 
@@ -168,7 +192,7 @@
 	updatememstats(nil)
 
 	// Size of the trailing by_size array differs between Go and C,
-	// NumSizeClasses was changed, but we can not change Go struct because of backward compatibility.
+	// NumSizeClasses was changed, but we cannot change Go struct because of backward compatibility.
 	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
 
 	// Stack numbers are part of the heap numbers, separate those out for user consumption
@@ -271,9 +295,9 @@
 			memstats.nmalloc++
 			memstats.alloc += uint64(s.elemsize)
 		} else {
-			memstats.nmalloc += uint64(s.ref)
-			memstats.by_size[s.sizeclass].nmalloc += uint64(s.ref)
-			memstats.alloc += uint64(s.ref) * uint64(s.elemsize)
+			memstats.nmalloc += uint64(s.allocCount)
+			memstats.by_size[s.sizeclass].nmalloc += uint64(s.allocCount)
+			memstats.alloc += uint64(s.allocCount) * uint64(s.elemsize)
 		}
 	}
 	unlock(&mheap_.lock)
@@ -285,13 +309,13 @@
 		memstats.nfree += mheap_.nsmallfree[i]
 		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
 		memstats.by_size[i].nmalloc += mheap_.nsmallfree[i]
-		smallfree += uint64(mheap_.nsmallfree[i]) * uint64(class_to_size[i])
+		smallfree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
 	}
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.nfree
 
 	// Calculate derived stats.
-	memstats.total_alloc = uint64(memstats.alloc) + uint64(mheap_.largefree) + smallfree
+	memstats.total_alloc = memstats.alloc + mheap_.largefree + smallfree
 	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
 }
@@ -322,7 +346,7 @@
 		if c == nil {
 			continue
 		}
-		mCache_ReleaseAll(c)
+		c.releaseAll()
 		stackcache_clear(c)
 	}
 }
@@ -331,11 +355,6 @@
 func purgecachedstats(c *mcache) {
 	// Protected by either heap or GC lock.
 	h := &mheap_
-	memstats.heap_live += uint64(c.local_cachealloc)
-	c.local_cachealloc = 0
-	if trace.enabled {
-		traceHeapAlloc()
-	}
 	memstats.heap_scan += uint64(c.local_scan)
 	c.local_scan = 0
 	memstats.tinyallocs += uint64(c.local_tinyallocs)
@@ -352,7 +371,7 @@
 	}
 }
 
-// Atomically increases a given *system* memory stat.  We are counting on this
+// Atomically increases a given *system* memory stat. We are counting on this
 // stat never overflowing a uintptr, so this function must only be used for
 // system memory stats.
 //
@@ -366,25 +385,25 @@
 // overflow errors.
 //go:nosplit
 func mSysStatInc(sysStat *uint64, n uintptr) {
-	if _BigEndian != 0 {
-		xadd64(sysStat, int64(n))
+	if sys.BigEndian != 0 {
+		atomic.Xadd64(sysStat, int64(n))
 		return
 	}
-	if val := xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
+	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
 		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
 		exit(2)
 	}
 }
 
-// Atomically decreases a given *system* memory stat.  Same comments as
+// Atomically decreases a given *system* memory stat. Same comments as
 // mSysStatInc apply.
 //go:nosplit
 func mSysStatDec(sysStat *uint64, n uintptr) {
-	if _BigEndian != 0 {
-		xadd64(sysStat, -int64(n))
+	if sys.BigEndian != 0 {
+		atomic.Xadd64(sysStat, -int64(n))
 		return
 	}
-	if val := xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
+	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
 		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
 		exit(2)
 	}

diff --git a/src/runtime/mstkbar.go b/src/runtime/mstkbar.go
new file mode 100644
index 0000000..1bf9d57
--- /dev/null
+++ b/src/runtime/mstkbar.go

@@ -0,0 +1,389 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Garbage collector: stack barriers
+//
+// Stack barriers enable the garbage collector to determine how much
+// of a gorountine stack has changed between when a stack is scanned
+// during the concurrent scan phase and when it is re-scanned during
+// the stop-the-world mark termination phase. Mark termination only
+// needs to re-scan the changed part, so for deep stacks this can
+// significantly reduce GC pause time compared to the alternative of
+// re-scanning whole stacks. The deeper the stacks, the more stack
+// barriers help.
+//
+// When stacks are scanned during the concurrent scan phase, the stack
+// scan installs stack barriers by selecting stack frames and
+// overwriting the saved return PCs (or link registers) of these
+// frames with the PC of a "stack barrier trampoline". Later, when a
+// selected frame returns, it "returns" to this trampoline instead of
+// returning to its actual caller. The trampoline records that the
+// stack has unwound past this frame and jumps to the original return
+// PC recorded when the stack barrier was installed. Mark termination
+// re-scans only as far as the first frame that hasn't hit a stack
+// barrier and then removes and un-hit stack barriers.
+//
+// This scheme is very lightweight. No special code is required in the
+// mutator to record stack unwinding and the trampoline is only a few
+// assembly instructions.
+//
+// Book-keeping
+// ------------
+//
+// The primary cost of stack barriers is book-keeping: the runtime has
+// to record the locations of all stack barriers and the original
+// return PCs in order to return to the correct caller when a stack
+// barrier is hit and so it can remove un-hit stack barriers. In order
+// to minimize this cost, the Go runtime places stack barriers in
+// exponentially-spaced frames, starting 1K past the current frame.
+// The book-keeping structure hence grows logarithmically with the
+// size of the stack and mark termination re-scans at most twice as
+// much stack as necessary.
+//
+// The runtime reserves space for this book-keeping structure at the
+// top of the stack allocation itself (just above the outermost
+// frame). This is necessary because the regular memory allocator can
+// itself grow the stack, and hence can't be used when allocating
+// stack-related structures.
+//
+// For debugging, the runtime also supports installing stack barriers
+// at every frame. However, this requires significantly more
+// book-keeping space.
+//
+// Correctness
+// -----------
+//
+// The runtime and the compiler cooperate to ensure that all objects
+// reachable from the stack as of mark termination are marked.
+// Anything unchanged since the concurrent scan phase will be marked
+// because it is marked by the concurrent scan. After the concurrent
+// scan, there are three possible classes of stack modifications that
+// must be tracked:
+//
+// 1) Mutator writes below the lowest un-hit stack barrier. This
+// includes all writes performed by an executing function to its own
+// stack frame. This part of the stack will be re-scanned by mark
+// termination, which will mark any objects made reachable from
+// modifications to this part of the stack.
+//
+// 2) Mutator writes above the lowest un-hit stack barrier. It's
+// possible for a mutator to modify the stack above the lowest un-hit
+// stack barrier if a higher frame has passed down a pointer to a
+// stack variable in its frame. This is called an "up-pointer". The
+// compiler ensures that writes through up-pointers have an
+// accompanying write barrier (it simply doesn't distinguish between
+// writes through up-pointers and writes through heap pointers). This
+// write barrier marks any object made reachable from modifications to
+// this part of the stack.
+//
+// 3) Runtime writes to the stack. Various runtime operations such as
+// sends to unbuffered channels can write to arbitrary parts of the
+// stack, including above the lowest un-hit stack barrier. We solve
+// this in two ways. In many cases, the runtime can perform an
+// explicit write barrier operation like in case 2. However, in the
+// case of bulk memory move (typedmemmove), the runtime doesn't
+// necessary have ready access to a pointer bitmap for the memory
+// being copied, so it simply unwinds any stack barriers below the
+// destination.
+//
+// Gotchas
+// -------
+//
+// Anything that inspects or manipulates the stack potentially needs
+// to understand stack barriers. The most obvious case is that
+// gentraceback needs to use the original return PC when it encounters
+// the stack barrier trampoline. Anything that unwinds the stack such
+// as panic/recover must unwind stack barriers in tandem with
+// unwinding the stack.
+//
+// Stack barriers require that any goroutine whose stack has been
+// scanned must execute write barriers. Go solves this by simply
+// enabling write barriers globally during the concurrent scan phase.
+// However, traditionally, write barriers are not enabled during this
+// phase.
+//
+// Synchronization
+// ---------------
+//
+// For the most part, accessing and modifying stack barriers is
+// synchronized around GC safe points. Installing stack barriers
+// forces the G to a safe point, while all other operations that
+// modify stack barriers run on the G and prevent it from reaching a
+// safe point.
+//
+// Subtlety arises when a G may be tracebacked when *not* at a safe
+// point. This happens during sigprof. For this, each G has a "stack
+// barrier lock" (see gcLockStackBarriers, gcUnlockStackBarriers).
+// Operations that manipulate stack barriers acquire this lock, while
+// sigprof tries to acquire it and simply skips the traceback if it
+// can't acquire it. There is one exception for performance and
+// complexity reasons: hitting a stack barrier manipulates the stack
+// barrier list without acquiring the stack barrier lock. For this,
+// gentraceback performs a special fix up if the traceback starts in
+// the stack barrier function.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+const debugStackBarrier = false
+
+// firstStackBarrierOffset is the approximate byte offset at
+// which to place the first stack barrier from the current SP.
+// This is a lower bound on how much stack will have to be
+// re-scanned during mark termination. Subsequent barriers are
+// placed at firstStackBarrierOffset * 2^n offsets.
+//
+// For debugging, this can be set to 0, which will install a
+// stack barrier at every frame. If you do this, you may also
+// have to raise _StackMin, since the stack barrier
+// bookkeeping will use a large amount of each stack.
+var firstStackBarrierOffset = 1024
+
+// gcMaxStackBarriers returns the maximum number of stack barriers
+// that can be installed in a stack of stackSize bytes.
+func gcMaxStackBarriers(stackSize int) (n int) {
+	if firstStackBarrierOffset == 0 {
+		// Special debugging case for inserting stack barriers
+		// at every frame. Steal half of the stack for the
+		// []stkbar. Technically, if the stack were to consist
+		// solely of return PCs we would need two thirds of
+		// the stack, but stealing that much breaks things and
+		// this doesn't happen in practice.
+		return stackSize / 2 / int(unsafe.Sizeof(stkbar{}))
+	}
+
+	offset := firstStackBarrierOffset
+	for offset < stackSize {
+		n++
+		offset *= 2
+	}
+	return n + 1
+}
+
+// gcInstallStackBarrier installs a stack barrier over the return PC of frame.
+//go:nowritebarrier
+func gcInstallStackBarrier(gp *g, frame *stkframe) bool {
+	if frame.lr == 0 {
+		if debugStackBarrier {
+			print("not installing stack barrier with no LR, goid=", gp.goid, "\n")
+		}
+		return false
+	}
+
+	if frame.fn.entry == cgocallback_gofuncPC {
+		// cgocallback_gofunc doesn't return to its LR;
+		// instead, its return path puts LR in g.sched.pc and
+		// switches back to the system stack on which
+		// cgocallback_gofunc was originally called. We can't
+		// have a stack barrier in g.sched.pc, so don't
+		// install one in this frame.
+		if debugStackBarrier {
+			print("not installing stack barrier over LR of cgocallback_gofunc, goid=", gp.goid, "\n")
+		}
+		return false
+	}
+
+	// Save the return PC and overwrite it with stackBarrier.
+	var lrUintptr uintptr
+	if usesLR {
+		lrUintptr = frame.sp
+	} else {
+		lrUintptr = frame.fp - sys.RegSize
+	}
+	lrPtr := (*sys.Uintreg)(unsafe.Pointer(lrUintptr))
+	if debugStackBarrier {
+		print("install stack barrier at ", hex(lrUintptr), " over ", hex(*lrPtr), ", goid=", gp.goid, "\n")
+		if uintptr(*lrPtr) != frame.lr {
+			print("frame.lr=", hex(frame.lr))
+			throw("frame.lr differs from stack LR")
+		}
+	}
+
+	gp.stkbar = gp.stkbar[:len(gp.stkbar)+1]
+	stkbar := &gp.stkbar[len(gp.stkbar)-1]
+	stkbar.savedLRPtr = lrUintptr
+	stkbar.savedLRVal = uintptr(*lrPtr)
+	*lrPtr = sys.Uintreg(stackBarrierPC)
+	return true
+}
+
+// gcRemoveStackBarriers removes all stack barriers installed in gp's stack.
+//
+// gp's stack barriers must be locked.
+//
+//go:nowritebarrier
+func gcRemoveStackBarriers(gp *g) {
+	if debugStackBarrier && gp.stkbarPos != 0 {
+		print("hit ", gp.stkbarPos, " stack barriers, goid=", gp.goid, "\n")
+	}
+
+	// Remove stack barriers that we didn't hit.
+	for _, stkbar := range gp.stkbar[gp.stkbarPos:] {
+		gcRemoveStackBarrier(gp, stkbar)
+	}
+
+	// Clear recorded stack barriers so copystack doesn't try to
+	// adjust them.
+	gp.stkbarPos = 0
+	gp.stkbar = gp.stkbar[:0]
+}
+
+// gcRemoveStackBarrier removes a single stack barrier. It is the
+// inverse operation of gcInstallStackBarrier.
+//
+// This is nosplit to ensure gp's stack does not move.
+//
+//go:nowritebarrier
+//go:nosplit
+func gcRemoveStackBarrier(gp *g, stkbar stkbar) {
+	if debugStackBarrier {
+		print("remove stack barrier at ", hex(stkbar.savedLRPtr), " with ", hex(stkbar.savedLRVal), ", goid=", gp.goid, "\n")
+	}
+	lrPtr := (*sys.Uintreg)(unsafe.Pointer(stkbar.savedLRPtr))
+	if val := *lrPtr; val != sys.Uintreg(stackBarrierPC) {
+		printlock()
+		print("at *", hex(stkbar.savedLRPtr), " expected stack barrier PC ", hex(stackBarrierPC), ", found ", hex(val), ", goid=", gp.goid, "\n")
+		print("gp.stkbar=")
+		gcPrintStkbars(gp, -1)
+		print(", gp.stack=[", hex(gp.stack.lo), ",", hex(gp.stack.hi), ")\n")
+		throw("stack barrier lost")
+	}
+	*lrPtr = sys.Uintreg(stkbar.savedLRVal)
+}
+
+// gcTryRemoveAllStackBarriers tries to remove stack barriers from all
+// Gs in gps. It is best-effort and efficient. If it can't remove
+// barriers from a G immediately, it will simply skip it.
+func gcTryRemoveAllStackBarriers(gps []*g) {
+	for _, gp := range gps {
+	retry:
+		for {
+			switch s := readgstatus(gp); s {
+			default:
+				break retry
+
+			case _Grunnable, _Gsyscall, _Gwaiting:
+				if !castogscanstatus(gp, s, s|_Gscan) {
+					continue
+				}
+				gcLockStackBarriers(gp)
+				gcRemoveStackBarriers(gp)
+				gcUnlockStackBarriers(gp)
+				restartg(gp)
+				break retry
+			}
+		}
+	}
+}
+
+// gcPrintStkbars prints the stack barriers of gp for debugging. It
+// places a "@@@" marker at gp.stkbarPos. If marker >= 0, it will also
+// place a "==>" marker before the marker'th entry.
+func gcPrintStkbars(gp *g, marker int) {
+	print("[")
+	for i, s := range gp.stkbar {
+		if i > 0 {
+			print(" ")
+		}
+		if i == int(gp.stkbarPos) {
+			print("@@@ ")
+		}
+		if i == marker {
+			print("==> ")
+		}
+		print("*", hex(s.savedLRPtr), "=", hex(s.savedLRVal))
+	}
+	if int(gp.stkbarPos) == len(gp.stkbar) {
+		print(" @@@")
+	}
+	if marker == len(gp.stkbar) {
+		print(" ==>")
+	}
+	print("]")
+}
+
+// gcUnwindBarriers marks all stack barriers up the frame containing
+// sp as hit and removes them. This is used during stack unwinding for
+// panic/recover and by heapBitsBulkBarrier to force stack re-scanning
+// when its destination is on the stack.
+//
+// This is nosplit to ensure gp's stack does not move.
+//
+//go:nosplit
+func gcUnwindBarriers(gp *g, sp uintptr) {
+	gcLockStackBarriers(gp)
+	// On LR machines, if there is a stack barrier on the return
+	// from the frame containing sp, this will mark it as hit even
+	// though it isn't, but it's okay to be conservative.
+	before := gp.stkbarPos
+	for int(gp.stkbarPos) < len(gp.stkbar) && gp.stkbar[gp.stkbarPos].savedLRPtr < sp {
+		gcRemoveStackBarrier(gp, gp.stkbar[gp.stkbarPos])
+		gp.stkbarPos++
+	}
+	gcUnlockStackBarriers(gp)
+	if debugStackBarrier && gp.stkbarPos != before {
+		print("skip barriers below ", hex(sp), " in goid=", gp.goid, ": ")
+		// We skipped barriers between the "==>" marker
+		// (before) and the "@@@" marker (gp.stkbarPos).
+		gcPrintStkbars(gp, int(before))
+		print("\n")
+	}
+}
+
+// nextBarrierPC returns the original return PC of the next stack barrier.
+// Used by getcallerpc, so it must be nosplit.
+//go:nosplit
+func nextBarrierPC() uintptr {
+	gp := getg()
+	return gp.stkbar[gp.stkbarPos].savedLRVal
+}
+
+// setNextBarrierPC sets the return PC of the next stack barrier.
+// Used by setcallerpc, so it must be nosplit.
+//go:nosplit
+func setNextBarrierPC(pc uintptr) {
+	gp := getg()
+	gcLockStackBarriers(gp)
+	gp.stkbar[gp.stkbarPos].savedLRVal = pc
+	gcUnlockStackBarriers(gp)
+}
+
+// gcLockStackBarriers synchronizes with tracebacks of gp's stack
+// during sigprof for installation or removal of stack barriers. It
+// blocks until any current sigprof is done tracebacking gp's stack
+// and then disallows profiling tracebacks of gp's stack.
+//
+// This is necessary because a sigprof during barrier installation or
+// removal could observe inconsistencies between the stkbar array and
+// the stack itself and crash.
+//
+//go:nosplit
+func gcLockStackBarriers(gp *g) {
+	// Disable preemption so scanstack cannot run while the caller
+	// is manipulating the stack barriers.
+	acquirem()
+	for !atomic.Cas(&gp.stackLock, 0, 1) {
+		osyield()
+	}
+}
+
+//go:nosplit
+func gcTryLockStackBarriers(gp *g) bool {
+	mp := acquirem()
+	result := atomic.Cas(&gp.stackLock, 0, 1)
+	if !result {
+		releasem(mp)
+	}
+	return result
+}
+
+func gcUnlockStackBarriers(gp *g) {
+	atomic.Store(&gp.stackLock, 0)
+	releasem(getg().m)
+}

diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 7c6e3fa..2ef248d 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go

@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 // Integrated network poller (platform-independent part).
 // A particular implementation (epoll/kqueue) must define the following functions:
@@ -77,11 +80,11 @@
 //go:linkname net_runtime_pollServerInit net.runtime_pollServerInit
 func net_runtime_pollServerInit() {
 	netpollinit()
-	atomicstore(&netpollInited, 1)
+	atomic.Store(&netpollInited, 1)
 }
 
 func netpollinited() bool {
-	return atomicload(&netpollInited) != 0
+	return atomic.Load(&netpollInited) != 0
 }
 
 //go:linkname net_runtime_pollOpen net.runtime_pollOpen
@@ -119,7 +122,7 @@
 	if pd.rg != 0 && pd.rg != pdReady {
 		throw("netpollClose: blocked read on closing descriptor")
 	}
-	netpollclose(uintptr(pd.fd))
+	netpollclose(pd.fd)
 	pollcache.free(pd)
 }
 
@@ -305,7 +308,7 @@
 }
 
 func netpollblockcommit(gp *g, gpp unsafe.Pointer) bool {
-	return casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
+	return atomic.Casuintptr((*uintptr)(gpp), pdWait, uintptr(unsafe.Pointer(gp)))
 }
 
 // returns true if IO is ready, or false if timedout or closed
@@ -326,7 +329,7 @@
 		if old != 0 {
 			throw("netpollblock: double wait")
 		}
-		if casuintptr(gpp, 0, pdWait) {
+		if atomic.Casuintptr(gpp, 0, pdWait) {
 			break
 		}
 	}
@@ -338,7 +341,7 @@
 		gopark(netpollblockcommit, unsafe.Pointer(gpp), "IO wait", traceEvGoBlockNet, 5)
 	}
 	// be careful to not lose concurrent READY notification
-	old := xchguintptr(gpp, 0)
+	old := atomic.Xchguintptr(gpp, 0)
 	if old > pdWait {
 		throw("netpollblock: corrupted state")
 	}
@@ -365,7 +368,7 @@
 		if ioready {
 			new = pdReady
 		}
-		if casuintptr(gpp, old, new) {
+		if atomic.Casuintptr(gpp, old, new) {
 			if old == pdReady || old == pdWait {
 				old = 0
 			}

diff --git a/src/runtime/netpoll_epoll.go b/src/runtime/netpoll_epoll.go
index 7b4052a..e06eff8 100644
--- a/src/runtime/netpoll_epoll.go
+++ b/src/runtime/netpoll_epoll.go

@@ -19,8 +19,7 @@
 func closeonexec(fd int32)
 
 var (
-	epfd           int32 = -1 // epoll descriptor
-	netpolllasterr int32
+	epfd int32 = -1 // epoll descriptor
 )
 
 func netpollinit() {
@@ -67,9 +66,9 @@
 retry:
 	n := epollwait(epfd, &events[0], int32(len(events)), waitms)
 	if n < 0 {
-		if n != -_EINTR && n != netpolllasterr {
-			netpolllasterr = n
+		if n != -_EINTR {
 			println("runtime: epollwait on fd", epfd, "failed with", -n)
+			throw("epollwait failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_kqueue.go b/src/runtime/netpoll_kqueue.go
index 01445dc..337377a 100644
--- a/src/runtime/netpoll_kqueue.go
+++ b/src/runtime/netpoll_kqueue.go

@@ -17,8 +17,7 @@
 func closeonexec(fd int32)
 
 var (
-	kq             int32 = -1
-	netpolllasterr int32
+	kq int32 = -1
 )
 
 func netpollinit() {
@@ -32,7 +31,7 @@
 
 func netpollopen(fd uintptr, pd *pollDesc) int32 {
 	// Arm both EVFILT_READ and EVFILT_WRITE in edge-triggered mode (EV_CLEAR)
-	// for the whole fd lifetime.  The notifications are automatically unregistered
+	// for the whole fd lifetime. The notifications are automatically unregistered
 	// when fd is closed.
 	var ev [2]keventt
 	*(*uintptr)(unsafe.Pointer(&ev[0].ident)) = fd
@@ -75,9 +74,9 @@
 retry:
 	n := kevent(kq, nil, 0, &events[0], int32(len(events)), tp)
 	if n < 0 {
-		if n != -_EINTR && n != netpolllasterr {
-			netpolllasterr = n
+		if n != -_EINTR {
 			println("runtime: kevent on fd", kq, "failed with", -n)
+			throw("kevent failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_solaris.go b/src/runtime/netpoll_solaris.go
index e4652d8..53b2aac 100644
--- a/src/runtime/netpoll_solaris.go
+++ b/src/runtime/netpoll_solaris.go

@@ -58,7 +58,7 @@
 //
 // The open and arming mechanisms are serialized using the lock
 // inside PollDesc. This is required because the netpoll loop runs
-// asynchonously in respect to other Go code and by the time we get
+// asynchronously in respect to other Go code and by the time we get
 // to call port_associate to update the association in the loop, the
 // file descriptor might have been closed and reopened already. The
 // lock allows runtime·netpollupdate to be called synchronously from
@@ -125,7 +125,7 @@
 	lock(&pd.lock)
 	// We don't register for any specific type of events yet, that's
 	// netpollarm's job. We merely ensure we call port_associate before
-	// asynchonous connect/accept completes, so when we actually want
+	// asynchronous connect/accept completes, so when we actually want
 	// to do any I/O, the call to port_associate (from netpollarm,
 	// with the interested event set) will unblock port_getn right away
 	// because of the I/O readiness notification.
@@ -174,9 +174,6 @@
 	unlock(&pd.lock)
 }
 
-// netpolllasterr holds the last error code returned by port_getn to prevent log spamming
-var netpolllasterr int32
-
 // polls for ready network connections
 // returns list of goroutines that become runnable
 func netpoll(block bool) *g {
@@ -194,9 +191,9 @@
 retry:
 	var n uint32 = 1
 	if port_getn(portfd, &events[0], uint32(len(events)), &n, wait) < 0 {
-		if e := errno(); e != _EINTR && e != netpolllasterr {
-			netpolllasterr = e
+		if e := errno(); e != _EINTR {
 			print("runtime: port_getn on fd ", portfd, " failed with ", e, "\n")
+			throw("port_getn failed")
 		}
 		goto retry
 	}

diff --git a/src/runtime/netpoll_windows.go b/src/runtime/netpoll_windows.go
index 7e15cd2..7ad1158 100644
--- a/src/runtime/netpoll_windows.go
+++ b/src/runtime/netpoll_windows.go

@@ -33,7 +33,7 @@
 var iocphandle uintptr = _INVALID_HANDLE_VALUE // completion port io handle
 
 func netpollinit() {
-	iocphandle = uintptr(stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX))
+	iocphandle = stdcall4(_CreateIoCompletionPort, _INVALID_HANDLE_VALUE, 0, 0, _DWORD_MAX)
 	if iocphandle == 0 {
 		println("netpoll: failed to create iocp handle (errno=", getlasterror(), ")")
 		throw("netpoll: failed to create iocp handle")

diff --git a/src/runtime/noasm.go b/src/runtime/noasm.go
index 9a6dbee..0a8f9e6 100644
--- a/src/runtime/noasm.go
+++ b/src/runtime/noasm.go

@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Routines that are implemented in assembly in asm_{amd64,386,arm,arm64}.s
+// Routines that are implemented in assembly in asm_{amd64,386,arm,arm64,ppc64x,s390x}.s
 
-// +build ppc64 ppc64le
+// +build mips64 mips64le
 
 package runtime
 
@@ -39,6 +39,9 @@
 	if len(s2) < l {
 		l = len(s2)
 	}
+	if l == 0 || &s1[0] == &s2[0] {
+		goto samebytes
+	}
 	for i := 0; i < l; i++ {
 		c1, c2 := s1[i], s2[i]
 		if c1 < c2 {
@@ -48,6 +51,7 @@
 			return +1
 		}
 	}
+samebytes:
 	if len(s1) < len(s2) {
 		return -1
 	}

diff --git a/src/runtime/norace_linux_test.go b/src/runtime/norace_linux_test.go
index bbf9d0b..3517a5d 100644
--- a/src/runtime/norace_linux_test.go
+++ b/src/runtime/norace_linux_test.go

@@ -1,8 +1,8 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// The file contains tests that can not run under race detector for some reason.
+// The file contains tests that cannot run under race detector for some reason.
 // +build !race
 
 package runtime_test

diff --git a/src/runtime/norace_test.go b/src/runtime/norace_test.go
index 3681bf1..e9b39b2 100644
--- a/src/runtime/norace_test.go
+++ b/src/runtime/norace_test.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// The file contains tests that can not run under race detector for some reason.
+// The file contains tests that cannot run under race detector for some reason.
 // +build !race
 
 package runtime_test

diff --git a/src/runtime/os1_darwin.go b/src/runtime/os1_darwin.go
deleted file mode 100644
index e070229..0000000
--- a/src/runtime/os1_darwin.go
+++ /dev/null

@@ -1,481 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-//extern SigTabTT runtime·sigtab[];
-
-var sigset_all = ^uint32(0)
-
-func unimplemented(name string) {
-	println(name, "not implemented")
-	*(*int)(unsafe.Pointer(uintptr(1231))) = 1231
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	mach_semrelease(uint32(mp.waitsema))
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	var x uintptr
-	systemstack(func() {
-		x = uintptr(mach_semcreate())
-	})
-	return x
-}
-
-// BSD interface for threading.
-func osinit() {
-	// bsdthread_register delayed until end of goenvs so that we
-	// can look at the environment first.
-
-	ncpu = getncpu()
-}
-
-func getncpu() int32 {
-	// Use sysctl to fetch hw.ncpu.
-	mib := [2]uint32{6, 3}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 && int32(out) > 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-
-	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
-	// but only if we're not using cgo.  If we are using cgo we need
-	// to let the C pthread library install its own thread-creation callback.
-	if !iscgo {
-		if bsdthread_register() != 0 {
-			if gogetenv("DYLD_INSERT_LIBRARIES") != "" {
-				throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)")
-			}
-			throw("runtime: bsdthread_register error")
-		}
-	}
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, "/", int(mp.tls[0]), " ostk=", &mp, "\n")
-	}
-
-	var oset uint32
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	errno := bsdthread_create(stk, unsafe.Pointer(mp), funcPC(mstart))
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-
-	if errno < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -errno, ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-// newosproc0 is a version of newosproc that can be called before the runtime
-// is initialized.
-//
-// As Go uses bsdthread_register when running without cgo, this function is
-// not safe to use after initialization as it does not pass an M as fnarg.
-//
-//go:nosplit
-func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
-	stack := sysAlloc(stacksize, &memstats.stacks_sys)
-	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
-		exit(1)
-	}
-	stk := unsafe.Pointer(uintptr(stack) + stacksize)
-
-	var oset uint32
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	errno := bsdthread_create(stk, fn, fnarg)
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-
-	if errno < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
-		exit(1)
-	}
-}
-
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*uint32)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	sigprocmask(_SIG_SETMASK, nil, smask)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	// Initialize signal handling.
-	_g_ := getg()
-	signalstack(&_g_.m.gsignal.stack)
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask &^= 1 << (uint32(i) - 1)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := (*uint32)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask, nil)
-	signalstack(nil)
-}
-
-// Mach IPC, to get at semaphores
-// Definitions are in /usr/include/mach on a Mac.
-
-func macherror(r int32, fn string) {
-	print("mach error ", fn, ": ", r, "\n")
-	throw("mach error")
-}
-
-const _DebugMach = false
-
-var zerondr machndr
-
-func mach_msgh_bits(a, b uint32) uint32 {
-	return a | b<<8
-}
-
-func mach_msg(h *machheader, op int32, send_size, rcv_size, rcv_name, timeout, notify uint32) int32 {
-	// TODO: Loop on interrupt.
-	return mach_msg_trap(unsafe.Pointer(h), op, send_size, rcv_size, rcv_name, timeout, notify)
-}
-
-// Mach RPC (MIG)
-const (
-	_MinMachMsg = 48
-	_MachReply  = 100
-)
-
-type codemsg struct {
-	h    machheader
-	ndr  machndr
-	code int32
-}
-
-func machcall(h *machheader, maxsize int32, rxsize int32) int32 {
-	_g_ := getg()
-	port := _g_.m.machport
-	if port == 0 {
-		port = mach_reply_port()
-		_g_.m.machport = port
-	}
-
-	h.msgh_bits |= mach_msgh_bits(_MACH_MSG_TYPE_COPY_SEND, _MACH_MSG_TYPE_MAKE_SEND_ONCE)
-	h.msgh_local_port = port
-	h.msgh_reserved = 0
-	id := h.msgh_id
-
-	if _DebugMach {
-		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
-		print("send:\t")
-		var i uint32
-		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
-			print(" ", p[i])
-			if i%8 == 7 {
-				print("\n\t")
-			}
-		}
-		if i%8 != 0 {
-			print("\n")
-		}
-	}
-	ret := mach_msg(h, _MACH_SEND_MSG|_MACH_RCV_MSG, h.msgh_size, uint32(maxsize), port, 0, 0)
-	if ret != 0 {
-		if _DebugMach {
-			print("mach_msg error ", ret, "\n")
-		}
-		return ret
-	}
-	if _DebugMach {
-		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
-		var i uint32
-		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
-			print(" ", p[i])
-			if i%8 == 7 {
-				print("\n\t")
-			}
-		}
-		if i%8 != 0 {
-			print("\n")
-		}
-	}
-	if h.msgh_id != id+_MachReply {
-		if _DebugMach {
-			print("mach_msg _MachReply id mismatch ", h.msgh_id, " != ", id+_MachReply, "\n")
-		}
-		return -303 // MIG_REPLY_MISMATCH
-	}
-	// Look for a response giving the return value.
-	// Any call can send this back with an error,
-	// and some calls only have return values so they
-	// send it back on success too.  I don't quite see how
-	// you know it's one of these and not the full response
-	// format, so just look if the message is right.
-	c := (*codemsg)(unsafe.Pointer(h))
-	if uintptr(h.msgh_size) == unsafe.Sizeof(*c) && h.msgh_bits&_MACH_MSGH_BITS_COMPLEX == 0 {
-		if _DebugMach {
-			print("mig result ", c.code, "\n")
-		}
-		return c.code
-	}
-	if h.msgh_size != uint32(rxsize) {
-		if _DebugMach {
-			print("mach_msg _MachReply size mismatch ", h.msgh_size, " != ", rxsize, "\n")
-		}
-		return -307 // MIG_ARRAY_TOO_LARGE
-	}
-	return 0
-}
-
-// Semaphores!
-
-const (
-	tmach_semcreate = 3418
-	rmach_semcreate = tmach_semcreate + _MachReply
-
-	tmach_semdestroy = 3419
-	rmach_semdestroy = tmach_semdestroy + _MachReply
-
-	_KERN_ABORTED             = 14
-	_KERN_OPERATION_TIMED_OUT = 49
-)
-
-type tmach_semcreatemsg struct {
-	h      machheader
-	ndr    machndr
-	policy int32
-	value  int32
-}
-
-type rmach_semcreatemsg struct {
-	h         machheader
-	body      machbody
-	semaphore machport
-}
-
-type tmach_semdestroymsg struct {
-	h         machheader
-	body      machbody
-	semaphore machport
-}
-
-func mach_semcreate() uint32 {
-	var m [256]uint8
-	tx := (*tmach_semcreatemsg)(unsafe.Pointer(&m))
-	rx := (*rmach_semcreatemsg)(unsafe.Pointer(&m))
-
-	tx.h.msgh_bits = 0
-	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
-	tx.h.msgh_remote_port = mach_task_self()
-	tx.h.msgh_id = tmach_semcreate
-	tx.ndr = zerondr
-
-	tx.policy = 0 // 0 = SYNC_POLICY_FIFO
-	tx.value = 0
-
-	for {
-		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), int32(unsafe.Sizeof(*rx)))
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_create")
-	}
-	if rx.body.msgh_descriptor_count != 1 {
-		unimplemented("mach_semcreate desc count")
-	}
-	return rx.semaphore.name
-}
-
-func mach_semdestroy(sem uint32) {
-	var m [256]uint8
-	tx := (*tmach_semdestroymsg)(unsafe.Pointer(&m))
-
-	tx.h.msgh_bits = _MACH_MSGH_BITS_COMPLEX
-	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
-	tx.h.msgh_remote_port = mach_task_self()
-	tx.h.msgh_id = tmach_semdestroy
-	tx.body.msgh_descriptor_count = 1
-	tx.semaphore.name = sem
-	tx.semaphore.disposition = _MACH_MSG_TYPE_MOVE_SEND
-	tx.semaphore._type = 0
-
-	for {
-		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), 0)
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_destroy")
-	}
-}
-
-// The other calls have simple system call traps in sys_darwin_{amd64,386}.s
-
-func mach_semaphore_wait(sema uint32) int32
-func mach_semaphore_timedwait(sema, sec, nsec uint32) int32
-func mach_semaphore_signal(sema uint32) int32
-func mach_semaphore_signal_all(sema uint32) int32
-
-func semasleep1(ns int64) int32 {
-	_g_ := getg()
-
-	if ns >= 0 {
-		var nsecs int32
-		secs := timediv(ns, 1000000000, &nsecs)
-		r := mach_semaphore_timedwait(uint32(_g_.m.waitsema), uint32(secs), uint32(nsecs))
-		if r == _KERN_ABORTED || r == _KERN_OPERATION_TIMED_OUT {
-			return -1
-		}
-		if r != 0 {
-			macherror(r, "semaphore_wait")
-		}
-		return 0
-	}
-
-	for {
-		r := mach_semaphore_wait(uint32(_g_.m.waitsema))
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-		macherror(r, "semaphore_wait")
-	}
-	return 0
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	var r int32
-	systemstack(func() {
-		r = semasleep1(ns)
-	})
-	return r
-}
-
-//go:nosplit
-func mach_semrelease(sem uint32) {
-	for {
-		r := mach_semaphore_signal(sem)
-		if r == 0 {
-			break
-		}
-		if r == _KERN_ABORTED { // interrupted
-			continue
-		}
-
-		// mach_semrelease must be completely nosplit,
-		// because it is called from Go code.
-		// If we're going to die, start that process on the system stack
-		// to avoid a Go stack split.
-		systemstack(func() { macherror(r, "semaphore_signal") })
-	}
-}
-
-//go:nosplit
-func osyield() {
-	usleep(1)
-}
-
-func memlimit() uintptr {
-	// NOTE(rsc): Could use getrlimit here,
-	// like on FreeBSD or Linux, but Darwin doesn't enforce
-	// ulimit -v, so it's unclear why we'd try to stay within
-	// the limit.
-	return 0
-}
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = ^uint32(0)
-	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp)) // runtime·sigtramp's job is to call into real handler
-	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
-	sigaction(uint32(i), &sa, nil)
-}
-
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	sigaction(uint32(i), nil, &sa)
-	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
-}
-
-func signalstack(s *stack) {
-	var st stackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m sigmask) {
-	sigprocmask(_SIG_SETMASK, &m[0], nil)
-}
-
-func unblocksig(sig int32) {
-	mask := uint32(1) << (uint32(sig) - 1)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}

diff --git a/src/runtime/os1_dragonfly.go b/src/runtime/os1_dragonfly.go
deleted file mode 100644
index f96c78c..0000000
--- a/src/runtime/os1_dragonfly.go
+++ /dev/null

@@ -1,247 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// From DragonFly's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-//go:nosplit
-func futexsleep(addr *uint32, val uint32, ns int64) {
-	systemstack(func() {
-		futexsleep1(addr, val, ns)
-	})
-}
-
-func futexsleep1(addr *uint32, val uint32, ns int64) {
-	var timeout int32
-	if ns >= 0 {
-		// The timeout is specified in microseconds - ensure that we
-		// do not end up dividing to zero, which would put us to sleep
-		// indefinitely...
-		timeout = timediv(ns, 1000, nil)
-		if timeout == 0 {
-			timeout = 1
-		}
-	}
-
-	// sys_umtx_sleep will return EWOULDBLOCK (EAGAIN) when the timeout
-	// expires or EBUSY if the mutex value does not match.
-	ret := sys_umtx_sleep(addr, int32(val), timeout)
-	if ret >= 0 || ret == -_EINTR || ret == -_EAGAIN || ret == -_EBUSY {
-		return
-	}
-
-	print("umtx_sleep addr=", addr, " val=", val, " ret=", ret, "\n")
-	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
-}
-
-//go:nosplit
-func futexwakeup(addr *uint32, cnt uint32) {
-	ret := sys_umtx_wakeup(addr, int32(cnt))
-	if ret >= 0 {
-		return
-	}
-
-	systemstack(func() {
-		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
-		*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
-	})
-}
-
-func lwp_start(uintptr)
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " lwp_start=", funcPC(lwp_start), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
-	}
-
-	var oset sigset
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-
-	params := lwpparams{
-		start_func: funcPC(lwp_start),
-		arg:        unsafe.Pointer(mp),
-		stack:      uintptr(stk),
-		tid1:       unsafe.Pointer(&mp.procid),
-		tid2:       nil,
-	}
-
-	mp.tls[0] = uintptr(mp.id) // XXX so 386 asm can find it
-
-	lwp_create(&params)
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	sigprocmask(_SIG_SETMASK, nil, smask)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// m.procid is a uint64, but lwp_start writes an int32. Fix it up.
-	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
-
-	// Initialize signal handling
-	signalstack(&_g_.m.gsignal.stack)
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask, nil)
-	signalstack(nil)
-}
-
-func memlimit() uintptr {
-	/*
-		                TODO: Convert to Go when something actually uses the result.
-
-				Rlimit rl;
-				extern byte runtime·text[], runtime·end[];
-				uintptr used;
-
-				if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
-					return 0;
-				if(rl.rlim_cur >= 0x7fffffff)
-					return 0;
-
-				// Estimate our VM footprint excluding the heap.
-				// Not an exact science: use size of binary plus
-				// some room for thread stacks.
-				used = runtime·end - runtime·text + (64<<20);
-				if(used >= rl.rlim_cur)
-					return 0;
-
-				// If there's not at least 16 MB left, we're probably
-				// not going to be able to do much.  Treat as no limit.
-				rl.rlim_cur -= used;
-				if(rl.rlim_cur < (16<<20))
-					return 0;
-
-				return rl.rlim_cur - used;
-	*/
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_sigaction uintptr
-	sa_flags     int32
-	sa_mask      sigset
-}
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_sigaction = fn
-	sigaction(i, &sa, nil)
-}
-
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_sigaction == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_sigaction
-}
-
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m sigmask) {
-	var mask sigset
-	copy(mask.__bits[:], m[:])
-	sigprocmask(_SIG_SETMASK, &mask, nil)
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}

diff --git a/src/runtime/os1_freebsd.go b/src/runtime/os1_freebsd.go
deleted file mode 100644
index f3519f3..0000000
--- a/src/runtime/os1_freebsd.go
+++ /dev/null

@@ -1,249 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// From FreeBSD's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-// FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
-// thus the code is largely similar. See Linux implementation
-// and lock_futex.go for comments.
-
-//go:nosplit
-func futexsleep(addr *uint32, val uint32, ns int64) {
-	systemstack(func() {
-		futexsleep1(addr, val, ns)
-	})
-}
-
-func futexsleep1(addr *uint32, val uint32, ns int64) {
-	var tsp *timespec
-	if ns >= 0 {
-		var ts timespec
-		ts.tv_nsec = 0
-		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
-		tsp = &ts
-	}
-	ret := sys_umtx_op(addr, _UMTX_OP_WAIT_UINT_PRIVATE, val, nil, tsp)
-	if ret >= 0 || ret == -_EINTR {
-		return
-	}
-	print("umtx_wait addr=", addr, " val=", val, " ret=", ret, "\n")
-	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
-}
-
-//go:nosplit
-func futexwakeup(addr *uint32, cnt uint32) {
-	ret := sys_umtx_op(addr, _UMTX_OP_WAKE_PRIVATE, cnt, nil, nil)
-	if ret >= 0 {
-		return
-	}
-
-	systemstack(func() {
-		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
-	})
-}
-
-func thr_start()
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " thr_start=", funcPC(thr_start), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
-	}
-
-	// NOTE(rsc): This code is confused. stackbase is the top of the stack
-	// and is equal to stk. However, it's working, so I'm not changing it.
-	param := thrparam{
-		start_func: funcPC(thr_start),
-		arg:        unsafe.Pointer(mp),
-		stack_base: mp.g0.stack.hi,
-		stack_size: uintptr(stk) - mp.g0.stack.hi,
-		child_tid:  unsafe.Pointer(&mp.procid),
-		parent_tid: nil,
-		tls_base:   unsafe.Pointer(&mp.tls[0]),
-		tls_size:   unsafe.Sizeof(mp.tls),
-	}
-	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
-
-	var oset sigset
-	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
-	thr_new(&param, int32(unsafe.Sizeof(param)))
-	sigprocmask(_SIG_SETMASK, &oset, nil)
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	sigprocmask(_SIG_SETMASK, nil, smask)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// m.procid is a uint64, but thr_new writes a uint32 on 32-bit systems.
-	// Fix it up. (Only matters on big-endian, but be clean anyway.)
-	if ptrSize == 4 {
-		_g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
-	}
-
-	// Initialize signal handling.
-	signalstack(&_g_.m.gsignal.stack)
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask, nil)
-	signalstack(nil)
-}
-
-func memlimit() uintptr {
-	/*
-		TODO: Convert to Go when something actually uses the result.
-		Rlimit rl;
-		extern byte runtime·text[], runtime·end[];
-		uintptr used;
-
-		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
-			return 0;
-		if(rl.rlim_cur >= 0x7fffffff)
-			return 0;
-
-		// Estimate our VM footprint excluding the heap.
-		// Not an exact science: use size of binary plus
-		// some room for thread stacks.
-		used = runtime·end - runtime·text + (64<<20);
-		if(used >= rl.rlim_cur)
-			return 0;
-
-		// If there's not at least 16 MB left, we're probably
-		// not going to be able to do much.  Treat as no limit.
-		rl.rlim_cur -= used;
-		if(rl.rlim_cur < (16<<20))
-			return 0;
-
-		return rl.rlim_cur - used;
-	*/
-
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_handler uintptr
-	sa_flags   int32
-	sa_mask    sigset
-}
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_handler = fn
-	sigaction(i, &sa, nil)
-}
-
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_handler == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_handler
-}
-
-func signalstack(s *stack) {
-	var st stackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m [(_NSIG + 31) / 32]uint32) {
-	var mask sigset
-	copy(mask.__bits[:], m[:])
-	sigprocmask(_SIG_SETMASK, &mask, nil)
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}

diff --git a/src/runtime/os1_linux.go b/src/runtime/os1_linux.go
deleted file mode 100644
index c23dc30..0000000
--- a/src/runtime/os1_linux.go
+++ /dev/null

@@ -1,350 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-var sigset_all sigset = sigset{^uint32(0), ^uint32(0)}
-
-// Linux futex.
-//
-//	futexsleep(uint32 *addr, uint32 val)
-//	futexwakeup(uint32 *addr)
-//
-// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
-// Futexwakeup wakes up threads sleeping on addr.
-// Futexsleep is allowed to wake up spuriously.
-
-const (
-	_FUTEX_WAIT = 0
-	_FUTEX_WAKE = 1
-)
-
-// Atomically,
-//	if(*addr == val) sleep
-// Might be woken up spuriously; that's allowed.
-// Don't sleep longer than ns; ns < 0 means forever.
-//go:nosplit
-func futexsleep(addr *uint32, val uint32, ns int64) {
-	var ts timespec
-
-	// Some Linux kernels have a bug where futex of
-	// FUTEX_WAIT returns an internal error code
-	// as an errno.  Libpthread ignores the return value
-	// here, and so can we: as it says a few lines up,
-	// spurious wakeups are allowed.
-	if ns < 0 {
-		futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
-		return
-	}
-
-	// It's difficult to live within the no-split stack limits here.
-	// On ARM and 386, a 64-bit divide invokes a general software routine
-	// that needs more stack than we can afford. So we use timediv instead.
-	// But on real 64-bit systems, where words are larger but the stack limit
-	// is not, even timediv is too heavy, and we really need to use just an
-	// ordinary machine instruction.
-	if ptrSize == 8 {
-		ts.set_sec(ns / 1000000000)
-		ts.set_nsec(int32(ns % 1000000000))
-	} else {
-		ts.tv_nsec = 0
-		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
-	}
-	futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
-}
-
-// If any procs are sleeping on addr, wake up at most cnt.
-//go:nosplit
-func futexwakeup(addr *uint32, cnt uint32) {
-	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
-	if ret >= 0 {
-		return
-	}
-
-	// I don't know that futex wakeup can return
-	// EAGAIN or EINTR, but if it does, it would be
-	// safe to loop and call futex again.
-	systemstack(func() {
-		print("futexwakeup addr=", addr, " returned ", ret, "\n")
-	})
-
-	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
-}
-
-func getproccount() int32 {
-	// This buffer is huge (8 kB) but we are on the system stack
-	// and there should be plenty of space (64 kB) -- except on ARM where
-	// the system stack itself is only 8kb (see golang.org/issue/11873).
-	// Also this is a leaf, so we're not holding up the memory for long.
-	// See golang.org/issue/11823.
-	// The suggested behavior here is to keep trying with ever-larger
-	// buffers, but we don't have a dynamic memory allocator at the
-	// moment, so that's a bit tricky and seems like overkill.
-	const maxCPUs = 64*1024*(1-goarch_arm) + 1024*goarch_arm
-	var buf [maxCPUs / (ptrSize * 8)]uintptr
-	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
-	n := int32(0)
-	for _, v := range buf[:r/ptrSize] {
-		for v != 0 {
-			n += int32(v & 1)
-			v >>= 1
-		}
-	}
-	if n == 0 {
-		n = 1
-	}
-	return n
-}
-
-// Clone, the Linux rfork.
-const (
-	_CLONE_VM             = 0x100
-	_CLONE_FS             = 0x200
-	_CLONE_FILES          = 0x400
-	_CLONE_SIGHAND        = 0x800
-	_CLONE_PTRACE         = 0x2000
-	_CLONE_VFORK          = 0x4000
-	_CLONE_PARENT         = 0x8000
-	_CLONE_THREAD         = 0x10000
-	_CLONE_NEWNS          = 0x20000
-	_CLONE_SYSVSEM        = 0x40000
-	_CLONE_SETTLS         = 0x80000
-	_CLONE_PARENT_SETTID  = 0x100000
-	_CLONE_CHILD_CLEARTID = 0x200000
-	_CLONE_UNTRACED       = 0x800000
-	_CLONE_CHILD_SETTID   = 0x1000000
-	_CLONE_STOPPED        = 0x2000000
-	_CLONE_NEWUTS         = 0x4000000
-	_CLONE_NEWIPC         = 0x8000000
-
-	cloneFlags = _CLONE_VM | /* share memory */
-		_CLONE_FS | /* share cwd, etc */
-		_CLONE_FILES | /* share fd table */
-		_CLONE_SIGHAND | /* share sig handler table */
-		_CLONE_THREAD /* revisit - okay for now */
-)
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	/*
-	 * note: strace gets confused if we use CLONE_PTRACE here.
-	 */
-	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, "/", mp.tls[0], " ostk=", &mp, "\n")
-	}
-
-	// Disable signals during clone, so that the new thread starts
-	// with signals disabled.  It will enable them in minit.
-	var oset sigset
-	rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
-	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
-	rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
-
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
-		throw("newosproc")
-	}
-}
-
-// Version of newosproc that doesn't require a valid G.
-//go:nosplit
-func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
-	stack := sysAlloc(stacksize, &memstats.stacks_sys)
-	if stack == nil {
-		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
-		exit(1)
-	}
-	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
-	if ret < 0 {
-		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
-		exit(1)
-	}
-}
-
-var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
-var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
-
-func osinit() {
-	ncpu = getproccount()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-func getRandomData(r []byte) {
-	if startupRandomData != nil {
-		n := copy(r, startupRandomData)
-		extendRandom(r, n)
-		return
-	}
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	rtsigprocmask(_SIG_SETMASK, nil, smask, int32(unsafe.Sizeof(*smask)))
-}
-
-func gettid() uint32
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	// Initialize signal handling.
-	_g_ := getg()
-	signalstack(&_g_.m.gsignal.stack)
-
-	// for debuggers, in case cgo created the thread
-	_g_.m.procid = uint64(gettid())
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	rtsigprocmask(_SIG_SETMASK, &nmask, nil, int32(unsafe.Sizeof(nmask)))
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	rtsigprocmask(_SIG_SETMASK, smask, nil, int32(unsafe.Sizeof(*smask)))
-	signalstack(nil)
-}
-
-func memlimit() uintptr {
-	/*
-		TODO: Convert to Go when something actually uses the result.
-
-		Rlimit rl;
-		extern byte runtime·text[], runtime·end[];
-		uintptr used;
-
-		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
-			return 0;
-		if(rl.rlim_cur >= 0x7fffffff)
-			return 0;
-
-		// Estimate our VM footprint excluding the heap.
-		// Not an exact science: use size of binary plus
-		// some room for thread stacks.
-		used = runtime·end - runtime·text + (64<<20);
-		if(used >= rl.rlim_cur)
-			return 0;
-
-		// If there's not at least 16 MB left, we're probably
-		// not going to be able to do much.  Treat as no limit.
-		rl.rlim_cur -= used;
-		if(rl.rlim_cur < (16<<20))
-			return 0;
-
-		return rl.rlim_cur - used;
-	*/
-
-	return 0
-}
-
-//#ifdef GOARCH_386
-//#define sa_handler k_sa_handler
-//#endif
-
-func sigreturn()
-func sigtramp()
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = ^uint64(0)
-	// Although Linux manpage says "sa_restorer element is obsolete and
-	// should not be used". x86_64 kernel requires it. Only use it on
-	// x86.
-	if GOARCH == "386" || GOARCH == "amd64" {
-		sa.sa_restorer = funcPC(sigreturn)
-	}
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_handler = fn
-	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction failure")
-	}
-}
-
-func setsigstack(i int32) {
-	var sa sigactiont
-	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction failure")
-	}
-	if sa.sa_handler == 0 || sa.sa_handler == _SIG_DFL || sa.sa_handler == _SIG_IGN || sa.sa_flags&_SA_ONSTACK != 0 {
-		return
-	}
-	sa.sa_flags |= _SA_ONSTACK
-	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction failure")
-	}
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-
-	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
-	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
-		throw("rt_sigaction read failure")
-	}
-	if sa.sa_handler == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_handler
-}
-
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m sigmask) {
-	var mask sigset
-	copy(mask[:], m[:])
-	rtsigprocmask(_SIG_SETMASK, &mask, nil, int32(unsafe.Sizeof(mask)))
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	rtsigprocmask(_SIG_UNBLOCK, &mask, nil, int32(unsafe.Sizeof(mask)))
-}

diff --git a/src/runtime/os1_nacl.go b/src/runtime/os1_nacl.go
deleted file mode 100644
index 143752a..0000000
--- a/src/runtime/os1_nacl.go
+++ /dev/null

@@ -1,203 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func sigtramp()
-
-func msigsave(mp *m) {
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// Initialize signal handling
-	ret := nacl_exception_stack(_g_.m.gsignal.stack.lo, 32*1024)
-	if ret < 0 {
-		print("runtime: nacl_exception_stack: error ", -ret, "\n")
-	}
-
-	ret = nacl_exception_handler(funcPC(sigtramp), nil)
-	if ret < 0 {
-		print("runtime: nacl_exception_handler: error ", -ret, "\n")
-	}
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-}
-
-func osinit() {
-	ncpu = 1
-	getg().m.procid = 2
-	//nacl_exception_handler(funcPC(sigtramp), nil);
-}
-
-func crash() {
-	*(*int32)(nil) = 0
-}
-
-//go:noescape
-func getRandomData([]byte)
-
-func goenvs() {
-	goenvs_unix()
-}
-
-func initsig() {
-}
-
-//go:nosplit
-func usleep(us uint32) {
-	var ts timespec
-
-	ts.tv_sec = int64(us / 1e6)
-	ts.tv_nsec = int32(us%1e6) * 1e3
-	nacl_nanosleep(&ts, nil)
-}
-
-func mstart_nacl()
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	mp.tls[0] = uintptr(unsafe.Pointer(mp.g0))
-	mp.tls[1] = uintptr(unsafe.Pointer(mp))
-	ret := nacl_thread_create(funcPC(mstart_nacl), stk, unsafe.Pointer(&mp.tls[2]), nil)
-	if ret < 0 {
-		print("nacl_thread_create: error ", -ret, "\n")
-		throw("newosproc")
-	}
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	var cond uintptr
-	systemstack(func() {
-		mu := nacl_mutex_create(0)
-		if mu < 0 {
-			print("nacl_mutex_create: error ", -mu, "\n")
-			throw("semacreate")
-		}
-		c := nacl_cond_create(0)
-		if c < 0 {
-			print("nacl_cond_create: error ", -cond, "\n")
-			throw("semacreate")
-		}
-		cond = uintptr(c)
-		_g_ := getg()
-		_g_.m.waitsemalock = uint32(mu)
-	})
-	return cond
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	var ret int32
-
-	systemstack(func() {
-		_g_ := getg()
-		if nacl_mutex_lock(int32(_g_.m.waitsemalock)) < 0 {
-			throw("semasleep")
-		}
-
-		for _g_.m.waitsemacount == 0 {
-			if ns < 0 {
-				if nacl_cond_wait(int32(_g_.m.waitsema), int32(_g_.m.waitsemalock)) < 0 {
-					throw("semasleep")
-				}
-			} else {
-				var ts timespec
-				end := ns + nanotime()
-				ts.tv_sec = end / 1e9
-				ts.tv_nsec = int32(end % 1e9)
-				r := nacl_cond_timed_wait_abs(int32(_g_.m.waitsema), int32(_g_.m.waitsemalock), &ts)
-				if r == -_ETIMEDOUT {
-					nacl_mutex_unlock(int32(_g_.m.waitsemalock))
-					ret = -1
-					return
-				}
-				if r < 0 {
-					throw("semasleep")
-				}
-			}
-		}
-
-		_g_.m.waitsemacount = 0
-		nacl_mutex_unlock(int32(_g_.m.waitsemalock))
-		ret = 0
-	})
-	return ret
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	systemstack(func() {
-		if nacl_mutex_lock(int32(mp.waitsemalock)) < 0 {
-			throw("semawakeup")
-		}
-		if mp.waitsemacount != 0 {
-			throw("semawakeup")
-		}
-		mp.waitsemacount = 1
-		nacl_cond_signal(int32(mp.waitsema))
-		nacl_mutex_unlock(int32(mp.waitsemalock))
-	})
-}
-
-func memlimit() uintptr {
-	return 0
-}
-
-// This runs on a foreign stack, without an m or a g.  No stack split.
-//go:nosplit
-func badsignal2() {
-	write(2, unsafe.Pointer(&badsignal1[0]), int32(len(badsignal1)))
-	exit(2)
-}
-
-var badsignal1 = []byte("runtime: signal received on thread not created by Go.\n")
-
-func raisebadsignal(sig int32) {
-	badsignal2()
-}
-
-func madvise(addr unsafe.Pointer, n uintptr, flags int32) {}
-func munmap(addr unsafe.Pointer, n uintptr)               {}
-func resetcpuprofiler(hz int32)                           {}
-func sigdisable(uint32)                                   {}
-func sigenable(uint32)                                    {}
-func sigignore(uint32)                                    {}
-func closeonexec(int32)                                   {}
-
-var writelock uint32 // test-and-set spin lock for write
-
-/*
-An attempt at IRT. Doesn't work. See end of sys_nacl_amd64.s.
-
-void (*nacl_irt_query)(void);
-
-int8 nacl_irt_basic_v0_1_str[] = "nacl-irt-basic-0.1";
-void *nacl_irt_basic_v0_1[6]; // exit, gettod, clock, nanosleep, sched_yield, sysconf
-int32 nacl_irt_basic_v0_1_size = sizeof(nacl_irt_basic_v0_1);
-
-int8 nacl_irt_memory_v0_3_str[] = "nacl-irt-memory-0.3";
-void *nacl_irt_memory_v0_3[3]; // mmap, munmap, mprotect
-int32 nacl_irt_memory_v0_3_size = sizeof(nacl_irt_memory_v0_3);
-
-int8 nacl_irt_thread_v0_1_str[] = "nacl-irt-thread-0.1";
-void *nacl_irt_thread_v0_1[3]; // thread_create, thread_exit, thread_nice
-int32 nacl_irt_thread_v0_1_size = sizeof(nacl_irt_thread_v0_1);
-*/

diff --git a/src/runtime/os1_netbsd.go b/src/runtime/os1_netbsd.go
deleted file mode 100644
index cacd606..0000000
--- a/src/runtime/os1_netbsd.go
+++ /dev/null

@@ -1,238 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-const (
-	_ESRCH     = 3
-	_ETIMEDOUT = 60
-
-	// From NetBSD's <sys/time.h>
-	_CLOCK_REALTIME  = 0
-	_CLOCK_VIRTUAL   = 1
-	_CLOCK_PROF      = 2
-	_CLOCK_MONOTONIC = 3
-)
-
-var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
-
-// From NetBSD's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	return 1
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	_g_ := getg()
-
-	// Compute sleep deadline.
-	var tsp *timespec
-	if ns >= 0 {
-		var ts timespec
-		var nsec int32
-		ns += nanotime()
-		ts.set_sec(timediv(ns, 1000000000, &nsec))
-		ts.set_nsec(nsec)
-		tsp = &ts
-	}
-
-	for {
-		v := atomicload(&_g_.m.waitsemacount)
-		if v > 0 {
-			if cas(&_g_.m.waitsemacount, v, v-1) {
-				return 0 // semaphore acquired
-			}
-			continue
-		}
-
-		// Sleep until unparked by semawakeup or timeout.
-		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
-		if ret == _ETIMEDOUT {
-			return -1
-		}
-	}
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	xadd(&mp.waitsemacount, 1)
-	// From NetBSD's _lwp_unpark(2) manual:
-	// "If the target LWP is not currently waiting, it will return
-	// immediately upon the next call to _lwp_park()."
-	ret := lwp_unpark(int32(mp.procid), unsafe.Pointer(&mp.waitsemacount))
-	if ret != 0 && ret != _ESRCH {
-		// semawakeup can be called on signal stack.
-		systemstack(func() {
-			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
-		})
-	}
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, "/", int32(mp.tls[0]), " ostk=", &mp, "\n")
-	}
-
-	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
-
-	var uc ucontextt
-	getcontext(unsafe.Pointer(&uc))
-
-	uc.uc_flags = _UC_SIGMASK | _UC_CPU
-	uc.uc_link = nil
-	uc.uc_sigmask = sigset_all
-
-	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(mstart))
-
-	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	sigprocmask(_SIG_SETMASK, nil, smask)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	_g_ := getg()
-	_g_.m.procid = uint64(lwp_self())
-
-	// Initialize signal handling
-	signalstack(&_g_.m.gsignal.stack)
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, &nmask, nil)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask, nil)
-
-	signalstack(nil)
-}
-
-func memlimit() uintptr {
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_sigaction uintptr
-	sa_mask      sigset
-	sa_flags     int32
-}
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_sigaction = fn
-	sigaction(i, &sa, nil)
-}
-
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_sigaction == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_sigaction
-}
-
-func signalstack(s *stack) {
-	var st sigaltstackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m sigmask) {
-	var mask sigset
-	copy(mask.__bits[:], m[:])
-	sigprocmask(_SIG_SETMASK, &mask, nil)
-}
-
-func unblocksig(sig int32) {
-	var mask sigset
-	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
-	sigprocmask(_SIG_UNBLOCK, &mask, nil)
-}

diff --git a/src/runtime/os1_openbsd.go b/src/runtime/os1_openbsd.go
deleted file mode 100644
index 24a095b..0000000
--- a/src/runtime/os1_openbsd.go
+++ /dev/null

@@ -1,246 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-const (
-	_ESRCH       = 3
-	_EAGAIN      = 35
-	_EWOULDBLOCK = _EAGAIN
-	_ENOTSUP     = 91
-
-	// From OpenBSD's sys/time.h
-	_CLOCK_REALTIME  = 0
-	_CLOCK_VIRTUAL   = 1
-	_CLOCK_PROF      = 2
-	_CLOCK_MONOTONIC = 3
-)
-
-const (
-	sigset_none = uint32(0)
-	sigset_all  = ^uint32(0)
-)
-
-// From OpenBSD's <sys/sysctl.h>
-const (
-	_CTL_HW  = 6
-	_HW_NCPU = 3
-)
-
-func getncpu() int32 {
-	mib := [2]uint32{_CTL_HW, _HW_NCPU}
-	out := uint32(0)
-	nout := unsafe.Sizeof(out)
-
-	// Fetch hw.ncpu via sysctl.
-	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
-	if ret >= 0 {
-		return int32(out)
-	}
-	return 1
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	return 1
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	_g_ := getg()
-
-	// Compute sleep deadline.
-	var tsp *timespec
-	if ns >= 0 {
-		var ts timespec
-		var nsec int32
-		ns += nanotime()
-		ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
-		ts.set_nsec(nsec)
-		tsp = &ts
-	}
-
-	for {
-		v := atomicload(&_g_.m.waitsemacount)
-		if v > 0 {
-			if cas(&_g_.m.waitsemacount, v, v-1) {
-				return 0 // semaphore acquired
-			}
-			continue
-		}
-
-		// Sleep until woken by semawakeup or timeout; or abort if waitsemacount != 0.
-		//
-		// From OpenBSD's __thrsleep(2) manual:
-		// "The abort argument, if not NULL, points to an int that will
-		// be examined [...] immediately before blocking.  If that int
-		// is non-zero then __thrsleep() will immediately return EINTR
-		// without blocking."
-		ret := thrsleep(uintptr(unsafe.Pointer(&_g_.m.waitsemacount)), _CLOCK_MONOTONIC, tsp, 0, &_g_.m.waitsemacount)
-		if ret == _EWOULDBLOCK {
-			return -1
-		}
-	}
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	xadd(&mp.waitsemacount, 1)
-	ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1)
-	if ret != 0 && ret != _ESRCH {
-		// semawakeup can be called on signal stack.
-		systemstack(func() {
-			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
-		})
-	}
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, "/", int32(mp.tls[0]), " ostk=", &mp, "\n")
-	}
-
-	mp.tls[0] = uintptr(mp.id) // so 386 asm can find it
-
-	param := tforkt{
-		tf_tcb:   unsafe.Pointer(&mp.tls[0]),
-		tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
-		tf_stack: uintptr(stk),
-	}
-
-	oset := sigprocmask(_SIG_SETMASK, sigset_all)
-	ret := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, funcPC(mstart))
-	sigprocmask(_SIG_SETMASK, oset)
-
-	if ret < 0 {
-		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-func osinit() {
-	ncpu = getncpu()
-}
-
-var urandom_dev = []byte("/dev/urandom\x00")
-
-//go:nosplit
-func getRandomData(r []byte) {
-	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
-	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
-	closefd(fd)
-	extendRandom(r, int(n))
-}
-
-func goenvs() {
-	goenvs_unix()
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-}
-
-func msigsave(mp *m) {
-	smask := (*uint32)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	*smask = sigprocmask(_SIG_BLOCK, 0)
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	_g_ := getg()
-
-	// m.procid is a uint64, but tfork writes an int32. Fix it up.
-	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
-
-	// Initialize signal handling
-	signalstack(&_g_.m.gsignal.stack)
-
-	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
-	for i := range sigtable {
-		if sigtable[i].flags&_SigUnblock != 0 {
-			nmask &^= 1 << (uint32(i) - 1)
-		}
-	}
-	sigprocmask(_SIG_SETMASK, nmask)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	_g_ := getg()
-	smask := *(*uint32)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask)
-	signalstack(nil)
-}
-
-func memlimit() uintptr {
-	return 0
-}
-
-func sigtramp()
-
-type sigactiont struct {
-	sa_sigaction uintptr
-	sa_mask      uint32
-	sa_flags     int32
-}
-
-func setsig(i int32, fn uintptr, restart bool) {
-	var sa sigactiont
-	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
-	if restart {
-		sa.sa_flags |= _SA_RESTART
-	}
-	sa.sa_mask = sigset_all
-	if fn == funcPC(sighandler) {
-		fn = funcPC(sigtramp)
-	}
-	sa.sa_sigaction = fn
-	sigaction(i, &sa, nil)
-}
-
-func setsigstack(i int32) {
-	throw("setsigstack")
-}
-
-func getsig(i int32) uintptr {
-	var sa sigactiont
-	sigaction(i, nil, &sa)
-	if sa.sa_sigaction == funcPC(sigtramp) {
-		return funcPC(sighandler)
-	}
-	return sa.sa_sigaction
-}
-
-func signalstack(s *stack) {
-	var st stackt
-	if s == nil {
-		st.ss_flags = _SS_DISABLE
-	} else {
-		st.ss_sp = s.lo
-		st.ss_size = s.hi - s.lo
-		st.ss_flags = 0
-	}
-	sigaltstack(&st, nil)
-}
-
-func updatesigmask(m sigmask) {
-	sigprocmask(_SIG_SETMASK, m[0])
-}
-
-func unblocksig(sig int32) {
-	mask := uint32(1) << (uint32(sig) - 1)
-	sigprocmask(_SIG_UNBLOCK, mask)
-}

diff --git a/src/runtime/os1_plan9.go b/src/runtime/os1_plan9.go
deleted file mode 100644
index 9615b6d..0000000
--- a/src/runtime/os1_plan9.go
+++ /dev/null

@@ -1,268 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-	// Initialize stack and goroutine for note handling.
-	mp.gsignal = malg(32 * 1024)
-	mp.gsignal.m = mp
-	mp.notesig = (*int8)(mallocgc(_ERRMAX, nil, _FlagNoScan))
-	// Initialize stack for handling strings from the
-	// errstr system call, as used in package syscall.
-	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, _FlagNoScan))
-}
-
-func msigsave(mp *m) {
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	// Mask all SSE floating-point exceptions
-	// when running on the 64-bit kernel.
-	setfpmasks()
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-}
-
-var sysstat = []byte("/dev/sysstat\x00")
-
-func getproccount() int32 {
-	var buf [2048]byte
-	fd := open(&sysstat[0], _OREAD, 0)
-	if fd < 0 {
-		return 1
-	}
-	ncpu := int32(0)
-	for {
-		n := read(fd, unsafe.Pointer(&buf), int32(len(buf)))
-		if n <= 0 {
-			break
-		}
-		for i := int32(0); i < n; i++ {
-			if buf[i] == '\n' {
-				ncpu++
-			}
-		}
-	}
-	closefd(fd)
-	if ncpu == 0 {
-		ncpu = 1
-	}
-	return ncpu
-}
-
-var pid = []byte("#c/pid\x00")
-
-func getpid() uint64 {
-	var b [20]byte
-	fd := open(&pid[0], 0, 0)
-	if fd >= 0 {
-		read(fd, unsafe.Pointer(&b), int32(len(b)))
-		closefd(fd)
-	}
-	c := b[:]
-	for c[0] == ' ' || c[0] == '\t' {
-		c = c[1:]
-	}
-	return uint64(_atoi(c))
-}
-
-func osinit() {
-	initBloc()
-	ncpu = getproccount()
-	getg().m.procid = getpid()
-	notify(unsafe.Pointer(funcPC(sigtramp)))
-}
-
-func crash() {
-	notify(nil)
-	*(*int)(nil) = 0
-}
-
-//go:nosplit
-func getRandomData(r []byte) {
-	extendRandom(r, 0)
-}
-
-func goenvs() {
-}
-
-func initsig() {
-}
-
-//go:nosplit
-func osyield() {
-	sleep(0)
-}
-
-//go:nosplit
-func usleep(µs uint32) {
-	ms := int32(µs / 1000)
-	if ms == 0 {
-		ms = 1
-	}
-	sleep(ms)
-}
-
-//go:nosplit
-func nanotime() int64 {
-	var scratch int64
-	ns := nsec(&scratch)
-	// TODO(aram): remove hack after I fix _nsec in the pc64 kernel.
-	if ns == 0 {
-		return scratch
-	}
-	return ns
-}
-
-//go:nosplit
-func itoa(buf []byte, val uint64) []byte {
-	i := len(buf) - 1
-	for val >= 10 {
-		buf[i] = byte(val%10 + '0')
-		i--
-		val /= 10
-	}
-	buf[i] = byte(val + '0')
-	return buf[i:]
-}
-
-var goexits = []byte("go: exit ")
-
-func goexitsall(status *byte) {
-	var buf [_ERRMAX]byte
-	n := copy(buf[:], goexits)
-	n = copy(buf[n:], gostringnocopy(status))
-	pid := getpid()
-	for mp := (*m)(atomicloadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
-		if mp.procid != pid {
-			postnote(mp.procid, buf[:])
-		}
-	}
-}
-
-var procdir = []byte("/proc/")
-var notefile = []byte("/note\x00")
-
-func postnote(pid uint64, msg []byte) int {
-	var buf [128]byte
-	var tmp [32]byte
-	n := copy(buf[:], procdir)
-	n += copy(buf[n:], itoa(tmp[:], pid))
-	copy(buf[n:], notefile)
-	fd := open(&buf[0], _OWRITE, 0)
-	if fd < 0 {
-		return -1
-	}
-	len := findnull(&msg[0])
-	if write(uintptr(fd), (unsafe.Pointer)(&msg[0]), int32(len)) != int64(len) {
-		closefd(fd)
-		return -1
-	}
-	closefd(fd)
-	return 0
-}
-
-//go:nosplit
-func exit(e int) {
-	var status []byte
-	if e == 0 {
-		status = []byte("\x00")
-	} else {
-		// build error string
-		var tmp [32]byte
-		status = append(itoa(tmp[:len(tmp)-1], uint64(e)), 0)
-	}
-	goexitsall(&status[0])
-	exits(&status[0])
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	if false {
-		print("newosproc mp=", mp, " ostk=", &mp, "\n")
-	}
-	pid := rfork(_RFPROC | _RFMEM | _RFNOWAIT)
-	if pid < 0 {
-		throw("newosproc: rfork failed")
-	}
-	if pid == 0 {
-		tstart_plan9(mp)
-	}
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	return 1
-}
-
-//go:nosplit
-func semasleep(ns int64) int {
-	_g_ := getg()
-	if ns >= 0 {
-		ms := timediv(ns, 1000000, nil)
-		if ms == 0 {
-			ms = 1
-		}
-		ret := plan9_tsemacquire(&_g_.m.waitsemacount, ms)
-		if ret == 1 {
-			return 0 // success
-		}
-		return -1 // timeout or interrupted
-	}
-	for plan9_semacquire(&_g_.m.waitsemacount, 1) < 0 {
-		// interrupted; try again (c.f. lock_sema.go)
-	}
-	return 0 // success
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	plan9_semrelease(&mp.waitsemacount, 1)
-}
-
-//go:nosplit
-func read(fd int32, buf unsafe.Pointer, n int32) int32 {
-	return pread(fd, buf, n, -1)
-}
-
-//go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, n int32) int64 {
-	return int64(pwrite(int32(fd), buf, n, -1))
-}
-
-func memlimit() uint64 {
-	return 0
-}
-
-var _badsignal = []byte("runtime: signal received on thread not created by Go.\n")
-
-// This runs on a foreign stack, without an m or a g.  No stack split.
-//go:nosplit
-func badsignal2() {
-	pwrite(2, unsafe.Pointer(&_badsignal[0]), int32(len(_badsignal)), -1)
-	exits(&_badsignal[0])
-}
-
-func raisebadsignal(sig int32) {
-	badsignal2()
-}
-
-func _atoi(b []byte) int {
-	n := 0
-	for len(b) > 0 && '0' <= b[0] && b[0] <= '9' {
-		n = n*10 + int(b[0]) - '0'
-		b = b[1:]
-	}
-	return n
-}

diff --git a/src/runtime/os1_windows.go b/src/runtime/os1_windows.go
deleted file mode 100644
index f608b4a..0000000
--- a/src/runtime/os1_windows.go
+++ /dev/null

@@ -1,540 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import (
-	"unsafe"
-)
-
-//go:cgo_import_dynamic runtime._AddVectoredExceptionHandler AddVectoredExceptionHandler%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CloseHandle CloseHandle%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateEventA CreateEventA%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._CryptAcquireContextW CryptAcquireContextW%5 "advapi32.dll"
-//go:cgo_import_dynamic runtime._CryptGenRandom CryptGenRandom%3 "advapi32.dll"
-//go:cgo_import_dynamic runtime._CryptReleaseContext CryptReleaseContext%2 "advapi32.dll"
-//go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
-//go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetEnvironmentStringsW GetEnvironmentStringsW%0 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetProcAddress GetProcAddress%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetQueuedCompletionStatus GetQueuedCompletionStatus%5 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetStdHandle GetStdHandle%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._NtWaitForSingleObject NtWaitForSingleObject%3 "ntdll.dll"
-//go:cgo_import_dynamic runtime._ResumeThread ResumeThread%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetConsoleCtrlHandler SetConsoleCtrlHandler%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetErrorMode SetErrorMode%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetEvent SetEvent%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetProcessPriorityBoost SetProcessPriorityBoost%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetThreadPriority SetThreadPriority%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetUnhandledExceptionFilter SetUnhandledExceptionFilter%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SetWaitableTimer SetWaitableTimer%6 "kernel32.dll"
-//go:cgo_import_dynamic runtime._SuspendThread SuspendThread%1 "kernel32.dll"
-//go:cgo_import_dynamic runtime._VirtualAlloc VirtualAlloc%4 "kernel32.dll"
-//go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
-//go:cgo_import_dynamic runtime._WSAGetOverlappedResult WSAGetOverlappedResult%5 "ws2_32.dll"
-//go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
-//go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
-//go:cgo_import_dynamic runtime._timeBeginPeriod timeBeginPeriod%1 "winmm.dll"
-
-var (
-	// Following syscalls are available on every Windows PC.
-	// All these variables are set by the Windows executable
-	// loader before the Go program starts.
-	_AddVectoredExceptionHandler,
-	_CloseHandle,
-	_CreateEventA,
-	_CreateIoCompletionPort,
-	_CreateThread,
-	_CreateWaitableTimerA,
-	_CryptAcquireContextW,
-	_CryptGenRandom,
-	_CryptReleaseContext,
-	_DuplicateHandle,
-	_ExitProcess,
-	_FreeEnvironmentStringsW,
-	_GetEnvironmentStringsW,
-	_GetProcAddress,
-	_GetQueuedCompletionStatus,
-	_GetStdHandle,
-	_GetSystemInfo,
-	_GetThreadContext,
-	_LoadLibraryW,
-	_LoadLibraryA,
-	_NtWaitForSingleObject,
-	_ResumeThread,
-	_SetConsoleCtrlHandler,
-	_SetErrorMode,
-	_SetEvent,
-	_SetProcessPriorityBoost,
-	_SetThreadPriority,
-	_SetUnhandledExceptionFilter,
-	_SetWaitableTimer,
-	_SuspendThread,
-	_VirtualAlloc,
-	_VirtualFree,
-	_WSAGetOverlappedResult,
-	_WaitForSingleObject,
-	_WriteFile,
-	_timeBeginPeriod stdFunction
-
-	// Following syscalls are only available on some Windows PCs.
-	// We will load syscalls, if available, before using them.
-	_AddVectoredContinueHandler,
-	_GetQueuedCompletionStatusEx stdFunction
-)
-
-// Call a Windows function with stdcall conventions,
-// and switch to os stack during the call.
-func asmstdcall(fn unsafe.Pointer)
-
-var asmstdcallAddr unsafe.Pointer
-
-func loadOptionalSyscalls() {
-	var buf [50]byte // large enough for longest string
-	strtoptr := func(s string) uintptr {
-		buf[copy(buf[:], s)] = 0 // nil-terminated for OS
-		return uintptr(noescape(unsafe.Pointer(&buf[0])))
-	}
-	l := stdcall1(_LoadLibraryA, strtoptr("kernel32.dll"))
-	findfunc := func(name string) stdFunction {
-		f := stdcall2(_GetProcAddress, l, strtoptr(name))
-		return stdFunction(unsafe.Pointer(f))
-	}
-	if l != 0 {
-		_AddVectoredContinueHandler = findfunc("AddVectoredContinueHandler")
-		_GetQueuedCompletionStatusEx = findfunc("GetQueuedCompletionStatusEx")
-	}
-}
-
-//go:nosplit
-func getLoadLibrary() uintptr {
-	return uintptr(unsafe.Pointer(_LoadLibraryW))
-}
-
-//go:nosplit
-func getGetProcAddress() uintptr {
-	return uintptr(unsafe.Pointer(_GetProcAddress))
-}
-
-func getproccount() int32 {
-	var info systeminfo
-	stdcall1(_GetSystemInfo, uintptr(unsafe.Pointer(&info)))
-	return int32(info.dwnumberofprocessors)
-}
-
-const (
-	currentProcess = ^uintptr(0) // -1 = current process
-	currentThread  = ^uintptr(1) // -2 = current thread
-)
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func externalthreadhandler()
-
-func osinit() {
-	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
-
-	setBadSignalMsg()
-
-	loadOptionalSyscalls()
-
-	disableWER()
-
-	externalthreadhandlerp = funcPC(externalthreadhandler)
-
-	initExceptionHandler()
-
-	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
-
-	stdcall1(_timeBeginPeriod, 1)
-
-	ncpu = getproccount()
-
-	// Windows dynamic priority boosting assumes that a process has different types
-	// of dedicated threads -- GUI, IO, computational, etc. Go processes use
-	// equivalent threads that all do a mix of GUI, IO, computations, etc.
-	// In such context dynamic priority boosting does nothing but harm, so we turn it off.
-	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
-}
-
-//go:nosplit
-func getRandomData(r []byte) {
-	const (
-		prov_rsa_full       = 1
-		crypt_verifycontext = 0xF0000000
-	)
-	var handle uintptr
-	n := 0
-	if stdcall5(_CryptAcquireContextW, uintptr(unsafe.Pointer(&handle)), 0, 0, prov_rsa_full, crypt_verifycontext) != 0 {
-		if stdcall3(_CryptGenRandom, handle, uintptr(len(r)), uintptr(unsafe.Pointer(&r[0]))) != 0 {
-			n = len(r)
-		}
-		stdcall2(_CryptReleaseContext, handle, 0)
-	}
-	extendRandom(r, n)
-}
-
-func goenvs() {
-	// strings is a pointer to environment variable pairs in the form:
-	//     "envA=valA\x00envB=valB\x00\x00" (in UTF-16)
-	// Two consecutive zero bytes end the list.
-	strings := unsafe.Pointer(stdcall0(_GetEnvironmentStringsW))
-	p := (*[1 << 24]uint16)(strings)[:]
-
-	n := 0
-	for from, i := 0, 0; true; i++ {
-		if p[i] == 0 {
-			// empty string marks the end
-			if i == from {
-				break
-			}
-			from = i + 1
-			n++
-		}
-	}
-	envs = make([]string, n)
-
-	for i := range envs {
-		envs[i] = gostringw(&p[0])
-		for p[0] != 0 {
-			p = p[1:]
-		}
-		p = p[1:] // skip nil byte
-	}
-
-	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
-}
-
-//go:nosplit
-func exit(code int32) {
-	stdcall1(_ExitProcess, uintptr(code))
-}
-
-//go:nosplit
-func write(fd uintptr, buf unsafe.Pointer, n int32) int32 {
-	const (
-		_STD_OUTPUT_HANDLE = ^uintptr(10) // -11
-		_STD_ERROR_HANDLE  = ^uintptr(11) // -12
-	)
-	var handle uintptr
-	switch fd {
-	case 1:
-		handle = stdcall1(_GetStdHandle, _STD_OUTPUT_HANDLE)
-	case 2:
-		handle = stdcall1(_GetStdHandle, _STD_ERROR_HANDLE)
-	default:
-		// assume fd is real windows handle.
-		handle = fd
-	}
-	var written uint32
-	stdcall5(_WriteFile, handle, uintptr(buf), uintptr(n), uintptr(unsafe.Pointer(&written)), 0)
-	return int32(written)
-}
-
-//go:nosplit
-func semasleep(ns int64) int32 {
-	// store ms in ns to save stack space
-	if ns < 0 {
-		ns = _INFINITE
-	} else {
-		ns = int64(timediv(ns, 1000000, nil))
-		if ns == 0 {
-			ns = 1
-		}
-	}
-	if stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(ns)) != 0 {
-		return -1 // timeout
-	}
-	return 0
-}
-
-//go:nosplit
-func semawakeup(mp *m) {
-	stdcall1(_SetEvent, mp.waitsema)
-}
-
-//go:nosplit
-func semacreate() uintptr {
-	return stdcall4(_CreateEventA, 0, 0, 0, 0)
-}
-
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newosproc(mp *m, stk unsafe.Pointer) {
-	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
-	thandle := stdcall6(_CreateThread, 0, 0x20000,
-		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
-		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
-	if thandle == 0 {
-		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", getlasterror(), ")\n")
-		throw("runtime.newosproc")
-	}
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
-func mpreinit(mp *m) {
-}
-
-func msigsave(mp *m) {
-}
-
-// Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
-func minit() {
-	var thandle uintptr
-	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
-	atomicstoreuintptr(&getg().m.thread, thandle)
-}
-
-// Called from dropm to undo the effect of an minit.
-func unminit() {
-	tp := &getg().m.thread
-	stdcall1(_CloseHandle, *tp)
-	*tp = 0
-}
-
-// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
-type _KSYSTEM_TIME struct {
-	LowPart   uint32
-	High1Time int32
-	High2Time int32
-}
-
-const (
-	_INTERRUPT_TIME = 0x7ffe0008
-	_SYSTEM_TIME    = 0x7ffe0014
-)
-
-//go:nosplit
-func systime(addr uintptr) int64 {
-	timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
-
-	var t _KSYSTEM_TIME
-	for i := 1; i < 10000; i++ {
-		// these fields must be read in that order (see URL above)
-		t.High1Time = timeaddr.High1Time
-		t.LowPart = timeaddr.LowPart
-		t.High2Time = timeaddr.High2Time
-		if t.High1Time == t.High2Time {
-			return int64(t.High1Time)<<32 | int64(t.LowPart)
-		}
-		if (i % 100) == 0 {
-			osyield()
-		}
-	}
-	systemstack(func() {
-		throw("interrupt/system time is changing too fast")
-	})
-	return 0
-}
-
-//go:nosplit
-func unixnano() int64 {
-	return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
-}
-
-//go:nosplit
-func nanotime() int64 {
-	return systime(_INTERRUPT_TIME) * 100
-}
-
-// Calling stdcall on os stack.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-//go:nosplit
-func stdcall(fn stdFunction) uintptr {
-	gp := getg()
-	mp := gp.m
-	mp.libcall.fn = uintptr(unsafe.Pointer(fn))
-
-	if mp.profilehz != 0 {
-		// leave pc/sp for cpu profiler
-		mp.libcallg.set(gp)
-		mp.libcallpc = getcallerpc(unsafe.Pointer(&fn))
-		// sp must be the last, because once async cpu profiler finds
-		// all three values to be non-zero, it will use them
-		mp.libcallsp = getcallersp(unsafe.Pointer(&fn))
-	}
-	asmcgocall(asmstdcallAddr, unsafe.Pointer(&mp.libcall))
-	mp.libcallsp = 0
-	return mp.libcall.r1
-}
-
-//go:nosplit
-func stdcall0(fn stdFunction) uintptr {
-	mp := getg().m
-	mp.libcall.n = 0
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&fn))) // it's unused but must be non-nil, otherwise crashes
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall1(fn stdFunction, a0 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 1
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 2
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 3
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 4
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 5
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 6
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-//go:nosplit
-func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
-	mp := getg().m
-	mp.libcall.n = 7
-	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
-	return stdcall(fn)
-}
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func usleep1(usec uint32)
-
-//go:nosplit
-func osyield() {
-	usleep1(1)
-}
-
-//go:nosplit
-func usleep(us uint32) {
-	// Have 1us units; want 100ns units.
-	usleep1(10 * us)
-}
-
-func ctrlhandler1(_type uint32) uint32 {
-	var s uint32
-
-	switch _type {
-	case _CTRL_C_EVENT, _CTRL_BREAK_EVENT:
-		s = _SIGINT
-	default:
-		return 0
-	}
-
-	if sigsend(s) {
-		return 1
-	}
-	exit(2) // SIGINT, SIGTERM, etc
-	return 0
-}
-
-// in sys_windows_386.s and sys_windows_amd64.s
-func profileloop()
-
-var profiletimer uintptr
-
-func profilem(mp *m) {
-	var r *context
-	rbuf := make([]byte, unsafe.Sizeof(*r)+15)
-
-	tls := &mp.tls[0]
-	if mp == &m0 {
-		tls = &tls0[0]
-	}
-	gp := *((**g)(unsafe.Pointer(tls)))
-
-	// align Context to 16 bytes
-	r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
-	r.contextflags = _CONTEXT_CONTROL
-	stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
-	sigprof(r.ip(), r.sp(), 0, gp, mp)
-}
-
-func profileloop1(param uintptr) uint32 {
-	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
-
-	for {
-		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
-		first := (*m)(atomicloadp(unsafe.Pointer(&allm)))
-		for mp := first; mp != nil; mp = mp.alllink {
-			thread := atomicloaduintptr(&mp.thread)
-			// Do not profile threads blocked on Notes,
-			// this includes idle worker threads,
-			// idle timer thread, idle heap scavenger, etc.
-			if thread == 0 || mp.profilehz == 0 || mp.blocked {
-				continue
-			}
-			stdcall1(_SuspendThread, thread)
-			if mp.profilehz != 0 && !mp.blocked {
-				profilem(mp)
-			}
-			stdcall1(_ResumeThread, thread)
-		}
-	}
-}
-
-var cpuprofilerlock mutex
-
-func resetcpuprofiler(hz int32) {
-	lock(&cpuprofilerlock)
-	if profiletimer == 0 {
-		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
-		atomicstoreuintptr(&profiletimer, timer)
-		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
-		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
-		stdcall1(_CloseHandle, thread)
-	}
-	unlock(&cpuprofilerlock)
-
-	ms := int32(0)
-	due := ^int64(^uint64(1 << 63))
-	if hz > 0 {
-		ms = 1000 / hz
-		if ms == 0 {
-			ms = 1
-		}
-		due = int64(ms) * -10000
-	}
-	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
-	atomicstore((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
-}
-
-func memlimit() uintptr {
-	return 0
-}

diff --git a/src/runtime/os2_darwin.go b/src/runtime/os2_darwin.go
deleted file mode 100644
index 542bd74..0000000
--- a/src/runtime/os2_darwin.go
+++ /dev/null

@@ -1,14 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_NSIG        = 32
-	_SI_USER     = 0 /* empirically true, but not what headers say */
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-	_SS_DISABLE  = 4
-)

diff --git a/src/runtime/os2_dragonfly.go b/src/runtime/os2_dragonfly.go
deleted file mode 100644
index ccad82f..0000000
--- a/src/runtime/os2_dragonfly.go
+++ /dev/null

@@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_NSIG        = 33
-	_SI_USER     = 0x10001
-	_SS_DISABLE  = 4
-	_RLIMIT_AS   = 10
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-)

diff --git a/src/runtime/os2_linux.go b/src/runtime/os2_linux.go
deleted file mode 100644
index 71f36eb..0000000
--- a/src/runtime/os2_linux.go
+++ /dev/null

@@ -1,25 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_SS_DISABLE  = 2
-	_NSIG        = 65
-	_SI_USER     = 0
-	_SIG_BLOCK   = 0
-	_SIG_UNBLOCK = 1
-	_SIG_SETMASK = 2
-	_RLIMIT_AS   = 9
-)
-
-// It's hard to tease out exactly how big a Sigset is, but
-// rt_sigprocmask crashes if we get it wrong, so if binaries
-// are running, this is right.
-type sigset [2]uint32
-
-type rlimit struct {
-	rlim_cur uintptr
-	rlim_max uintptr
-}

diff --git a/src/runtime/os2_nacl.go b/src/runtime/os2_nacl.go
index 0c91e0f..b84cb18 100644
--- a/src/runtime/os2_nacl.go
+++ b/src/runtime/os2_nacl.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -10,18 +10,19 @@
 
 	// native_client/src/trusted/service_runtime/include/sys/errno.h
 	// The errors are mainly copied from Linux.
-	_EPERM           = 1        /* Operation not permitted */
-	_ENOENT          = 2        /* No such file or directory */
-	_ESRCH           = 3        /* No such process */
-	_EINTR           = 4        /* Interrupted system call */
-	_EIO             = 5        /* I/O error */
-	_ENXIO           = 6        /* No such device or address */
-	_E2BIG           = 7        /* Argument list too long */
-	_ENOEXEC         = 8        /* Exec format error */
-	_EBADF           = 9        /* Bad file number */
-	_ECHILD          = 10       /* No child processes */
-	_EAGAIN          = 11       /* Try again */
-	_ENOMEM          = 12       /* Out of memory */
+	_EPERM   = 1  /* Operation not permitted */
+	_ENOENT  = 2  /* No such file or directory */
+	_ESRCH   = 3  /* No such process */
+	_EINTR   = 4  /* Interrupted system call */
+	_EIO     = 5  /* I/O error */
+	_ENXIO   = 6  /* No such device or address */
+	_E2BIG   = 7  /* Argument list too long */
+	_ENOEXEC = 8  /* Exec format error */
+	_EBADF   = 9  /* Bad file number */
+	_ECHILD  = 10 /* No child processes */
+	_EAGAIN  = 11 /* Try again */
+	// _ENOMEM is defined in mem_bsd.go for nacl.
+	// _ENOMEM          = 12       /* Out of memory */
 	_EACCES          = 13       /* Permission denied */
 	_EFAULT          = 14       /* Bad address */
 	_EBUSY           = 16       /* Device or resource busy */

diff --git a/src/runtime/os2_netbsd.go b/src/runtime/os2_netbsd.go
deleted file mode 100644
index 46576b9..0000000
--- a/src/runtime/os2_netbsd.go
+++ /dev/null

@@ -1,18 +0,0 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_SS_DISABLE  = 4
-	_SIG_BLOCK   = 1
-	_SIG_UNBLOCK = 2
-	_SIG_SETMASK = 3
-	_NSIG        = 33
-	_SI_USER     = 0
-
-	// From NetBSD's <sys/ucontext.h>
-	_UC_SIGMASK = 0x01
-	_UC_CPU     = 0x04
-)

diff --git a/src/runtime/os2_openbsd.go b/src/runtime/os2_openbsd.go
index 1e785ad..8656a91 100644
--- a/src/runtime/os2_openbsd.go
+++ b/src/runtime/os2_openbsd.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/os2_windows.go b/src/runtime/os2_windows.go
deleted file mode 100644
index a867dfe..0000000
--- a/src/runtime/os2_windows.go
+++ /dev/null

@@ -1,19 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-func getlasterror() uint32
-func setlasterror(err uint32)
-
-// Function to be called by windows CreateThread
-// to start new os thread.
-func tstart_stdcall(newm *m) uint32
-
-func ctrlhandler(_type uint32) uint32
-
-// TODO(brainman): should not need those
-const (
-	_NSIG = 65
-)

diff --git a/src/runtime/os3_plan9.go b/src/runtime/os3_plan9.go
index 03e9410..aff1d05 100644
--- a/src/runtime/os3_plan9.go
+++ b/src/runtime/os3_plan9.go

@@ -1,19 +1,24 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(_ureg *ureg, note *byte, gp *g) int {
 	_g_ := getg()
 	var t sigTabT
 	var docrash bool
 	var sig int
 	var flags int
+	var level int32
 
 	c := &sigctxt{_ureg}
 	notestr := gostringnocopy(note)
@@ -50,8 +55,8 @@
 		gp.sig = uint32(sig)
 		gp.sigpc = c.pc()
 
-		pc := uintptr(c.pc())
-		sp := uintptr(c.sp())
+		pc := c.pc()
+		sp := c.sp()
 
 		// If we don't recognize the PC as code
 		// but we do recognize the top pointer on the stack as code,
@@ -61,22 +66,37 @@
 			pc = 0
 		}
 
-		// Only push sigpanic if PC != 0.
-		//
+		// IF LR exists, sigpanictramp must save it to the stack
+		// before entry to sigpanic so that panics in leaf
+		// functions are correctly handled. This will smash
+		// the stack frame but we're not going back there
+		// anyway.
+		if usesLR {
+			c.savelr(c.lr())
+		}
+
 		// If PC == 0, probably panicked because of a call to a nil func.
-		// Not pushing that onto SP will make the trace look like a call
+		// Not faking that as the return address will make the trace look like a call
 		// to sigpanic instead. (Otherwise the trace will end at
 		// sigpanic and we won't get to see who faulted).
 		if pc != 0 {
-			if regSize > ptrSize {
-				sp -= ptrSize
-				*(*uintptr)(unsafe.Pointer(sp)) = 0
+			if usesLR {
+				c.setlr(pc)
+			} else {
+				if sys.RegSize > sys.PtrSize {
+					sp -= sys.PtrSize
+					*(*uintptr)(unsafe.Pointer(sp)) = 0
+				}
+				sp -= sys.PtrSize
+				*(*uintptr)(unsafe.Pointer(sp)) = pc
+				c.setsp(sp)
 			}
-			sp -= ptrSize
-			*(*uintptr)(unsafe.Pointer(sp)) = pc
-			c.setsp(sp)
 		}
-		c.setpc(funcPC(sigpanic))
+		if usesLR {
+			c.setpc(funcPC(sigpanictramp))
+		} else {
+			c.setpc(funcPC(sigpanic))
+		}
 		return _NCONT
 	}
 	if flags&_SigNotify != 0 {
@@ -97,9 +117,10 @@
 	print(notestr, "\n")
 	print("PC=", hex(c.pc()), "\n")
 	print("\n")
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash = gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
-		tracebacktrap(c.pc(), c.sp(), 0, gp)
+		tracebacktrap(c.pc(), c.sp(), c.lr(), gp)
 		tracebackothers(gp)
 		print("\n")
 		dumpregs(_ureg)

diff --git a/src/runtime/os3_solaris.go b/src/runtime/os3_solaris.go
index 792188f..9368e0d 100644
--- a/src/runtime/os3_solaris.go
+++ b/src/runtime/os3_solaris.go

@@ -159,12 +159,15 @@
 	}
 
 	// Disable signals during create, so that the new thread starts
-	// with signals disabled.  It will enable them in minit.
+	// with signals disabled. It will enable them in minit.
 	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
 	ret = pthread_create(&tid, &attr, funcPC(tstart_sysvicall), unsafe.Pointer(mp))
 	sigprocmask(_SIG_SETMASK, &oset, nil)
 	if ret != 0 {
 		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", ret, ")\n")
+		if ret == -_EAGAIN {
+			println("runtime: may need to increase max user processes (ulimit -u)")
+		}
 		throw("newosproc")
 	}
 }
@@ -192,24 +195,45 @@
 
 func miniterrno()
 
+//go:nosplit
 func msigsave(mp *m) {
-	smask := (*sigset)(unsafe.Pointer(&mp.sigmask))
-	if unsafe.Sizeof(*smask) > unsafe.Sizeof(mp.sigmask) {
-		throw("insufficient storage for signal mask")
-	}
-	sigprocmask(_SIG_SETMASK, nil, smask)
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
 }
 
 // Called to initialize a new m (including the bootstrap m).
-// Called on the new thread, can not allocate memory.
+// Called on the new thread, cannot allocate memory.
 func minit() {
 	_g_ := getg()
 	asmcgocall(unsafe.Pointer(funcPC(miniterrno)), unsafe.Pointer(&libc____errno))
 	// Initialize signal handling
-	signalstack(&_g_.m.gsignal.stack)
+	var st sigaltstackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		// Use existing signal stack.
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		_g_.m.gsignal.stack.lo = stsp
+		_g_.m.gsignal.stack.hi = stsp + uintptr(st.ss_size)
+		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+		_g_.m.gsignal.stackAlloc = uintptr(st.ss_size)
+		_g_.m.newSigstack = false
+	}
 
 	// restore signal mask from m.sigmask and unblock essential signals
-	nmask := *(*sigset)(unsafe.Pointer(&_g_.m.sigmask))
+	nmask := _g_.m.sigmask
 	for i := range sigtable {
 		if sigtable[i].flags&_SigUnblock != 0 {
 			nmask.__sigbits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
@@ -220,11 +244,9 @@
 
 // Called from dropm to undo the effect of an minit.
 func unminit() {
-	_g_ := getg()
-	smask := (*sigset)(unsafe.Pointer(&_g_.m.sigmask))
-	sigprocmask(_SIG_SETMASK, smask, nil)
-
-	signalstack(nil)
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
 }
 
 func memlimit() uintptr {
@@ -247,7 +269,7 @@
 			return 0;
 
 		// If there's not at least 16 MB left, we're probably
-		// not going to be able to do much.  Treat as no limit.
+		// not going to be able to do much. Treat as no limit.
 		rl.rlim_cur -= used;
 		if(rl.rlim_cur < (16<<20))
 			return 0;
@@ -260,6 +282,8 @@
 
 func sigtramp()
 
+//go:nosplit
+//go:nowritebarrierrec
 func setsig(i int32, fn uintptr, restart bool) {
 	var sa sigactiont
 
@@ -276,10 +300,21 @@
 	sigaction(i, &sa, nil)
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func setsigstack(i int32) {
-	throw("setsigstack")
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	handler := *((*uintptr)(unsafe.Pointer(&sa._funcptr)))
+	if handler == 0 || handler == _SIG_DFL || handler == _SIG_IGN || sa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	sa.sa_flags |= _SA_ONSTACK
+	sigaction(i, &sa, nil)
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func getsig(i int32) uintptr {
 	var sa sigactiont
 	sigaction(i, nil, &sa)
@@ -289,6 +324,7 @@
 	return *((*uintptr)(unsafe.Pointer(&sa._funcptr)))
 }
 
+//go:nosplit
 func signalstack(s *stack) {
 	var st sigaltstackt
 	if s == nil {
@@ -301,6 +337,8 @@
 	sigaltstack(&st, nil)
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func updatesigmask(m sigmask) {
 	var mask sigset
 	copy(mask.__sigbits[:], m[:])
@@ -314,12 +352,16 @@
 }
 
 //go:nosplit
-func semacreate() uintptr {
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+
 	var sem *semt
 	_g_ := getg()
 
-	// Call libc's malloc rather than malloc.  This will
-	// allocate space on the C heap.  We can't call malloc
+	// Call libc's malloc rather than malloc. This will
+	// allocate space on the C heap. We can't call malloc
 	// here because it could cause a deadlock.
 	_g_.m.libcall.fn = uintptr(unsafe.Pointer(&libc_malloc))
 	_g_.m.libcall.n = 1
@@ -331,7 +373,7 @@
 	if sem_init(sem, 0, 0) != 0 {
 		throw("sem_init")
 	}
-	return uintptr(unsafe.Pointer(sem))
+	mp.waitsema = uintptr(unsafe.Pointer(sem))
 }
 
 //go:nosplit
@@ -403,7 +445,21 @@
 
 //go:nosplit
 func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer {
-	return unsafe.Pointer(sysvicall6(&libc_mmap, uintptr(addr), uintptr(n), uintptr(prot), uintptr(flags), uintptr(fd), uintptr(off)))
+	p, err := doMmap(uintptr(addr), n, uintptr(prot), uintptr(flags), uintptr(fd), uintptr(off))
+	if p == ^uintptr(0) {
+		return unsafe.Pointer(err)
+	}
+	return unsafe.Pointer(p)
+}
+
+//go:nosplit
+func doMmap(addr, n, prot, flags, fd, off uintptr) (uintptr, uintptr) {
+	var libcall libcall
+	libcall.fn = uintptr(unsafe.Pointer(&libc_mmap))
+	libcall.n = 6
+	libcall.args = uintptr(noescape(unsafe.Pointer(&addr)))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
+	return libcall.r1, libcall.err
 }
 
 //go:nosplit
@@ -447,6 +503,8 @@
 	return int32(sysvicall4(&libc_pthread_create, uintptr(unsafe.Pointer(thread)), uintptr(unsafe.Pointer(attr)), uintptr(fn), uintptr(arg)))
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func raise(sig int32) /* int32 */ {
 	sysvicall1(&libc_raise, uintptr(sig))
 }
@@ -485,14 +543,20 @@
 	sysvicall3(&libc_setitimer, uintptr(which), uintptr(unsafe.Pointer(value)), uintptr(unsafe.Pointer(ovalue)))
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func sigaction(sig int32, act *sigactiont, oact *sigactiont) /* int32 */ {
 	sysvicall3(&libc_sigaction, uintptr(sig), uintptr(unsafe.Pointer(act)), uintptr(unsafe.Pointer(oact)))
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func sigaltstack(ss *sigaltstackt, oss *sigaltstackt) /* int32 */ {
 	sysvicall2(&libc_sigaltstack, uintptr(unsafe.Pointer(ss)), uintptr(unsafe.Pointer(oss)))
 }
 
+//go:nosplit
+//go:nowritebarrierrec
 func sigprocmask(how int32, set *sigset, oset *sigset) /* int32 */ {
 	sysvicall3(&libc_sigprocmask, uintptr(how), uintptr(unsafe.Pointer(set)), uintptr(unsafe.Pointer(oset)))
 }

diff --git a/src/runtime/os_android_arm.go b/src/runtime/os_android.go
similarity index 100%
rename from src/runtime/os_android_arm.go
rename to src/runtime/os_android.go


diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 3deafd5..a0e3d8e 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,6 +6,11 @@
 
 import "unsafe"
 
+type mOS struct {
+	machport uint32 // return address for mach ipc
+	waitsema uint32 // semaphore for parking on locks
+}
+
 func bsdthread_create(stk, arg unsafe.Pointer, fn uintptr) int32
 func bsdthread_register() int32
 
@@ -19,11 +24,481 @@
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
 
-//go:noescape
-func sigprocmask(how uint32, new, old *uint32)
+func unimplemented(name string) {
+	println(name, "not implemented")
+	*(*int)(unsafe.Pointer(uintptr(1231))) = 1231
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	mach_semrelease(mp.waitsema)
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+	systemstack(func() {
+		mp.waitsema = mach_semcreate()
+	})
+}
+
+// BSD interface for threading.
+func osinit() {
+	// bsdthread_register delayed until end of goenvs so that we
+	// can look at the environment first.
+
+	ncpu = getncpu()
+}
+
+func getncpu() int32 {
+	// Use sysctl to fetch hw.ncpu.
+	mib := [2]uint32{6, 3}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 && int32(out) > 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+
+	// Register our thread-creation callback (see sys_darwin_{amd64,386}.s)
+	// but only if we're not using cgo. If we are using cgo we need
+	// to let the C pthread library install its own thread-creation callback.
+	if !iscgo {
+		if bsdthread_register() != 0 {
+			if gogetenv("DYLD_INSERT_LIBRARIES") != "" {
+				throw("runtime: bsdthread_register error (unset DYLD_INSERT_LIBRARIES)")
+			}
+			throw("runtime: bsdthread_register error")
+		}
+	}
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	errno := bsdthread_create(stk, unsafe.Pointer(mp), funcPC(mstart))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+
+	if errno < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -errno, ")\n")
+		throw("runtime.newosproc")
+	}
+}
+
+// newosproc0 is a version of newosproc that can be called before the runtime
+// is initialized.
+//
+// As Go uses bsdthread_register when running without cgo, this function is
+// not safe to use after initialization as it does not pass an M as fnarg.
+//
+//go:nosplit
+func newosproc0(stacksize uintptr, fn unsafe.Pointer, fnarg uintptr) {
+	stack := sysAlloc(stacksize, &memstats.stacks_sys)
+	if stack == nil {
+		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		exit(1)
+	}
+	stk := unsafe.Pointer(uintptr(stack) + stacksize)
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	errno := bsdthread_create(stk, fn, fnarg)
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+
+	if errno < 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // OS X wants >= 8K
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	// Initialize signal handling.
+	_g_ := getg()
+
+	// The alternate signal stack is buggy on arm and arm64.
+	// The signal handler handles it directly.
+	// The sigaltstack assembly function does nothing.
+	if GOARCH != "arm" && GOARCH != "arm64" {
+		var st stackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			signalstack(&_g_.m.gsignal.stack)
+			_g_.m.newSigstack = true
+		} else {
+			// Use existing signal stack.
+			stsp := uintptr(unsafe.Pointer(st.ss_sp))
+			_g_.m.gsignal.stack.lo = stsp
+			_g_.m.gsignal.stack.hi = stsp + st.ss_size
+			_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+			_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+			_g_.m.gsignal.stackAlloc = st.ss_size
+			_g_.m.newSigstack = false
+		}
+	}
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask &^= 1 << (uint32(i) - 1)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+// Mach IPC, to get at semaphores
+// Definitions are in /usr/include/mach on a Mac.
+
+func macherror(r int32, fn string) {
+	print("mach error ", fn, ": ", r, "\n")
+	throw("mach error")
+}
+
+const _DebugMach = false
+
+var zerondr machndr
+
+func mach_msgh_bits(a, b uint32) uint32 {
+	return a | b<<8
+}
+
+func mach_msg(h *machheader, op int32, send_size, rcv_size, rcv_name, timeout, notify uint32) int32 {
+	// TODO: Loop on interrupt.
+	return mach_msg_trap(unsafe.Pointer(h), op, send_size, rcv_size, rcv_name, timeout, notify)
+}
+
+// Mach RPC (MIG)
+const (
+	_MinMachMsg = 48
+	_MachReply  = 100
+)
+
+type codemsg struct {
+	h    machheader
+	ndr  machndr
+	code int32
+}
+
+func machcall(h *machheader, maxsize int32, rxsize int32) int32 {
+	_g_ := getg()
+	port := _g_.m.machport
+	if port == 0 {
+		port = mach_reply_port()
+		_g_.m.machport = port
+	}
+
+	h.msgh_bits |= mach_msgh_bits(_MACH_MSG_TYPE_COPY_SEND, _MACH_MSG_TYPE_MAKE_SEND_ONCE)
+	h.msgh_local_port = port
+	h.msgh_reserved = 0
+	id := h.msgh_id
+
+	if _DebugMach {
+		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
+		print("send:\t")
+		var i uint32
+		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
+			print(" ", p[i])
+			if i%8 == 7 {
+				print("\n\t")
+			}
+		}
+		if i%8 != 0 {
+			print("\n")
+		}
+	}
+	ret := mach_msg(h, _MACH_SEND_MSG|_MACH_RCV_MSG, h.msgh_size, uint32(maxsize), port, 0, 0)
+	if ret != 0 {
+		if _DebugMach {
+			print("mach_msg error ", ret, "\n")
+		}
+		return ret
+	}
+	if _DebugMach {
+		p := (*[10000]unsafe.Pointer)(unsafe.Pointer(h))
+		var i uint32
+		for i = 0; i < h.msgh_size/uint32(unsafe.Sizeof(p[0])); i++ {
+			print(" ", p[i])
+			if i%8 == 7 {
+				print("\n\t")
+			}
+		}
+		if i%8 != 0 {
+			print("\n")
+		}
+	}
+	if h.msgh_id != id+_MachReply {
+		if _DebugMach {
+			print("mach_msg _MachReply id mismatch ", h.msgh_id, " != ", id+_MachReply, "\n")
+		}
+		return -303 // MIG_REPLY_MISMATCH
+	}
+	// Look for a response giving the return value.
+	// Any call can send this back with an error,
+	// and some calls only have return values so they
+	// send it back on success too. I don't quite see how
+	// you know it's one of these and not the full response
+	// format, so just look if the message is right.
+	c := (*codemsg)(unsafe.Pointer(h))
+	if uintptr(h.msgh_size) == unsafe.Sizeof(*c) && h.msgh_bits&_MACH_MSGH_BITS_COMPLEX == 0 {
+		if _DebugMach {
+			print("mig result ", c.code, "\n")
+		}
+		return c.code
+	}
+	if h.msgh_size != uint32(rxsize) {
+		if _DebugMach {
+			print("mach_msg _MachReply size mismatch ", h.msgh_size, " != ", rxsize, "\n")
+		}
+		return -307 // MIG_ARRAY_TOO_LARGE
+	}
+	return 0
+}
+
+// Semaphores!
+
+const (
+	tmach_semcreate = 3418
+	rmach_semcreate = tmach_semcreate + _MachReply
+
+	tmach_semdestroy = 3419
+	rmach_semdestroy = tmach_semdestroy + _MachReply
+
+	_KERN_ABORTED             = 14
+	_KERN_OPERATION_TIMED_OUT = 49
+)
+
+type tmach_semcreatemsg struct {
+	h      machheader
+	ndr    machndr
+	policy int32
+	value  int32
+}
+
+type rmach_semcreatemsg struct {
+	h         machheader
+	body      machbody
+	semaphore machport
+}
+
+type tmach_semdestroymsg struct {
+	h         machheader
+	body      machbody
+	semaphore machport
+}
+
+func mach_semcreate() uint32 {
+	var m [256]uint8
+	tx := (*tmach_semcreatemsg)(unsafe.Pointer(&m))
+	rx := (*rmach_semcreatemsg)(unsafe.Pointer(&m))
+
+	tx.h.msgh_bits = 0
+	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
+	tx.h.msgh_remote_port = mach_task_self()
+	tx.h.msgh_id = tmach_semcreate
+	tx.ndr = zerondr
+
+	tx.policy = 0 // 0 = SYNC_POLICY_FIFO
+	tx.value = 0
+
+	for {
+		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), int32(unsafe.Sizeof(*rx)))
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_create")
+	}
+	if rx.body.msgh_descriptor_count != 1 {
+		unimplemented("mach_semcreate desc count")
+	}
+	return rx.semaphore.name
+}
+
+func mach_semdestroy(sem uint32) {
+	var m [256]uint8
+	tx := (*tmach_semdestroymsg)(unsafe.Pointer(&m))
+
+	tx.h.msgh_bits = _MACH_MSGH_BITS_COMPLEX
+	tx.h.msgh_size = uint32(unsafe.Sizeof(*tx))
+	tx.h.msgh_remote_port = mach_task_self()
+	tx.h.msgh_id = tmach_semdestroy
+	tx.body.msgh_descriptor_count = 1
+	tx.semaphore.name = sem
+	tx.semaphore.disposition = _MACH_MSG_TYPE_MOVE_SEND
+	tx.semaphore._type = 0
+
+	for {
+		r := machcall(&tx.h, int32(unsafe.Sizeof(m)), 0)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_destroy")
+	}
+}
+
+// The other calls have simple system call traps in sys_darwin_{amd64,386}.s
+
+func mach_semaphore_wait(sema uint32) int32
+func mach_semaphore_timedwait(sema, sec, nsec uint32) int32
+func mach_semaphore_signal(sema uint32) int32
+func mach_semaphore_signal_all(sema uint32) int32
+
+func semasleep1(ns int64) int32 {
+	_g_ := getg()
+
+	if ns >= 0 {
+		var nsecs int32
+		secs := timediv(ns, 1000000000, &nsecs)
+		r := mach_semaphore_timedwait(_g_.m.waitsema, uint32(secs), uint32(nsecs))
+		if r == _KERN_ABORTED || r == _KERN_OPERATION_TIMED_OUT {
+			return -1
+		}
+		if r != 0 {
+			macherror(r, "semaphore_wait")
+		}
+		return 0
+	}
+
+	for {
+		r := mach_semaphore_wait(_g_.m.waitsema)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+		macherror(r, "semaphore_wait")
+	}
+	return 0
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	var r int32
+	systemstack(func() {
+		r = semasleep1(ns)
+	})
+	return r
+}
+
+//go:nosplit
+func mach_semrelease(sem uint32) {
+	for {
+		r := mach_semaphore_signal(sem)
+		if r == 0 {
+			break
+		}
+		if r == _KERN_ABORTED { // interrupted
+			continue
+		}
+
+		// mach_semrelease must be completely nosplit,
+		// because it is called from Go code.
+		// If we're going to die, start that process on the system stack
+		// to avoid a Go stack split.
+		systemstack(func() { macherror(r, "semaphore_signal") })
+	}
+}
+
+//go:nosplit
+func osyield() {
+	usleep(1)
+}
+
+func memlimit() uintptr {
+	// NOTE(rsc): Could use getrlimit here,
+	// like on FreeBSD or Linux, but Darwin doesn't enforce
+	// ulimit -v, so it's unclear why we'd try to stay within
+	// the limit.
+	return 0
+}
+
+const (
+	_NSIG        = 32
+	_SI_USER     = 0 /* empirically true, but not what headers say */
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_SS_DISABLE  = 4
+)
 
 //go:noescape
-func sigaction(mode uint32, new, old *sigactiont)
+func sigprocmask(how uint32, new, old *sigset)
+
+//go:noescape
+func sigaction(mode uint32, new *sigactiont, old *usigactiont)
 
 //go:noescape
 func sigaltstack(new, old *stackt)
@@ -35,3 +510,73 @@
 
 func raise(sig int32)
 func raiseproc(int32)
+
+//extern SigTabTT runtime·sigtab[];
+
+type sigset uint32
+
+var sigset_all = ^sigset(0)
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = ^uint32(0)
+	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp)) // runtime·sigtramp's job is to call into real handler
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = fn
+	sigaction(uint32(i), &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	var osa usigactiont
+	sigaction(uint32(i), nil, &osa)
+	handler := *(*uintptr)(unsafe.Pointer(&osa.__sigaction_u))
+	if handler == 0 || handler == _SIG_DFL || handler == _SIG_IGN || osa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	var sa sigactiont
+	*(*uintptr)(unsafe.Pointer(&sa.__sigaction_u)) = handler
+	sa.sa_tramp = unsafe.Pointer(funcPC(sigtramp))
+	sa.sa_mask = osa.sa_mask
+	sa.sa_flags = osa.sa_flags | _SA_ONSTACK
+	sigaction(uint32(i), &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa usigactiont
+	sigaction(uint32(i), nil, &sa)
+	return *(*uintptr)(unsafe.Pointer(&sa.__sigaction_u))
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st stackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	s := sigset(m[0])
+	sigprocmask(_SIG_SETMASK, &s, nil)
+}
+
+func unblocksig(sig int32) {
+	mask := sigset(1) << (uint32(sig) - 1)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}

diff --git a/src/runtime/os_dragonfly.go b/src/runtime/os_dragonfly.go
index b19270a..85d4aad 100644
--- a/src/runtime/os_dragonfly.go
+++ b/src/runtime/os_dragonfly.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,6 +6,18 @@
 
 import "unsafe"
 
+const (
+	_NSIG        = 33
+	_SI_USER     = 0
+	_SS_DISABLE  = 4
+	_RLIMIT_AS   = 10
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+)
+
+type mOS struct{}
+
 //go:noescape
 func lwp_create(param *lwpparams) int32
 
@@ -13,9 +25,6 @@
 func sigaltstack(new, old *sigaltstackt)
 
 //go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
-//go:noescape
 func sigaction(sig int32, new, old *sigactiont)
 
 //go:noescape
@@ -42,3 +51,267 @@
 func osyield()
 
 const stackSystem = 0
+
+// From DragonFly's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	systemstack(func() {
+		futexsleep1(addr, val, ns)
+	})
+}
+
+func futexsleep1(addr *uint32, val uint32, ns int64) {
+	var timeout int32
+	if ns >= 0 {
+		// The timeout is specified in microseconds - ensure that we
+		// do not end up dividing to zero, which would put us to sleep
+		// indefinitely...
+		timeout = timediv(ns, 1000, nil)
+		if timeout == 0 {
+			timeout = 1
+		}
+	}
+
+	// sys_umtx_sleep will return EWOULDBLOCK (EAGAIN) when the timeout
+	// expires or EBUSY if the mutex value does not match.
+	ret := sys_umtx_sleep(addr, int32(val), timeout)
+	if ret >= 0 || ret == -_EINTR || ret == -_EAGAIN || ret == -_EBUSY {
+		return
+	}
+
+	print("umtx_sleep addr=", addr, " val=", val, " ret=", ret, "\n")
+	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
+}
+
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := sys_umtx_wakeup(addr, int32(cnt))
+	if ret >= 0 {
+		return
+	}
+
+	systemstack(func() {
+		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
+		*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+	})
+}
+
+func lwp_start(uintptr)
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " lwp_start=", funcPC(lwp_start), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+
+	params := lwpparams{
+		start_func: funcPC(lwp_start),
+		arg:        unsafe.Pointer(mp),
+		stack:      uintptr(stk),
+		tid1:       unsafe.Pointer(&mp.procid),
+		tid2:       nil,
+	}
+
+	// TODO: Check for error.
+	lwp_create(&params)
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+
+	// m.procid is a uint64, but lwp_start writes an int32. Fix it up.
+	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
+
+	// Initialize signal handling.
+
+	// On DragonFly a thread created by pthread_create inherits
+	// the signal stack of the creating thread. We always create
+	// a new signal stack here, to avoid having two Go threads
+	// using the same signal stack. This breaks the case of a
+	// thread created in C that calls sigaltstack and then calls a
+	// Go function, because we will lose track of the C code's
+	// sigaltstack, but it's the best we can do.
+	signalstack(&_g_.m.gsignal.stack)
+	_g_.m.newSigstack = true
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	/*
+		                TODO: Convert to Go when something actually uses the result.
+
+				Rlimit rl;
+				extern byte runtime·text[], runtime·end[];
+				uintptr used;
+
+				if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+					return 0;
+				if(rl.rlim_cur >= 0x7fffffff)
+					return 0;
+
+				// Estimate our VM footprint excluding the heap.
+				// Not an exact science: use size of binary plus
+				// some room for thread stacks.
+				used = runtime·end - runtime·text + (64<<20);
+				if(used >= rl.rlim_cur)
+					return 0;
+
+				// If there's not at least 16 MB left, we're probably
+				// not going to be able to do much. Treat as no limit.
+				rl.rlim_cur -= used;
+				if(rl.rlim_cur < (16<<20))
+					return 0;
+
+				return rl.rlim_cur - used;
+	*/
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_sigaction uintptr
+	sa_flags     int32
+	sa_mask      sigset
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = sigset_all
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_sigaction = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_sigaction == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_sigaction
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}

diff --git a/src/runtime/os_freebsd.go b/src/runtime/os_freebsd.go
index 8c8a106..c187ee8 100644
--- a/src/runtime/os_freebsd.go
+++ b/src/runtime/os_freebsd.go

@@ -1,10 +1,15 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type mOS struct{}
 
 //go:noescape
 func thr_new(param *thrparam, size int32)
@@ -13,9 +18,6 @@
 func sigaltstack(new, old *stackt)
 
 //go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
-//go:noescape
 func sigaction(sig int32, new, old *sigactiont)
 
 //go:noescape
@@ -36,3 +38,275 @@
 func sys_umtx_op(addr *uint32, mode int32, val uint32, ptr2, ts *timespec) int32
 
 func osyield()
+
+// From FreeBSD's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+// FreeBSD's umtx_op syscall is effectively the same as Linux's futex, and
+// thus the code is largely similar. See Linux implementation
+// and lock_futex.go for comments.
+
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	systemstack(func() {
+		futexsleep1(addr, val, ns)
+	})
+}
+
+func futexsleep1(addr *uint32, val uint32, ns int64) {
+	var tsp *timespec
+	if ns >= 0 {
+		var ts timespec
+		ts.tv_nsec = 0
+		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+		tsp = &ts
+	}
+	ret := sys_umtx_op(addr, _UMTX_OP_WAIT_UINT_PRIVATE, val, nil, tsp)
+	if ret >= 0 || ret == -_EINTR {
+		return
+	}
+	print("umtx_wait addr=", addr, " val=", val, " ret=", ret, "\n")
+	*(*int32)(unsafe.Pointer(uintptr(0x1005))) = 0x1005
+}
+
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := sys_umtx_op(addr, _UMTX_OP_WAKE_PRIVATE, cnt, nil, nil)
+	if ret >= 0 {
+		return
+	}
+
+	systemstack(func() {
+		print("umtx_wake_addr=", addr, " ret=", ret, "\n")
+	})
+}
+
+func thr_start()
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " thr_start=", funcPC(thr_start), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	// NOTE(rsc): This code is confused. stackbase is the top of the stack
+	// and is equal to stk. However, it's working, so I'm not changing it.
+	param := thrparam{
+		start_func: funcPC(thr_start),
+		arg:        unsafe.Pointer(mp),
+		stack_base: mp.g0.stack.hi,
+		stack_size: uintptr(stk) - mp.g0.stack.hi,
+		child_tid:  unsafe.Pointer(&mp.procid),
+		parent_tid: nil,
+		tls_base:   unsafe.Pointer(&mp.tls[0]),
+		tls_size:   unsafe.Sizeof(mp.tls),
+	}
+
+	var oset sigset
+	sigprocmask(_SIG_SETMASK, &sigset_all, &oset)
+	// TODO: Check for error.
+	thr_new(&param, int32(unsafe.Sizeof(param)))
+	sigprocmask(_SIG_SETMASK, &oset, nil)
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+
+	// m.procid is a uint64, but thr_new writes a uint32 on 32-bit systems.
+	// Fix it up. (Only matters on big-endian, but be clean anyway.)
+	if sys.PtrSize == 4 {
+		_g_.m.procid = uint64(*(*uint32)(unsafe.Pointer(&_g_.m.procid)))
+	}
+
+	// Initialize signal handling.
+	var st stackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		// Use existing signal stack.
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		_g_.m.gsignal.stack.lo = stsp
+		_g_.m.gsignal.stack.hi = stsp + st.ss_size
+		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+		_g_.m.gsignal.stackAlloc = st.ss_size
+		_g_.m.newSigstack = false
+	}
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	/*
+		TODO: Convert to Go when something actually uses the result.
+		Rlimit rl;
+		extern byte runtime·text[], runtime·end[];
+		uintptr used;
+
+		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+			return 0;
+		if(rl.rlim_cur >= 0x7fffffff)
+			return 0;
+
+		// Estimate our VM footprint excluding the heap.
+		// Not an exact science: use size of binary plus
+		// some room for thread stacks.
+		used = runtime·end - runtime·text + (64<<20);
+		if(used >= rl.rlim_cur)
+			return 0;
+
+		// If there's not at least 16 MB left, we're probably
+		// not going to be able to do much. Treat as no limit.
+		rl.rlim_cur -= used;
+		if(rl.rlim_cur < (16<<20))
+			return 0;
+
+		return rl.rlim_cur - used;
+	*/
+
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_handler uintptr
+	sa_flags   int32
+	sa_mask    sigset
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = sigset_all
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_handler = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_handler == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_handler
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st stackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m [(_NSIG + 31) / 32]uint32) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}

diff --git a/src/runtime/os_linux.go b/src/runtime/os_linux.go
index bd492f5..542f214 100644
--- a/src/runtime/os_linux.go
+++ b/src/runtime/os_linux.go

@@ -1,17 +1,368 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type mOS struct{}
 
 //go:noescape
 func futex(addr unsafe.Pointer, op int32, val uint32, ts, addr2 unsafe.Pointer, val3 uint32) int32
 
+// Linux futex.
+//
+//	futexsleep(uint32 *addr, uint32 val)
+//	futexwakeup(uint32 *addr)
+//
+// Futexsleep atomically checks if *addr == val and if so, sleeps on addr.
+// Futexwakeup wakes up threads sleeping on addr.
+// Futexsleep is allowed to wake up spuriously.
+
+const (
+	_FUTEX_WAIT = 0
+	_FUTEX_WAKE = 1
+)
+
+// Atomically,
+//	if(*addr == val) sleep
+// Might be woken up spuriously; that's allowed.
+// Don't sleep longer than ns; ns < 0 means forever.
+//go:nosplit
+func futexsleep(addr *uint32, val uint32, ns int64) {
+	var ts timespec
+
+	// Some Linux kernels have a bug where futex of
+	// FUTEX_WAIT returns an internal error code
+	// as an errno. Libpthread ignores the return value
+	// here, and so can we: as it says a few lines up,
+	// spurious wakeups are allowed.
+	if ns < 0 {
+		futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, nil, nil, 0)
+		return
+	}
+
+	// It's difficult to live within the no-split stack limits here.
+	// On ARM and 386, a 64-bit divide invokes a general software routine
+	// that needs more stack than we can afford. So we use timediv instead.
+	// But on real 64-bit systems, where words are larger but the stack limit
+	// is not, even timediv is too heavy, and we really need to use just an
+	// ordinary machine instruction.
+	if sys.PtrSize == 8 {
+		ts.set_sec(ns / 1000000000)
+		ts.set_nsec(int32(ns % 1000000000))
+	} else {
+		ts.tv_nsec = 0
+		ts.set_sec(int64(timediv(ns, 1000000000, (*int32)(unsafe.Pointer(&ts.tv_nsec)))))
+	}
+	futex(unsafe.Pointer(addr), _FUTEX_WAIT, val, unsafe.Pointer(&ts), nil, 0)
+}
+
+// If any procs are sleeping on addr, wake up at most cnt.
+//go:nosplit
+func futexwakeup(addr *uint32, cnt uint32) {
+	ret := futex(unsafe.Pointer(addr), _FUTEX_WAKE, cnt, nil, nil, 0)
+	if ret >= 0 {
+		return
+	}
+
+	// I don't know that futex wakeup can return
+	// EAGAIN or EINTR, but if it does, it would be
+	// safe to loop and call futex again.
+	systemstack(func() {
+		print("futexwakeup addr=", addr, " returned ", ret, "\n")
+	})
+
+	*(*int32)(unsafe.Pointer(uintptr(0x1006))) = 0x1006
+}
+
+func getproccount() int32 {
+	// This buffer is huge (8 kB) but we are on the system stack
+	// and there should be plenty of space (64 kB).
+	// Also this is a leaf, so we're not holding up the memory for long.
+	// See golang.org/issue/11823.
+	// The suggested behavior here is to keep trying with ever-larger
+	// buffers, but we don't have a dynamic memory allocator at the
+	// moment, so that's a bit tricky and seems like overkill.
+	const maxCPUs = 64 * 1024
+	var buf [maxCPUs / (sys.PtrSize * 8)]uintptr
+	r := sched_getaffinity(0, unsafe.Sizeof(buf), &buf[0])
+	n := int32(0)
+	for _, v := range buf[:r/sys.PtrSize] {
+		for v != 0 {
+			n += int32(v & 1)
+			v >>= 1
+		}
+	}
+	if n == 0 {
+		n = 1
+	}
+	return n
+}
+
+// Clone, the Linux rfork.
+const (
+	_CLONE_VM             = 0x100
+	_CLONE_FS             = 0x200
+	_CLONE_FILES          = 0x400
+	_CLONE_SIGHAND        = 0x800
+	_CLONE_PTRACE         = 0x2000
+	_CLONE_VFORK          = 0x4000
+	_CLONE_PARENT         = 0x8000
+	_CLONE_THREAD         = 0x10000
+	_CLONE_NEWNS          = 0x20000
+	_CLONE_SYSVSEM        = 0x40000
+	_CLONE_SETTLS         = 0x80000
+	_CLONE_PARENT_SETTID  = 0x100000
+	_CLONE_CHILD_CLEARTID = 0x200000
+	_CLONE_UNTRACED       = 0x800000
+	_CLONE_CHILD_SETTID   = 0x1000000
+	_CLONE_STOPPED        = 0x2000000
+	_CLONE_NEWUTS         = 0x4000000
+	_CLONE_NEWIPC         = 0x8000000
+
+	cloneFlags = _CLONE_VM | /* share memory */
+		_CLONE_FS | /* share cwd, etc */
+		_CLONE_FILES | /* share fd table */
+		_CLONE_SIGHAND | /* share sig handler table */
+		_CLONE_THREAD /* revisit - okay for now */
+)
+
 //go:noescape
 func clone(flags int32, stk, mm, gg, fn unsafe.Pointer) int32
 
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	/*
+	 * note: strace gets confused if we use CLONE_PTRACE here.
+	 */
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " clone=", funcPC(clone), " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	// Disable signals during clone, so that the new thread starts
+	// with signals disabled. It will enable them in minit.
+	var oset sigset
+	rtsigprocmask(_SIG_SETMASK, &sigset_all, &oset, int32(unsafe.Sizeof(oset)))
+	ret := clone(cloneFlags, stk, unsafe.Pointer(mp), unsafe.Pointer(mp.g0), unsafe.Pointer(funcPC(mstart)))
+	rtsigprocmask(_SIG_SETMASK, &oset, nil, int32(unsafe.Sizeof(oset)))
+
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", -ret, ")\n")
+		if ret == -_EAGAIN {
+			println("runtime: may need to increase max user processes (ulimit -u)")
+		}
+		throw("newosproc")
+	}
+}
+
+// Version of newosproc that doesn't require a valid G.
+//go:nosplit
+func newosproc0(stacksize uintptr, fn unsafe.Pointer) {
+	stack := sysAlloc(stacksize, &memstats.stacks_sys)
+	if stack == nil {
+		write(2, unsafe.Pointer(&failallocatestack[0]), int32(len(failallocatestack)))
+		exit(1)
+	}
+	ret := clone(cloneFlags, unsafe.Pointer(uintptr(stack)+stacksize), nil, nil, fn)
+	if ret < 0 {
+		write(2, unsafe.Pointer(&failthreadcreate[0]), int32(len(failthreadcreate)))
+		exit(1)
+	}
+}
+
+var failallocatestack = []byte("runtime: failed to allocate stack for the new OS thread\n")
+var failthreadcreate = []byte("runtime: failed to create new OS thread\n")
+
+const (
+	_AT_NULL   = 0  // End of vector
+	_AT_PAGESZ = 6  // System physical page size
+	_AT_RANDOM = 25 // introduced in 2.6.29
+)
+
+func sysargs(argc int32, argv **byte) {
+	n := argc + 1
+
+	// skip over argv, envp to get to auxv
+	for argv_index(argv, n) != nil {
+		n++
+	}
+
+	// skip NULL separator
+	n++
+
+	// now argv+n is auxv
+	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
+	for i := 0; auxv[i] != _AT_NULL; i += 2 {
+		tag, val := auxv[i], auxv[i+1]
+		switch tag {
+		case _AT_RANDOM:
+			// The kernel provides a pointer to 16-bytes
+			// worth of random data.
+			startupRandomData = (*[16]byte)(unsafe.Pointer(val))[:]
+
+		case _AT_PAGESZ:
+			// Check that the true physical page size is
+			// compatible with the runtime's assumed
+			// physical page size.
+			if sys.PhysPageSize < val {
+				print("runtime: kernel page size (", val, ") is larger than runtime page size (", sys.PhysPageSize, ")\n")
+				exit(1)
+			}
+			if sys.PhysPageSize%val != 0 {
+				print("runtime: runtime page size (", sys.PhysPageSize, ") is not a multiple of kernel page size (", val, ")\n")
+				exit(1)
+			}
+		}
+
+		archauxv(tag, val)
+	}
+}
+
+func osinit() {
+	ncpu = getproccount()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+func getRandomData(r []byte) {
+	if startupRandomData != nil {
+		n := copy(r, startupRandomData)
+		extendRandom(r, n)
+		return
+	}
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to do synchronous initialization of Go code built with
+// -buildmode=c-archive or -buildmode=c-shared.
+// None of the Go runtime is initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func libpreinit() {
+	initsig(true)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024) // Linux wants >= 2K
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	smask := &mp.sigmask
+	rtsigprocmask(_SIG_SETMASK, nil, smask, int32(unsafe.Sizeof(*smask)))
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	rtsigprocmask(_SIG_SETMASK, &sigmask, nil, int32(unsafe.Sizeof(sigmask)))
+}
+
+//go:nosplit
+func sigblock() {
+	rtsigprocmask(_SIG_SETMASK, &sigset_all, nil, int32(unsafe.Sizeof(sigset_all)))
+}
+
+func gettid() uint32
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	// Initialize signal handling.
+	_g_ := getg()
+
+	var st sigaltstackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		// Use existing signal stack.
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		_g_.m.gsignal.stack.lo = stsp
+		_g_.m.gsignal.stack.hi = stsp + st.ss_size
+		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+		_g_.m.gsignal.stackAlloc = st.ss_size
+		_g_.m.newSigstack = false
+	}
+
+	// for debuggers, in case cgo created the thread
+	_g_.m.procid = uint64(gettid())
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			sigdelset(&nmask, i)
+		}
+	}
+	rtsigprocmask(_SIG_SETMASK, &nmask, nil, int32(unsafe.Sizeof(nmask)))
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	/*
+		TODO: Convert to Go when something actually uses the result.
+
+		Rlimit rl;
+		extern byte runtime·text[], runtime·end[];
+		uintptr used;
+
+		if(runtime·getrlimit(RLIMIT_AS, &rl) != 0)
+			return 0;
+		if(rl.rlim_cur >= 0x7fffffff)
+			return 0;
+
+		// Estimate our VM footprint excluding the heap.
+		// Not an exact science: use size of binary plus
+		// some room for thread stacks.
+		used = runtime·end - runtime·text + (64<<20);
+		if(used >= rl.rlim_cur)
+			return 0;
+
+		// If there's not at least 16 MB left, we're probably
+		// not going to be able to do much. Treat as no limit.
+		rl.rlim_cur -= used;
+		if(rl.rlim_cur < (16<<20))
+			return 0;
+
+		return rl.rlim_cur - used;
+	*/
+
+	return 0
+}
+
+//#ifdef GOARCH_386
+//#define sa_handler k_sa_handler
+//#endif
+
+func sigreturn()
+func sigtramp()
+func cgoSigtramp()
+
 //go:noescape
 func rt_sigaction(sig uintptr, new, old *sigactiont, size uintptr) int32
 
@@ -19,9 +370,6 @@
 func sigaltstack(new, old *sigaltstackt)
 
 //go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
-//go:noescape
 func setitimer(mode int32, new, old *itimerval)
 
 //go:noescape
@@ -35,3 +383,88 @@
 //go:noescape
 func sched_getaffinity(pid, len uintptr, buf *uintptr) int32
 func osyield()
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK | _SA_RESTORER
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sigfillset(&sa.sa_mask)
+	// Although Linux manpage says "sa_restorer element is obsolete and
+	// should not be used". x86_64 kernel requires it. Only use it on
+	// x86.
+	if GOARCH == "386" || GOARCH == "amd64" {
+		sa.sa_restorer = funcPC(sigreturn)
+	}
+	if fn == funcPC(sighandler) {
+		if iscgo {
+			fn = funcPC(cgoSigtramp)
+		} else {
+			fn = funcPC(sigtramp)
+		}
+	}
+	sa.sa_handler = fn
+	rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask))
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	var sa sigactiont
+	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction failure")
+	}
+	if sa.sa_handler == 0 || sa.sa_handler == _SIG_DFL || sa.sa_handler == _SIG_IGN || sa.sa_flags&_SA_ONSTACK != 0 {
+		return
+	}
+	sa.sa_flags |= _SA_ONSTACK
+	if rt_sigaction(uintptr(i), &sa, nil, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction failure")
+	}
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+
+	memclr(unsafe.Pointer(&sa), unsafe.Sizeof(sa))
+	if rt_sigaction(uintptr(i), nil, &sa, unsafe.Sizeof(sa.sa_mask)) != 0 {
+		throw("rt_sigaction read failure")
+	}
+	if sa.sa_handler == funcPC(sigtramp) || sa.sa_handler == funcPC(cgoSigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_handler
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = (*byte)(unsafe.Pointer(s.lo))
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	sigcopyset(&mask, m)
+	rtsigprocmask(_SIG_SETMASK, &mask, nil, int32(unsafe.Sizeof(mask)))
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	sigaddset(&mask, int(sig))
+	rtsigprocmask(_SIG_UNBLOCK, &mask, nil, int32(unsafe.Sizeof(mask)))
+}

diff --git a/src/runtime/os_linux_386.go b/src/runtime/os_linux_386.go
deleted file mode 100644
index e2120da..0000000
--- a/src/runtime/os_linux_386.go
+++ /dev/null

@@ -1,35 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-const (
-	_AT_NULL    = 0
-	_AT_RANDOM  = 25
-	_AT_SYSINFO = 32
-)
-
-var _vdso uint32
-
-func sysargs(argc int32, argv **byte) {
-	// skip over argv, envv to get to auxv
-	n := argc + 1
-	for argv_index(argv, n) != nil {
-		n++
-	}
-	n++
-	auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*ptrSize))
-
-	for i := 0; auxv[i] != _AT_NULL; i += 2 {
-		switch auxv[i] {
-		case _AT_SYSINFO:
-			_vdso = auxv[i+1]
-
-		case _AT_RANDOM:
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(auxv[i+1])))[:]
-		}
-	}
-}

diff --git a/src/runtime/os_linux_arm.go b/src/runtime/os_linux_arm.go
index 3749640..8e2765a 100644
--- a/src/runtime/os_linux_arm.go
+++ b/src/runtime/os_linux_arm.go

@@ -7,10 +7,8 @@
 import "unsafe"
 
 const (
-	_AT_NULL     = 0
 	_AT_PLATFORM = 15 //  introduced in at least 2.6.11
 	_AT_HWCAP    = 16 // introduced in at least 2.6.11
-	_AT_RANDOM   = 25 // introduced in 2.6.29
 
 	_HWCAP_VFP   = 1 << 6  // introduced in at least 2.6.11
 	_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
@@ -33,33 +31,23 @@
 	}
 }
 
-func sysargs(argc int32, argv **byte) {
-	// skip over argv, envv to get to auxv
-	n := argc + 1
-	for argv_index(argv, n) != nil {
-		n++
-	}
-	n++
-	auxv := (*[1 << 28]uint32)(add(unsafe.Pointer(argv), uintptr(n)*ptrSize))
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_RANDOM:
+		// sysargs filled in startupRandomData, but that
+		// pointer may not be word aligned, so we must treat
+		// it as a byte array.
+		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
+			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
 
-	for i := 0; auxv[i] != _AT_NULL; i += 2 {
-		switch auxv[i] {
-		case _AT_RANDOM: // kernel provides a pointer to 16-bytes worth of random data
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(auxv[i+1])))[:]
-			// the pointer provided may not be word aligned, so we must treat it
-			// as a byte array.
-			randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
-				uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
-
-		case _AT_PLATFORM: // v5l, v6l, v7l
-			t := *(*uint8)(unsafe.Pointer(uintptr(auxv[i+1] + 1)))
-			if '5' <= t && t <= '7' {
-				armArch = t - '0'
-			}
-
-		case _AT_HWCAP: // CPU capability bit flags
-			hwcap = auxv[i+1]
+	case _AT_PLATFORM: // v5l, v6l, v7l
+		t := *(*uint8)(unsafe.Pointer(val + 1))
+		if '5' <= t && t <= '7' {
+			armArch = t - '0'
 		}
+
+	case _AT_HWCAP: // CPU capability bit flags
+		hwcap = uint32(val)
 	}
 }
 

diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 3f994f1..43262ae 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go

@@ -4,13 +4,19 @@
 
 package runtime
 
-const (
-	_AT_NULL   = 0
-	_AT_RANDOM = 25 // introduced in 2.6.29
-)
-
 var randomNumber uint32
 
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_RANDOM:
+		// sysargs filled in startupRandomData, but that
+		// pointer may not be word aligned, so we must treat
+		// it as a byte array.
+		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
+			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+	}
+}
+
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand1().

diff --git a/src/runtime/os_linux_generic.go b/src/runtime/os_linux_generic.go
new file mode 100644
index 0000000..a16d140
--- /dev/null
+++ b/src/runtime/os_linux_generic.go

@@ -0,0 +1,48 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !mips64
+// +build !mips64le
+// +build !s390x
+// +build linux
+
+package runtime
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 0
+	_SIG_UNBLOCK = 1
+	_SIG_SETMASK = 2
+	_RLIMIT_AS   = 9
+)
+
+// It's hard to tease out exactly how big a Sigset is, but
+// rt_sigprocmask crashes if we get it wrong, so if binaries
+// are running, this is right.
+type sigset [2]uint32
+
+type rlimit struct {
+	rlim_cur uintptr
+	rlim_max uintptr
+}
+
+var sigset_all = sigset{^uint32(0), ^uint32(0)}
+
+func sigaddset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] |= 1 << ((uint32(i) - 1) & 31)
+}
+
+func sigdelset(mask *sigset, i int) {
+	(*mask)[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+}
+
+func sigfillset(mask *uint64) {
+	*mask = ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	copy((*mask)[:], m[:])
+}

diff --git a/src/runtime/os_linux_mips64x.go b/src/runtime/os_linux_mips64x.go
new file mode 100644
index 0000000..8039b2f
--- /dev/null
+++ b/src/runtime/os_linux_mips64x.go

@@ -0,0 +1,64 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+package runtime
+
+var randomNumber uint32
+
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_RANDOM:
+		// sysargs filled in startupRandomData, but that
+		// pointer may not be word aligned, so we must treat
+		// it as a byte array.
+		randomNumber = uint32(startupRandomData[4]) | uint32(startupRandomData[5])<<8 |
+			uint32(startupRandomData[6])<<16 | uint32(startupRandomData[7])<<24
+	}
+}
+
+//go:nosplit
+func cputicks() int64 {
+	// Currently cputicks() is used in blocking profiler and to seed fastrand1().
+	// nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// randomNumber provides better seeding of fastrand1.
+	return nanotime() + int64(randomNumber)
+}
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_RLIMIT_AS   = 6
+)
+
+type sigset [2]uint64
+
+type rlimit struct {
+	rlim_cur uintptr
+	rlim_max uintptr
+}
+
+var sigset_all = sigset{^uint64(0), ^uint64(0)}
+
+func sigaddset(mask *sigset, i int) {
+	(*mask)[(i-1)/64] |= 1 << ((uint32(i) - 1) & 63)
+}
+
+func sigdelset(mask *sigset, i int) {
+	(*mask)[(i-1)/64] &^= 1 << ((uint32(i) - 1) & 63)
+}
+
+func sigfillset(mask *[2]uint64) {
+	(*mask)[0], (*mask)[1] = ^uint64(0), ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	(*mask)[0] = uint64(m[0]) | uint64(m[1])<<32
+}

diff --git a/src/runtime/arch_amd64p32.go b/src/runtime/os_linux_noauxv.go
similarity index 67%
rename from src/runtime/arch_amd64p32.go
rename to src/runtime/os_linux_noauxv.go
index 5c636ae..22522dd 100644
--- a/src/runtime/arch_amd64p32.go
+++ b/src/runtime/os_linux_noauxv.go

@@ -2,7 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+// +build !amd64,!arm,!arm64,!mips64,!mips64le
+
 package runtime
 
-type uintreg uint64
-type intptr int32 // TODO(rsc): remove
+func archauxv(tag, val uintptr) {
+}

diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
new file mode 100644
index 0000000..e659dff
--- /dev/null
+++ b/src/runtime/os_linux_s390x.go

@@ -0,0 +1,46 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+const (
+	_SS_DISABLE  = 2
+	_NSIG        = 65
+	_SI_USER     = 0
+	_SIG_BLOCK   = 0
+	_SIG_UNBLOCK = 1
+	_SIG_SETMASK = 2
+	_RLIMIT_AS   = 9
+)
+
+type sigset uint64
+
+type rlimit struct {
+	rlim_cur uintptr
+	rlim_max uintptr
+}
+
+var sigset_all = sigset(^uint64(0))
+
+func sigaddset(mask *sigset, i int) {
+	if i > 64 {
+		throw("unexpected signal greater than 64")
+	}
+	*mask |= 1 << (uint(i) - 1)
+}
+
+func sigdelset(mask *sigset, i int) {
+	if i > 64 {
+		throw("unexpected signal greater than 64")
+	}
+	*mask &^= 1 << (uint(i) - 1)
+}
+
+func sigfillset(mask *uint64) {
+	*mask = ^uint64(0)
+}
+
+func sigcopyset(mask *sigset, m sigmask) {
+	*mask = sigset(uint64(m[0]) | uint64(m[1])<<32)
+}

diff --git a/src/runtime/os_nacl.go b/src/runtime/os_nacl.go
index 3b4c136..6cbd16d 100644
--- a/src/runtime/os_nacl.go
+++ b/src/runtime/os_nacl.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,6 +6,12 @@
 
 import "unsafe"
 
+type mOS struct {
+	waitsema      int32 // semaphore for parking on locks
+	waitsemacount int32
+	waitsemalock  int32
+}
+
 func nacl_exception_stack(p uintptr, size int32) int32
 func nacl_exception_handler(fn uintptr, arg unsafe.Pointer) int32
 func nacl_sem_create(flag int32) int32
@@ -39,6 +45,10 @@
 	throw("too many writes on closed pipe")
 }
 
+func dieFromSignal(sig int32) {
+	exit(2)
+}
+
 func sigpanic() {
 	g := getg()
 	if !canpanic(g) {
@@ -50,14 +60,240 @@
 	panicmem()
 }
 
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer) {
-	throw("sigfwd not implemented")
-}
-
 func raiseproc(sig int32) {
 }
 
-// Stubs so tests can link correctly.  These should never be called.
+// Stubs so tests can link correctly. These should never be called.
 func open(name *byte, mode, perm int32) int32
 func closefd(fd int32) int32
 func read(fd int32, p unsafe.Pointer, n int32) int32
+
+type sigset struct{}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+func sigtramp()
+
+//go:nosplit
+func msigsave(mp *m) {
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+}
+
+//go:nosplit
+func sigblock() {
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+
+	// Initialize signal handling
+	ret := nacl_exception_stack(_g_.m.gsignal.stack.lo, 32*1024)
+	if ret < 0 {
+		print("runtime: nacl_exception_stack: error ", -ret, "\n")
+	}
+
+	ret = nacl_exception_handler(funcPC(sigtramp), nil)
+	if ret < 0 {
+		print("runtime: nacl_exception_handler: error ", -ret, "\n")
+	}
+}
+
+// Called from dropm to undo the effect of an minit.
+func unminit() {
+}
+
+func osinit() {
+	ncpu = 1
+	getg().m.procid = 2
+	//nacl_exception_handler(funcPC(sigtramp), nil);
+}
+
+func signame(sig uint32) string {
+	if sig >= uint32(len(sigtable)) {
+		return ""
+	}
+	return sigtable[sig].name
+}
+
+func crash() {
+	*(*int32)(nil) = 0
+}
+
+//go:noescape
+func getRandomData([]byte)
+
+func goenvs() {
+	goenvs_unix()
+}
+
+func initsig(preinit bool) {
+}
+
+//go:nosplit
+func usleep(us uint32) {
+	var ts timespec
+
+	ts.tv_sec = int64(us / 1e6)
+	ts.tv_nsec = int32(us%1e6) * 1e3
+	nacl_nanosleep(&ts, nil)
+}
+
+func mstart_nacl()
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	mp.tls[0] = uintptr(unsafe.Pointer(mp.g0))
+	mp.tls[1] = uintptr(unsafe.Pointer(mp))
+	ret := nacl_thread_create(funcPC(mstart_nacl), stk, unsafe.Pointer(&mp.tls[2]), nil)
+	if ret < 0 {
+		print("nacl_thread_create: error ", -ret, "\n")
+		throw("newosproc")
+	}
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+	systemstack(func() {
+		mu := nacl_mutex_create(0)
+		if mu < 0 {
+			print("nacl_mutex_create: error ", -mu, "\n")
+			throw("semacreate")
+		}
+		c := nacl_cond_create(0)
+		if c < 0 {
+			print("nacl_cond_create: error ", -c, "\n")
+			throw("semacreate")
+		}
+		mp.waitsema = c
+		mp.waitsemalock = mu
+	})
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	var ret int32
+
+	systemstack(func() {
+		_g_ := getg()
+		if nacl_mutex_lock(_g_.m.waitsemalock) < 0 {
+			throw("semasleep")
+		}
+
+		for _g_.m.waitsemacount == 0 {
+			if ns < 0 {
+				if nacl_cond_wait(_g_.m.waitsema, _g_.m.waitsemalock) < 0 {
+					throw("semasleep")
+				}
+			} else {
+				var ts timespec
+				end := ns + nanotime()
+				ts.tv_sec = end / 1e9
+				ts.tv_nsec = int32(end % 1e9)
+				r := nacl_cond_timed_wait_abs(_g_.m.waitsema, _g_.m.waitsemalock, &ts)
+				if r == -_ETIMEDOUT {
+					nacl_mutex_unlock(_g_.m.waitsemalock)
+					ret = -1
+					return
+				}
+				if r < 0 {
+					throw("semasleep")
+				}
+			}
+		}
+
+		_g_.m.waitsemacount = 0
+		nacl_mutex_unlock(_g_.m.waitsemalock)
+		ret = 0
+	})
+	return ret
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	systemstack(func() {
+		if nacl_mutex_lock(mp.waitsemalock) < 0 {
+			throw("semawakeup")
+		}
+		if mp.waitsemacount != 0 {
+			throw("semawakeup")
+		}
+		mp.waitsemacount = 1
+		nacl_cond_signal(mp.waitsema)
+		nacl_mutex_unlock(mp.waitsemalock)
+	})
+}
+
+func memlimit() uintptr {
+	return 0
+}
+
+// This runs on a foreign stack, without an m or a g. No stack split.
+//go:nosplit
+//go:norace
+//go:nowritebarrierrec
+func badsignal(sig uintptr) {
+	cgocallback(unsafe.Pointer(funcPC(badsignalgo)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+}
+
+func badsignalgo(sig uintptr) {
+	if !sigsend(uint32(sig)) {
+		// A foreign thread received the signal sig, and the
+		// Go code does not want to handle it.
+		raisebadsignal(int32(sig))
+	}
+}
+
+// This runs on a foreign stack, without an m or a g. No stack split.
+//go:nosplit
+func badsignal2() {
+	write(2, unsafe.Pointer(&badsignal1[0]), int32(len(badsignal1)))
+	exit(2)
+}
+
+var badsignal1 = []byte("runtime: signal received on thread not created by Go.\n")
+
+func raisebadsignal(sig int32) {
+	badsignal2()
+}
+
+func madvise(addr unsafe.Pointer, n uintptr, flags int32) {}
+func munmap(addr unsafe.Pointer, n uintptr)               {}
+func resetcpuprofiler(hz int32)                           {}
+func sigdisable(uint32)                                   {}
+func sigenable(uint32)                                    {}
+func sigignore(uint32)                                    {}
+func closeonexec(int32)                                   {}
+
+var writelock uint32 // test-and-set spin lock for write
+
+/*
+An attempt at IRT. Doesn't work. See end of sys_nacl_amd64.s.
+
+void (*nacl_irt_query)(void);
+
+int8 nacl_irt_basic_v0_1_str[] = "nacl-irt-basic-0.1";
+void *nacl_irt_basic_v0_1[6]; // exit, gettod, clock, nanosleep, sched_yield, sysconf
+int32 nacl_irt_basic_v0_1_size = sizeof(nacl_irt_basic_v0_1);
+
+int8 nacl_irt_memory_v0_3_str[] = "nacl-irt-memory-0.3";
+void *nacl_irt_memory_v0_3[3]; // mmap, munmap, mprotect
+int32 nacl_irt_memory_v0_3_size = sizeof(nacl_irt_memory_v0_3);
+
+int8 nacl_irt_thread_v0_1_str[] = "nacl-irt-thread-0.1";
+void *nacl_irt_thread_v0_1[3]; // thread_create, thread_exit, thread_nice
+int32 nacl_irt_thread_v0_1_size = sizeof(nacl_irt_thread_v0_1);
+*/

diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index af52099..4c44b2b 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go

@@ -1,10 +1,32 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+const (
+	_SS_DISABLE  = 4
+	_SIG_BLOCK   = 1
+	_SIG_UNBLOCK = 2
+	_SIG_SETMASK = 3
+	_NSIG        = 33
+	_SI_USER     = 0
+
+	// From NetBSD's <sys/ucontext.h>
+	_UC_SIGMASK = 0x01
+	_UC_CPU     = 0x04
+
+	_EAGAIN = 35
+)
+
+type mOS struct {
+	waitsemacount uint32
+}
 
 //go:noescape
 func setitimer(mode int32, new, old *itimerval)
@@ -15,10 +37,6 @@
 //go:noescape
 func sigaltstack(new, old *sigaltstackt)
 
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer) {
-	throw("sigfwd not implemented")
-}
-
 //go:noescape
 func sigprocmask(mode int32, new, old *sigset)
 
@@ -45,3 +63,271 @@
 func lwp_self() int32
 
 func osyield()
+
+const (
+	_ESRCH     = 3
+	_ETIMEDOUT = 60
+
+	// From NetBSD's <sys/time.h>
+	_CLOCK_REALTIME  = 0
+	_CLOCK_VIRTUAL   = 1
+	_CLOCK_PROF      = 2
+	_CLOCK_MONOTONIC = 3
+)
+
+var sigset_all = sigset{[4]uint32{^uint32(0), ^uint32(0), ^uint32(0), ^uint32(0)}}
+
+// From NetBSD's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	_g_ := getg()
+
+	// Compute sleep deadline.
+	var tsp *timespec
+	if ns >= 0 {
+		var ts timespec
+		var nsec int32
+		ns += nanotime()
+		ts.set_sec(timediv(ns, 1000000000, &nsec))
+		ts.set_nsec(nsec)
+		tsp = &ts
+	}
+
+	for {
+		v := atomic.Load(&_g_.m.waitsemacount)
+		if v > 0 {
+			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
+				return 0 // semaphore acquired
+			}
+			continue
+		}
+
+		// Sleep until unparked by semawakeup or timeout.
+		ret := lwp_park(tsp, 0, unsafe.Pointer(&_g_.m.waitsemacount), nil)
+		if ret == _ETIMEDOUT {
+			return -1
+		}
+	}
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	atomic.Xadd(&mp.waitsemacount, 1)
+	// From NetBSD's _lwp_unpark(2) manual:
+	// "If the target LWP is not currently waiting, it will return
+	// immediately upon the next call to _lwp_park()."
+	ret := lwp_unpark(int32(mp.procid), unsafe.Pointer(&mp.waitsemacount))
+	if ret != 0 && ret != _ESRCH {
+		// semawakeup can be called on signal stack.
+		systemstack(func() {
+			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
+		})
+	}
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	var uc ucontextt
+	getcontext(unsafe.Pointer(&uc))
+
+	uc.uc_flags = _UC_SIGMASK | _UC_CPU
+	uc.uc_link = nil
+	uc.uc_sigmask = sigset_all
+
+	lwp_mcontext_init(&uc.uc_mcontext, stk, mp, mp.g0, funcPC(netbsdMstart))
+
+	ret := lwp_create(unsafe.Pointer(&uc), 0, unsafe.Pointer(&mp.procid))
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
+		if ret == -_EAGAIN {
+			println("runtime: may need to increase max user processes (ulimit -p)")
+		}
+		throw("runtime.newosproc")
+	}
+}
+
+// netbsdMStart is the function call that starts executing a newly
+// created thread. On NetBSD, a new thread inherits the signal stack
+// of the creating thread. That confuses minit, so we remove that
+// signal stack here before calling the regular mstart. It's a bit
+// baroque to remove a signal stack here only to add one in minit, but
+// it's a simple change that keeps NetBSD working like other OS's.
+// At this point all signals are blocked, so there is no race.
+//go:nosplit
+func netbsdMstart() {
+	signalstack(nil)
+	mstart()
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, &sigmask, nil)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, &sigset_all, nil)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	_g_ := getg()
+	_g_.m.procid = uint64(lwp_self())
+
+	// Initialize signal handling.
+
+	// On NetBSD a thread created by pthread_create inherits the
+	// signal stack of the creating thread. We always create a
+	// new signal stack here, to avoid having two Go threads using
+	// the same signal stack. This breaks the case of a thread
+	// created in C that calls sigaltstack and then calls a Go
+	// function, because we will lose track of the C code's
+	// sigaltstack, but it's the best we can do.
+	signalstack(&_g_.m.gsignal.stack)
+	_g_.m.newSigstack = true
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask.__bits[(i-1)/32] &^= 1 << ((uint32(i) - 1) & 31)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, &nmask, nil)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_sigaction uintptr
+	sa_mask      sigset
+	sa_flags     int32
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = sigset_all
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_sigaction = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_sigaction == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_sigaction
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st sigaltstackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	var mask sigset
+	copy(mask.__bits[:], m[:])
+	sigprocmask(_SIG_SETMASK, &mask, nil)
+}
+
+func unblocksig(sig int32) {
+	var mask sigset
+	mask.__bits[(sig-1)/32] |= 1 << ((uint32(sig) - 1) & 31)
+	sigprocmask(_SIG_UNBLOCK, &mask, nil)
+}

diff --git a/src/runtime/os1_netbsd_386.go b/src/runtime/os_netbsd_386.go
similarity index 100%
rename from src/runtime/os1_netbsd_386.go
rename to src/runtime/os_netbsd_386.go


diff --git a/src/runtime/os1_netbsd_amd64.go b/src/runtime/os_netbsd_amd64.go
similarity index 100%
rename from src/runtime/os1_netbsd_amd64.go
rename to src/runtime/os_netbsd_amd64.go


diff --git a/src/runtime/os_openbsd.go b/src/runtime/os_openbsd.go
index f94b490..9a5c53e 100644
--- a/src/runtime/os_openbsd.go
+++ b/src/runtime/os_openbsd.go

@@ -1,10 +1,17 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+type mOS struct {
+	waitsemacount uint32
+}
 
 //go:noescape
 func setitimer(mode int32, new, old *itimerval)
@@ -16,10 +23,7 @@
 func sigaltstack(new, old *stackt)
 
 //go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
-//go:noescape
-func sigprocmask(mode int32, new uint32) uint32
+func sigprocmask(mode int32, new sigset) sigset
 
 //go:noescape
 func sysctl(mib *uint32, miblen uint32, out *byte, size *uintptr, dst *byte, ndst uintptr) int32
@@ -37,3 +41,274 @@
 func thrwakeup(ident uintptr, n int32) int32
 
 func osyield()
+
+const (
+	_ESRCH       = 3
+	_EAGAIN      = 35
+	_EWOULDBLOCK = _EAGAIN
+	_ENOTSUP     = 91
+
+	// From OpenBSD's sys/time.h
+	_CLOCK_REALTIME  = 0
+	_CLOCK_VIRTUAL   = 1
+	_CLOCK_PROF      = 2
+	_CLOCK_MONOTONIC = 3
+)
+
+type sigset uint32
+
+const (
+	sigset_none = sigset(0)
+	sigset_all  = ^sigset(0)
+)
+
+// From OpenBSD's <sys/sysctl.h>
+const (
+	_CTL_HW  = 6
+	_HW_NCPU = 3
+)
+
+func getncpu() int32 {
+	mib := [2]uint32{_CTL_HW, _HW_NCPU}
+	out := uint32(0)
+	nout := unsafe.Sizeof(out)
+
+	// Fetch hw.ncpu via sysctl.
+	ret := sysctl(&mib[0], 2, (*byte)(unsafe.Pointer(&out)), &nout, nil, 0)
+	if ret >= 0 {
+		return int32(out)
+	}
+	return 1
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	_g_ := getg()
+
+	// Compute sleep deadline.
+	var tsp *timespec
+	if ns >= 0 {
+		var ts timespec
+		var nsec int32
+		ns += nanotime()
+		ts.set_sec(int64(timediv(ns, 1000000000, &nsec)))
+		ts.set_nsec(nsec)
+		tsp = &ts
+	}
+
+	for {
+		v := atomic.Load(&_g_.m.waitsemacount)
+		if v > 0 {
+			if atomic.Cas(&_g_.m.waitsemacount, v, v-1) {
+				return 0 // semaphore acquired
+			}
+			continue
+		}
+
+		// Sleep until woken by semawakeup or timeout; or abort if waitsemacount != 0.
+		//
+		// From OpenBSD's __thrsleep(2) manual:
+		// "The abort argument, if not NULL, points to an int that will
+		// be examined [...] immediately before blocking. If that int
+		// is non-zero then __thrsleep() will immediately return EINTR
+		// without blocking."
+		ret := thrsleep(uintptr(unsafe.Pointer(&_g_.m.waitsemacount)), _CLOCK_MONOTONIC, tsp, 0, &_g_.m.waitsemacount)
+		if ret == _EWOULDBLOCK {
+			return -1
+		}
+	}
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	atomic.Xadd(&mp.waitsemacount, 1)
+	ret := thrwakeup(uintptr(unsafe.Pointer(&mp.waitsemacount)), 1)
+	if ret != 0 && ret != _ESRCH {
+		// semawakeup can be called on signal stack.
+		systemstack(func() {
+			print("thrwakeup addr=", &mp.waitsemacount, " sem=", mp.waitsemacount, " ret=", ret, "\n")
+		})
+	}
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc stk=", stk, " m=", mp, " g=", mp.g0, " id=", mp.id, " ostk=", &mp, "\n")
+	}
+
+	param := tforkt{
+		tf_tcb:   unsafe.Pointer(&mp.tls[0]),
+		tf_tid:   (*int32)(unsafe.Pointer(&mp.procid)),
+		tf_stack: uintptr(stk),
+	}
+
+	oset := sigprocmask(_SIG_SETMASK, sigset_all)
+	ret := tfork(&param, unsafe.Sizeof(param), mp, mp.g0, funcPC(mstart))
+	sigprocmask(_SIG_SETMASK, oset)
+
+	if ret < 0 {
+		print("runtime: failed to create new OS thread (have ", mcount()-1, " already; errno=", -ret, ")\n")
+		if ret == -_EAGAIN {
+			println("runtime: may need to increase max user processes (ulimit -p)")
+		}
+		throw("runtime.newosproc")
+	}
+}
+
+func osinit() {
+	ncpu = getncpu()
+}
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+//go:nosplit
+func getRandomData(r []byte) {
+	fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+	n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+	closefd(fd)
+	extendRandom(r, int(n))
+}
+
+func goenvs() {
+	goenvs_unix()
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+	mp.sigmask = sigprocmask(_SIG_BLOCK, 0)
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+	sigprocmask(_SIG_SETMASK, sigmask)
+}
+
+//go:nosplit
+func sigblock() {
+	sigprocmask(_SIG_SETMASK, sigset_all)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, can not allocate memory.
+func minit() {
+	_g_ := getg()
+
+	// m.procid is a uint64, but tfork writes an int32. Fix it up.
+	_g_.m.procid = uint64(*(*int32)(unsafe.Pointer(&_g_.m.procid)))
+
+	// Initialize signal handling
+	var st stackt
+	sigaltstack(nil, &st)
+	if st.ss_flags&_SS_DISABLE != 0 {
+		signalstack(&_g_.m.gsignal.stack)
+		_g_.m.newSigstack = true
+	} else {
+		// Use existing signal stack.
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		_g_.m.gsignal.stack.lo = stsp
+		_g_.m.gsignal.stack.hi = stsp + st.ss_size
+		_g_.m.gsignal.stackguard0 = stsp + _StackGuard
+		_g_.m.gsignal.stackguard1 = stsp + _StackGuard
+		_g_.m.gsignal.stackAlloc = st.ss_size
+		_g_.m.newSigstack = false
+	}
+
+	// restore signal mask from m.sigmask and unblock essential signals
+	nmask := _g_.m.sigmask
+	for i := range sigtable {
+		if sigtable[i].flags&_SigUnblock != 0 {
+			nmask &^= 1 << (uint32(i) - 1)
+		}
+	}
+	sigprocmask(_SIG_SETMASK, nmask)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	if getg().m.newSigstack {
+		signalstack(nil)
+	}
+}
+
+func memlimit() uintptr {
+	return 0
+}
+
+func sigtramp()
+
+type sigactiont struct {
+	sa_sigaction uintptr
+	sa_mask      uint32
+	sa_flags     int32
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsig(i int32, fn uintptr, restart bool) {
+	var sa sigactiont
+	sa.sa_flags = _SA_SIGINFO | _SA_ONSTACK
+	if restart {
+		sa.sa_flags |= _SA_RESTART
+	}
+	sa.sa_mask = uint32(sigset_all)
+	if fn == funcPC(sighandler) {
+		fn = funcPC(sigtramp)
+	}
+	sa.sa_sigaction = fn
+	sigaction(i, &sa, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func setsigstack(i int32) {
+	throw("setsigstack")
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func getsig(i int32) uintptr {
+	var sa sigactiont
+	sigaction(i, nil, &sa)
+	if sa.sa_sigaction == funcPC(sigtramp) {
+		return funcPC(sighandler)
+	}
+	return sa.sa_sigaction
+}
+
+//go:nosplit
+func signalstack(s *stack) {
+	var st stackt
+	if s == nil {
+		st.ss_flags = _SS_DISABLE
+	} else {
+		st.ss_sp = s.lo
+		st.ss_size = s.hi - s.lo
+		st.ss_flags = 0
+	}
+	sigaltstack(&st, nil)
+}
+
+//go:nosplit
+//go:nowritebarrierrec
+func updatesigmask(m sigmask) {
+	sigprocmask(_SIG_SETMASK, sigset(m[0]))
+}
+
+func unblocksig(sig int32) {
+	mask := sigset(1) << (uint32(sig) - 1)
+	sigprocmask(_SIG_UNBLOCK, mask)
+}

diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index 6def35c..2f3a0d1 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go

@@ -1,10 +1,19 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+type mOS struct {
+	waitsemacount uint32
+	notesig       *int8
+	errstr        *byte
+}
 
 func closefd(fd int32) int32
 
@@ -72,7 +81,15 @@
 	note := gostringnocopy((*byte)(unsafe.Pointer(g.m.notesig)))
 	switch g.sig {
 	case _SIGRFAULT, _SIGWFAULT:
-		addr := note[index(note, "addr=")+5:]
+		i := index(note, "addr=")
+		if i >= 0 {
+			i += 5
+		} else if i = index(note, "va="); i >= 0 {
+			i += 3
+		} else {
+			panicmem()
+		}
+		addr := note[i:]
 		g.sigcode1 = uintptr(atolwhex(addr))
 		if g.sigcode1 < 0x1000 || g.paniconfault {
 			panicmem()
@@ -134,3 +151,288 @@
 	}
 	return n
 }
+
+type sigset struct{}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+	// Initialize stack and goroutine for note handling.
+	mp.gsignal = malg(32 * 1024)
+	mp.gsignal.m = mp
+	mp.notesig = (*int8)(mallocgc(_ERRMAX, nil, true))
+	// Initialize stack for handling strings from the
+	// errstr system call, as used in package syscall.
+	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, true))
+}
+
+func msigsave(mp *m) {
+}
+
+func msigrestore(sigmask sigset) {
+}
+
+func sigblock() {
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	if atomic.Load(&exiting) != 0 {
+		exits(&emptystatus[0])
+	}
+	// Mask all SSE floating-point exceptions
+	// when running on the 64-bit kernel.
+	setfpmasks()
+}
+
+// Called from dropm to undo the effect of an minit.
+func unminit() {
+}
+
+var sysstat = []byte("/dev/sysstat\x00")
+
+func getproccount() int32 {
+	var buf [2048]byte
+	fd := open(&sysstat[0], _OREAD, 0)
+	if fd < 0 {
+		return 1
+	}
+	ncpu := int32(0)
+	for {
+		n := read(fd, unsafe.Pointer(&buf), int32(len(buf)))
+		if n <= 0 {
+			break
+		}
+		for i := int32(0); i < n; i++ {
+			if buf[i] == '\n' {
+				ncpu++
+			}
+		}
+	}
+	closefd(fd)
+	if ncpu == 0 {
+		ncpu = 1
+	}
+	return ncpu
+}
+
+var pid = []byte("#c/pid\x00")
+
+func getpid() uint64 {
+	var b [20]byte
+	fd := open(&pid[0], 0, 0)
+	if fd >= 0 {
+		read(fd, unsafe.Pointer(&b), int32(len(b)))
+		closefd(fd)
+	}
+	c := b[:]
+	for c[0] == ' ' || c[0] == '\t' {
+		c = c[1:]
+	}
+	return uint64(_atoi(c))
+}
+
+func osinit() {
+	initBloc()
+	ncpu = getproccount()
+	getg().m.procid = getpid()
+	notify(unsafe.Pointer(funcPC(sigtramp)))
+}
+
+func crash() {
+	notify(nil)
+	*(*int)(nil) = 0
+}
+
+//go:nosplit
+func getRandomData(r []byte) {
+	extendRandom(r, 0)
+}
+
+func goenvs() {
+}
+
+func initsig(preinit bool) {
+}
+
+//go:nosplit
+func osyield() {
+	sleep(0)
+}
+
+//go:nosplit
+func usleep(µs uint32) {
+	ms := int32(µs / 1000)
+	if ms == 0 {
+		ms = 1
+	}
+	sleep(ms)
+}
+
+//go:nosplit
+func nanotime() int64 {
+	var scratch int64
+	ns := nsec(&scratch)
+	// TODO(aram): remove hack after I fix _nsec in the pc64 kernel.
+	if ns == 0 {
+		return scratch
+	}
+	return ns
+}
+
+//go:nosplit
+func itoa(buf []byte, val uint64) []byte {
+	i := len(buf) - 1
+	for val >= 10 {
+		buf[i] = byte(val%10 + '0')
+		i--
+		val /= 10
+	}
+	buf[i] = byte(val + '0')
+	return buf[i:]
+}
+
+var goexits = []byte("go: exit ")
+var emptystatus = []byte("\x00")
+var exiting uint32
+
+func goexitsall(status *byte) {
+	var buf [_ERRMAX]byte
+	if !atomic.Cas(&exiting, 0, 1) {
+		return
+	}
+	getg().m.locks++
+	n := copy(buf[:], goexits)
+	n = copy(buf[n:], gostringnocopy(status))
+	pid := getpid()
+	for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+		if mp.procid != 0 && mp.procid != pid {
+			postnote(mp.procid, buf[:])
+		}
+	}
+	getg().m.locks--
+}
+
+var procdir = []byte("/proc/")
+var notefile = []byte("/note\x00")
+
+func postnote(pid uint64, msg []byte) int {
+	var buf [128]byte
+	var tmp [32]byte
+	n := copy(buf[:], procdir)
+	n += copy(buf[n:], itoa(tmp[:], pid))
+	copy(buf[n:], notefile)
+	fd := open(&buf[0], _OWRITE, 0)
+	if fd < 0 {
+		return -1
+	}
+	len := findnull(&msg[0])
+	if write(uintptr(fd), unsafe.Pointer(&msg[0]), int32(len)) != int64(len) {
+		closefd(fd)
+		return -1
+	}
+	closefd(fd)
+	return 0
+}
+
+//go:nosplit
+func exit(e int) {
+	var status []byte
+	if e == 0 {
+		status = emptystatus
+	} else {
+		// build error string
+		var tmp [32]byte
+		status = append(itoa(tmp[:len(tmp)-1], uint64(e)), 0)
+	}
+	goexitsall(&status[0])
+	exits(&status[0])
+}
+
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newosproc(mp *m, stk unsafe.Pointer) {
+	if false {
+		print("newosproc mp=", mp, " ostk=", &mp, "\n")
+	}
+	pid := rfork(_RFPROC | _RFMEM | _RFNOWAIT)
+	if pid < 0 {
+		throw("newosproc: rfork failed")
+	}
+	if pid == 0 {
+		tstart_plan9(mp)
+	}
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+}
+
+//go:nosplit
+func semasleep(ns int64) int {
+	_g_ := getg()
+	if ns >= 0 {
+		ms := timediv(ns, 1000000, nil)
+		if ms == 0 {
+			ms = 1
+		}
+		ret := plan9_tsemacquire(&_g_.m.waitsemacount, ms)
+		if ret == 1 {
+			return 0 // success
+		}
+		return -1 // timeout or interrupted
+	}
+	for plan9_semacquire(&_g_.m.waitsemacount, 1) < 0 {
+		// interrupted; try again (c.f. lock_sema.go)
+	}
+	return 0 // success
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	plan9_semrelease(&mp.waitsemacount, 1)
+}
+
+//go:nosplit
+func read(fd int32, buf unsafe.Pointer, n int32) int32 {
+	return pread(fd, buf, n, -1)
+}
+
+//go:nosplit
+func write(fd uintptr, buf unsafe.Pointer, n int32) int64 {
+	return int64(pwrite(int32(fd), buf, n, -1))
+}
+
+func memlimit() uint64 {
+	return 0
+}
+
+var _badsignal = []byte("runtime: signal received on thread not created by Go.\n")
+
+// This runs on a foreign stack, without an m or a g. No stack split.
+//go:nosplit
+func badsignal2() {
+	pwrite(2, unsafe.Pointer(&_badsignal[0]), int32(len(_badsignal)), -1)
+	exits(&_badsignal[0])
+}
+
+func raisebadsignal(sig int32) {
+	badsignal2()
+}
+
+func _atoi(b []byte) int {
+	n := 0
+	for len(b) > 0 && '0' <= b[0] && b[0] <= '9' {
+		n = n*10 + int(b[0]) - '0'
+		b = b[1:]
+	}
+	return n
+}
+
+func signame(sig uint32) string {
+	if sig >= uint32(len(sigtable)) {
+		return ""
+	}
+	return sigtable[sig].name
+}

diff --git a/src/runtime/os_plan9_arm.go b/src/runtime/os_plan9_arm.go
new file mode 100644
index 0000000..30cde8f
--- /dev/null
+++ b/src/runtime/os_plan9_arm.go

@@ -0,0 +1,17 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+func checkgoarm() {
+	return // TODO(minux)
+}
+
+//go:nosplit
+func cputicks() int64 {
+	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand1().
+	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
+	// TODO: need more entropy to better seed fastrand1.
+	return nanotime()
+}

diff --git a/src/runtime/os_solaris.go b/src/runtime/os_solaris.go
index fd20a5c..0c39715 100644
--- a/src/runtime/os_solaris.go
+++ b/src/runtime/os_solaris.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,80 +6,96 @@
 
 import "unsafe"
 
+type mts struct {
+	tv_sec  int64
+	tv_nsec int64
+}
+
+type mscratch struct {
+	v [6]uintptr
+}
+
+type mOS struct {
+	waitsema uintptr // semaphore for parking on locks
+	perrno   *int32  // pointer to tls errno
+	// these are here because they are too large to be on the stack
+	// of low-level NOSPLIT functions.
+	//LibCall       libcall;
+	ts      mts
+	scratch mscratch
+}
+
 type libcFunc uintptr
 
 var asmsysvicall6 libcFunc
 
-//go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
 //go:nosplit
 func sysvicall0(fn *libcFunc) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 0
 	libcall.args = uintptr(unsafe.Pointer(fn)) // it's unused but must be non-nil, otherwise crashes
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall1(fn *libcFunc, a1 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 1
 	// TODO(rsc): Why is noescape necessary here and below?
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall2(fn *libcFunc, a1, a2 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 2
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall3(fn *libcFunc, a1, a2, a3 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 3
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall4(fn *libcFunc, a1, a2, a3, a4 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 4
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall5(fn *libcFunc, a1, a2, a3, a4, a5 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 5
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }
 
 //go:nosplit
 func sysvicall6(fn *libcFunc, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
-	libcall := &getg().m.libcall
+	var libcall libcall
 	libcall.fn = uintptr(unsafe.Pointer(fn))
 	libcall.n = 6
 	libcall.args = uintptr(noescape(unsafe.Pointer(&a1)))
-	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(libcall))
+	asmcgocall(unsafe.Pointer(&asmsysvicall6), unsafe.Pointer(&libcall))
 	return libcall.r1
 }

diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index 545b416..9147091 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go

@@ -1,19 +1,133 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
-type stdFunction *byte
+// TODO(brainman): should not need those
+const (
+	_NSIG = 65
+)
+
+//go:cgo_import_dynamic runtime._AddVectoredExceptionHandler AddVectoredExceptionHandler%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CloseHandle CloseHandle%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateEventA CreateEventA%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CryptAcquireContextW CryptAcquireContextW%5 "advapi32.dll"
+//go:cgo_import_dynamic runtime._CryptGenRandom CryptGenRandom%3 "advapi32.dll"
+//go:cgo_import_dynamic runtime._CryptReleaseContext CryptReleaseContext%2 "advapi32.dll"
+//go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
+//go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetConsoleMode GetConsoleMode%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetEnvironmentStringsW GetEnvironmentStringsW%0 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetProcAddress GetProcAddress%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetProcessAffinityMask GetProcessAffinityMask%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetQueuedCompletionStatus GetQueuedCompletionStatus%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetStdHandle GetStdHandle%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetSystemInfo GetSystemInfo%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._GetThreadContext GetThreadContext%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._LoadLibraryW LoadLibraryW%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._LoadLibraryA LoadLibraryA%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._NtWaitForSingleObject NtWaitForSingleObject%3 "ntdll.dll"
+//go:cgo_import_dynamic runtime._ResumeThread ResumeThread%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetConsoleCtrlHandler SetConsoleCtrlHandler%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetErrorMode SetErrorMode%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetEvent SetEvent%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetProcessPriorityBoost SetProcessPriorityBoost%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetThreadPriority SetThreadPriority%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetUnhandledExceptionFilter SetUnhandledExceptionFilter%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SetWaitableTimer SetWaitableTimer%6 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SuspendThread SuspendThread%1 "kernel32.dll"
+//go:cgo_import_dynamic runtime._SwitchToThread SwitchToThread%0 "kernel32.dll"
+//go:cgo_import_dynamic runtime._VirtualAlloc VirtualAlloc%4 "kernel32.dll"
+//go:cgo_import_dynamic runtime._VirtualFree VirtualFree%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WSAGetOverlappedResult WSAGetOverlappedResult%5 "ws2_32.dll"
+//go:cgo_import_dynamic runtime._WaitForSingleObject WaitForSingleObject%2 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WriteConsoleW WriteConsoleW%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._WriteFile WriteFile%5 "kernel32.dll"
+//go:cgo_import_dynamic runtime._timeBeginPeriod timeBeginPeriod%1 "winmm.dll"
+
+type stdFunction unsafe.Pointer
+
+var (
+	// Following syscalls are available on every Windows PC.
+	// All these variables are set by the Windows executable
+	// loader before the Go program starts.
+	_AddVectoredExceptionHandler,
+	_CloseHandle,
+	_CreateEventA,
+	_CreateIoCompletionPort,
+	_CreateThread,
+	_CreateWaitableTimerA,
+	_CryptAcquireContextW,
+	_CryptGenRandom,
+	_CryptReleaseContext,
+	_DuplicateHandle,
+	_ExitProcess,
+	_FreeEnvironmentStringsW,
+	_GetConsoleMode,
+	_GetEnvironmentStringsW,
+	_GetProcAddress,
+	_GetProcessAffinityMask,
+	_GetQueuedCompletionStatus,
+	_GetStdHandle,
+	_GetSystemInfo,
+	_GetThreadContext,
+	_LoadLibraryW,
+	_LoadLibraryA,
+	_NtWaitForSingleObject,
+	_ResumeThread,
+	_SetConsoleCtrlHandler,
+	_SetErrorMode,
+	_SetEvent,
+	_SetProcessPriorityBoost,
+	_SetThreadPriority,
+	_SetUnhandledExceptionFilter,
+	_SetWaitableTimer,
+	_SuspendThread,
+	_SwitchToThread,
+	_VirtualAlloc,
+	_VirtualFree,
+	_WSAGetOverlappedResult,
+	_WaitForSingleObject,
+	_WriteConsoleW,
+	_WriteFile,
+	_timeBeginPeriod,
+	_ stdFunction
+
+	// Following syscalls are only available on some Windows PCs.
+	// We will load syscalls, if available, before using them.
+	_AddDllDirectory,
+	_AddVectoredContinueHandler,
+	_GetQueuedCompletionStatusEx,
+	_LoadLibraryExW,
+	_ stdFunction
+)
+
+// Function to be called by windows CreateThread
+// to start new os thread.
+func tstart_stdcall(newm *m) uint32
+
+func ctrlhandler(_type uint32) uint32
+
+type mOS struct {
+	waitsema uintptr // semaphore for parking on locks
+}
 
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	throw("too many writes on closed pipe")
 }
 
-// Stubs so tests can link correctly.  These should never be called.
+// Stubs so tests can link correctly. These should never be called.
 func open(name *byte, mode, perm int32) int32 {
 	throw("unimplemented")
 	return -1
@@ -26,3 +140,605 @@
 	throw("unimplemented")
 	return -1
 }
+
+type sigset struct{}
+
+// Call a Windows function with stdcall conventions,
+// and switch to os stack during the call.
+func asmstdcall(fn unsafe.Pointer)
+
+var asmstdcallAddr unsafe.Pointer
+
+func windowsFindfunc(lib uintptr, name []byte) stdFunction {
+	if name[len(name)-1] != 0 {
+		throw("usage")
+	}
+	f := stdcall2(_GetProcAddress, lib, uintptr(unsafe.Pointer(&name[0])))
+	return stdFunction(unsafe.Pointer(f))
+}
+
+func loadOptionalSyscalls() {
+	var kernel32dll = []byte("kernel32.dll\000")
+	k32 := stdcall1(_LoadLibraryA, uintptr(unsafe.Pointer(&kernel32dll[0])))
+	if k32 == 0 {
+		throw("kernel32.dll not found")
+	}
+	_AddDllDirectory = windowsFindfunc(k32, []byte("AddDllDirectory\000"))
+	_AddVectoredContinueHandler = windowsFindfunc(k32, []byte("AddVectoredContinueHandler\000"))
+	_GetQueuedCompletionStatusEx = windowsFindfunc(k32, []byte("GetQueuedCompletionStatusEx\000"))
+	_LoadLibraryExW = windowsFindfunc(k32, []byte("LoadLibraryExW\000"))
+}
+
+//go:nosplit
+func getLoadLibrary() uintptr {
+	return uintptr(unsafe.Pointer(_LoadLibraryW))
+}
+
+//go:nosplit
+func getLoadLibraryEx() uintptr {
+	return uintptr(unsafe.Pointer(_LoadLibraryExW))
+}
+
+//go:nosplit
+func getGetProcAddress() uintptr {
+	return uintptr(unsafe.Pointer(_GetProcAddress))
+}
+
+func getproccount() int32 {
+	var mask, sysmask uintptr
+	ret := stdcall3(_GetProcessAffinityMask, currentProcess, uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
+	if ret != 0 {
+		n := 0
+		maskbits := int(unsafe.Sizeof(mask) * 8)
+		for i := 0; i < maskbits; i++ {
+			if mask&(1<<uint(i)) != 0 {
+				n++
+			}
+		}
+		if n != 0 {
+			return int32(n)
+		}
+	}
+	// use GetSystemInfo if GetProcessAffinityMask fails
+	var info systeminfo
+	stdcall1(_GetSystemInfo, uintptr(unsafe.Pointer(&info)))
+	return int32(info.dwnumberofprocessors)
+}
+
+const (
+	currentProcess = ^uintptr(0) // -1 = current process
+	currentThread  = ^uintptr(1) // -2 = current thread
+)
+
+// in sys_windows_386.s and sys_windows_amd64.s:
+func externalthreadhandler()
+func getlasterror() uint32
+func setlasterror(err uint32)
+
+// When loading DLLs, we prefer to use LoadLibraryEx with
+// LOAD_LIBRARY_SEARCH_* flags, if available. LoadLibraryEx is not
+// available on old Windows, though, and the LOAD_LIBRARY_SEARCH_*
+// flags are not available on some versions of Windows without a
+// security patch.
+//
+// https://msdn.microsoft.com/en-us/library/ms684179(v=vs.85).aspx says:
+// "Windows 7, Windows Server 2008 R2, Windows Vista, and Windows
+// Server 2008: The LOAD_LIBRARY_SEARCH_* flags are available on
+// systems that have KB2533623 installed. To determine whether the
+// flags are available, use GetProcAddress to get the address of the
+// AddDllDirectory, RemoveDllDirectory, or SetDefaultDllDirectories
+// function. If GetProcAddress succeeds, the LOAD_LIBRARY_SEARCH_*
+// flags can be used with LoadLibraryEx."
+var useLoadLibraryEx bool
+
+var timeBeginPeriodRetValue uint32
+
+func osinit() {
+	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
+	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
+	switchtothreadAddr = unsafe.Pointer(funcPC(switchtothread))
+
+	setBadSignalMsg()
+
+	loadOptionalSyscalls()
+
+	useLoadLibraryEx = (_LoadLibraryExW != nil && _AddDllDirectory != nil)
+
+	disableWER()
+
+	externalthreadhandlerp = funcPC(externalthreadhandler)
+
+	initExceptionHandler()
+
+	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
+
+	timeBeginPeriodRetValue = uint32(stdcall1(_timeBeginPeriod, 1))
+
+	ncpu = getproccount()
+
+	// Windows dynamic priority boosting assumes that a process has different types
+	// of dedicated threads -- GUI, IO, computational, etc. Go processes use
+	// equivalent threads that all do a mix of GUI, IO, computations, etc.
+	// In such context dynamic priority boosting does nothing but harm, so we turn it off.
+	stdcall2(_SetProcessPriorityBoost, currentProcess, 1)
+}
+
+//go:nosplit
+func getRandomData(r []byte) {
+	const (
+		prov_rsa_full       = 1
+		crypt_verifycontext = 0xF0000000
+	)
+	var handle uintptr
+	n := 0
+	if stdcall5(_CryptAcquireContextW, uintptr(unsafe.Pointer(&handle)), 0, 0, prov_rsa_full, crypt_verifycontext) != 0 {
+		if stdcall3(_CryptGenRandom, handle, uintptr(len(r)), uintptr(unsafe.Pointer(&r[0]))) != 0 {
+			n = len(r)
+		}
+		stdcall2(_CryptReleaseContext, handle, 0)
+	}
+	extendRandom(r, n)
+}
+
+func goenvs() {
+	// strings is a pointer to environment variable pairs in the form:
+	//     "envA=valA\x00envB=valB\x00\x00" (in UTF-16)
+	// Two consecutive zero bytes end the list.
+	strings := unsafe.Pointer(stdcall0(_GetEnvironmentStringsW))
+	p := (*[1 << 24]uint16)(strings)[:]
+
+	n := 0
+	for from, i := 0, 0; true; i++ {
+		if p[i] == 0 {
+			// empty string marks the end
+			if i == from {
+				break
+			}
+			from = i + 1
+			n++
+		}
+	}
+	envs = make([]string, n)
+
+	for i := range envs {
+		envs[i] = gostringw(&p[0])
+		for p[0] != 0 {
+			p = p[1:]
+		}
+		p = p[1:] // skip nil byte
+	}
+
+	stdcall1(_FreeEnvironmentStringsW, uintptr(strings))
+}
+
+//go:nosplit
+func exit(code int32) {
+	stdcall1(_ExitProcess, uintptr(code))
+}
+
+//go:nosplit
+func write(fd uintptr, buf unsafe.Pointer, n int32) int32 {
+	const (
+		_STD_OUTPUT_HANDLE = ^uintptr(10) // -11
+		_STD_ERROR_HANDLE  = ^uintptr(11) // -12
+	)
+	var handle uintptr
+	switch fd {
+	case 1:
+		handle = stdcall1(_GetStdHandle, _STD_OUTPUT_HANDLE)
+	case 2:
+		handle = stdcall1(_GetStdHandle, _STD_ERROR_HANDLE)
+	default:
+		// assume fd is real windows handle.
+		handle = fd
+	}
+	isASCII := true
+	b := (*[1 << 30]byte)(buf)[:n]
+	for _, x := range b {
+		if x >= 0x80 {
+			isASCII = false
+			break
+		}
+	}
+
+	if !isASCII {
+		var m uint32
+		isConsole := stdcall2(_GetConsoleMode, handle, uintptr(unsafe.Pointer(&m))) != 0
+		// If this is a console output, various non-unicode code pages can be in use.
+		// Use the dedicated WriteConsole call to ensure unicode is printed correctly.
+		if isConsole {
+			return int32(writeConsole(handle, buf, n))
+		}
+	}
+	var written uint32
+	stdcall5(_WriteFile, handle, uintptr(buf), uintptr(n), uintptr(unsafe.Pointer(&written)), 0)
+	return int32(written)
+}
+
+var (
+	utf16ConsoleBack     [1000]uint16
+	utf16ConsoleBackLock mutex
+)
+
+// writeConsole writes bufLen bytes from buf to the console File.
+// It returns the number of bytes written.
+func writeConsole(handle uintptr, buf unsafe.Pointer, bufLen int32) int {
+	const surr2 = (surrogateMin + surrogateMax + 1) / 2
+
+	// Do not use defer for unlock. May cause issues when printing a panic.
+	lock(&utf16ConsoleBackLock)
+
+	b := (*[1 << 30]byte)(buf)[:bufLen]
+	s := *(*string)(unsafe.Pointer(&b))
+
+	utf16tmp := utf16ConsoleBack[:]
+
+	total := len(s)
+	w := 0
+	for len(s) > 0 {
+		if w >= len(utf16tmp)-2 {
+			writeConsoleUTF16(handle, utf16tmp[:w])
+			w = 0
+		}
+		r, n := charntorune(s)
+		s = s[n:]
+		if r < 0x10000 {
+			utf16tmp[w] = uint16(r)
+			w++
+		} else {
+			r -= 0x10000
+			utf16tmp[w] = surrogateMin + uint16(r>>10)&0x3ff
+			utf16tmp[w+1] = surr2 + uint16(r)&0x3ff
+			w += 2
+		}
+	}
+	writeConsoleUTF16(handle, utf16tmp[:w])
+	unlock(&utf16ConsoleBackLock)
+	return total
+}
+
+// writeConsoleUTF16 is the dedicated windows calls that correctly prints
+// to the console regardless of the current code page. Input is utf-16 code points.
+// The handle must be a console handle.
+func writeConsoleUTF16(handle uintptr, b []uint16) {
+	l := uint32(len(b))
+	if l == 0 {
+		return
+	}
+	var written uint32
+	stdcall5(_WriteConsoleW,
+		handle,
+		uintptr(unsafe.Pointer(&b[0])),
+		uintptr(l),
+		uintptr(unsafe.Pointer(&written)),
+		0,
+	)
+	return
+}
+
+//go:nosplit
+func semasleep(ns int64) int32 {
+	// store ms in ns to save stack space
+	if ns < 0 {
+		ns = _INFINITE
+	} else {
+		ns = int64(timediv(ns, 1000000, nil))
+		if ns == 0 {
+			ns = 1
+		}
+	}
+	if stdcall2(_WaitForSingleObject, getg().m.waitsema, uintptr(ns)) != 0 {
+		return -1 // timeout
+	}
+	return 0
+}
+
+//go:nosplit
+func semawakeup(mp *m) {
+	stdcall1(_SetEvent, mp.waitsema)
+}
+
+//go:nosplit
+func semacreate(mp *m) {
+	if mp.waitsema != 0 {
+		return
+	}
+	mp.waitsema = stdcall4(_CreateEventA, 0, 0, 0, 0)
+}
+
+// May run with m.p==nil, so write barriers are not allowed. This
+// function is called by newosproc0, so it is also required to
+// operate without stack guards.
+//go:nowritebarrierc
+//go:nosplit
+func newosproc(mp *m, stk unsafe.Pointer) {
+	const _STACK_SIZE_PARAM_IS_A_RESERVATION = 0x00010000
+	thandle := stdcall6(_CreateThread, 0, 0x20000,
+		funcPC(tstart_stdcall), uintptr(unsafe.Pointer(mp)),
+		_STACK_SIZE_PARAM_IS_A_RESERVATION, 0)
+	if thandle == 0 {
+		print("runtime: failed to create new OS thread (have ", mcount(), " already; errno=", getlasterror(), ")\n")
+		throw("runtime.newosproc")
+	}
+}
+
+// Used by the C library build mode. On Linux this function would allocate a
+// stack, but that's not necessary for Windows. No stack guards are present
+// and the GC has not been initialized, so write barriers will fail.
+//go:nowritebarrierc
+//go:nosplit
+func newosproc0(mp *m, stk unsafe.Pointer) {
+	newosproc(mp, stk)
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the parent thread (main thread in case of bootstrap), can allocate memory.
+func mpreinit(mp *m) {
+}
+
+//go:nosplit
+func msigsave(mp *m) {
+}
+
+//go:nosplit
+func msigrestore(sigmask sigset) {
+}
+
+//go:nosplit
+func sigblock() {
+}
+
+// Called to initialize a new m (including the bootstrap m).
+// Called on the new thread, cannot allocate memory.
+func minit() {
+	var thandle uintptr
+	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
+	atomic.Storeuintptr(&getg().m.thread, thandle)
+}
+
+// Called from dropm to undo the effect of an minit.
+//go:nosplit
+func unminit() {
+	tp := &getg().m.thread
+	stdcall1(_CloseHandle, *tp)
+	*tp = 0
+}
+
+// Described in http://www.dcl.hpi.uni-potsdam.de/research/WRK/2007/08/getting-os-information-the-kuser_shared_data-structure/
+type _KSYSTEM_TIME struct {
+	LowPart   uint32
+	High1Time int32
+	High2Time int32
+}
+
+const (
+	_INTERRUPT_TIME = 0x7ffe0008
+	_SYSTEM_TIME    = 0x7ffe0014
+)
+
+//go:nosplit
+func systime(addr uintptr) int64 {
+	timeaddr := (*_KSYSTEM_TIME)(unsafe.Pointer(addr))
+
+	var t _KSYSTEM_TIME
+	for i := 1; i < 10000; i++ {
+		// these fields must be read in that order (see URL above)
+		t.High1Time = timeaddr.High1Time
+		t.LowPart = timeaddr.LowPart
+		t.High2Time = timeaddr.High2Time
+		if t.High1Time == t.High2Time {
+			return int64(t.High1Time)<<32 | int64(t.LowPart)
+		}
+		if (i % 100) == 0 {
+			osyield()
+		}
+	}
+	systemstack(func() {
+		throw("interrupt/system time is changing too fast")
+	})
+	return 0
+}
+
+//go:nosplit
+func unixnano() int64 {
+	return (systime(_SYSTEM_TIME) - 116444736000000000) * 100
+}
+
+//go:nosplit
+func nanotime() int64 {
+	return systime(_INTERRUPT_TIME) * 100
+}
+
+// Calling stdcall on os stack.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+//go:nosplit
+func stdcall(fn stdFunction) uintptr {
+	gp := getg()
+	mp := gp.m
+	mp.libcall.fn = uintptr(unsafe.Pointer(fn))
+
+	if mp.profilehz != 0 {
+		// leave pc/sp for cpu profiler
+		mp.libcallg.set(gp)
+		mp.libcallpc = getcallerpc(unsafe.Pointer(&fn))
+		// sp must be the last, because once async cpu profiler finds
+		// all three values to be non-zero, it will use them
+		mp.libcallsp = getcallersp(unsafe.Pointer(&fn))
+	}
+	asmcgocall(asmstdcallAddr, unsafe.Pointer(&mp.libcall))
+	mp.libcallsp = 0
+	return mp.libcall.r1
+}
+
+//go:nosplit
+func stdcall0(fn stdFunction) uintptr {
+	mp := getg().m
+	mp.libcall.n = 0
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&fn))) // it's unused but must be non-nil, otherwise crashes
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall1(fn stdFunction, a0 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 1
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall2(fn stdFunction, a0, a1 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 2
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall3(fn stdFunction, a0, a1, a2 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 3
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall4(fn stdFunction, a0, a1, a2, a3 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 4
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall5(fn stdFunction, a0, a1, a2, a3, a4 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 5
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall6(fn stdFunction, a0, a1, a2, a3, a4, a5 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 6
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+//go:nosplit
+func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
+	mp := getg().m
+	mp.libcall.n = 7
+	mp.libcall.args = uintptr(noescape(unsafe.Pointer(&a0)))
+	return stdcall(fn)
+}
+
+// in sys_windows_386.s and sys_windows_amd64.s
+func onosstack(fn unsafe.Pointer, arg uint32)
+func usleep2(usec uint32)
+func switchtothread()
+
+var usleep2Addr unsafe.Pointer
+var switchtothreadAddr unsafe.Pointer
+
+//go:nosplit
+func osyield() {
+	onosstack(switchtothreadAddr, 0)
+}
+
+//go:nosplit
+func usleep(us uint32) {
+	// Have 1us units; want 100ns units.
+	onosstack(usleep2Addr, 10*us)
+}
+
+func ctrlhandler1(_type uint32) uint32 {
+	var s uint32
+
+	switch _type {
+	case _CTRL_C_EVENT, _CTRL_BREAK_EVENT:
+		s = _SIGINT
+	default:
+		return 0
+	}
+
+	if sigsend(s) {
+		return 1
+	}
+	exit(2) // SIGINT, SIGTERM, etc
+	return 0
+}
+
+// in sys_windows_386.s and sys_windows_amd64.s
+func profileloop()
+
+var profiletimer uintptr
+
+func profilem(mp *m) {
+	var r *context
+	rbuf := make([]byte, unsafe.Sizeof(*r)+15)
+
+	tls := &mp.tls[0]
+	gp := *((**g)(unsafe.Pointer(tls)))
+
+	// align Context to 16 bytes
+	r = (*context)(unsafe.Pointer((uintptr(unsafe.Pointer(&rbuf[15]))) &^ 15))
+	r.contextflags = _CONTEXT_CONTROL
+	stdcall2(_GetThreadContext, mp.thread, uintptr(unsafe.Pointer(r)))
+	sigprof(r.ip(), r.sp(), 0, gp, mp)
+}
+
+func profileloop1(param uintptr) uint32 {
+	stdcall2(_SetThreadPriority, currentThread, _THREAD_PRIORITY_HIGHEST)
+
+	for {
+		stdcall2(_WaitForSingleObject, profiletimer, _INFINITE)
+		first := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))
+		for mp := first; mp != nil; mp = mp.alllink {
+			thread := atomic.Loaduintptr(&mp.thread)
+			// Do not profile threads blocked on Notes,
+			// this includes idle worker threads,
+			// idle timer thread, idle heap scavenger, etc.
+			if thread == 0 || mp.profilehz == 0 || mp.blocked {
+				continue
+			}
+			stdcall1(_SuspendThread, thread)
+			if mp.profilehz != 0 && !mp.blocked {
+				profilem(mp)
+			}
+			stdcall1(_ResumeThread, thread)
+		}
+	}
+}
+
+var cpuprofilerlock mutex
+
+func resetcpuprofiler(hz int32) {
+	lock(&cpuprofilerlock)
+	if profiletimer == 0 {
+		timer := stdcall3(_CreateWaitableTimerA, 0, 0, 0)
+		atomic.Storeuintptr(&profiletimer, timer)
+		thread := stdcall6(_CreateThread, 0, 0, funcPC(profileloop), 0, 0, 0)
+		stdcall2(_SetThreadPriority, thread, _THREAD_PRIORITY_HIGHEST)
+		stdcall1(_CloseHandle, thread)
+	}
+	unlock(&cpuprofilerlock)
+
+	ms := int32(0)
+	due := ^int64(^uint64(1 << 63))
+	if hz > 0 {
+		ms = 1000 / hz
+		if ms == 0 {
+			ms = 1
+		}
+		due = int64(ms) * -10000
+	}
+	stdcall6(_SetWaitableTimer, profiletimer, uintptr(unsafe.Pointer(&due)), uintptr(ms), 0, 0, 0)
+	atomic.Store((*uint32)(unsafe.Pointer(&getg().m.profilehz)), uint32(hz))
+}
+
+func memlimit() uintptr {
+	return 0
+}

diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index a166281..60b277d 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go

@@ -4,41 +4,61 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+// Calling panic with one of the errors below will call errorString.Error
+// which will call mallocgc to concatenate strings. That will fail if
+// malloc is locked, causing a confusing error message. Throw a better
+// error message instead.
+func panicCheckMalloc(err error) {
+	gp := getg()
+	if gp != nil && gp.m != nil && gp.m.mallocing != 0 {
+		throw(string(err.(errorString)))
+	}
+}
 
 var indexError = error(errorString("index out of range"))
 
 func panicindex() {
+	panicCheckMalloc(indexError)
 	panic(indexError)
 }
 
 var sliceError = error(errorString("slice bounds out of range"))
 
 func panicslice() {
+	panicCheckMalloc(sliceError)
 	panic(sliceError)
 }
 
 var divideError = error(errorString("integer divide by zero"))
 
 func panicdivide() {
+	panicCheckMalloc(divideError)
 	panic(divideError)
 }
 
 var overflowError = error(errorString("integer overflow"))
 
 func panicoverflow() {
+	panicCheckMalloc(overflowError)
 	panic(overflowError)
 }
 
 var floatError = error(errorString("floating point error"))
 
 func panicfloat() {
+	panicCheckMalloc(floatError)
 	panic(floatError)
 }
 
 var memoryError = error(errorString("invalid memory address or nil pointer dereference"))
 
 func panicmem() {
+	panicCheckMalloc(memoryError)
 	panic(memoryError)
 }
 
@@ -59,10 +79,10 @@
 		throw("defer on system stack")
 	}
 
-	// the arguments of fn are in a perilous state.  The stack map
-	// for deferproc does not describe them.  So we can't let garbage
+	// the arguments of fn are in a perilous state. The stack map
+	// for deferproc does not describe them. So we can't let garbage
 	// collection or stack copying trigger until we've copied them out
-	// to somewhere safe.  The memmove below does that.
+	// to somewhere safe. The memmove below does that.
 	// Until the copy completes, we can only call nosplit routines.
 	sp := getcallersp(unsafe.Pointer(&siz))
 	argp := uintptr(unsafe.Pointer(&fn)) + unsafe.Sizeof(fn)
@@ -185,7 +205,7 @@
 	if d == nil {
 		// Allocate new defer+args.
 		total := roundupsize(totaldefersize(uintptr(siz)))
-		d = (*_defer)(mallocgc(total, deferType, 0))
+		d = (*_defer)(mallocgc(total, deferType, true))
 	}
 	d.siz = siz
 	gp := mp.curg
@@ -252,7 +272,7 @@
 // If there is a deferred function, this will call runtime·jmpdefer,
 // which will jump to the deferred function such that it appears
 // to have been called by the caller of deferreturn at the point
-// just before deferreturn was called.  The effect is that deferreturn
+// just before deferreturn was called. The effect is that deferreturn
 // is called again and again until there are no more deferred functions.
 // Cannot split the stack because we reuse the caller's frame to
 // call the deferred function.
@@ -288,8 +308,8 @@
 	jmpdefer(fn, uintptr(unsafe.Pointer(&arg0)))
 }
 
-// Goexit terminates the goroutine that calls it.  No other goroutine is affected.
-// Goexit runs all deferred calls before terminating the goroutine.  Because Goexit
+// Goexit terminates the goroutine that calls it. No other goroutine is affected.
+// Goexit runs all deferred calls before terminating the goroutine. Because Goexit
 // is not panic, however, any recover calls in those deferred functions will return nil.
 //
 // Calling Goexit from the main goroutine terminates that goroutine
@@ -330,7 +350,22 @@
 	goexit1()
 }
 
-// Print all currently active panics.  Used when crashing.
+// Call all Error and String methods before freezing the world.
+// Used when crashing with panicking.
+// This must match types handled by printany.
+func preprintpanics(p *_panic) {
+	for p != nil {
+		switch v := p.arg.(type) {
+		case error:
+			p.arg = v.Error()
+		case stringer:
+			p.arg = v.String()
+		}
+		p = p.link
+	}
+}
+
+// Print all currently active panics. Used when crashing.
 func printpanics(p *_panic) {
 	if p.link != nil {
 		printpanics(p.link)
@@ -417,7 +452,7 @@
 		// Record the panic that is running the defer.
 		// If there is a new panic during the deferred call, that panic
 		// will find d in the list and will mark d._panic (this panic) aborted.
-		d._panic = (*_panic)(noescape((unsafe.Pointer)(&p)))
+		d._panic = (*_panic)(noescape(unsafe.Pointer(&p)))
 
 		p.argp = unsafe.Pointer(getargp(0))
 		reflectcall(nil, unsafe.Pointer(d.fn), deferArgs(d), uint32(d.siz), uint32(d.siz))
@@ -431,7 +466,7 @@
 		d.fn = nil
 		gp._defer = d.link
 
-		// trigger shrinkage to test stack copy.  See stack_test.go:TestStackPanic
+		// trigger shrinkage to test stack copy. See stack_test.go:TestStackPanic
 		//GC()
 
 		pc := d.pc
@@ -456,6 +491,10 @@
 	}
 
 	// ran out of deferred calls - old-school panic now
+	// Because it is unsafe to call arbitrary user code after freezing
+	// the world, we call preprintpanics to invoke all necessary Error
+	// and String methods to prepare the panic strings before startpanic.
+	preprintpanics(gp._panic)
 	startpanic()
 	printpanics(gp._panic)
 	dopanic(0)       // should not return
@@ -527,3 +566,150 @@
 	dopanic(0)
 	*(*int)(nil) = 0 // not reached
 }
+
+//uint32 runtime·panicking;
+var paniclk mutex
+
+// Unwind the stack after a deferred function calls recover
+// after a panic. Then arrange to continue running as though
+// the caller of the deferred function returned normally.
+func recovery(gp *g) {
+	// Info about defer passed in G struct.
+	sp := gp.sigcode0
+	pc := gp.sigcode1
+
+	// d's arguments need to be in the stack.
+	if sp != 0 && (sp < gp.stack.lo || gp.stack.hi < sp) {
+		print("recover: ", hex(sp), " not in [", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
+		throw("bad recovery")
+	}
+
+	// Make the deferproc for this d return again,
+	// this time returning 1.  The calling function will
+	// jump to the standard return epilogue.
+	gcUnwindBarriers(gp, sp)
+	gp.sched.sp = sp
+	gp.sched.pc = pc
+	gp.sched.lr = 0
+	gp.sched.ret = 1
+	gogo(&gp.sched)
+}
+
+func startpanic_m() {
+	_g_ := getg()
+	if mheap_.cachealloc.size == 0 { // very early
+		print("runtime: panic before malloc heap initialized\n")
+		_g_.m.mallocing = 1 // tell rest of panic not to try to malloc
+	} else if _g_.m.mcache == nil { // can happen if called from signal handler or throw
+		_g_.m.mcache = allocmcache()
+	}
+
+	switch _g_.m.dying {
+	case 0:
+		_g_.m.dying = 1
+		_g_.writebuf = nil
+		atomic.Xadd(&panicking, 1)
+		lock(&paniclk)
+		if debug.schedtrace > 0 || debug.scheddetail > 0 {
+			schedtrace(true)
+		}
+		freezetheworld()
+		return
+	case 1:
+		// Something failed while panicing, probably the print of the
+		// argument to panic().  Just print a stack trace and exit.
+		_g_.m.dying = 2
+		print("panic during panic\n")
+		dopanic(0)
+		exit(3)
+		fallthrough
+	case 2:
+		// This is a genuine bug in the runtime, we couldn't even
+		// print the stack trace successfully.
+		_g_.m.dying = 3
+		print("stack trace unavailable\n")
+		exit(4)
+		fallthrough
+	default:
+		// Can't even print!  Just exit.
+		exit(5)
+	}
+}
+
+var didothers bool
+var deadlock mutex
+
+func dopanic_m(gp *g, pc, sp uintptr) {
+	if gp.sig != 0 {
+		signame := signame(gp.sig)
+		if signame != "" {
+			print("[signal ", signame)
+		} else {
+			print("[signal ", hex(gp.sig))
+		}
+		print(" code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n")
+	}
+
+	level, all, docrash := gotraceback()
+	_g_ := getg()
+	if level > 0 {
+		if gp != gp.m.curg {
+			all = true
+		}
+		if gp != gp.m.g0 {
+			print("\n")
+			goroutineheader(gp)
+			traceback(pc, sp, 0, gp)
+		} else if level >= 2 || _g_.m.throwing > 0 {
+			print("\nruntime stack:\n")
+			traceback(pc, sp, 0, gp)
+		}
+		if !didothers && all {
+			didothers = true
+			tracebackothers(gp)
+		}
+	}
+	unlock(&paniclk)
+
+	if atomic.Xadd(&panicking, -1) != 0 {
+		// Some other m is panicking too.
+		// Let it print what it needs to print.
+		// Wait forever without chewing up cpu.
+		// It will exit when it's done.
+		lock(&deadlock)
+		lock(&deadlock)
+	}
+
+	if docrash {
+		crash()
+	}
+
+	exit(2)
+}
+
+//go:nosplit
+func canpanic(gp *g) bool {
+	// Note that g is m->gsignal, different from gp.
+	// Note also that g->m can change at preemption, so m can go stale
+	// if this function ever makes a function call.
+	_g_ := getg()
+	_m_ := _g_.m
+
+	// Is it okay for gp to panic instead of crashing the program?
+	// Yes, as long as it is running Go code, not runtime code,
+	// and not stuck in a system call.
+	if gp == nil || gp != _m_.curg {
+		return false
+	}
+	if _m_.locks-_m_.softfloat != 0 || _m_.mallocing != 0 || _m_.throwing != 0 || _m_.preemptoff != "" || _m_.dying != 0 {
+		return false
+	}
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning || gp.syscallsp != 0 {
+		return false
+	}
+	if GOOS == "windows" && _m_.libcallsp != 0 {
+		return false
+	}
+	return true
+}

diff --git a/src/runtime/panic1.go b/src/runtime/panic1.go
deleted file mode 100644
index 1a71d09..0000000
--- a/src/runtime/panic1.go
+++ /dev/null

@@ -1,150 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-// Code related to defer, panic and recover.
-// TODO: Merge into panic.go.
-
-//uint32 runtime·panicking;
-var paniclk mutex
-
-const hasLinkRegister = GOARCH == "arm" || GOARCH == "arm64" || GOARCH == "ppc64" || GOARCH == "ppc64le"
-
-// Unwind the stack after a deferred function calls recover
-// after a panic.  Then arrange to continue running as though
-// the caller of the deferred function returned normally.
-func recovery(gp *g) {
-	// Info about defer passed in G struct.
-	sp := gp.sigcode0
-	pc := gp.sigcode1
-
-	// d's arguments need to be in the stack.
-	if sp != 0 && (sp < gp.stack.lo || gp.stack.hi < sp) {
-		print("recover: ", hex(sp), " not in [", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n")
-		throw("bad recovery")
-	}
-
-	// Make the deferproc for this d return again,
-	// this time returning 1.  The calling function will
-	// jump to the standard return epilogue.
-	gcUnwindBarriers(gp, sp)
-	gp.sched.sp = sp
-	gp.sched.pc = pc
-	gp.sched.lr = 0
-	gp.sched.ret = 1
-	gogo(&gp.sched)
-}
-
-func startpanic_m() {
-	_g_ := getg()
-	if mheap_.cachealloc.size == 0 { // very early
-		print("runtime: panic before malloc heap initialized\n")
-		_g_.m.mallocing = 1 // tell rest of panic not to try to malloc
-	} else if _g_.m.mcache == nil { // can happen if called from signal handler or throw
-		_g_.m.mcache = allocmcache()
-	}
-
-	switch _g_.m.dying {
-	case 0:
-		_g_.m.dying = 1
-		if _g_ != nil {
-			_g_.writebuf = nil
-		}
-		xadd(&panicking, 1)
-		lock(&paniclk)
-		if debug.schedtrace > 0 || debug.scheddetail > 0 {
-			schedtrace(true)
-		}
-		freezetheworld()
-		return
-	case 1:
-		// Something failed while panicing, probably the print of the
-		// argument to panic().  Just print a stack trace and exit.
-		_g_.m.dying = 2
-		print("panic during panic\n")
-		dopanic(0)
-		exit(3)
-		fallthrough
-	case 2:
-		// This is a genuine bug in the runtime, we couldn't even
-		// print the stack trace successfully.
-		_g_.m.dying = 3
-		print("stack trace unavailable\n")
-		exit(4)
-		fallthrough
-	default:
-		// Can't even print!  Just exit.
-		exit(5)
-	}
-}
-
-var didothers bool
-var deadlock mutex
-
-func dopanic_m(gp *g, pc, sp uintptr) {
-	if gp.sig != 0 {
-		print("[signal ", hex(gp.sig), " code=", hex(gp.sigcode0), " addr=", hex(gp.sigcode1), " pc=", hex(gp.sigpc), "]\n")
-	}
-
-	var docrash bool
-	_g_ := getg()
-	if t := gotraceback(&docrash); t > 0 {
-		if gp != gp.m.g0 {
-			print("\n")
-			goroutineheader(gp)
-			traceback(pc, sp, 0, gp)
-		} else if t >= 2 || _g_.m.throwing > 0 {
-			print("\nruntime stack:\n")
-			traceback(pc, sp, 0, gp)
-		}
-		if !didothers {
-			didothers = true
-			tracebackothers(gp)
-		}
-	}
-	unlock(&paniclk)
-
-	if xadd(&panicking, -1) != 0 {
-		// Some other m is panicking too.
-		// Let it print what it needs to print.
-		// Wait forever without chewing up cpu.
-		// It will exit when it's done.
-		lock(&deadlock)
-		lock(&deadlock)
-	}
-
-	if docrash {
-		crash()
-	}
-
-	exit(2)
-}
-
-//go:nosplit
-func canpanic(gp *g) bool {
-	// Note that g is m->gsignal, different from gp.
-	// Note also that g->m can change at preemption, so m can go stale
-	// if this function ever makes a function call.
-	_g_ := getg()
-	_m_ := _g_.m
-
-	// Is it okay for gp to panic instead of crashing the program?
-	// Yes, as long as it is running Go code, not runtime code,
-	// and not stuck in a system call.
-	if gp == nil || gp != _m_.curg {
-		return false
-	}
-	if _m_.locks-_m_.softfloat != 0 || _m_.mallocing != 0 || _m_.throwing != 0 || _m_.preemptoff != "" || _m_.dying != 0 {
-		return false
-	}
-	status := readgstatus(gp)
-	if status&^_Gscan != _Grunning || gp.syscallsp != 0 {
-		return false
-	}
-	if GOOS == "windows" && _m_.libcallsp != 0 {
-		return false
-	}
-	return true
-}

diff --git a/src/runtime/parfor.go b/src/runtime/parfor.go
deleted file mode 100644
index c82beee..0000000
--- a/src/runtime/parfor.go
+++ /dev/null

@@ -1,212 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Parallel for algorithm.
-
-package runtime
-
-// A parfor holds state for the parallel for operation.
-type parfor struct {
-	body   func(*parfor, uint32) // executed for each element
-	done   uint32                // number of idle threads
-	nthr   uint32                // total number of threads
-	thrseq uint32                // thread id sequencer
-	cnt    uint32                // iteration space [0, cnt)
-	wait   bool                  // if true, wait while all threads finish processing,
-	// otherwise parfor may return while other threads are still working
-
-	thr []parforthread // thread descriptors
-
-	// stats
-	nsteal     uint64
-	nstealcnt  uint64
-	nprocyield uint64
-	nosyield   uint64
-	nsleep     uint64
-}
-
-// A parforthread holds state for a single thread in the parallel for.
-type parforthread struct {
-	// the thread's iteration space [32lsb, 32msb)
-	pos uint64
-	// stats
-	nsteal     uint64
-	nstealcnt  uint64
-	nprocyield uint64
-	nosyield   uint64
-	nsleep     uint64
-	pad        [_CacheLineSize]byte
-}
-
-func parforalloc(nthrmax uint32) *parfor {
-	return &parfor{
-		thr: make([]parforthread, nthrmax),
-	}
-}
-
-// Parforsetup initializes desc for a parallel for operation with nthr
-// threads executing n jobs.
-//
-// On return the nthr threads are each expected to call parfordo(desc)
-// to run the operation. During those calls, for each i in [0, n), one
-// thread will be used invoke body(desc, i).
-// If wait is true, no parfordo will return until all work has been completed.
-// If wait is false, parfordo may return when there is a small amount
-// of work left, under the assumption that another thread has that
-// work well in hand.
-func parforsetup(desc *parfor, nthr, n uint32, wait bool, body func(*parfor, uint32)) {
-	if desc == nil || nthr == 0 || nthr > uint32(len(desc.thr)) || body == nil {
-		print("desc=", desc, " nthr=", nthr, " count=", n, " body=", body, "\n")
-		throw("parfor: invalid args")
-	}
-
-	desc.body = body
-	desc.done = 0
-	desc.nthr = nthr
-	desc.thrseq = 0
-	desc.cnt = n
-	desc.wait = wait
-	desc.nsteal = 0
-	desc.nstealcnt = 0
-	desc.nprocyield = 0
-	desc.nosyield = 0
-	desc.nsleep = 0
-
-	for i := range desc.thr {
-		begin := uint32(uint64(n) * uint64(i) / uint64(nthr))
-		end := uint32(uint64(n) * uint64(i+1) / uint64(nthr))
-		desc.thr[i].pos = uint64(begin) | uint64(end)<<32
-	}
-}
-
-func parfordo(desc *parfor) {
-	// Obtain 0-based thread index.
-	tid := xadd(&desc.thrseq, 1) - 1
-	if tid >= desc.nthr {
-		print("tid=", tid, " nthr=", desc.nthr, "\n")
-		throw("parfor: invalid tid")
-	}
-
-	// If single-threaded, just execute the for serially.
-	body := desc.body
-	if desc.nthr == 1 {
-		for i := uint32(0); i < desc.cnt; i++ {
-			body(desc, i)
-		}
-		return
-	}
-
-	me := &desc.thr[tid]
-	mypos := &me.pos
-	for {
-		for {
-			// While there is local work,
-			// bump low index and execute the iteration.
-			pos := xadd64(mypos, 1)
-			begin := uint32(pos) - 1
-			end := uint32(pos >> 32)
-			if begin < end {
-				body(desc, begin)
-				continue
-			}
-			break
-		}
-
-		// Out of work, need to steal something.
-		idle := false
-		for try := uint32(0); ; try++ {
-			// If we don't see any work for long enough,
-			// increment the done counter...
-			if try > desc.nthr*4 && !idle {
-				idle = true
-				xadd(&desc.done, 1)
-			}
-
-			// ...if all threads have incremented the counter,
-			// we are done.
-			extra := uint32(0)
-			if !idle {
-				extra = 1
-			}
-			if desc.done+extra == desc.nthr {
-				if !idle {
-					xadd(&desc.done, 1)
-				}
-				goto exit
-			}
-
-			// Choose a random victim for stealing.
-			var begin, end uint32
-			victim := fastrand1() % (desc.nthr - 1)
-			if victim >= tid {
-				victim++
-			}
-			victimpos := &desc.thr[victim].pos
-			for {
-				// See if it has any work.
-				pos := atomicload64(victimpos)
-				begin = uint32(pos)
-				end = uint32(pos >> 32)
-				if begin+1 >= end {
-					end = 0
-					begin = end
-					break
-				}
-				if idle {
-					xadd(&desc.done, -1)
-					idle = false
-				}
-				begin2 := begin + (end-begin)/2
-				newpos := uint64(begin) | uint64(begin2)<<32
-				if cas64(victimpos, pos, newpos) {
-					begin = begin2
-					break
-				}
-			}
-			if begin < end {
-				// Has successfully stolen some work.
-				if idle {
-					throw("parfor: should not be idle")
-				}
-				atomicstore64(mypos, uint64(begin)|uint64(end)<<32)
-				me.nsteal++
-				me.nstealcnt += uint64(end) - uint64(begin)
-				break
-			}
-
-			// Backoff.
-			if try < desc.nthr {
-				// nothing
-			} else if try < 4*desc.nthr {
-				me.nprocyield++
-				procyield(20)
-			} else if !desc.wait {
-				// If a caller asked not to wait for the others, exit now
-				// (assume that most work is already done at this point).
-				if !idle {
-					xadd(&desc.done, 1)
-				}
-				goto exit
-			} else if try < 6*desc.nthr {
-				me.nosyield++
-				osyield()
-			} else {
-				me.nsleep++
-				usleep(1)
-			}
-		}
-	}
-
-exit:
-	xadd64(&desc.nsteal, int64(me.nsteal))
-	xadd64(&desc.nstealcnt, int64(me.nstealcnt))
-	xadd64(&desc.nprocyield, int64(me.nprocyield))
-	xadd64(&desc.nosyield, int64(me.nosyield))
-	xadd64(&desc.nsleep, int64(me.nsleep))
-	me.nsteal = 0
-	me.nstealcnt = 0
-	me.nprocyield = 0
-	me.nosyield = 0
-	me.nsleep = 0
-}

diff --git a/src/runtime/parfor_test.go b/src/runtime/parfor_test.go
deleted file mode 100644
index 5d22aec..0000000
--- a/src/runtime/parfor_test.go
+++ /dev/null

@@ -1,128 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// The race detector does not understand ParFor synchronization.
-// +build !race
-
-package runtime_test
-
-import (
-	. "runtime"
-	"testing"
-)
-
-// Simple serial sanity test for parallelfor.
-func TestParFor(t *testing.T) {
-	const P = 1
-	const N = 20
-	data := make([]uint64, N)
-	for i := uint64(0); i < N; i++ {
-		data[i] = i
-	}
-	desc := NewParFor(P)
-	ParForSetup(desc, P, N, true, func(desc *ParFor, i uint32) {
-		data[i] = data[i]*data[i] + 1
-	})
-	ParForDo(desc)
-	for i := uint64(0); i < N; i++ {
-		if data[i] != i*i+1 {
-			t.Fatalf("Wrong element %d: %d", i, data[i])
-		}
-	}
-}
-
-// Test that nonblocking parallelfor does not block.
-func TestParFor2(t *testing.T) {
-	const P = 7
-	const N = 1003
-	data := make([]uint64, N)
-	for i := uint64(0); i < N; i++ {
-		data[i] = i
-	}
-	desc := NewParFor(P)
-	ParForSetup(desc, P, N, false, func(desc *ParFor, i uint32) {
-		data[i] = data[i]*data[i] + 1
-	})
-	for p := 0; p < P; p++ {
-		ParForDo(desc)
-	}
-	for i := uint64(0); i < N; i++ {
-		if data[i] != i*i+1 {
-			t.Fatalf("Wrong element %d: %d", i, data[i])
-		}
-	}
-}
-
-// Test that iterations are properly distributed.
-func TestParForSetup(t *testing.T) {
-	const P = 11
-	const N = 101
-	desc := NewParFor(P)
-	for n := uint32(0); n < N; n++ {
-		for p := uint32(1); p <= P; p++ {
-			ParForSetup(desc, p, n, true, func(desc *ParFor, i uint32) {})
-			sum := uint32(0)
-			size0 := uint32(0)
-			end0 := uint32(0)
-			for i := uint32(0); i < p; i++ {
-				begin, end := ParForIters(desc, i)
-				size := end - begin
-				sum += size
-				if i == 0 {
-					size0 = size
-					if begin != 0 {
-						t.Fatalf("incorrect begin: %d (n=%d, p=%d)", begin, n, p)
-					}
-				} else {
-					if size != size0 && size != size0+1 {
-						t.Fatalf("incorrect size: %d/%d (n=%d, p=%d)", size, size0, n, p)
-					}
-					if begin != end0 {
-						t.Fatalf("incorrect begin/end: %d/%d (n=%d, p=%d)", begin, end0, n, p)
-					}
-				}
-				end0 = end
-			}
-			if sum != n {
-				t.Fatalf("incorrect sum: %d/%d (p=%d)", sum, n, p)
-			}
-		}
-	}
-}
-
-// Test parallel parallelfor.
-func TestParForParallel(t *testing.T) {
-	N := uint64(1e7)
-	if testing.Short() {
-		N /= 10
-	}
-	data := make([]uint64, N)
-	for i := uint64(0); i < N; i++ {
-		data[i] = i
-	}
-	P := GOMAXPROCS(-1)
-	c := make(chan bool, P)
-	desc := NewParFor(uint32(P))
-	ParForSetup(desc, uint32(P), uint32(N), false, func(desc *ParFor, i uint32) {
-		data[i] = data[i]*data[i] + 1
-	})
-	for p := 1; p < P; p++ {
-		go func() {
-			ParForDo(desc)
-			c <- true
-		}()
-	}
-	ParForDo(desc)
-	for p := 1; p < P; p++ {
-		<-c
-	}
-	for i := uint64(0); i < N; i++ {
-		if data[i] != i*i+1 {
-			t.Fatalf("Wrong element %d: %d", i, data[i])
-		}
-	}
-
-	data, desc = nil, nil
-	GC()
-}

diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index ebf53dd..0fff9d4 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -22,11 +22,8 @@
 	}
 }
 
+//go:noinline
 func allocateTransient2M() {
-	// prevent inlining
-	if memSink == nil {
-		panic("bad")
-	}
 	memSink = make([]byte, 2<<20)
 }
 
@@ -76,18 +73,18 @@
 
 	tests := []string{
 		fmt.Sprintf(`%v: %v \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:43
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:66
+#	0x[0-9,a-f]+	runtime/pprof_test\.allocatePersistent1K\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:40
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test\.go:63
 `, 32*memoryProfilerRun, 1024*memoryProfilerRun, 32*memoryProfilerRun, 1024*memoryProfilerRun),
 
 		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
 #	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient1M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:21
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:64
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:61
 `, (1<<10)*memoryProfilerRun, (1<<20)*memoryProfilerRun),
 
-		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
-#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:30
-#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:65
+		fmt.Sprintf(`0: 0 \[%v: %v\] @ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+ 0x[0-9,a-f]+
+#	0x[0-9,a-f]+	runtime/pprof_test\.allocateTransient2M\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:27
+#	0x[0-9,a-f]+	runtime/pprof_test\.TestMemoryProfiler\+0x[0-9,a-f]+	.*/runtime/pprof/mprof_test.go:62
 `, memoryProfilerRun, (2<<20)*memoryProfilerRun),
 	}
 

diff --git a/src/runtime/pprof/pprof.go b/src/runtime/pprof/pprof.go
index 23fc850..f2cd81a 100644
--- a/src/runtime/pprof/pprof.go
+++ b/src/runtime/pprof/pprof.go

@@ -1,11 +1,11 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // Package pprof writes runtime profiling data in the format expected
 // by the pprof visualization tool.
 // For more information about pprof, see
-// http://code.google.com/p/google-perftools/.
+// http://github.com/google/pprof/.
 package pprof
 
 import (
@@ -13,6 +13,7 @@
 	"bytes"
 	"fmt"
 	"io"
+	"os"
 	"runtime"
 	"sort"
 	"strings"
@@ -20,8 +21,8 @@
 	"text/tabwriter"
 )
 
-// BUG(rsc): Profiles are incomplete and inaccurate on NetBSD and OS X.
-// See https://golang.org/issue/6047 for details.
+// BUG(rsc): Profiles are only as good as the kernel support used to generate them.
+// See https://golang.org/issue/13841 for details about known problems.
 
 // A Profile is a collection of stack traces showing the call sequences
 // that led to instances of a particular event, such as allocation.
@@ -31,7 +32,7 @@
 //
 // A Profile's methods can be called from multiple goroutines simultaneously.
 //
-// Each Profile has a unique name.  A few profiles are predefined:
+// Each Profile has a unique name. A few profiles are predefined:
 //
 //	goroutine    - stack traces of all current goroutines
 //	heap         - a sampling of all heap allocations
@@ -48,7 +49,7 @@
 // all known allocations. This exception helps mainly in programs running
 // without garbage collection enabled, usually for debugging purposes.
 //
-// The CPU profile is not available as a Profile.  It has a special API,
+// The CPU profile is not available as a Profile. It has a special API,
 // the StartCPUProfile and StopCPUProfile functions, because it streams
 // output to a writer during profiling.
 //
@@ -173,11 +174,11 @@
 // Add adds the current execution stack to the profile, associated with value.
 // Add stores value in an internal map, so value must be suitable for use as
 // a map key and will not be garbage collected until the corresponding
-// call to Remove.  Add panics if the profile already contains a stack for value.
+// call to Remove. Add panics if the profile already contains a stack for value.
 //
 // The skip parameter has the same meaning as runtime.Caller's skip
-// and controls where the stack trace begins.  Passing skip=0 begins the
-// trace in the function calling Add.  For example, given this
+// and controls where the stack trace begins. Passing skip=0 begins the
+// trace in the function calling Add. For example, given this
 // execution stack:
 //
 //	Add
@@ -266,7 +267,7 @@
 }
 
 // A countProfile is a set of stack traces to be printed as counts
-// grouped by stack trace.  There are multiple implementations:
+// grouped by stack trace. There are multiple implementations:
 // all that matters is that we can find out how many traces there are
 // and obtain each trace in turn.
 type countProfile interface {
@@ -296,22 +297,25 @@
 		}
 		return buf.String()
 	}
-	m := map[string]int{}
+	count := map[string]int{}
+	index := map[string]int{}
+	var keys []string
 	n := p.Len()
 	for i := 0; i < n; i++ {
-		m[key(p.Stack(i))]++
+		k := key(p.Stack(i))
+		if count[k] == 0 {
+			index[k] = i
+			keys = append(keys, k)
+		}
+		count[k]++
 	}
 
-	// Print stacks, listing count on first occurrence of a unique stack.
-	for i := 0; i < n; i++ {
-		stk := p.Stack(i)
-		s := key(stk)
-		if count := m[s]; count != 0 {
-			fmt.Fprintf(w, "%d %s\n", count, s)
-			if debug > 0 {
-				printStackRecord(w, stk, false)
-			}
-			delete(m, s)
+	sort.Sort(&keysByCount{keys, count})
+
+	for _, k := range keys {
+		fmt.Fprintf(w, "%d %s\n", count[k], k)
+		if debug > 0 {
+			printStackRecord(w, p.Stack(index[k]), false)
 		}
 	}
 
@@ -321,37 +325,45 @@
 	return b.Flush()
 }
 
+// keysByCount sorts keys with higher counts first, breaking ties by key string order.
+type keysByCount struct {
+	keys  []string
+	count map[string]int
+}
+
+func (x *keysByCount) Len() int      { return len(x.keys) }
+func (x *keysByCount) Swap(i, j int) { x.keys[i], x.keys[j] = x.keys[j], x.keys[i] }
+func (x *keysByCount) Less(i, j int) bool {
+	ki, kj := x.keys[i], x.keys[j]
+	ci, cj := x.count[ki], x.count[kj]
+	if ci != cj {
+		return ci > cj
+	}
+	return ki < kj
+}
+
 // printStackRecord prints the function + source line information
 // for a single stack trace.
 func printStackRecord(w io.Writer, stk []uintptr, allFrames bool) {
 	show := allFrames
-	wasPanic := false
-	for i, pc := range stk {
-		f := runtime.FuncForPC(pc)
-		if f == nil {
+	frames := runtime.CallersFrames(stk)
+	for {
+		frame, more := frames.Next()
+		name := frame.Function
+		if name == "" {
 			show = true
-			fmt.Fprintf(w, "#\t%#x\n", pc)
-			wasPanic = false
+			fmt.Fprintf(w, "#\t%#x\n", frame.PC)
 		} else {
-			tracepc := pc
-			// Back up to call instruction.
-			if i > 0 && pc > f.Entry() && !wasPanic {
-				if runtime.GOARCH == "386" || runtime.GOARCH == "amd64" {
-					tracepc--
-				} else {
-					tracepc -= 4 // arm, etc
-				}
-			}
-			file, line := f.FileLine(tracepc)
-			name := f.Name()
 			// Hide runtime.goexit and any runtime functions at the beginning.
 			// This is useful mainly for allocation traces.
-			wasPanic = name == "runtime.panic"
 			if name == "runtime.goexit" || !show && strings.HasPrefix(name, "runtime.") {
 				continue
 			}
 			show = true
-			fmt.Fprintf(w, "#\t%#x\t%s+%#x\t%s:%d\n", pc, name, pc-f.Entry(), file, line)
+			fmt.Fprintf(w, "#\t%#x\t%s+%#x\t%s:%d\n", frame.PC, name, frame.PC-frame.Entry, frame.File, frame.Line)
+		}
+		if !more {
+			break
 		}
 	}
 	if !show {
@@ -474,7 +486,6 @@
 	fmt.Fprintf(w, "# NextGC = %d\n", s.NextGC)
 	fmt.Fprintf(w, "# PauseNs = %d\n", s.PauseNs)
 	fmt.Fprintf(w, "# NumGC = %d\n", s.NumGC)
-	fmt.Fprintf(w, "# EnableGC = %v\n", s.EnableGC)
 	fmt.Fprintf(w, "# DebugGC = %v\n", s.DebugGC)
 
 	if tw != nil {
@@ -509,7 +520,7 @@
 
 func writeGoroutineStacks(w io.Writer) error {
 	// We don't know how big the buffer needs to be to collect
-	// all the goroutines.  Start with 1 MB and try a few times, doubling each time.
+	// all the goroutines. Start with 1 MB and try a few times, doubling each time.
 	// Give up and use a truncated trace if 64 MB is not enough.
 	buf := make([]byte, 1<<20)
 	for i := 0; ; i++ {
@@ -567,6 +578,14 @@
 // StartCPUProfile enables CPU profiling for the current process.
 // While profiling, the profile will be buffered and written to w.
 // StartCPUProfile returns an error if profiling is already enabled.
+//
+// On Unix-like systems, StartCPUProfile does not work by default for
+// Go code built with -buildmode=c-archive or -buildmode=c-shared.
+// StartCPUProfile relies on the SIGPROF signal, but that signal will
+// be delivered to the main program's SIGPROF signal handler (if any)
+// not to the one used by Go. To make it work, call os/signal.Notify
+// for syscall.SIGPROF, but note that doing so may break any profiling
+// being done by the main program.
 func StartCPUProfile(w io.Writer) error {
 	// The runtime routines allow a variable profiling rate,
 	// but in practice operating systems cannot trigger signals
@@ -575,7 +594,7 @@
 	// 100 Hz is a reasonable choice: it is frequent enough to
 	// produce useful data, rare enough not to bog down the
 	// system, and a nice round number to make it easy to
-	// convert sample counts to seconds.  Instead of requiring
+	// convert sample counts to seconds. Instead of requiring
 	// each client to specify the frequency, we hard code it.
 	const hz = 100
 
@@ -602,6 +621,42 @@
 		}
 		w.Write(data)
 	}
+
+	// We are emitting the legacy profiling format, which permits
+	// a memory map following the CPU samples. The memory map is
+	// simply a copy of the GNU/Linux /proc/self/maps file. The
+	// profiler uses the memory map to map PC values in shared
+	// libraries to a shared library in the filesystem, in order
+	// to report the correct function and, if the shared library
+	// has debug info, file/line. This is particularly useful for
+	// PIE (position independent executables) as on ELF systems a
+	// PIE is simply an executable shared library.
+	//
+	// Because the profiling format expects the memory map in
+	// GNU/Linux format, we only do this on GNU/Linux for now. To
+	// add support for profiling PIE on other ELF-based systems,
+	// it may be necessary to map the system-specific mapping
+	// information to the GNU/Linux format. For a reasonably
+	// portable C++ version, see the FillProcSelfMaps function in
+	// https://github.com/gperftools/gperftools/blob/master/src/base/sysinfo.cc
+	//
+	// The code that parses this mapping for the pprof tool is
+	// ParseMemoryMap in cmd/internal/pprof/legacy_profile.go, but
+	// don't change that code, as similar code exists in other
+	// (non-Go) pprof readers. Change this code so that that code works.
+	//
+	// We ignore errors reading or copying the memory map; the
+	// profile is likely usable without it, and we have no good way
+	// to report errors.
+	if runtime.GOOS == "linux" {
+		f, err := os.Open("/proc/self/maps")
+		if err == nil {
+			io.WriteString(w, "\nMAPPED_LIBRARIES:\n")
+			io.Copy(w, f)
+			f.Close()
+		}
+	}
+
 	cpu.done <- true
 }
 

diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go
index 785d75a..a6f5eda 100644
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go

@@ -1,4 +1,4 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
+// Copyright 2011 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -23,14 +23,14 @@
 	"unsafe"
 )
 
-func cpuHogger(f func()) {
+func cpuHogger(f func(), dur time.Duration) {
 	// We only need to get one 100 Hz clock tick, so we've got
-	// a 25x safety buffer.
+	// a large safety buffer.
 	// But do at least 500 iterations (which should take about 100ms),
 	// otherwise TestCPUProfileMultithreaded can fail if only one
-	// thread is scheduled during the 250ms period.
+	// thread is scheduled during the testing period.
 	t0 := time.Now()
-	for i := 0; i < 500 || time.Since(t0) < 250*time.Millisecond; i++ {
+	for i := 0; i < 500 || time.Since(t0) < dur; i++ {
 		f()
 	}
 }
@@ -68,35 +68,39 @@
 }
 
 func TestCPUProfile(t *testing.T) {
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func() {
-		cpuHogger(cpuHog1)
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1"}, func(dur time.Duration) {
+		cpuHogger(cpuHog1, dur)
 	})
 }
 
 func TestCPUProfileMultithreaded(t *testing.T) {
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(2))
-	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func() {
+	testCPUProfile(t, []string{"runtime/pprof_test.cpuHog1", "runtime/pprof_test.cpuHog2"}, func(dur time.Duration) {
 		c := make(chan int)
 		go func() {
-			cpuHogger(cpuHog1)
+			cpuHogger(cpuHog1, dur)
 			c <- 1
 		}()
-		cpuHogger(cpuHog2)
+		cpuHogger(cpuHog2, dur)
 		<-c
 	})
 }
 
-func parseProfile(t *testing.T, bytes []byte, f func(uintptr, []uintptr)) {
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []uintptr)) {
 	// Convert []byte to []uintptr.
-	l := len(bytes) / int(unsafe.Sizeof(uintptr(0)))
-	val := *(*[]uintptr)(unsafe.Pointer(&bytes))
+	l := len(valBytes)
+	if i := bytes.Index(valBytes, []byte("\nMAPPED_LIBRARIES:\n")); i >= 0 {
+		l = i
+	}
+	l /= int(unsafe.Sizeof(uintptr(0)))
+	val := *(*[]uintptr)(unsafe.Pointer(&valBytes))
 	val = val[:l]
 
-	// 5 for the header, 2 for the per-sample header on at least one sample, 3 for the trailer.
-	if l < 5+2+3 {
+	// 5 for the header, 3 for the trailer.
+	if l < 5+3 {
 		t.Logf("profile too short: %#x", val)
 		if badOS[runtime.GOOS] {
-			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
+			t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
 			return
 		}
 		t.FailNow()
@@ -120,7 +124,7 @@
 	}
 }
 
-func testCPUProfile(t *testing.T, need []string, f func()) {
+func testCPUProfile(t *testing.T, need []string, f func(dur time.Duration)) {
 	switch runtime.GOOS {
 	case "darwin":
 		switch runtime.GOARCH {
@@ -138,12 +142,55 @@
 		t.Skip("skipping on plan9")
 	}
 
-	var prof bytes.Buffer
-	if err := StartCPUProfile(&prof); err != nil {
-		t.Fatal(err)
+	const maxDuration = 5 * time.Second
+	// If we're running a long test, start with a long duration
+	// because some of the tests (e.g., TestStackBarrierProfiling)
+	// are trying to make sure something *doesn't* happen.
+	duration := 5 * time.Second
+	if testing.Short() {
+		duration = 200 * time.Millisecond
 	}
-	f()
-	StopCPUProfile()
+
+	// Profiling tests are inherently flaky, especially on a
+	// loaded system, such as when this test is running with
+	// several others under go test std. If a test fails in a way
+	// that could mean it just didn't run long enough, try with a
+	// longer duration.
+	for duration <= maxDuration {
+		var prof bytes.Buffer
+		if err := StartCPUProfile(&prof); err != nil {
+			t.Fatal(err)
+		}
+		f(duration)
+		StopCPUProfile()
+
+		if profileOk(t, need, prof, duration) {
+			return
+		}
+
+		duration *= 2
+		if duration <= maxDuration {
+			t.Logf("retrying with %s duration", duration)
+		}
+	}
+
+	if badOS[runtime.GOOS] {
+		t.Skipf("ignoring failure on %s; see golang.org/issue/13841", runtime.GOOS)
+		return
+	}
+	// Ignore the failure if the tests are running in a QEMU-based emulator,
+	// QEMU is not perfect at emulating everything.
+	// IN_QEMU environmental variable is set by some of the Go builders.
+	// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
+	if os.Getenv("IN_QEMU") == "1" {
+		t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
+		return
+	}
+	t.FailNow()
+}
+
+func profileOk(t *testing.T, need []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+	ok = true
 
 	// Check that profile is well formed and contains need.
 	have := make([]uintptr, len(need))
@@ -160,6 +207,10 @@
 					have[i] += count
 				}
 			}
+			if strings.Contains(f.Name(), "stackBarrier") {
+				// The runtime should have unwound this.
+				t.Fatalf("profile includes stackBarrier")
+			}
 		}
 	})
 	t.Logf("total %d CPU profile samples collected", samples)
@@ -168,11 +219,18 @@
 		// On some windows machines we end up with
 		// not enough samples due to coarse timer
 		// resolution. Let it go.
-		t.Skip("too few samples on Windows (golang.org/issue/10842)")
+		t.Log("too few samples on Windows (golang.org/issue/10842)")
+		return false
+	}
+
+	// Check that we got a reasonable number of samples.
+	if ideal := uintptr(duration * 100 / time.Second); samples == 0 || samples < ideal/4 {
+		t.Logf("too few samples; got %d, want at least %d, ideally %d", samples, ideal/4, ideal)
+		ok = false
 	}
 
 	if len(need) == 0 {
-		return
+		return ok
 	}
 
 	var total uintptr
@@ -180,9 +238,8 @@
 		total += have[i]
 		t.Logf("%s: %d\n", name, have[i])
 	}
-	ok := true
 	if total == 0 {
-		t.Logf("no CPU profile samples collected")
+		t.Logf("no samples in expected functions")
 		ok = false
 	}
 	// We'd like to check a reasonable minimum, like
@@ -196,22 +253,7 @@
 			ok = false
 		}
 	}
-
-	if !ok {
-		if badOS[runtime.GOOS] {
-			t.Skipf("ignoring failure on %s; see golang.org/issue/6047", runtime.GOOS)
-			return
-		}
-		// Ignore the failure if the tests are running in a QEMU-based emulator,
-		// QEMU is not perfect at emulating everything.
-		// IN_QEMU environmental variable is set by some of the Go builders.
-		// IN_QEMU=1 indicates that the tests are running in QEMU. See issue 9605.
-		if os.Getenv("IN_QEMU") == "1" {
-			t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
-			return
-		}
-		t.FailNow()
-	}
+	return ok
 }
 
 // Fork can hang if preempted with signals frequently enough (see issue 5517).
@@ -306,8 +348,8 @@
 
 // Test that profiling of division operations is okay, especially on ARM. See issue 6681.
 func TestMathBigDivide(t *testing.T) {
-	testCPUProfile(t, nil, func() {
-		t := time.After(5 * time.Second)
+	testCPUProfile(t, nil, func(duration time.Duration) {
+		t := time.After(duration)
 		pi := new(big.Int)
 		for {
 			for i := 0; i < 100; i++ {
@@ -324,11 +366,71 @@
 	})
 }
 
-// Operating systems that are expected to fail the tests. See issue 6047.
+func TestStackBarrierProfiling(t *testing.T) {
+	if (runtime.GOOS == "linux" && runtime.GOARCH == "arm") || runtime.GOOS == "openbsd" || runtime.GOOS == "solaris" || runtime.GOOS == "dragonfly" || runtime.GOOS == "freebsd" {
+		// This test currently triggers a large number of
+		// usleep(100)s. These kernels/arches have poor
+		// resolution timers, so this gives up a whole
+		// scheduling quantum. On Linux and the BSDs (and
+		// probably Solaris), profiling signals are only
+		// generated when a process completes a whole
+		// scheduling quantum, so this test often gets zero
+		// profiling signals and fails.
+		t.Skipf("low resolution timers inhibit profiling signals (golang.org/issue/13405)")
+		return
+	}
+
+	if !strings.Contains(os.Getenv("GODEBUG"), "gcstackbarrierall=1") {
+		// Re-execute this test with constant GC and stack
+		// barriers at every frame.
+		testenv.MustHaveExec(t)
+		if runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" {
+			t.Skip("gcstackbarrierall doesn't work on ppc64")
+		}
+		args := []string{"-test.run=TestStackBarrierProfiling"}
+		if testing.Short() {
+			args = append(args, "-test.short")
+		}
+		cmd := exec.Command(os.Args[0], args...)
+		cmd.Env = append([]string{"GODEBUG=gcstackbarrierall=1", "GOGC=1", "GOTRACEBACK=system"}, os.Environ()...)
+		if out, err := cmd.CombinedOutput(); err != nil {
+			t.Fatalf("subprocess failed with %v:\n%s", err, out)
+		}
+		return
+	}
+
+	testCPUProfile(t, nil, func(duration time.Duration) {
+		// In long mode, we're likely to get one or two
+		// samples in stackBarrier.
+		t := time.After(duration)
+		for {
+			deepStack(1000)
+			select {
+			case <-t:
+				return
+			default:
+			}
+		}
+	})
+}
+
+var x []byte
+
+func deepStack(depth int) int {
+	if depth == 0 {
+		return 0
+	}
+	x = make([]byte, 1024)
+	return deepStack(depth-1) + 1
+}
+
+// Operating systems that are expected to fail the tests. See issue 13841.
 var badOS = map[string]bool{
-	"darwin": true,
-	"netbsd": true,
-	"plan9":  true,
+	"darwin":    true,
+	"netbsd":    true,
+	"plan9":     true,
+	"dragonfly": true,
+	"solaris":   true,
 }
 
 func TestBlockProfile(t *testing.T) {
@@ -432,15 +534,20 @@
 }
 
 func blockSelectRecvAsync() {
+	const numTries = 3
 	c := make(chan bool, 1)
 	c2 := make(chan bool, 1)
 	go func() {
-		time.Sleep(blockDelay)
-		c <- true
+		for i := 0; i < numTries; i++ {
+			time.Sleep(blockDelay)
+			c <- true
+		}
 	}()
-	select {
-	case <-c:
-	case <-c2:
+	for i := 0; i < numTries; i++ {
+		select {
+		case <-c:
+		case <-c2:
+		}
 	}
 }
 
@@ -480,3 +587,50 @@
 	c.Wait()
 	mu.Unlock()
 }
+
+func func1(c chan int) { <-c }
+func func2(c chan int) { <-c }
+func func3(c chan int) { <-c }
+func func4(c chan int) { <-c }
+
+func TestGoroutineCounts(t *testing.T) {
+	if runtime.GOOS == "openbsd" {
+		testenv.SkipFlaky(t, 15156)
+	}
+	c := make(chan int)
+	for i := 0; i < 100; i++ {
+		if i%10 == 0 {
+			go func1(c)
+			continue
+		}
+		if i%2 == 0 {
+			go func2(c)
+			continue
+		}
+		go func3(c)
+	}
+	time.Sleep(10 * time.Millisecond) // let goroutines block on channel
+
+	var w bytes.Buffer
+	Lookup("goroutine").WriteTo(&w, 1)
+	prof := w.String()
+
+	if !containsInOrder(prof, "\n50 @ ", "\n40 @", "\n10 @", "\n1 @") {
+		t.Errorf("expected sorted goroutine counts:\n%s", prof)
+	}
+
+	close(c)
+
+	time.Sleep(10 * time.Millisecond) // let goroutines exit
+}
+
+func containsInOrder(s string, all ...string) bool {
+	for _, t := range all {
+		i := strings.Index(s, t)
+		if i < 0 {
+			return false
+		}
+		s = s[i+len(t):]
+	}
+	return true
+}

diff --git a/src/runtime/print1.go b/src/runtime/print.go
similarity index 87%
rename from src/runtime/print1.go
rename to src/runtime/print.go
index 6eff381..32626c1 100644
--- a/src/runtime/print1.go
+++ b/src/runtime/print.go

@@ -12,8 +12,8 @@
 
 func bytes(s string) (ret []byte) {
 	rp := (*slice)(unsafe.Pointer(&ret))
-	sp := (*_string)(noescape(unsafe.Pointer(&s)))
-	rp.array = unsafe.Pointer(sp.str)
+	sp := stringStructOf(&s)
+	rp.array = sp.str
 	rp.len = sp.len
 	rp.cap = sp.len
 	return
@@ -71,10 +71,6 @@
 	print("\n")
 }
 
-func printpc(p unsafe.Pointer) {
-	print("PC=", hex(uintptr(p)))
-}
-
 func printbool(v bool) {
 	if v {
 		print("true")
@@ -83,10 +79,6 @@
 	}
 }
 
-func printbyte(c byte) {
-	gwrite((*[1]byte)(unsafe.Pointer(&c))[:])
-}
-
 func printfloat(v float64) {
 	switch {
 	case v != v:
@@ -217,15 +209,13 @@
 func printslice(s []byte) {
 	sp := (*slice)(unsafe.Pointer(&s))
 	print("[", len(s), "/", cap(s), "]")
-	printpointer(unsafe.Pointer(sp.array))
+	printpointer(sp.array)
 }
 
-func printeface(e interface{}) {
-	ep := (*eface)(unsafe.Pointer(&e))
-	print("(", ep._type, ",", ep.data, ")")
+func printeface(e eface) {
+	print("(", e._type, ",", e.data, ")")
 }
 
-func printiface(i fInterface) {
-	ip := (*iface)(unsafe.Pointer(&i))
-	print("(", ip.tab, ",", ip.data, ")")
+func printiface(i iface) {
+	print("(", i.tab, ",", i.data, ")")
 }

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index c5b4a8c..2c0b3df 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go

@@ -4,7 +4,82 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+var buildVersion = sys.TheVersion
+
+// Goroutine scheduler
+// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
+//
+// The main concepts are:
+// G - goroutine.
+// M - worker thread, or machine.
+// P - processor, a resource that is required to execute Go code.
+//     M must have an associated P to execute Go code, however it can be
+//     blocked or in a syscall w/o an associated P.
+//
+// Design doc at https://golang.org/s/go11sched.
+
+// Worker thread parking/unparking.
+// We need to balance between keeping enough running worker threads to utilize
+// available hardware parallelism and parking excessive running worker threads
+// to conserve CPU resources and power. This is not simple for two reasons:
+// (1) scheduler state is intentionally distributed (in particular, per-P work
+// queues), so it is not possible to compute global predicates on fast paths;
+// (2) for optimal thread management we would need to know the future (don't park
+// a worker thread when a new goroutine will be readied in near future).
+//
+// Three rejected approaches that would work badly:
+// 1. Centralize all scheduler state (would inhibit scalability).
+// 2. Direct goroutine handoff. That is, when we ready a new goroutine and there
+//    is a spare P, unpark a thread and handoff it the thread and the goroutine.
+//    This would lead to thread state thrashing, as the thread that readied the
+//    goroutine can be out of work the very next moment, we will need to park it.
+//    Also, it would destroy locality of computation as we want to preserve
+//    dependent goroutines on the same thread; and introduce additional latency.
+// 3. Unpark an additional thread whenever we ready a goroutine and there is an
+//    idle P, but don't do handoff. This would lead to excessive thread parking/
+//    unparking as the additional threads will instantly park without discovering
+//    any work to do.
+//
+// The current approach:
+// We unpark an additional thread when we ready a goroutine if (1) there is an
+// idle P and there are no "spinning" worker threads. A worker thread is considered
+// spinning if it is out of local work and did not find work in global run queue/
+// netpoller; the spinning state is denoted in m.spinning and in sched.nmspinning.
+// Threads unparked this way are also considered spinning; we don't do goroutine
+// handoff so such threads are out of work initially. Spinning threads do some
+// spinning looking for work in per-P run queues before parking. If a spinning
+// thread finds work it takes itself out of the spinning state and proceeds to
+// execution. If it does not find work it takes itself out of the spinning state
+// and then parks.
+// If there is at least one spinning thread (sched.nmspinning>1), we don't unpark
+// new threads when readying goroutines. To compensate for that, if the last spinning
+// thread finds work and stops spinning, it must unpark a new spinning thread.
+// This approach smooths out unjustified spikes of thread unparking,
+// but at the same time guarantees eventual maximal CPU parallelism utilization.
+//
+// The main implementation complication is that we need to be very careful during
+// spinning->non-spinning thread transition. This transition can race with submission
+// of a new goroutine, and either one part or another needs to unpark another worker
+// thread. If they both fail to do that, we can end up with semi-persistent CPU
+// underutilization. The general pattern for goroutine readying is: submit a goroutine
+// to local work queue, #StoreLoad-style memory barrier, check sched.nmspinning.
+// The general pattern for spinning->non-spinning transition is: decrement nmspinning,
+// #StoreLoad-style memory barrier, check all per-P work queues for new work.
+// Note that all this complexity does not apply to global run queue as we are not
+// sloppy about thread unparking when submitting to global queue. Also see comments
+// for nmspinning manipulation.
+
+var (
+	m0           m
+	g0           g
+	raceprocctx0 uintptr
+)
 
 //go:linkname runtime_init runtime.init
 func runtime_init()
@@ -24,6 +99,9 @@
 // runtimeInitTime is the nanotime() at which the runtime started.
 var runtimeInitTime int64
 
+// Value to use for signal mask for newly created M's.
+var initSigmask sigset
+
 // The main goroutine.
 func main() {
 	g := getg()
@@ -35,7 +113,7 @@
 	// Max stack size is 1 GB on 64-bit, 250 MB on 32-bit.
 	// Using decimal instead of binary GB and MB because
 	// they look nicer in the stack overflow failure message.
-	if ptrSize == 8 {
+	if sys.PtrSize == 8 {
 		maxstacksize = 1000000000
 	} else {
 		maxstacksize = 250000000
@@ -49,7 +127,7 @@
 	})
 
 	// Lock the main goroutine onto this, the main OS thread,
-	// during initialization.  Most programs won't care, but a few
+	// during initialization. Most programs won't care, but a few
 	// do require certain calls to be made by the main thread.
 	// Those can arrange for main.main to run in the main thread
 	// by calling runtime.LockOSThread during initialization
@@ -77,12 +155,6 @@
 		if _cgo_thread_start == nil {
 			throw("_cgo_thread_start missing")
 		}
-		if _cgo_malloc == nil {
-			throw("_cgo_malloc missing")
-		}
-		if _cgo_free == nil {
-			throw("_cgo_free missing")
-		}
 		if GOOS != "windows" {
 			if _cgo_setenv == nil {
 				throw("_cgo_setenv missing")
@@ -148,19 +220,19 @@
 		if forcegc.idle != 0 {
 			throw("forcegc: phase error")
 		}
-		atomicstore(&forcegc.idle, 1)
+		atomic.Store(&forcegc.idle, 1)
 		goparkunlock(&forcegc.lock, "force gc (idle)", traceEvGoBlock, 1)
 		// this goroutine is explicitly resumed by sysmon
 		if debug.gctrace > 0 {
 			println("GC forced")
 		}
-		startGC(gcBackgroundMode, true)
+		gcStart(gcBackgroundMode, true)
 	}
 }
 
 //go:nosplit
 
-// Gosched yields the processor, allowing other goroutines to run.  It does not
+// Gosched yields the processor, allowing other goroutines to run. It does not
 // suspend the current goroutine, so execution resumes automatically.
 func Gosched() {
 	mcall(gosched_m)
@@ -168,6 +240,8 @@
 
 // Puts the current goroutine into a waiting state and calls unlockf.
 // If unlockf returns false, the goroutine is resumed.
+// unlockf must not access this G's stack, as it may be moved between
+// the call to gopark and the call to unlockf.
 func gopark(unlockf func(*g, unsafe.Pointer) bool, lock unsafe.Pointer, reason string, traceEv byte, traceskip int) {
 	mp := acquirem()
 	gp := mp.curg
@@ -193,7 +267,7 @@
 
 func goready(gp *g, traceskip int) {
 	systemstack(func() {
-		ready(gp, traceskip)
+		ready(gp, traceskip, true)
 	})
 }
 
@@ -252,6 +326,9 @@
 	if s.waitlink != nil {
 		throw("runtime: sudog with non-nil waitlink")
 	}
+	if s.c != nil {
+		throw("runtime: sudog with non-nil c")
+	}
 	gp := getg()
 	if gp.param != nil {
 		throw("runtime: releaseSudog with non-nil gp.param")
@@ -286,7 +363,7 @@
 // It assumes that f is a func value. Otherwise the behavior is undefined.
 //go:nosplit
 func funcPC(f interface{}) uintptr {
-	return **(**uintptr)(add(unsafe.Pointer(&f), ptrSize))
+	return **(**uintptr)(add(unsafe.Pointer(&f), sys.PtrSize))
 }
 
 // called from assembly
@@ -299,7 +376,7 @@
 }
 
 func badreflectcall() {
-	panic("runtime: arg size to reflect.call more than 1GB")
+	panic(plainError("arg size to reflect.call more than 1GB"))
 }
 
 func lockedOSThread() bool {
@@ -319,7 +396,3939 @@
 
 	lock(&allglock)
 	allgs = append(allgs, gp)
-	allg = &allgs[0]
 	allglen = uintptr(len(allgs))
+
+	// Grow GC rescan list if necessary.
+	if len(allgs) > cap(work.rescan.list) {
+		lock(&work.rescan.lock)
+		l := work.rescan.list
+		// Let append do the heavy lifting, but keep the
+		// length the same.
+		work.rescan.list = append(l[:cap(l)], 0)[:len(l)]
+		unlock(&work.rescan.lock)
+	}
 	unlock(&allglock)
 }
+
+const (
+	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
+	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
+	_GoidCacheBatch = 16
+)
+
+// The bootstrap sequence is:
+//
+//	call osinit
+//	call schedinit
+//	make & queue new G
+//	call runtime·mstart
+//
+// The new G calls runtime·main.
+func schedinit() {
+	// raceinit must be the first call to race detector.
+	// In particular, it must be done before mallocinit below calls racemapshadow.
+	_g_ := getg()
+	if raceenabled {
+		_g_.racectx, raceprocctx0 = raceinit()
+	}
+
+	sched.maxmcount = 10000
+
+	tracebackinit()
+	moduledataverify()
+	stackinit()
+	mallocinit()
+	mcommoninit(_g_.m)
+	typelinksinit()
+	itabsinit()
+
+	msigsave(_g_.m)
+	initSigmask = _g_.m.sigmask
+
+	goargs()
+	goenvs()
+	parsedebugvars()
+	gcinit()
+
+	sched.lastpoll = uint64(nanotime())
+	procs := int(ncpu)
+	if procs > _MaxGomaxprocs {
+		procs = _MaxGomaxprocs
+	}
+	if n := atoi(gogetenv("GOMAXPROCS")); n > 0 {
+		if n > _MaxGomaxprocs {
+			n = _MaxGomaxprocs
+		}
+		procs = n
+	}
+	if procresize(int32(procs)) != nil {
+		throw("unknown runnable goroutine during bootstrap")
+	}
+
+	if buildVersion == "" {
+		// Condition should never trigger. This code just serves
+		// to ensure runtime·buildVersion is kept in the resulting binary.
+		buildVersion = "unknown"
+	}
+}
+
+func dumpgstatus(gp *g) {
+	_g_ := getg()
+	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
+	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
+}
+
+func checkmcount() {
+	// sched lock is held
+	if sched.mcount > sched.maxmcount {
+		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
+		throw("thread exhaustion")
+	}
+}
+
+func mcommoninit(mp *m) {
+	_g_ := getg()
+
+	// g0 stack won't make sense for user (and is not necessary unwindable).
+	if _g_ != _g_.m.g0 {
+		callers(1, mp.createstack[:])
+	}
+
+	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
+	if mp.fastrand == 0 {
+		mp.fastrand = 0x49f6428a
+	}
+
+	lock(&sched.lock)
+	mp.id = sched.mcount
+	sched.mcount++
+	checkmcount()
+	mpreinit(mp)
+	if mp.gsignal != nil {
+		mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
+	}
+
+	// Add to allm so garbage collector doesn't free g->m
+	// when it is just in a register or thread-local storage.
+	mp.alllink = allm
+
+	// NumCgoCall() iterates over allm w/o schedlock,
+	// so we need to publish it safely.
+	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
+	unlock(&sched.lock)
+
+	// Allocate memory to hold a cgo traceback if the cgo call crashes.
+	if iscgo || GOOS == "solaris" || GOOS == "windows" {
+		mp.cgoCallers = new(cgoCallers)
+	}
+}
+
+// Mark gp ready to run.
+func ready(gp *g, traceskip int, next bool) {
+	if trace.enabled {
+		traceGoUnpark(gp, traceskip)
+	}
+
+	status := readgstatus(gp)
+
+	// Mark runnable.
+	_g_ := getg()
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+	if status&^_Gscan != _Gwaiting {
+		dumpgstatus(gp)
+		throw("bad g->status in ready")
+	}
+
+	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
+	casgstatus(gp, _Gwaiting, _Grunnable)
+	runqput(_g_.m.p.ptr(), gp, next)
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 { // TODO: fast atomic
+		wakep()
+	}
+	_g_.m.locks--
+	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in Case we've cleared it in newstack
+		_g_.stackguard0 = stackPreempt
+	}
+}
+
+func gcprocs() int32 {
+	// Figure out how many CPUs to use during GC.
+	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
+	lock(&sched.lock)
+	n := gomaxprocs
+	if n > ncpu {
+		n = ncpu
+	}
+	if n > _MaxGcproc {
+		n = _MaxGcproc
+	}
+	if n > sched.nmidle+1 { // one M is currently running
+		n = sched.nmidle + 1
+	}
+	unlock(&sched.lock)
+	return n
+}
+
+func needaddgcproc() bool {
+	lock(&sched.lock)
+	n := gomaxprocs
+	if n > ncpu {
+		n = ncpu
+	}
+	if n > _MaxGcproc {
+		n = _MaxGcproc
+	}
+	n -= sched.nmidle + 1 // one M is currently running
+	unlock(&sched.lock)
+	return n > 0
+}
+
+func helpgc(nproc int32) {
+	_g_ := getg()
+	lock(&sched.lock)
+	pos := 0
+	for n := int32(1); n < nproc; n++ { // one M is currently running
+		if allp[pos].mcache == _g_.m.mcache {
+			pos++
+		}
+		mp := mget()
+		if mp == nil {
+			throw("gcprocs inconsistency")
+		}
+		mp.helpgc = n
+		mp.p.set(allp[pos])
+		mp.mcache = allp[pos].mcache
+		pos++
+		notewakeup(&mp.park)
+	}
+	unlock(&sched.lock)
+}
+
+// freezeStopWait is a large value that freezetheworld sets
+// sched.stopwait to in order to request that all Gs permanently stop.
+const freezeStopWait = 0x7fffffff
+
+// Similar to stopTheWorld but best-effort and can be called several times.
+// There is no reverse operation, used during crashing.
+// This function must not lock any mutexes.
+func freezetheworld() {
+	// stopwait and preemption requests can be lost
+	// due to races with concurrently executing threads,
+	// so try several times
+	for i := 0; i < 5; i++ {
+		// this should tell the scheduler to not start any new goroutines
+		sched.stopwait = freezeStopWait
+		atomic.Store(&sched.gcwaiting, 1)
+		// this should stop running goroutines
+		if !preemptall() {
+			break // no running goroutines
+		}
+		usleep(1000)
+	}
+	// to be sure
+	usleep(1000)
+	preemptall()
+	usleep(1000)
+}
+
+func isscanstatus(status uint32) bool {
+	if status == _Gscan {
+		throw("isscanstatus: Bad status Gscan")
+	}
+	return status&_Gscan == _Gscan
+}
+
+// All reads and writes of g's status go through readgstatus, casgstatus
+// castogscanstatus, casfrom_Gscanstatus.
+//go:nosplit
+func readgstatus(gp *g) uint32 {
+	return atomic.Load(&gp.atomicstatus)
+}
+
+// Ownership of gcscanvalid:
+//
+// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
+// then gp owns gp.gcscanvalid, and other goroutines must not modify it.
+//
+// Otherwise, a second goroutine can lock the scan state by setting _Gscan
+// in the status bit and then modify gcscanvalid, and then unlock the scan state.
+//
+// Note that the first condition implies an exception to the second:
+// if a second goroutine changes gp's status to _Grunning|_Gscan,
+// that second goroutine still does not have the right to modify gcscanvalid.
+
+// The Gscanstatuses are acting like locks and this releases them.
+// If it proves to be a performance hit we should be able to make these
+// simple atomic stores but for now we are going to throw if
+// we see an inconsistent state.
+func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
+	success := false
+
+	// Check that transition is valid.
+	switch oldval {
+	default:
+		print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		throw("casfrom_Gscanstatus:top gp->status is not in scan state")
+	case _Gscanrunnable,
+		_Gscanwaiting,
+		_Gscanrunning,
+		_Gscansyscall:
+		if newval == oldval&^_Gscan {
+			success = atomic.Cas(&gp.atomicstatus, oldval, newval)
+		}
+	}
+	if !success {
+		print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
+		dumpgstatus(gp)
+		throw("casfrom_Gscanstatus: gp->status is not in scan state")
+	}
+}
+
+// This will return false if the gp is not in the expected status and the cas fails.
+// This acts like a lock acquire while the casfromgstatus acts like a lock release.
+func castogscanstatus(gp *g, oldval, newval uint32) bool {
+	switch oldval {
+	case _Grunnable,
+		_Grunning,
+		_Gwaiting,
+		_Gsyscall:
+		if newval == oldval|_Gscan {
+			return atomic.Cas(&gp.atomicstatus, oldval, newval)
+		}
+	}
+	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
+	throw("castogscanstatus")
+	panic("not reached")
+}
+
+// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
+// and casfrom_Gscanstatus instead.
+// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
+// put it in the Gscan state is finished.
+//go:nosplit
+func casgstatus(gp *g, oldval, newval uint32) {
+	if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
+		systemstack(func() {
+			print("runtime: casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
+			throw("casgstatus: bad incoming values")
+		})
+	}
+
+	if oldval == _Grunning && gp.gcscanvalid {
+		// If oldvall == _Grunning, then the actual status must be
+		// _Grunning or _Grunning|_Gscan; either way,
+		// we own gp.gcscanvalid, so it's safe to read.
+		// gp.gcscanvalid must not be true when we are running.
+		print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
+		throw("casgstatus")
+	}
+
+	// See http://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 5 * 1000
+	var nextYield int64
+
+	// loop if gp->atomicstatus is in a scan state giving
+	// GC time to finish and change the state to oldval.
+	for i := 0; !atomic.Cas(&gp.atomicstatus, oldval, newval); i++ {
+		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
+			systemstack(func() {
+				throw("casgstatus: waiting for Gwaiting but is Grunnable")
+			})
+		}
+		// Help GC if needed.
+		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
+		// 	gp.preemptscan = false
+		// 	systemstack(func() {
+		// 		gcphasework(gp)
+		// 	})
+		// }
+		// But meanwhile just yield.
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			for x := 0; x < 10 && gp.atomicstatus != oldval; x++ {
+				procyield(1)
+			}
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+	if newval == _Grunning && gp.gcscanvalid {
+		// Run queueRescan on the system stack so it has more space.
+		systemstack(func() { queueRescan(gp) })
+	}
+}
+
+// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
+// Returns old status. Cannot call casgstatus directly, because we are racing with an
+// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
+// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
+// it would loop waiting for the status to go back to Gwaiting, which it never will.
+//go:nosplit
+func casgcopystack(gp *g) uint32 {
+	for {
+		oldstatus := readgstatus(gp) &^ _Gscan
+		if oldstatus != _Gwaiting && oldstatus != _Grunnable {
+			throw("copystack: bad status, not Gwaiting or Grunnable")
+		}
+		if atomic.Cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
+			return oldstatus
+		}
+	}
+}
+
+// scang blocks until gp's stack has been scanned.
+// It might be scanned by scang or it might be scanned by the goroutine itself.
+// Either way, the stack scan has completed when scang returns.
+func scang(gp *g, gcw *gcWork) {
+	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
+	// Nothing is racing with us now, but gcscandone might be set to true left over
+	// from an earlier round of stack scanning (we scan twice per GC).
+	// We use gcscandone to record whether the scan has been done during this round.
+	// It is important that the scan happens exactly once: if called twice,
+	// the installation of stack barriers will detect the double scan and die.
+
+	gp.gcscandone = false
+
+	// See http://golang.org/cl/21503 for justification of the yield delay.
+	const yieldDelay = 10 * 1000
+	var nextYield int64
+
+	// Endeavor to get gcscandone set to true,
+	// either by doing the stack scan ourselves or by coercing gp to scan itself.
+	// gp.gcscandone can transition from false to true when we're not looking
+	// (if we asked for preemption), so any time we lock the status using
+	// castogscanstatus we have to double-check that the scan is still not done.
+loop:
+	for i := 0; !gp.gcscandone; i++ {
+		switch s := readgstatus(gp); s {
+		default:
+			dumpgstatus(gp)
+			throw("stopg: invalid status")
+
+		case _Gdead:
+			// No stack.
+			gp.gcscandone = true
+			break loop
+
+		case _Gcopystack:
+		// Stack being switched. Go around again.
+
+		case _Grunnable, _Gsyscall, _Gwaiting:
+			// Claim goroutine by setting scan bit.
+			// Racing with execution or readying of gp.
+			// The scan bit keeps them from running
+			// the goroutine until we're done.
+			if castogscanstatus(gp, s, s|_Gscan) {
+				if !gp.gcscandone {
+					scanstack(gp, gcw)
+					gp.gcscandone = true
+				}
+				restartg(gp)
+				break loop
+			}
+
+		case _Gscanwaiting:
+		// newstack is doing a scan for us right now. Wait.
+
+		case _Grunning:
+			// Goroutine running. Try to preempt execution so it can scan itself.
+			// The preemption handler (in newstack) does the actual scan.
+
+			// Optimization: if there is already a pending preemption request
+			// (from the previous loop iteration), don't bother with the atomics.
+			if gp.preemptscan && gp.preempt && gp.stackguard0 == stackPreempt {
+				break
+			}
+
+			// Ask for preemption and self scan.
+			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
+				if !gp.gcscandone {
+					gp.preemptscan = true
+					gp.preempt = true
+					gp.stackguard0 = stackPreempt
+				}
+				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
+			}
+		}
+
+		if i == 0 {
+			nextYield = nanotime() + yieldDelay
+		}
+		if nanotime() < nextYield {
+			procyield(10)
+		} else {
+			osyield()
+			nextYield = nanotime() + yieldDelay/2
+		}
+	}
+
+	gp.preemptscan = false // cancel scan request if no longer needed
+}
+
+// The GC requests that this routine be moved from a scanmumble state to a mumble state.
+func restartg(gp *g) {
+	s := readgstatus(gp)
+	switch s {
+	default:
+		dumpgstatus(gp)
+		throw("restartg: unexpected status")
+
+	case _Gdead:
+	// ok
+
+	case _Gscanrunnable,
+		_Gscanwaiting,
+		_Gscansyscall:
+		casfrom_Gscanstatus(gp, s, s&^_Gscan)
+	}
+}
+
+// stopTheWorld stops all P's from executing goroutines, interrupting
+// all goroutines at GC safe points and records reason as the reason
+// for the stop. On return, only the current goroutine's P is running.
+// stopTheWorld must not be called from a system stack and the caller
+// must not hold worldsema. The caller must call startTheWorld when
+// other P's should resume execution.
+//
+// stopTheWorld is safe for multiple goroutines to call at the
+// same time. Each will execute its own stop, and the stops will
+// be serialized.
+//
+// This is also used by routines that do stack dumps. If the system is
+// in panic or being exited, this may not reliably stop all
+// goroutines.
+func stopTheWorld(reason string) {
+	semacquire(&worldsema, false)
+	getg().m.preemptoff = reason
+	systemstack(stopTheWorldWithSema)
+}
+
+// startTheWorld undoes the effects of stopTheWorld.
+func startTheWorld() {
+	systemstack(startTheWorldWithSema)
+	// worldsema must be held over startTheWorldWithSema to ensure
+	// gomaxprocs cannot change while worldsema is held.
+	semrelease(&worldsema)
+	getg().m.preemptoff = ""
+}
+
+// Holding worldsema grants an M the right to try to stop the world
+// and prevents gomaxprocs from changing concurrently.
+var worldsema uint32 = 1
+
+// stopTheWorldWithSema is the core implementation of stopTheWorld.
+// The caller is responsible for acquiring worldsema and disabling
+// preemption first and then should stopTheWorldWithSema on the system
+// stack:
+//
+//	semacquire(&worldsema, false)
+//	m.preemptoff = "reason"
+//	systemstack(stopTheWorldWithSema)
+//
+// When finished, the caller must either call startTheWorld or undo
+// these three operations separately:
+//
+//	m.preemptoff = ""
+//	systemstack(startTheWorldWithSema)
+//	semrelease(&worldsema)
+//
+// It is allowed to acquire worldsema once and then execute multiple
+// startTheWorldWithSema/stopTheWorldWithSema pairs.
+// Other P's are able to execute between successive calls to
+// startTheWorldWithSema and stopTheWorldWithSema.
+// Holding worldsema causes any other goroutines invoking
+// stopTheWorld to block.
+func stopTheWorldWithSema() {
+	_g_ := getg()
+
+	// If we hold a lock, then we won't be able to stop another M
+	// that is blocked trying to acquire the lock.
+	if _g_.m.locks > 0 {
+		throw("stopTheWorld: holding locks")
+	}
+
+	lock(&sched.lock)
+	sched.stopwait = gomaxprocs
+	atomic.Store(&sched.gcwaiting, 1)
+	preemptall()
+	// stop current P
+	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
+	sched.stopwait--
+	// try to retake all P's in Psyscall status
+	for i := 0; i < int(gomaxprocs); i++ {
+		p := allp[i]
+		s := p.status
+		if s == _Psyscall && atomic.Cas(&p.status, s, _Pgcstop) {
+			if trace.enabled {
+				traceGoSysBlock(p)
+				traceProcStop(p)
+			}
+			p.syscalltick++
+			sched.stopwait--
+		}
+	}
+	// stop idle P's
+	for {
+		p := pidleget()
+		if p == nil {
+			break
+		}
+		p.status = _Pgcstop
+		sched.stopwait--
+	}
+	wait := sched.stopwait > 0
+	unlock(&sched.lock)
+
+	// wait for remaining P's to stop voluntarily
+	if wait {
+		for {
+			// wait for 100us, then try to re-preempt in case of any races
+			if notetsleep(&sched.stopnote, 100*1000) {
+				noteclear(&sched.stopnote)
+				break
+			}
+			preemptall()
+		}
+	}
+	if sched.stopwait != 0 {
+		throw("stopTheWorld: not stopped")
+	}
+	for i := 0; i < int(gomaxprocs); i++ {
+		p := allp[i]
+		if p.status != _Pgcstop {
+			throw("stopTheWorld: not stopped")
+		}
+	}
+}
+
+func mhelpgc() {
+	_g_ := getg()
+	_g_.m.helpgc = -1
+}
+
+func startTheWorldWithSema() {
+	_g_ := getg()
+
+	_g_.m.locks++        // disable preemption because it can be holding p in a local var
+	gp := netpoll(false) // non-blocking
+	injectglist(gp)
+	add := needaddgcproc()
+	lock(&sched.lock)
+
+	procs := gomaxprocs
+	if newprocs != 0 {
+		procs = newprocs
+		newprocs = 0
+	}
+	p1 := procresize(procs)
+	sched.gcwaiting = 0
+	if sched.sysmonwait != 0 {
+		sched.sysmonwait = 0
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+
+	for p1 != nil {
+		p := p1
+		p1 = p1.link.ptr()
+		if p.m != 0 {
+			mp := p.m.ptr()
+			p.m = 0
+			if mp.nextp != 0 {
+				throw("startTheWorld: inconsistent mp->nextp")
+			}
+			mp.nextp.set(p)
+			notewakeup(&mp.park)
+		} else {
+			// Start M to run P.  Do not start another M below.
+			newm(nil, p)
+			add = false
+		}
+	}
+
+	// Wakeup an additional proc in case we have excessive runnable goroutines
+	// in local queues or in the global queue. If we don't, the proc will park itself.
+	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 {
+		wakep()
+	}
+
+	if add {
+		// If GC could have used another helper proc, start one now,
+		// in the hope that it will be available next time.
+		// It would have been even better to start it before the collection,
+		// but doing so requires allocating memory, so it's tricky to
+		// coordinate. This lazy approach works out in practice:
+		// we don't mind if the first couple gc rounds don't have quite
+		// the maximum number of procs.
+		newm(mhelpgc, nil)
+	}
+	_g_.m.locks--
+	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+		_g_.stackguard0 = stackPreempt
+	}
+}
+
+// Called to start an M.
+//go:nosplit
+func mstart() {
+	_g_ := getg()
+
+	if _g_.stack.lo == 0 {
+		// Initialize stack bounds from system stack.
+		// Cgo may have left stack size in stack.hi.
+		size := _g_.stack.hi
+		if size == 0 {
+			size = 8192 * sys.StackGuardMultiplier
+		}
+		_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
+		_g_.stack.lo = _g_.stack.hi - size + 1024
+	}
+	// Initialize stack guards so that we can start calling
+	// both Go and C functions with stack growth prologues.
+	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+	_g_.stackguard1 = _g_.stackguard0
+	mstart1()
+}
+
+func mstart1() {
+	_g_ := getg()
+
+	if _g_ != _g_.m.g0 {
+		throw("bad runtime·mstart")
+	}
+
+	// Record top of stack for use by mcall.
+	// Once we call schedule we're never coming back,
+	// so other calls can reuse this stack space.
+	gosave(&_g_.m.g0.sched)
+	_g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
+	asminit()
+	minit()
+
+	// Install signal handlers; after minit so that minit can
+	// prepare the thread to be able to handle the signals.
+	if _g_.m == &m0 {
+		// Create an extra M for callbacks on threads not created by Go.
+		if iscgo && !cgoHasExtraM {
+			cgoHasExtraM = true
+			newextram()
+		}
+		initsig(false)
+	}
+
+	if fn := _g_.m.mstartfn; fn != nil {
+		fn()
+	}
+
+	if _g_.m.helpgc != 0 {
+		_g_.m.helpgc = 0
+		stopm()
+	} else if _g_.m != &m0 {
+		acquirep(_g_.m.nextp.ptr())
+		_g_.m.nextp = 0
+	}
+	schedule()
+}
+
+// forEachP calls fn(p) for every P p when p reaches a GC safe point.
+// If a P is currently executing code, this will bring the P to a GC
+// safe point and execute fn on that P. If the P is not executing code
+// (it is idle or in a syscall), this will call fn(p) directly while
+// preventing the P from exiting its state. This does not ensure that
+// fn will run on every CPU executing Go code, but it acts as a global
+// memory barrier. GC uses this as a "ragged barrier."
+//
+// The caller must hold worldsema.
+//
+//go:systemstack
+func forEachP(fn func(*p)) {
+	mp := acquirem()
+	_p_ := getg().m.p.ptr()
+
+	lock(&sched.lock)
+	if sched.safePointWait != 0 {
+		throw("forEachP: sched.safePointWait != 0")
+	}
+	sched.safePointWait = gomaxprocs - 1
+	sched.safePointFn = fn
+
+	// Ask all Ps to run the safe point function.
+	for _, p := range allp[:gomaxprocs] {
+		if p != _p_ {
+			atomic.Store(&p.runSafePointFn, 1)
+		}
+	}
+	preemptall()
+
+	// Any P entering _Pidle or _Psyscall from now on will observe
+	// p.runSafePointFn == 1 and will call runSafePointFn when
+	// changing its status to _Pidle/_Psyscall.
+
+	// Run safe point function for all idle Ps. sched.pidle will
+	// not change because we hold sched.lock.
+	for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
+		if atomic.Cas(&p.runSafePointFn, 1, 0) {
+			fn(p)
+			sched.safePointWait--
+		}
+	}
+
+	wait := sched.safePointWait > 0
+	unlock(&sched.lock)
+
+	// Run fn for the current P.
+	fn(_p_)
+
+	// Force Ps currently in _Psyscall into _Pidle and hand them
+	// off to induce safe point function execution.
+	for i := 0; i < int(gomaxprocs); i++ {
+		p := allp[i]
+		s := p.status
+		if s == _Psyscall && p.runSafePointFn == 1 && atomic.Cas(&p.status, s, _Pidle) {
+			if trace.enabled {
+				traceGoSysBlock(p)
+				traceProcStop(p)
+			}
+			p.syscalltick++
+			handoffp(p)
+		}
+	}
+
+	// Wait for remaining Ps to run fn.
+	if wait {
+		for {
+			// Wait for 100us, then try to re-preempt in
+			// case of any races.
+			//
+			// Requires system stack.
+			if notetsleep(&sched.safePointNote, 100*1000) {
+				noteclear(&sched.safePointNote)
+				break
+			}
+			preemptall()
+		}
+	}
+	if sched.safePointWait != 0 {
+		throw("forEachP: not done")
+	}
+	for i := 0; i < int(gomaxprocs); i++ {
+		p := allp[i]
+		if p.runSafePointFn != 0 {
+			throw("forEachP: P did not run fn")
+		}
+	}
+
+	lock(&sched.lock)
+	sched.safePointFn = nil
+	unlock(&sched.lock)
+	releasem(mp)
+}
+
+// runSafePointFn runs the safe point function, if any, for this P.
+// This should be called like
+//
+//     if getg().m.p.runSafePointFn != 0 {
+//         runSafePointFn()
+//     }
+//
+// runSafePointFn must be checked on any transition in to _Pidle or
+// _Psyscall to avoid a race where forEachP sees that the P is running
+// just before the P goes into _Pidle/_Psyscall and neither forEachP
+// nor the P run the safe-point function.
+func runSafePointFn() {
+	p := getg().m.p.ptr()
+	// Resolve the race between forEachP running the safe-point
+	// function on this P's behalf and this P running the
+	// safe-point function directly.
+	if !atomic.Cas(&p.runSafePointFn, 1, 0) {
+		return
+	}
+	sched.safePointFn(p)
+	lock(&sched.lock)
+	sched.safePointWait--
+	if sched.safePointWait == 0 {
+		notewakeup(&sched.safePointNote)
+	}
+	unlock(&sched.lock)
+}
+
+// When running with cgo, we call _cgo_thread_start
+// to start threads for us so that we can play nicely with
+// foreign code.
+var cgoThreadStart unsafe.Pointer
+
+type cgothreadstart struct {
+	g   guintptr
+	tls *uint64
+	fn  unsafe.Pointer
+}
+
+// Allocate a new m unassociated with any thread.
+// Can use p for allocation context if needed.
+// fn is recorded as the new m's m.mstartfn.
+//
+// This function it known to the compiler to inhibit the
+// go:nowritebarrierrec annotation because it uses P for allocation.
+func allocm(_p_ *p, fn func()) *m {
+	_g_ := getg()
+	_g_.m.locks++ // disable GC because it can be called from sysmon
+	if _g_.m.p == 0 {
+		acquirep(_p_) // temporarily borrow p for mallocs in this function
+	}
+	mp := new(m)
+	mp.mstartfn = fn
+	mcommoninit(mp)
+
+	// In case of cgo or Solaris, pthread_create will make us a stack.
+	// Windows and Plan 9 will layout sched stack on OS stack.
+	if iscgo || GOOS == "solaris" || GOOS == "windows" || GOOS == "plan9" {
+		mp.g0 = malg(-1)
+	} else {
+		mp.g0 = malg(8192 * sys.StackGuardMultiplier)
+	}
+	mp.g0.m = mp
+
+	if _p_ == _g_.m.p.ptr() {
+		releasep()
+	}
+	_g_.m.locks--
+	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+		_g_.stackguard0 = stackPreempt
+	}
+
+	return mp
+}
+
+// needm is called when a cgo callback happens on a
+// thread without an m (a thread not created by Go).
+// In this case, needm is expected to find an m to use
+// and return with m, g initialized correctly.
+// Since m and g are not set now (likely nil, but see below)
+// needm is limited in what routines it can call. In particular
+// it can only call nosplit functions (textflag 7) and cannot
+// do any scheduling that requires an m.
+//
+// In order to avoid needing heavy lifting here, we adopt
+// the following strategy: there is a stack of available m's
+// that can be stolen. Using compare-and-swap
+// to pop from the stack has ABA races, so we simulate
+// a lock by doing an exchange (via casp) to steal the stack
+// head and replace the top pointer with MLOCKED (1).
+// This serves as a simple spin lock that we can use even
+// without an m. The thread that locks the stack in this way
+// unlocks the stack by storing a valid stack head pointer.
+//
+// In order to make sure that there is always an m structure
+// available to be stolen, we maintain the invariant that there
+// is always one more than needed. At the beginning of the
+// program (if cgo is in use) the list is seeded with a single m.
+// If needm finds that it has taken the last m off the list, its job
+// is - once it has installed its own m so that it can do things like
+// allocate memory - to create a spare m and put it on the list.
+//
+// Each of these extra m's also has a g0 and a curg that are
+// pressed into service as the scheduling stack and current
+// goroutine for the duration of the cgo callback.
+//
+// When the callback is done with the m, it calls dropm to
+// put the m back on the list.
+//go:nosplit
+func needm(x byte) {
+	if iscgo && !cgoHasExtraM {
+		// Can happen if C/C++ code calls Go from a global ctor.
+		// Can not throw, because scheduler is not initialized yet.
+		write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
+		exit(1)
+	}
+
+	// Lock extra list, take head, unlock popped list.
+	// nilokay=false is safe here because of the invariant above,
+	// that the extra list always contains or will soon contain
+	// at least one m.
+	mp := lockextra(false)
+
+	// Set needextram when we've just emptied the list,
+	// so that the eventual call into cgocallbackg will
+	// allocate a new m for the extra list. We delay the
+	// allocation until then so that it can be done
+	// after exitsyscall makes sure it is okay to be
+	// running at all (that is, there's no garbage collection
+	// running right now).
+	mp.needextram = mp.schedlink == 0
+	unlockextra(mp.schedlink.ptr())
+
+	// Save and block signals before installing g.
+	// Once g is installed, any incoming signals will try to execute,
+	// but we won't have the sigaltstack settings and other data
+	// set up appropriately until the end of minit, which will
+	// unblock the signals. This is the same dance as when
+	// starting a new m to run Go code via newosproc.
+	msigsave(mp)
+	sigblock()
+
+	// Install g (= m->g0) and set the stack bounds
+	// to match the current stack. We don't actually know
+	// how big the stack is, like we don't know how big any
+	// scheduling stack is, but we assume there's at least 32 kB,
+	// which is more than enough for us.
+	setg(mp.g0)
+	_g_ := getg()
+	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
+	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+	_g_.stackguard0 = _g_.stack.lo + _StackGuard
+
+	// Initialize this thread to use the m.
+	asminit()
+	minit()
+}
+
+var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
+
+// newextram allocates an m and puts it on the extra list.
+// It is called with a working local m, so that it can do things
+// like call schedlock and allocate.
+func newextram() {
+	// Create extra goroutine locked to extra m.
+	// The goroutine is the context in which the cgo callback will run.
+	// The sched.pc will never be returned to, but setting it to
+	// goexit makes clear to the traceback routines where
+	// the goroutine stack ends.
+	mp := allocm(nil, nil)
+	gp := malg(4096)
+	gp.sched.pc = funcPC(goexit) + sys.PCQuantum
+	gp.sched.sp = gp.stack.hi
+	gp.sched.sp -= 4 * sys.RegSize // extra space in case of reads slightly beyond frame
+	gp.sched.lr = 0
+	gp.sched.g = guintptr(unsafe.Pointer(gp))
+	gp.syscallpc = gp.sched.pc
+	gp.syscallsp = gp.sched.sp
+	gp.stktopsp = gp.sched.sp
+	gp.gcscanvalid = true // fresh G, so no dequeueRescan necessary
+	gp.gcRescan = -1
+	// malg returns status as Gidle, change to Gsyscall before adding to allg
+	// where GC will see it.
+	casgstatus(gp, _Gidle, _Gsyscall)
+	gp.m = mp
+	mp.curg = gp
+	mp.locked = _LockInternal
+	mp.lockedg = gp
+	gp.lockedm = mp
+	gp.goid = int64(atomic.Xadd64(&sched.goidgen, 1))
+	if raceenabled {
+		gp.racectx = racegostart(funcPC(newextram))
+	}
+	// put on allg for garbage collector
+	allgadd(gp)
+
+	// Add m to the extra list.
+	mnext := lockextra(true)
+	mp.schedlink.set(mnext)
+	unlockextra(mp)
+}
+
+// dropm is called when a cgo callback has called needm but is now
+// done with the callback and returning back into the non-Go thread.
+// It puts the current m back onto the extra list.
+//
+// The main expense here is the call to signalstack to release the
+// m's signal stack, and then the call to needm on the next callback
+// from this thread. It is tempting to try to save the m for next time,
+// which would eliminate both these costs, but there might not be
+// a next time: the current thread (which Go does not control) might exit.
+// If we saved the m for that thread, there would be an m leak each time
+// such a thread exited. Instead, we acquire and release an m on each
+// call. These should typically not be scheduling operations, just a few
+// atomics, so the cost should be small.
+//
+// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
+// variable using pthread_key_create. Unlike the pthread keys we already use
+// on OS X, this dummy key would never be read by Go code. It would exist
+// only so that we could register at thread-exit-time destructor.
+// That destructor would put the m back onto the extra list.
+// This is purely a performance optimization. The current version,
+// in which dropm happens on each cgo call, is still correct too.
+// We may have to keep the current version on systems with cgo
+// but without pthreads, like Windows.
+func dropm() {
+	// Clear m and g, and return m to the extra list.
+	// After the call to setg we can only call nosplit functions
+	// with no pointer manipulation.
+	mp := getg().m
+
+	// Block signals before unminit.
+	// Unminit unregisters the signal handling stack (but needs g on some systems).
+	// Setg(nil) clears g, which is the signal handler's cue not to run Go handlers.
+	// It's important not to try to handle a signal between those two steps.
+	sigmask := mp.sigmask
+	sigblock()
+	unminit()
+
+	mnext := lockextra(true)
+	mp.schedlink.set(mnext)
+
+	setg(nil)
+
+	// Commit the release of mp.
+	unlockextra(mp)
+
+	msigrestore(sigmask)
+}
+
+// A helper function for EnsureDropM.
+func getm() uintptr {
+	return uintptr(unsafe.Pointer(getg().m))
+}
+
+var extram uintptr
+
+// lockextra locks the extra list and returns the list head.
+// The caller must unlock the list by storing a new list head
+// to extram. If nilokay is true, then lockextra will
+// return a nil list head if that's what it finds. If nilokay is false,
+// lockextra will keep waiting until the list head is no longer nil.
+//go:nosplit
+func lockextra(nilokay bool) *m {
+	const locked = 1
+
+	for {
+		old := atomic.Loaduintptr(&extram)
+		if old == locked {
+			yield := osyield
+			yield()
+			continue
+		}
+		if old == 0 && !nilokay {
+			usleep(1)
+			continue
+		}
+		if atomic.Casuintptr(&extram, old, locked) {
+			return (*m)(unsafe.Pointer(old))
+		}
+		yield := osyield
+		yield()
+		continue
+	}
+}
+
+//go:nosplit
+func unlockextra(mp *m) {
+	atomic.Storeuintptr(&extram, uintptr(unsafe.Pointer(mp)))
+}
+
+// Create a new m. It will start off with a call to fn, or else the scheduler.
+// fn needs to be static and not a heap allocated closure.
+// May run with m.p==nil, so write barriers are not allowed.
+//go:nowritebarrier
+func newm(fn func(), _p_ *p) {
+	mp := allocm(_p_, fn)
+	mp.nextp.set(_p_)
+	mp.sigmask = initSigmask
+	if iscgo {
+		var ts cgothreadstart
+		if _cgo_thread_start == nil {
+			throw("_cgo_thread_start missing")
+		}
+		ts.g.set(mp.g0)
+		ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
+		ts.fn = unsafe.Pointer(funcPC(mstart))
+		if msanenabled {
+			msanwrite(unsafe.Pointer(&ts), unsafe.Sizeof(ts))
+		}
+		asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
+		return
+	}
+	newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
+}
+
+// Stops execution of the current m until new work is available.
+// Returns with acquired P.
+func stopm() {
+	_g_ := getg()
+
+	if _g_.m.locks != 0 {
+		throw("stopm holding locks")
+	}
+	if _g_.m.p != 0 {
+		throw("stopm holding p")
+	}
+	if _g_.m.spinning {
+		throw("stopm spinning")
+	}
+
+retry:
+	lock(&sched.lock)
+	mput(_g_.m)
+	unlock(&sched.lock)
+	notesleep(&_g_.m.park)
+	noteclear(&_g_.m.park)
+	if _g_.m.helpgc != 0 {
+		gchelper()
+		_g_.m.helpgc = 0
+		_g_.m.mcache = nil
+		_g_.m.p = 0
+		goto retry
+	}
+	acquirep(_g_.m.nextp.ptr())
+	_g_.m.nextp = 0
+}
+
+func mspinning() {
+	// startm's caller incremented nmspinning. Set the new M's spinning.
+	getg().m.spinning = true
+}
+
+// Schedules some M to run the p (creates an M if necessary).
+// If p==nil, tries to get an idle P, if no idle P's does nothing.
+// May run with m.p==nil, so write barriers are not allowed.
+// If spinning is set, the caller has incremented nmspinning and startm will
+// either decrement nmspinning or set m.spinning in the newly started M.
+//go:nowritebarrier
+func startm(_p_ *p, spinning bool) {
+	lock(&sched.lock)
+	if _p_ == nil {
+		_p_ = pidleget()
+		if _p_ == nil {
+			unlock(&sched.lock)
+			if spinning {
+				// The caller incremented nmspinning, but there are no idle Ps,
+				// so it's okay to just undo the increment and give up.
+				if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+					throw("startm: negative nmspinning")
+				}
+			}
+			return
+		}
+	}
+	mp := mget()
+	unlock(&sched.lock)
+	if mp == nil {
+		var fn func()
+		if spinning {
+			// The caller incremented nmspinning, so set m.spinning in the new M.
+			fn = mspinning
+		}
+		newm(fn, _p_)
+		return
+	}
+	if mp.spinning {
+		throw("startm: m is spinning")
+	}
+	if mp.nextp != 0 {
+		throw("startm: m has p")
+	}
+	if spinning && !runqempty(_p_) {
+		throw("startm: p has runnable gs")
+	}
+	// The caller incremented nmspinning, so set m.spinning in the new M.
+	mp.spinning = spinning
+	mp.nextp.set(_p_)
+	notewakeup(&mp.park)
+}
+
+// Hands off P from syscall or locked M.
+// Always runs without a P, so write barriers are not allowed.
+//go:nowritebarrier
+func handoffp(_p_ *p) {
+	// handoffp must start an M in any situation where
+	// findrunnable would return a G to run on _p_.
+
+	// if it has local work, start it straight away
+	if !runqempty(_p_) || sched.runqsize != 0 {
+		startm(_p_, false)
+		return
+	}
+	// if it has GC work, start it straight away
+	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(_p_) {
+		startm(_p_, false)
+		return
+	}
+	// no local work, check that there are no spinning/idle M's,
+	// otherwise our help is not required
+	if atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) == 0 && atomic.Cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
+		startm(_p_, true)
+		return
+	}
+	lock(&sched.lock)
+	if sched.gcwaiting != 0 {
+		_p_.status = _Pgcstop
+		sched.stopwait--
+		if sched.stopwait == 0 {
+			notewakeup(&sched.stopnote)
+		}
+		unlock(&sched.lock)
+		return
+	}
+	if _p_.runSafePointFn != 0 && atomic.Cas(&_p_.runSafePointFn, 1, 0) {
+		sched.safePointFn(_p_)
+		sched.safePointWait--
+		if sched.safePointWait == 0 {
+			notewakeup(&sched.safePointNote)
+		}
+	}
+	if sched.runqsize != 0 {
+		unlock(&sched.lock)
+		startm(_p_, false)
+		return
+	}
+	// If this is the last running P and nobody is polling network,
+	// need to wakeup another M to poll network.
+	if sched.npidle == uint32(gomaxprocs-1) && atomic.Load64(&sched.lastpoll) != 0 {
+		unlock(&sched.lock)
+		startm(_p_, false)
+		return
+	}
+	pidleput(_p_)
+	unlock(&sched.lock)
+}
+
+// Tries to add one more P to execute G's.
+// Called when a G is made runnable (newproc, ready).
+func wakep() {
+	// be conservative about spinning threads
+	if !atomic.Cas(&sched.nmspinning, 0, 1) {
+		return
+	}
+	startm(nil, true)
+}
+
+// Stops execution of the current m that is locked to a g until the g is runnable again.
+// Returns with acquired P.
+func stoplockedm() {
+	_g_ := getg()
+
+	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
+		throw("stoplockedm: inconsistent locking")
+	}
+	if _g_.m.p != 0 {
+		// Schedule another M to run this p.
+		_p_ := releasep()
+		handoffp(_p_)
+	}
+	incidlelocked(1)
+	// Wait until another thread schedules lockedg again.
+	notesleep(&_g_.m.park)
+	noteclear(&_g_.m.park)
+	status := readgstatus(_g_.m.lockedg)
+	if status&^_Gscan != _Grunnable {
+		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_)
+		throw("stoplockedm: not runnable")
+	}
+	acquirep(_g_.m.nextp.ptr())
+	_g_.m.nextp = 0
+}
+
+// Schedules the locked m to run the locked gp.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func startlockedm(gp *g) {
+	_g_ := getg()
+
+	mp := gp.lockedm
+	if mp == _g_.m {
+		throw("startlockedm: locked to me")
+	}
+	if mp.nextp != 0 {
+		throw("startlockedm: m has p")
+	}
+	// directly handoff current P to the locked m
+	incidlelocked(-1)
+	_p_ := releasep()
+	mp.nextp.set(_p_)
+	notewakeup(&mp.park)
+	stopm()
+}
+
+// Stops the current m for stopTheWorld.
+// Returns when the world is restarted.
+func gcstopm() {
+	_g_ := getg()
+
+	if sched.gcwaiting == 0 {
+		throw("gcstopm: not waiting for gc")
+	}
+	if _g_.m.spinning {
+		_g_.m.spinning = false
+		// OK to just drop nmspinning here,
+		// startTheWorld will unpark threads as necessary.
+		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+			throw("gcstopm: negative nmspinning")
+		}
+	}
+	_p_ := releasep()
+	lock(&sched.lock)
+	_p_.status = _Pgcstop
+	sched.stopwait--
+	if sched.stopwait == 0 {
+		notewakeup(&sched.stopnote)
+	}
+	unlock(&sched.lock)
+	stopm()
+}
+
+// Schedules gp to run on the current M.
+// If inheritTime is true, gp inherits the remaining time in the
+// current time slice. Otherwise, it starts a new time slice.
+// Never returns.
+func execute(gp *g, inheritTime bool) {
+	_g_ := getg()
+
+	casgstatus(gp, _Grunnable, _Grunning)
+	gp.waitsince = 0
+	gp.preempt = false
+	gp.stackguard0 = gp.stack.lo + _StackGuard
+	if !inheritTime {
+		_g_.m.p.ptr().schedtick++
+	}
+	_g_.m.curg = gp
+	gp.m = _g_.m
+
+	// Check whether the profiler needs to be turned on or off.
+	hz := sched.profilehz
+	if _g_.m.profilehz != hz {
+		resetcpuprofiler(hz)
+	}
+
+	if trace.enabled {
+		// GoSysExit has to happen when we have a P, but before GoStart.
+		// So we emit it here.
+		if gp.syscallsp != 0 && gp.sysblocktraced {
+			traceGoSysExit(gp.sysexitticks)
+		}
+		traceGoStart()
+	}
+
+	gogo(&gp.sched)
+}
+
+// Finds a runnable goroutine to execute.
+// Tries to steal from other P's, get g from global queue, poll network.
+func findrunnable() (gp *g, inheritTime bool) {
+	_g_ := getg()
+
+	// The conditions here and in handoffp must agree: if
+	// findrunnable would return a G to run, handoffp must start
+	// an M.
+
+top:
+	_p_ := _g_.m.p.ptr()
+	if sched.gcwaiting != 0 {
+		gcstopm()
+		goto top
+	}
+	if _p_.runSafePointFn != 0 {
+		runSafePointFn()
+	}
+	if fingwait && fingwake {
+		if gp := wakefing(); gp != nil {
+			ready(gp, 0, true)
+		}
+	}
+
+	// local runq
+	if gp, inheritTime := runqget(_p_); gp != nil {
+		return gp, inheritTime
+	}
+
+	// global runq
+	if sched.runqsize != 0 {
+		lock(&sched.lock)
+		gp := globrunqget(_p_, 0)
+		unlock(&sched.lock)
+		if gp != nil {
+			return gp, false
+		}
+	}
+
+	// Poll network.
+	// This netpoll is only an optimization before we resort to stealing.
+	// We can safely skip it if there a thread blocked in netpoll already.
+	// If there is any kind of logical race with that blocked thread
+	// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
+	// this thread will do blocking netpoll below anyway.
+	if netpollinited() && sched.lastpoll != 0 {
+		if gp := netpoll(false); gp != nil { // non-blocking
+			// netpoll returns list of goroutines linked by schedlink.
+			injectglist(gp.schedlink.ptr())
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			return gp, false
+		}
+	}
+
+	// Steal work from other P's.
+	procs := uint32(gomaxprocs)
+	if atomic.Load(&sched.npidle) == procs-1 {
+		// Either GOMAXPROCS=1 or everybody, except for us, is idle already.
+		// New work can appear from returning syscall/cgocall, network or timers.
+		// Neither of that submits to local run queues, so no point in stealing.
+		goto stop
+	}
+	// If number of spinning M's >= number of busy P's, block.
+	// This is necessary to prevent excessive CPU consumption
+	// when GOMAXPROCS>>1 but the program parallelism is low.
+	if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) { // TODO: fast atomic
+		goto stop
+	}
+	if !_g_.m.spinning {
+		_g_.m.spinning = true
+		atomic.Xadd(&sched.nmspinning, 1)
+	}
+	for i := 0; i < 4; i++ {
+		for enum := stealOrder.start(fastrand1()); !enum.done(); enum.next() {
+			if sched.gcwaiting != 0 {
+				goto top
+			}
+			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
+			if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+				return gp, false
+			}
+		}
+	}
+
+stop:
+
+	// We have nothing to do. If we're in the GC mark phase, can
+	// safely scan and blacken objects, and have work to do, run
+	// idle-time marking rather than give up the P.
+	if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
+		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
+		gp := _p_.gcBgMarkWorker.ptr()
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		if trace.enabled {
+			traceGoUnpark(gp, 0)
+		}
+		return gp, false
+	}
+
+	// return P and block
+	lock(&sched.lock)
+	if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
+		unlock(&sched.lock)
+		goto top
+	}
+	if sched.runqsize != 0 {
+		gp := globrunqget(_p_, 0)
+		unlock(&sched.lock)
+		return gp, false
+	}
+	if releasep() != _p_ {
+		throw("findrunnable: wrong p")
+	}
+	pidleput(_p_)
+	unlock(&sched.lock)
+
+	// Delicate dance: thread transitions from spinning to non-spinning state,
+	// potentially concurrently with submission of new goroutines. We must
+	// drop nmspinning first and then check all per-P queues again (with
+	// #StoreLoad memory barrier in between). If we do it the other way around,
+	// another thread can submit a goroutine after we've checked all run queues
+	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// to run the goroutine.
+	// If we discover new work below, we need to restore m.spinning as a signal
+	// for resetspinning to unpark a new worker thread (because there can be more
+	// than one starving goroutine). However, if after discovering new work
+	// we also observe no idle Ps, it is OK to just park the current thread:
+	// the system is fully loaded so no spinning threads are required.
+	// Also see "Worker thread parking/unparking" comment at the top of the file.
+	wasSpinning := _g_.m.spinning
+	if _g_.m.spinning {
+		_g_.m.spinning = false
+		if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
+			throw("findrunnable: negative nmspinning")
+		}
+	}
+
+	// check all runqueues once again
+	for i := 0; i < int(gomaxprocs); i++ {
+		_p_ := allp[i]
+		if _p_ != nil && !runqempty(_p_) {
+			lock(&sched.lock)
+			_p_ = pidleget()
+			unlock(&sched.lock)
+			if _p_ != nil {
+				acquirep(_p_)
+				if wasSpinning {
+					_g_.m.spinning = true
+					atomic.Xadd(&sched.nmspinning, 1)
+				}
+				goto top
+			}
+			break
+		}
+	}
+
+	// poll network
+	if netpollinited() && atomic.Xchg64(&sched.lastpoll, 0) != 0 {
+		if _g_.m.p != 0 {
+			throw("findrunnable: netpoll with p")
+		}
+		if _g_.m.spinning {
+			throw("findrunnable: netpoll with spinning")
+		}
+		gp := netpoll(true) // block until new work is available
+		atomic.Store64(&sched.lastpoll, uint64(nanotime()))
+		if gp != nil {
+			lock(&sched.lock)
+			_p_ = pidleget()
+			unlock(&sched.lock)
+			if _p_ != nil {
+				acquirep(_p_)
+				injectglist(gp.schedlink.ptr())
+				casgstatus(gp, _Gwaiting, _Grunnable)
+				if trace.enabled {
+					traceGoUnpark(gp, 0)
+				}
+				return gp, false
+			}
+			injectglist(gp)
+		}
+	}
+	stopm()
+	goto top
+}
+
+func resetspinning() {
+	_g_ := getg()
+	if !_g_.m.spinning {
+		throw("resetspinning: not a spinning m")
+	}
+	_g_.m.spinning = false
+	nmspinning := atomic.Xadd(&sched.nmspinning, -1)
+	if int32(nmspinning) < 0 {
+		throw("findrunnable: negative nmspinning")
+	}
+	// M wakeup policy is deliberately somewhat conservative, so check if we
+	// need to wakeup another P here. See "Worker thread parking/unparking"
+	// comment at the top of the file for details.
+	if nmspinning == 0 && atomic.Load(&sched.npidle) > 0 {
+		wakep()
+	}
+}
+
+// Injects the list of runnable G's into the scheduler.
+// Can run concurrently with GC.
+func injectglist(glist *g) {
+	if glist == nil {
+		return
+	}
+	if trace.enabled {
+		for gp := glist; gp != nil; gp = gp.schedlink.ptr() {
+			traceGoUnpark(gp, 0)
+		}
+	}
+	lock(&sched.lock)
+	var n int
+	for n = 0; glist != nil; n++ {
+		gp := glist
+		glist = gp.schedlink.ptr()
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		globrunqput(gp)
+	}
+	unlock(&sched.lock)
+	for ; n != 0 && sched.npidle != 0; n-- {
+		startm(nil, false)
+	}
+}
+
+// One round of scheduler: find a runnable goroutine and execute it.
+// Never returns.
+func schedule() {
+	_g_ := getg()
+
+	if _g_.m.locks != 0 {
+		throw("schedule: holding locks")
+	}
+
+	if _g_.m.lockedg != nil {
+		stoplockedm()
+		execute(_g_.m.lockedg, false) // Never returns.
+	}
+
+top:
+	if sched.gcwaiting != 0 {
+		gcstopm()
+		goto top
+	}
+	if _g_.m.p.ptr().runSafePointFn != 0 {
+		runSafePointFn()
+	}
+
+	var gp *g
+	var inheritTime bool
+	if trace.enabled || trace.shutdown {
+		gp = traceReader()
+		if gp != nil {
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			traceGoUnpark(gp, 0)
+		}
+	}
+	if gp == nil && gcBlackenEnabled != 0 {
+		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
+	}
+	if gp == nil {
+		// Check the global runnable queue once in a while to ensure fairness.
+		// Otherwise two goroutines can completely occupy the local runqueue
+		// by constantly respawning each other.
+		if _g_.m.p.ptr().schedtick%61 == 0 && sched.runqsize > 0 {
+			lock(&sched.lock)
+			gp = globrunqget(_g_.m.p.ptr(), 1)
+			unlock(&sched.lock)
+		}
+	}
+	if gp == nil {
+		gp, inheritTime = runqget(_g_.m.p.ptr())
+		if gp != nil && _g_.m.spinning {
+			throw("schedule: spinning with local work")
+		}
+	}
+	if gp == nil {
+		gp, inheritTime = findrunnable() // blocks until work is available
+	}
+
+	// This thread is going to run a goroutine and is not spinning anymore,
+	// so if it was marked as spinning we need to reset it now and potentially
+	// start a new spinning M.
+	if _g_.m.spinning {
+		resetspinning()
+	}
+
+	if gp.lockedm != nil {
+		// Hands off own p to the locked m,
+		// then blocks waiting for a new p.
+		startlockedm(gp)
+		goto top
+	}
+
+	execute(gp, inheritTime)
+}
+
+// dropg removes the association between m and the current goroutine m->curg (gp for short).
+// Typically a caller sets gp's status away from Grunning and then
+// immediately calls dropg to finish the job. The caller is also responsible
+// for arranging that gp will be restarted using ready at an
+// appropriate time. After calling dropg and arranging for gp to be
+// readied later, the caller can do other work but eventually should
+// call schedule to restart the scheduling of goroutines on this m.
+func dropg() {
+	_g_ := getg()
+
+	_g_.m.curg.m = nil
+	_g_.m.curg = nil
+}
+
+func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
+	unlock((*mutex)(lock))
+	return true
+}
+
+// park continuation on g0.
+func park_m(gp *g) {
+	_g_ := getg()
+
+	if trace.enabled {
+		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
+	}
+
+	casgstatus(gp, _Grunning, _Gwaiting)
+	dropg()
+
+	if _g_.m.waitunlockf != nil {
+		fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
+		ok := fn(gp, _g_.m.waitlock)
+		_g_.m.waitunlockf = nil
+		_g_.m.waitlock = nil
+		if !ok {
+			if trace.enabled {
+				traceGoUnpark(gp, 2)
+			}
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			execute(gp, true) // Schedule it back, never returns.
+		}
+	}
+	schedule()
+}
+
+func goschedImpl(gp *g) {
+	status := readgstatus(gp)
+	if status&^_Gscan != _Grunning {
+		dumpgstatus(gp)
+		throw("bad g status")
+	}
+	casgstatus(gp, _Grunning, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	globrunqput(gp)
+	unlock(&sched.lock)
+
+	schedule()
+}
+
+// Gosched continuation on g0.
+func gosched_m(gp *g) {
+	if trace.enabled {
+		traceGoSched()
+	}
+	goschedImpl(gp)
+}
+
+func gopreempt_m(gp *g) {
+	if trace.enabled {
+		traceGoPreempt()
+	}
+	goschedImpl(gp)
+}
+
+// Finishes execution of the current goroutine.
+func goexit1() {
+	if raceenabled {
+		racegoend()
+	}
+	if trace.enabled {
+		traceGoEnd()
+	}
+	mcall(goexit0)
+}
+
+// goexit continuation on g0.
+func goexit0(gp *g) {
+	_g_ := getg()
+
+	casgstatus(gp, _Grunning, _Gdead)
+	if isSystemGoroutine(gp) {
+		atomic.Xadd(&sched.ngsys, -1)
+	}
+	gp.m = nil
+	gp.lockedm = nil
+	_g_.m.lockedg = nil
+	gp.paniconfault = false
+	gp._defer = nil // should be true already but just in case.
+	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
+	gp.writebuf = nil
+	gp.waitreason = ""
+	gp.param = nil
+
+	// Note that gp's stack scan is now "valid" because it has no
+	// stack. We could dequeueRescan, but that takes a lock and
+	// isn't really necessary.
+	gp.gcscanvalid = true
+	dropg()
+
+	if _g_.m.locked&^_LockExternal != 0 {
+		print("invalid m->locked = ", _g_.m.locked, "\n")
+		throw("internal lockOSThread error")
+	}
+	_g_.m.locked = 0
+	gfput(_g_.m.p.ptr(), gp)
+	schedule()
+}
+
+//go:nosplit
+//go:nowritebarrier
+func save(pc, sp uintptr) {
+	_g_ := getg()
+
+	_g_.sched.pc = pc
+	_g_.sched.sp = sp
+	_g_.sched.lr = 0
+	_g_.sched.ret = 0
+	_g_.sched.ctxt = nil
+	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
+}
+
+// The goroutine g is about to enter a system call.
+// Record that it's not using the cpu anymore.
+// This is called only from the go syscall library and cgocall,
+// not from the low-level system calls used by the runtime.
+//
+// Entersyscall cannot split the stack: the gosave must
+// make g->sched refer to the caller's stack segment, because
+// entersyscall is going to return immediately after.
+//
+// Nothing entersyscall calls can split the stack either.
+// We cannot safely move the stack during an active call to syscall,
+// because we do not know which of the uintptr arguments are
+// really pointers (back into the stack).
+// In practice, this means that we make the fast path run through
+// entersyscall doing no-split things, and the slow path has to use systemstack
+// to run bigger things on the system stack.
+//
+// reentersyscall is the entry point used by cgo callbacks, where explicitly
+// saved SP and PC are restored. This is needed when exitsyscall will be called
+// from a function further up in the call stack than the parent, as g->syscallsp
+// must always point to a valid stack frame. entersyscall below is the normal
+// entry point for syscalls, which obtains the SP and PC from the caller.
+//
+// Syscall tracing:
+// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
+// If the syscall does not block, that is it, we do not emit any other events.
+// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
+// when syscall returns we emit traceGoSysExit and when the goroutine starts running
+// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
+// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
+// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
+// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
+// and we wait for the increment before emitting traceGoSysExit.
+// Note that the increment is done even if tracing is not enabled,
+// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
+//
+//go:nosplit
+func reentersyscall(pc, sp uintptr) {
+	_g_ := getg()
+
+	// Disable preemption because during this function g is in Gsyscall status,
+	// but can have inconsistent g->sched, do not let GC observe it.
+	_g_.m.locks++
+
+	// Entersyscall must not call any function that might split/grow the stack.
+	// (See details in comment above.)
+	// Catch calls that might, by replacing the stack guard with something that
+	// will trip any stack check and leaving a flag to tell newstack to die.
+	_g_.stackguard0 = stackPreempt
+	_g_.throwsplit = true
+
+	// Leave SP around for GC and traceback.
+	save(pc, sp)
+	_g_.syscallsp = sp
+	_g_.syscallpc = pc
+	casgstatus(_g_, _Grunning, _Gsyscall)
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		systemstack(func() {
+			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscall")
+		})
+	}
+
+	if trace.enabled {
+		systemstack(traceGoSysCall)
+		// systemstack itself clobbers g.sched.{pc,sp} and we might
+		// need them later when the G is genuinely blocked in a
+		// syscall
+		save(pc, sp)
+	}
+
+	if atomic.Load(&sched.sysmonwait) != 0 { // TODO: fast atomic
+		systemstack(entersyscall_sysmon)
+		save(pc, sp)
+	}
+
+	if _g_.m.p.ptr().runSafePointFn != 0 {
+		// runSafePointFn may stack split if run on this stack
+		systemstack(runSafePointFn)
+		save(pc, sp)
+	}
+
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	_g_.m.mcache = nil
+	_g_.m.p.ptr().m = 0
+	atomic.Store(&_g_.m.p.ptr().status, _Psyscall)
+	if sched.gcwaiting != 0 {
+		systemstack(entersyscall_gcwait)
+		save(pc, sp)
+	}
+
+	// Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
+	// We set _StackGuard to StackPreempt so that first split stack check calls morestack.
+	// Morestack detects this case and throws.
+	_g_.stackguard0 = stackPreempt
+	_g_.m.locks--
+}
+
+// Standard syscall entry used by the go syscall library and normal cgo calls.
+//go:nosplit
+func entersyscall(dummy int32) {
+	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+}
+
+func entersyscall_sysmon() {
+	lock(&sched.lock)
+	if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+}
+
+func entersyscall_gcwait() {
+	_g_ := getg()
+	_p_ := _g_.m.p.ptr()
+
+	lock(&sched.lock)
+	if sched.stopwait > 0 && atomic.Cas(&_p_.status, _Psyscall, _Pgcstop) {
+		if trace.enabled {
+			traceGoSysBlock(_p_)
+			traceProcStop(_p_)
+		}
+		_p_.syscalltick++
+		if sched.stopwait--; sched.stopwait == 0 {
+			notewakeup(&sched.stopnote)
+		}
+	}
+	unlock(&sched.lock)
+}
+
+// The same as entersyscall(), but with a hint that the syscall is blocking.
+//go:nosplit
+func entersyscallblock(dummy int32) {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+	_g_.throwsplit = true
+	_g_.stackguard0 = stackPreempt // see comment in entersyscall
+	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
+	_g_.sysblocktraced = true
+	_g_.m.p.ptr().syscalltick++
+
+	// Leave SP around for GC and traceback.
+	pc := getcallerpc(unsafe.Pointer(&dummy))
+	sp := getcallersp(unsafe.Pointer(&dummy))
+	save(pc, sp)
+	_g_.syscallsp = _g_.sched.sp
+	_g_.syscallpc = _g_.sched.pc
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		sp1 := sp
+		sp2 := _g_.sched.sp
+		sp3 := _g_.syscallsp
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscallblock")
+		})
+	}
+	casgstatus(_g_, _Grunning, _Gsyscall)
+	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
+		systemstack(func() {
+			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
+			throw("entersyscallblock")
+		})
+	}
+
+	systemstack(entersyscallblock_handoff)
+
+	// Resave for traceback during blocked call.
+	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
+
+	_g_.m.locks--
+}
+
+func entersyscallblock_handoff() {
+	if trace.enabled {
+		traceGoSysCall()
+		traceGoSysBlock(getg().m.p.ptr())
+	}
+	handoffp(releasep())
+}
+
+// The goroutine g exited its system call.
+// Arrange for it to run on a cpu again.
+// This is called only from the go syscall library, not
+// from the low-level system calls used by the runtime.
+//go:nosplit
+func exitsyscall(dummy int32) {
+	_g_ := getg()
+
+	_g_.m.locks++ // see comment in entersyscall
+	if getcallersp(unsafe.Pointer(&dummy)) > _g_.syscallsp {
+		// throw calls print which may try to grow the stack,
+		// but throwsplit == true so the stack can not be grown;
+		// use systemstack to avoid that possible problem.
+		systemstack(func() {
+			throw("exitsyscall: syscall frame is no longer valid")
+		})
+	}
+
+	_g_.waitsince = 0
+	oldp := _g_.m.p.ptr()
+	if exitsyscallfast() {
+		if _g_.m.mcache == nil {
+			throw("lost mcache")
+		}
+		if trace.enabled {
+			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+				systemstack(traceGoStart)
+			}
+		}
+		// There's a cpu for us, so we can run.
+		_g_.m.p.ptr().syscalltick++
+		// We need to cas the status and scan before resuming...
+		casgstatus(_g_, _Gsyscall, _Grunning)
+
+		// Garbage collector isn't running (since we are),
+		// so okay to clear syscallsp.
+		_g_.syscallsp = 0
+		_g_.m.locks--
+		if _g_.preempt {
+			// restore the preemption request in case we've cleared it in newstack
+			_g_.stackguard0 = stackPreempt
+		} else {
+			// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
+			_g_.stackguard0 = _g_.stack.lo + _StackGuard
+		}
+		_g_.throwsplit = false
+		return
+	}
+
+	_g_.sysexitticks = 0
+	if trace.enabled {
+		// Wait till traceGoSysBlock event is emitted.
+		// This ensures consistency of the trace (the goroutine is started after it is blocked).
+		for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
+			osyield()
+		}
+		// We can't trace syscall exit right now because we don't have a P.
+		// Tracing code can invoke write barriers that cannot run without a P.
+		// So instead we remember the syscall exit time and emit the event
+		// in execute when we have a P.
+		_g_.sysexitticks = cputicks()
+	}
+
+	_g_.m.locks--
+
+	// Call the scheduler.
+	mcall(exitsyscall0)
+
+	if _g_.m.mcache == nil {
+		throw("lost mcache")
+	}
+
+	// Scheduler returned, so we're allowed to run now.
+	// Delete the syscallsp information that we left for
+	// the garbage collector during the system call.
+	// Must wait until now because until gosched returns
+	// we don't know for sure that the garbage collector
+	// is not running.
+	_g_.syscallsp = 0
+	_g_.m.p.ptr().syscalltick++
+	_g_.throwsplit = false
+}
+
+//go:nosplit
+func exitsyscallfast() bool {
+	_g_ := getg()
+
+	// Freezetheworld sets stopwait but does not retake P's.
+	if sched.stopwait == freezeStopWait {
+		_g_.m.mcache = nil
+		_g_.m.p = 0
+		return false
+	}
+
+	// Try to re-acquire the last P.
+	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && atomic.Cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
+		// There's a cpu for us, so we can run.
+		_g_.m.mcache = _g_.m.p.ptr().mcache
+		_g_.m.p.ptr().m.set(_g_.m)
+		if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
+			if trace.enabled {
+				// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
+				// traceGoSysBlock for this syscall was already emitted,
+				// but here we effectively retake the p from the new syscall running on the same p.
+				systemstack(func() {
+					// Denote blocking of the new syscall.
+					traceGoSysBlock(_g_.m.p.ptr())
+					// Denote completion of the current syscall.
+					traceGoSysExit(0)
+				})
+			}
+			_g_.m.p.ptr().syscalltick++
+		}
+		return true
+	}
+
+	// Try to get any other idle P.
+	oldp := _g_.m.p.ptr()
+	_g_.m.mcache = nil
+	_g_.m.p = 0
+	if sched.pidle != 0 {
+		var ok bool
+		systemstack(func() {
+			ok = exitsyscallfast_pidle()
+			if ok && trace.enabled {
+				if oldp != nil {
+					// Wait till traceGoSysBlock event is emitted.
+					// This ensures consistency of the trace (the goroutine is started after it is blocked).
+					for oldp.syscalltick == _g_.m.syscalltick {
+						osyield()
+					}
+				}
+				traceGoSysExit(0)
+			}
+		})
+		if ok {
+			return true
+		}
+	}
+	return false
+}
+
+func exitsyscallfast_pidle() bool {
+	lock(&sched.lock)
+	_p_ := pidleget()
+	if _p_ != nil && atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		return true
+	}
+	return false
+}
+
+// exitsyscall slow path on g0.
+// Failed to acquire P, enqueue gp as runnable.
+func exitsyscall0(gp *g) {
+	_g_ := getg()
+
+	casgstatus(gp, _Gsyscall, _Grunnable)
+	dropg()
+	lock(&sched.lock)
+	_p_ := pidleget()
+	if _p_ == nil {
+		globrunqput(gp)
+	} else if atomic.Load(&sched.sysmonwait) != 0 {
+		atomic.Store(&sched.sysmonwait, 0)
+		notewakeup(&sched.sysmonnote)
+	}
+	unlock(&sched.lock)
+	if _p_ != nil {
+		acquirep(_p_)
+		execute(gp, false) // Never returns.
+	}
+	if _g_.m.lockedg != nil {
+		// Wait until another thread schedules gp and so m again.
+		stoplockedm()
+		execute(gp, false) // Never returns.
+	}
+	stopm()
+	schedule() // Never returns.
+}
+
+func beforefork() {
+	gp := getg().m.curg
+
+	// Fork can hang if preempted with signals frequently enough (see issue 5517).
+	// Ensure that we stay on the same M where we disable profiling.
+	gp.m.locks++
+	if gp.m.profilehz != 0 {
+		resetcpuprofiler(0)
+	}
+
+	// This function is called before fork in syscall package.
+	// Code between fork and exec must not allocate memory nor even try to grow stack.
+	// Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
+	// runtime_AfterFork will undo this in parent process, but not in child.
+	gp.stackguard0 = stackFork
+}
+
+// Called from syscall package before fork.
+//go:linkname syscall_runtime_BeforeFork syscall.runtime_BeforeFork
+//go:nosplit
+func syscall_runtime_BeforeFork() {
+	systemstack(beforefork)
+}
+
+func afterfork() {
+	gp := getg().m.curg
+
+	// See the comment in beforefork.
+	gp.stackguard0 = gp.stack.lo + _StackGuard
+
+	hz := sched.profilehz
+	if hz != 0 {
+		resetcpuprofiler(hz)
+	}
+	gp.m.locks--
+}
+
+// Called from syscall package after fork in parent.
+//go:linkname syscall_runtime_AfterFork syscall.runtime_AfterFork
+//go:nosplit
+func syscall_runtime_AfterFork() {
+	systemstack(afterfork)
+}
+
+// Allocate a new g, with a stack big enough for stacksize bytes.
+func malg(stacksize int32) *g {
+	newg := new(g)
+	if stacksize >= 0 {
+		stacksize = round2(_StackSystem + stacksize)
+		systemstack(func() {
+			newg.stack, newg.stkbar = stackalloc(uint32(stacksize))
+		})
+		newg.stackguard0 = newg.stack.lo + _StackGuard
+		newg.stackguard1 = ^uintptr(0)
+		newg.stackAlloc = uintptr(stacksize)
+	}
+	return newg
+}
+
+// Create a new g running fn with siz bytes of arguments.
+// Put it on the queue of g's waiting to run.
+// The compiler turns a go statement into a call to this.
+// Cannot split the stack because it assumes that the arguments
+// are available sequentially after &fn; they would not be
+// copied if a stack split occurred.
+//go:nosplit
+func newproc(siz int32, fn *funcval) {
+	argp := add(unsafe.Pointer(&fn), sys.PtrSize)
+	pc := getcallerpc(unsafe.Pointer(&siz))
+	systemstack(func() {
+		newproc1(fn, (*uint8)(argp), siz, 0, pc)
+	})
+}
+
+// Create a new g running fn with narg bytes of arguments starting
+// at argp and returning nret bytes of results.  callerpc is the
+// address of the go statement that created this. The new g is put
+// on the queue of g's waiting to run.
+func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
+	_g_ := getg()
+
+	if fn == nil {
+		_g_.m.throwing = -1 // do not dump full stacks
+		throw("go of nil func value")
+	}
+	_g_.m.locks++ // disable preemption because it can be holding p in a local var
+	siz := narg + nret
+	siz = (siz + 7) &^ 7
+
+	// We could allocate a larger initial stack if necessary.
+	// Not worth it: this is almost always an error.
+	// 4*sizeof(uintreg): extra space added below
+	// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
+	if siz >= _StackMin-4*sys.RegSize-sys.RegSize {
+		throw("newproc: function arguments too large for new goroutine")
+	}
+
+	_p_ := _g_.m.p.ptr()
+	newg := gfget(_p_)
+	if newg == nil {
+		newg = malg(_StackMin)
+		casgstatus(newg, _Gidle, _Gdead)
+		newg.gcRescan = -1
+		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
+	}
+	if newg.stack.hi == 0 {
+		throw("newproc1: newg missing stack")
+	}
+
+	if readgstatus(newg) != _Gdead {
+		throw("newproc1: new g is not Gdead")
+	}
+
+	totalSize := 4*sys.RegSize + uintptr(siz) + sys.MinFrameSize // extra space in case of reads slightly beyond frame
+	totalSize += -totalSize & (sys.SpAlign - 1)                  // align to spAlign
+	sp := newg.stack.hi - totalSize
+	spArg := sp
+	if usesLR {
+		// caller's LR
+		*(*unsafe.Pointer)(unsafe.Pointer(sp)) = nil
+		prepGoExitFrame(sp)
+		spArg += sys.MinFrameSize
+	}
+	memmove(unsafe.Pointer(spArg), unsafe.Pointer(argp), uintptr(narg))
+
+	memclr(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
+	newg.sched.sp = sp
+	newg.stktopsp = sp
+	newg.sched.pc = funcPC(goexit) + sys.PCQuantum // +PCQuantum so that previous instruction is in same function
+	newg.sched.g = guintptr(unsafe.Pointer(newg))
+	gostartcallfn(&newg.sched, fn)
+	newg.gopc = callerpc
+	newg.startpc = fn.fn
+	if isSystemGoroutine(newg) {
+		atomic.Xadd(&sched.ngsys, +1)
+	}
+	// The stack is dirty from the argument frame, so queue it for
+	// scanning. Do this before setting it to runnable so we still
+	// own the G. If we're recycling a G, it may already be on the
+	// rescan list.
+	if newg.gcRescan == -1 {
+		queueRescan(newg)
+	} else {
+		// The recycled G is already on the rescan list. Just
+		// mark the stack dirty.
+		newg.gcscanvalid = false
+	}
+	casgstatus(newg, _Gdead, _Grunnable)
+
+	if _p_.goidcache == _p_.goidcacheend {
+		// Sched.goidgen is the last allocated id,
+		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
+		// At startup sched.goidgen=0, so main goroutine receives goid=1.
+		_p_.goidcache = atomic.Xadd64(&sched.goidgen, _GoidCacheBatch)
+		_p_.goidcache -= _GoidCacheBatch - 1
+		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
+	}
+	newg.goid = int64(_p_.goidcache)
+	_p_.goidcache++
+	if raceenabled {
+		newg.racectx = racegostart(callerpc)
+	}
+	if trace.enabled {
+		traceGoCreate(newg, newg.startpc)
+	}
+	runqput(_p_, newg, true)
+
+	if atomic.Load(&sched.npidle) != 0 && atomic.Load(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
+		wakep()
+	}
+	_g_.m.locks--
+	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
+		_g_.stackguard0 = stackPreempt
+	}
+	return newg
+}
+
+// Put on gfree list.
+// If local list is too long, transfer a batch to the global list.
+func gfput(_p_ *p, gp *g) {
+	if readgstatus(gp) != _Gdead {
+		throw("gfput: bad status (not Gdead)")
+	}
+
+	stksize := gp.stackAlloc
+
+	if stksize != _FixedStack {
+		// non-standard stack size - free it.
+		stackfree(gp.stack, gp.stackAlloc)
+		gp.stack.lo = 0
+		gp.stack.hi = 0
+		gp.stackguard0 = 0
+		gp.stkbar = nil
+		gp.stkbarPos = 0
+	} else {
+		// Reset stack barriers.
+		gp.stkbar = gp.stkbar[:0]
+		gp.stkbarPos = 0
+	}
+
+	gp.schedlink.set(_p_.gfree)
+	_p_.gfree = gp
+	_p_.gfreecnt++
+	if _p_.gfreecnt >= 64 {
+		lock(&sched.gflock)
+		for _p_.gfreecnt >= 32 {
+			_p_.gfreecnt--
+			gp = _p_.gfree
+			_p_.gfree = gp.schedlink.ptr()
+			if gp.stack.lo == 0 {
+				gp.schedlink.set(sched.gfreeNoStack)
+				sched.gfreeNoStack = gp
+			} else {
+				gp.schedlink.set(sched.gfreeStack)
+				sched.gfreeStack = gp
+			}
+			sched.ngfree++
+		}
+		unlock(&sched.gflock)
+	}
+}
+
+// Get from gfree list.
+// If local list is empty, grab a batch from global list.
+func gfget(_p_ *p) *g {
+retry:
+	gp := _p_.gfree
+	if gp == nil && (sched.gfreeStack != nil || sched.gfreeNoStack != nil) {
+		lock(&sched.gflock)
+		for _p_.gfreecnt < 32 {
+			if sched.gfreeStack != nil {
+				// Prefer Gs with stacks.
+				gp = sched.gfreeStack
+				sched.gfreeStack = gp.schedlink.ptr()
+			} else if sched.gfreeNoStack != nil {
+				gp = sched.gfreeNoStack
+				sched.gfreeNoStack = gp.schedlink.ptr()
+			} else {
+				break
+			}
+			_p_.gfreecnt++
+			sched.ngfree--
+			gp.schedlink.set(_p_.gfree)
+			_p_.gfree = gp
+		}
+		unlock(&sched.gflock)
+		goto retry
+	}
+	if gp != nil {
+		_p_.gfree = gp.schedlink.ptr()
+		_p_.gfreecnt--
+		if gp.stack.lo == 0 {
+			// Stack was deallocated in gfput. Allocate a new one.
+			systemstack(func() {
+				gp.stack, gp.stkbar = stackalloc(_FixedStack)
+			})
+			gp.stackguard0 = gp.stack.lo + _StackGuard
+			gp.stackAlloc = _FixedStack
+		} else {
+			if raceenabled {
+				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
+			}
+			if msanenabled {
+				msanmalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
+			}
+		}
+	}
+	return gp
+}
+
+// Purge all cached G's from gfree list to the global list.
+func gfpurge(_p_ *p) {
+	lock(&sched.gflock)
+	for _p_.gfreecnt != 0 {
+		_p_.gfreecnt--
+		gp := _p_.gfree
+		_p_.gfree = gp.schedlink.ptr()
+		if gp.stack.lo == 0 {
+			gp.schedlink.set(sched.gfreeNoStack)
+			sched.gfreeNoStack = gp
+		} else {
+			gp.schedlink.set(sched.gfreeStack)
+			sched.gfreeStack = gp
+		}
+		sched.ngfree++
+	}
+	unlock(&sched.gflock)
+}
+
+// Breakpoint executes a breakpoint trap.
+func Breakpoint() {
+	breakpoint()
+}
+
+// dolockOSThread is called by LockOSThread and lockOSThread below
+// after they modify m.locked. Do not allow preemption during this call,
+// or else the m might be different in this function than in the caller.
+//go:nosplit
+func dolockOSThread() {
+	_g_ := getg()
+	_g_.m.lockedg = _g_
+	_g_.lockedm = _g_.m
+}
+
+//go:nosplit
+
+// LockOSThread wires the calling goroutine to its current operating system thread.
+// Until the calling goroutine exits or calls UnlockOSThread, it will always
+// execute in that thread, and no other goroutine can.
+func LockOSThread() {
+	getg().m.locked |= _LockExternal
+	dolockOSThread()
+}
+
+//go:nosplit
+func lockOSThread() {
+	getg().m.locked += _LockInternal
+	dolockOSThread()
+}
+
+// dounlockOSThread is called by UnlockOSThread and unlockOSThread below
+// after they update m->locked. Do not allow preemption during this call,
+// or else the m might be in different in this function than in the caller.
+//go:nosplit
+func dounlockOSThread() {
+	_g_ := getg()
+	if _g_.m.locked != 0 {
+		return
+	}
+	_g_.m.lockedg = nil
+	_g_.lockedm = nil
+}
+
+//go:nosplit
+
+// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
+// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
+func UnlockOSThread() {
+	getg().m.locked &^= _LockExternal
+	dounlockOSThread()
+}
+
+//go:nosplit
+func unlockOSThread() {
+	_g_ := getg()
+	if _g_.m.locked < _LockInternal {
+		systemstack(badunlockosthread)
+	}
+	_g_.m.locked -= _LockInternal
+	dounlockOSThread()
+}
+
+func badunlockosthread() {
+	throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
+}
+
+func gcount() int32 {
+	n := int32(allglen) - sched.ngfree - int32(atomic.Load(&sched.ngsys))
+	for i := 0; ; i++ {
+		_p_ := allp[i]
+		if _p_ == nil {
+			break
+		}
+		n -= _p_.gfreecnt
+	}
+
+	// All these variables can be changed concurrently, so the result can be inconsistent.
+	// But at least the current goroutine is running.
+	if n < 1 {
+		n = 1
+	}
+	return n
+}
+
+func mcount() int32 {
+	return sched.mcount
+}
+
+var prof struct {
+	lock uint32
+	hz   int32
+}
+
+func _System()       { _System() }
+func _ExternalCode() { _ExternalCode() }
+func _GC()           { _GC() }
+
+// Called if we receive a SIGPROF signal.
+// Called by the signal handler, may run during STW.
+//go:nowritebarrierrec
+func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
+	if prof.hz == 0 {
+		return
+	}
+
+	// Profiling runs concurrently with GC, so it must not allocate.
+	mp.mallocing++
+
+	// Define that a "user g" is a user-created goroutine, and a "system g"
+	// is one that is m->g0 or m->gsignal.
+	//
+	// We might be interrupted for profiling halfway through a
+	// goroutine switch. The switch involves updating three (or four) values:
+	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
+	// because once it gets updated the new g is running.
+	//
+	// When switching from a user g to a system g, LR is not considered live,
+	// so the update only affects g, SP, and PC. Since PC must be last, there
+	// the possible partial transitions in ordinary execution are (1) g alone is updated,
+	// (2) both g and SP are updated, and (3) SP alone is updated.
+	// If SP or g alone is updated, we can detect the partial transition by checking
+	// whether the SP is within g's stack bounds. (We could also require that SP
+	// be changed only after g, but the stack bounds check is needed by other
+	// cases, so there is no need to impose an additional requirement.)
+	//
+	// There is one exceptional transition to a system g, not in ordinary execution.
+	// When a signal arrives, the operating system starts the signal handler running
+	// with an updated PC and SP. The g is updated last, at the beginning of the
+	// handler. There are two reasons this is okay. First, until g is updated the
+	// g and SP do not match, so the stack bounds check detects the partial transition.
+	// Second, signal handlers currently run with signals disabled, so a profiling
+	// signal cannot arrive during the handler.
+	//
+	// When switching from a system g to a user g, there are three possibilities.
+	//
+	// First, it may be that the g switch has no PC update, because the SP
+	// either corresponds to a user g throughout (as in asmcgocall)
+	// or because it has been arranged to look like a user g frame
+	// (as in cgocallback_gofunc). In this case, since the entire
+	// transition is a g+SP update, a partial transition updating just one of
+	// those will be detected by the stack bounds check.
+	//
+	// Second, when returning from a signal handler, the PC and SP updates
+	// are performed by the operating system in an atomic update, so the g
+	// update must be done before them. The stack bounds check detects
+	// the partial transition here, and (again) signal handlers run with signals
+	// disabled, so a profiling signal cannot arrive then anyway.
+	//
+	// Third, the common case: it may be that the switch updates g, SP, and PC
+	// separately. If the PC is within any of the functions that does this,
+	// we don't ask for a traceback. C.F. the function setsSP for more about this.
+	//
+	// There is another apparently viable approach, recorded here in case
+	// the "PC within setsSP function" check turns out not to be usable.
+	// It would be possible to delay the update of either g or SP until immediately
+	// before the PC update instruction. Then, because of the stack bounds check,
+	// the only problematic interrupt point is just before that PC update instruction,
+	// and the sigprof handler can detect that instruction and simulate stepping past
+	// it in order to reach a consistent state. On ARM, the update of g must be made
+	// in two places (in R10 and also in a TLS slot), so the delayed update would
+	// need to be the SP update. The sigprof handler must read the instruction at
+	// the current PC and if it was the known instruction (for example, JMP BX or
+	// MOV R2, PC), use that other register in place of the PC value.
+	// The biggest drawback to this solution is that it requires that we can tell
+	// whether it's safe to read from the memory pointed at by PC.
+	// In a correct program, we can test PC == nil and otherwise read,
+	// but if a profiling signal happens at the instant that a program executes
+	// a bad jump (before the program manages to handle the resulting fault)
+	// the profiling handler could fault trying to read nonexistent memory.
+	//
+	// To recap, there are no constraints on the assembly being used for the
+	// transition. We simply require that g and SP match and that the PC is not
+	// in gogo.
+	traceback := true
+	if gp == nil || sp < gp.stack.lo || gp.stack.hi < sp || setsSP(pc) {
+		traceback = false
+	}
+	var stk [maxCPUProfStack]uintptr
+	var haveStackLock *g
+	n := 0
+	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
+		cgoOff := 0
+		// Check cgoCallersUse to make sure that we are not
+		// interrupting other code that is fiddling with
+		// cgoCallers.  We are running in a signal handler
+		// with all signals blocked, so we don't have to worry
+		// about any other code interrupting us.
+		if atomic.Load(&mp.cgoCallersUse) == 0 && mp.cgoCallers != nil && mp.cgoCallers[0] != 0 {
+			for cgoOff < len(mp.cgoCallers) && mp.cgoCallers[cgoOff] != 0 {
+				cgoOff++
+			}
+			copy(stk[:], mp.cgoCallers[:cgoOff])
+			mp.cgoCallers[0] = 0
+		}
+
+		// Collect Go stack that leads to the cgo call.
+		if gcTryLockStackBarriers(mp.curg) {
+			haveStackLock = mp.curg
+			n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[cgoOff], len(stk)-cgoOff, nil, nil, 0)
+		}
+	} else if traceback {
+		var flags uint = _TraceTrap
+		if gp.m.curg != nil && gcTryLockStackBarriers(gp.m.curg) {
+			// It's safe to traceback the user stack.
+			haveStackLock = gp.m.curg
+			flags |= _TraceJumpStack
+		}
+		// Traceback is safe if we're on the system stack (if
+		// necessary, flags will stop it before switching to
+		// the user stack), or if we locked the user stack.
+		if gp != gp.m.curg || haveStackLock != nil {
+			n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, flags)
+		}
+	}
+	if haveStackLock != nil {
+		gcUnlockStackBarriers(haveStackLock)
+	}
+
+	if n <= 0 {
+		// Normal traceback is impossible or has failed.
+		// See if it falls into several common cases.
+		n = 0
+		if GOOS == "windows" && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
+			// Libcall, i.e. runtime syscall on windows.
+			// Collect Go stack that leads to the call.
+			if gcTryLockStackBarriers(mp.libcallg.ptr()) {
+				n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
+				gcUnlockStackBarriers(mp.libcallg.ptr())
+			}
+		}
+		if n == 0 {
+			// If all of the above has failed, account it against abstract "System" or "GC".
+			n = 2
+			// "ExternalCode" is better than "etext".
+			if pc > firstmoduledata.etext {
+				pc = funcPC(_ExternalCode) + sys.PCQuantum
+			}
+			stk[0] = pc
+			if mp.preemptoff != "" || mp.helpgc != 0 {
+				stk[1] = funcPC(_GC) + sys.PCQuantum
+			} else {
+				stk[1] = funcPC(_System) + sys.PCQuantum
+			}
+		}
+	}
+
+	if prof.hz != 0 {
+		// Simple cas-lock to coordinate with setcpuprofilerate.
+		for !atomic.Cas(&prof.lock, 0, 1) {
+			osyield()
+		}
+		if prof.hz != 0 {
+			cpuprof.add(stk[:n])
+		}
+		atomic.Store(&prof.lock, 0)
+	}
+	mp.mallocing--
+}
+
+// If the signal handler receives a SIGPROF signal on a non-Go thread,
+// it tries to collect a traceback into sigprofCallers.
+// sigprofCallersUse is set to non-zero while sigprofCallers holds a traceback.
+var sigprofCallers cgoCallers
+var sigprofCallersUse uint32
+
+// Called if we receive a SIGPROF signal on a non-Go thread.
+// When this is called, sigprofCallersUse will be non-zero.
+// g is nil, and what we can do is very limited.
+//go:nosplit
+//go:nowritebarrierrec
+func sigprofNonGo() {
+	if prof.hz != 0 {
+		n := 0
+		for n < len(sigprofCallers) && sigprofCallers[n] != 0 {
+			n++
+		}
+
+		// Simple cas-lock to coordinate with setcpuprofilerate.
+		if atomic.Cas(&prof.lock, 0, 1) {
+			if prof.hz != 0 {
+				cpuprof.addNonGo(sigprofCallers[:n])
+			}
+			atomic.Store(&prof.lock, 0)
+		}
+	}
+
+	atomic.Store(&sigprofCallersUse, 0)
+}
+
+// Reports whether a function will set the SP
+// to an absolute value. Important that
+// we don't traceback when these are at the bottom
+// of the stack since we can't be sure that we will
+// find the caller.
+//
+// If the function is not on the bottom of the stack
+// we assume that it will have set it up so that traceback will be consistent,
+// either by being a traceback terminating function
+// or putting one on the stack at the right offset.
+func setsSP(pc uintptr) bool {
+	f := findfunc(pc)
+	if f == nil {
+		// couldn't find the function for this PC,
+		// so assume the worst and stop traceback
+		return true
+	}
+	switch f.entry {
+	case gogoPC, systemstackPC, mcallPC, morestackPC:
+		return true
+	}
+	return false
+}
+
+// Arrange to call fn with a traceback hz times a second.
+func setcpuprofilerate_m(hz int32) {
+	// Force sane arguments.
+	if hz < 0 {
+		hz = 0
+	}
+
+	// Disable preemption, otherwise we can be rescheduled to another thread
+	// that has profiling enabled.
+	_g_ := getg()
+	_g_.m.locks++
+
+	// Stop profiler on this thread so that it is safe to lock prof.
+	// if a profiling signal came in while we had prof locked,
+	// it would deadlock.
+	resetcpuprofiler(0)
+
+	for !atomic.Cas(&prof.lock, 0, 1) {
+		osyield()
+	}
+	prof.hz = hz
+	atomic.Store(&prof.lock, 0)
+
+	lock(&sched.lock)
+	sched.profilehz = hz
+	unlock(&sched.lock)
+
+	if hz != 0 {
+		resetcpuprofiler(hz)
+	}
+
+	_g_.m.locks--
+}
+
+// Change number of processors. The world is stopped, sched is locked.
+// gcworkbufs are not being modified by either the GC or
+// the write barrier code.
+// Returns list of Ps with local work, they need to be scheduled by the caller.
+func procresize(nprocs int32) *p {
+	old := gomaxprocs
+	if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
+		throw("procresize: invalid arg")
+	}
+	if trace.enabled {
+		traceGomaxprocs(nprocs)
+	}
+
+	// update statistics
+	now := nanotime()
+	if sched.procresizetime != 0 {
+		sched.totaltime += int64(old) * (now - sched.procresizetime)
+	}
+	sched.procresizetime = now
+
+	// initialize new P's
+	for i := int32(0); i < nprocs; i++ {
+		pp := allp[i]
+		if pp == nil {
+			pp = new(p)
+			pp.id = i
+			pp.status = _Pgcstop
+			pp.sudogcache = pp.sudogbuf[:0]
+			for i := range pp.deferpool {
+				pp.deferpool[i] = pp.deferpoolbuf[i][:0]
+			}
+			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
+		}
+		if pp.mcache == nil {
+			if old == 0 && i == 0 {
+				if getg().m.mcache == nil {
+					throw("missing mcache?")
+				}
+				pp.mcache = getg().m.mcache // bootstrap
+			} else {
+				pp.mcache = allocmcache()
+			}
+		}
+		if raceenabled && pp.racectx == 0 {
+			if old == 0 && i == 0 {
+				pp.racectx = raceprocctx0
+				raceprocctx0 = 0 // bootstrap
+			} else {
+				pp.racectx = raceproccreate()
+			}
+		}
+	}
+
+	// free unused P's
+	for i := nprocs; i < old; i++ {
+		p := allp[i]
+		if trace.enabled {
+			if p == getg().m.p.ptr() {
+				// moving to p[0], pretend that we were descheduled
+				// and then scheduled again to keep the trace sane.
+				traceGoSched()
+				traceProcStop(p)
+			}
+		}
+		// move all runnable goroutines to the global queue
+		for p.runqhead != p.runqtail {
+			// pop from tail of local queue
+			p.runqtail--
+			gp := p.runq[p.runqtail%uint32(len(p.runq))].ptr()
+			// push onto head of global queue
+			globrunqputhead(gp)
+		}
+		if p.runnext != 0 {
+			globrunqputhead(p.runnext.ptr())
+			p.runnext = 0
+		}
+		// if there's a background worker, make it runnable and put
+		// it on the global queue so it can clean itself up
+		if gp := p.gcBgMarkWorker.ptr(); gp != nil {
+			casgstatus(gp, _Gwaiting, _Grunnable)
+			if trace.enabled {
+				traceGoUnpark(gp, 0)
+			}
+			globrunqput(gp)
+			// This assignment doesn't race because the
+			// world is stopped.
+			p.gcBgMarkWorker.set(nil)
+		}
+		for i := range p.sudogbuf {
+			p.sudogbuf[i] = nil
+		}
+		p.sudogcache = p.sudogbuf[:0]
+		for i := range p.deferpool {
+			for j := range p.deferpoolbuf[i] {
+				p.deferpoolbuf[i][j] = nil
+			}
+			p.deferpool[i] = p.deferpoolbuf[i][:0]
+		}
+		freemcache(p.mcache)
+		p.mcache = nil
+		gfpurge(p)
+		traceProcFree(p)
+		if raceenabled {
+			raceprocdestroy(p.racectx)
+			p.racectx = 0
+		}
+		p.status = _Pdead
+		// can't free P itself because it can be referenced by an M in syscall
+	}
+
+	_g_ := getg()
+	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
+		// continue to use the current P
+		_g_.m.p.ptr().status = _Prunning
+	} else {
+		// release the current P and acquire allp[0]
+		if _g_.m.p != 0 {
+			_g_.m.p.ptr().m = 0
+		}
+		_g_.m.p = 0
+		_g_.m.mcache = nil
+		p := allp[0]
+		p.m = 0
+		p.status = _Pidle
+		acquirep(p)
+		if trace.enabled {
+			traceGoStart()
+		}
+	}
+	var runnablePs *p
+	for i := nprocs - 1; i >= 0; i-- {
+		p := allp[i]
+		if _g_.m.p.ptr() == p {
+			continue
+		}
+		p.status = _Pidle
+		if runqempty(p) {
+			pidleput(p)
+		} else {
+			p.m.set(mget())
+			p.link.set(runnablePs)
+			runnablePs = p
+		}
+	}
+	stealOrder.reset(uint32(nprocs))
+	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
+	atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
+	return runnablePs
+}
+
+// Associate p and the current m.
+func acquirep(_p_ *p) {
+	acquirep1(_p_)
+
+	// have p; write barriers now allowed
+	_g_ := getg()
+	_g_.m.mcache = _p_.mcache
+
+	if trace.enabled {
+		traceProcStart()
+	}
+}
+
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func acquirep1(_p_ *p) {
+	_g_ := getg()
+
+	if _g_.m.p != 0 || _g_.m.mcache != nil {
+		throw("acquirep: already in go")
+	}
+	if _p_.m != 0 || _p_.status != _Pidle {
+		id := int32(0)
+		if _p_.m != 0 {
+			id = _p_.m.ptr().id
+		}
+		print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
+		throw("acquirep: invalid p state")
+	}
+	_g_.m.p.set(_p_)
+	_p_.m.set(_g_.m)
+	_p_.status = _Prunning
+}
+
+// Disassociate p and the current m.
+func releasep() *p {
+	_g_ := getg()
+
+	if _g_.m.p == 0 || _g_.m.mcache == nil {
+		throw("releasep: invalid arg")
+	}
+	_p_ := _g_.m.p.ptr()
+	if _p_.m.ptr() != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
+		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", _p_.m, " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
+		throw("releasep: invalid p state")
+	}
+	if trace.enabled {
+		traceProcStop(_g_.m.p.ptr())
+	}
+	_g_.m.p = 0
+	_g_.m.mcache = nil
+	_p_.m = 0
+	_p_.status = _Pidle
+	return _p_
+}
+
+func incidlelocked(v int32) {
+	lock(&sched.lock)
+	sched.nmidlelocked += v
+	if v > 0 {
+		checkdead()
+	}
+	unlock(&sched.lock)
+}
+
+// Check for deadlock situation.
+// The check is based on number of running M's, if 0 -> deadlock.
+func checkdead() {
+	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
+	// there are no running goroutines. The calling program is
+	// assumed to be running.
+	if islibrary || isarchive {
+		return
+	}
+
+	// If we are dying because of a signal caught on an already idle thread,
+	// freezetheworld will cause all running threads to block.
+	// And runtime will essentially enter into deadlock state,
+	// except that there is a thread that will call exit soon.
+	if panicking > 0 {
+		return
+	}
+
+	// -1 for sysmon
+	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
+	if run > 0 {
+		return
+	}
+	if run < 0 {
+		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
+		throw("checkdead: inconsistent counts")
+	}
+
+	grunning := 0
+	lock(&allglock)
+	for i := 0; i < len(allgs); i++ {
+		gp := allgs[i]
+		if isSystemGoroutine(gp) {
+			continue
+		}
+		s := readgstatus(gp)
+		switch s &^ _Gscan {
+		case _Gwaiting:
+			grunning++
+		case _Grunnable,
+			_Grunning,
+			_Gsyscall:
+			unlock(&allglock)
+			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
+			throw("checkdead: runnable g")
+		}
+	}
+	unlock(&allglock)
+	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
+		throw("no goroutines (main called runtime.Goexit) - deadlock!")
+	}
+
+	// Maybe jump time forward for playground.
+	gp := timejump()
+	if gp != nil {
+		casgstatus(gp, _Gwaiting, _Grunnable)
+		globrunqput(gp)
+		_p_ := pidleget()
+		if _p_ == nil {
+			throw("checkdead: no p for timer")
+		}
+		mp := mget()
+		if mp == nil {
+			// There should always be a free M since
+			// nothing is running.
+			throw("checkdead: no m for timer")
+		}
+		mp.nextp.set(_p_)
+		notewakeup(&mp.park)
+		return
+	}
+
+	getg().m.throwing = -1 // do not dump full stacks
+	throw("all goroutines are asleep - deadlock!")
+}
+
+// forcegcperiod is the maximum time in nanoseconds between garbage
+// collections. If we go this long without a garbage collection, one
+// is forced to run.
+//
+// This is a variable for testing purposes. It normally doesn't change.
+var forcegcperiod int64 = 2 * 60 * 1e9
+
+// Always runs without a P, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func sysmon() {
+	// If a heap span goes unused for 5 minutes after a garbage collection,
+	// we hand it back to the operating system.
+	scavengelimit := int64(5 * 60 * 1e9)
+
+	if debug.scavenge > 0 {
+		// Scavenge-a-lot for testing.
+		forcegcperiod = 10 * 1e6
+		scavengelimit = 20 * 1e6
+	}
+
+	lastscavenge := nanotime()
+	nscavenge := 0
+
+	lasttrace := int64(0)
+	idle := 0 // how many cycles in succession we had not wokeup somebody
+	delay := uint32(0)
+	for {
+		if idle == 0 { // start with 20us sleep...
+			delay = 20
+		} else if idle > 50 { // start doubling the sleep after 1ms...
+			delay *= 2
+		}
+		if delay > 10*1000 { // up to 10ms
+			delay = 10 * 1000
+		}
+		usleep(delay)
+		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
+			lock(&sched.lock)
+			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				atomic.Store(&sched.sysmonwait, 1)
+				unlock(&sched.lock)
+				// Make wake-up period small enough
+				// for the sampling to be correct.
+				maxsleep := forcegcperiod / 2
+				if scavengelimit < forcegcperiod {
+					maxsleep = scavengelimit / 2
+				}
+				notetsleep(&sched.sysmonnote, maxsleep)
+				lock(&sched.lock)
+				atomic.Store(&sched.sysmonwait, 0)
+				noteclear(&sched.sysmonnote)
+				idle = 0
+				delay = 20
+			}
+			unlock(&sched.lock)
+		}
+		// poll network if not polled for more than 10ms
+		lastpoll := int64(atomic.Load64(&sched.lastpoll))
+		now := nanotime()
+		unixnow := unixnanotime()
+		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
+			atomic.Cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
+			gp := netpoll(false) // non-blocking - returns list of goroutines
+			if gp != nil {
+				// Need to decrement number of idle locked M's
+				// (pretending that one more is running) before injectglist.
+				// Otherwise it can lead to the following situation:
+				// injectglist grabs all P's but before it starts M's to run the P's,
+				// another M returns from syscall, finishes running its G,
+				// observes that there is no work to do and no other running M's
+				// and reports deadlock.
+				incidlelocked(-1)
+				injectglist(gp)
+				incidlelocked(1)
+			}
+		}
+		// retake P's blocked in syscalls
+		// and preempt long running G's
+		if retake(now) != 0 {
+			idle = 0
+		} else {
+			idle++
+		}
+		// check if we need to force a GC
+		lastgc := int64(atomic.Load64(&memstats.last_gc))
+		if gcphase == _GCoff && lastgc != 0 && unixnow-lastgc > forcegcperiod && atomic.Load(&forcegc.idle) != 0 {
+			lock(&forcegc.lock)
+			forcegc.idle = 0
+			forcegc.g.schedlink = 0
+			injectglist(forcegc.g)
+			unlock(&forcegc.lock)
+		}
+		// scavenge heap once in a while
+		if lastscavenge+scavengelimit/2 < now {
+			mheap_.scavenge(int32(nscavenge), uint64(now), uint64(scavengelimit))
+			lastscavenge = now
+			nscavenge++
+		}
+		if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace)*1000000 <= now {
+			lasttrace = now
+			schedtrace(debug.scheddetail > 0)
+		}
+	}
+}
+
+var pdesc [_MaxGomaxprocs]struct {
+	schedtick   uint32
+	schedwhen   int64
+	syscalltick uint32
+	syscallwhen int64
+}
+
+// forcePreemptNS is the time slice given to a G before it is
+// preempted.
+const forcePreemptNS = 10 * 1000 * 1000 // 10ms
+
+func retake(now int64) uint32 {
+	n := 0
+	for i := int32(0); i < gomaxprocs; i++ {
+		_p_ := allp[i]
+		if _p_ == nil {
+			continue
+		}
+		pd := &pdesc[i]
+		s := _p_.status
+		if s == _Psyscall {
+			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
+			t := int64(_p_.syscalltick)
+			if int64(pd.syscalltick) != t {
+				pd.syscalltick = uint32(t)
+				pd.syscallwhen = now
+				continue
+			}
+			// On the one hand we don't want to retake Ps if there is no other work to do,
+			// but on the other hand we want to retake them eventually
+			// because they can prevent the sysmon thread from deep sleep.
+			if runqempty(_p_) && atomic.Load(&sched.nmspinning)+atomic.Load(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
+				continue
+			}
+			// Need to decrement number of idle locked M's
+			// (pretending that one more is running) before the CAS.
+			// Otherwise the M from which we retake can exit the syscall,
+			// increment nmidle and report deadlock.
+			incidlelocked(-1)
+			if atomic.Cas(&_p_.status, s, _Pidle) {
+				if trace.enabled {
+					traceGoSysBlock(_p_)
+					traceProcStop(_p_)
+				}
+				n++
+				_p_.syscalltick++
+				handoffp(_p_)
+			}
+			incidlelocked(1)
+		} else if s == _Prunning {
+			// Preempt G if it's running for too long.
+			t := int64(_p_.schedtick)
+			if int64(pd.schedtick) != t {
+				pd.schedtick = uint32(t)
+				pd.schedwhen = now
+				continue
+			}
+			if pd.schedwhen+forcePreemptNS > now {
+				continue
+			}
+			preemptone(_p_)
+		}
+	}
+	return uint32(n)
+}
+
+// Tell all goroutines that they have been preempted and they should stop.
+// This function is purely best-effort. It can fail to inform a goroutine if a
+// processor just started running it.
+// No locks need to be held.
+// Returns true if preemption request was issued to at least one goroutine.
+func preemptall() bool {
+	res := false
+	for i := int32(0); i < gomaxprocs; i++ {
+		_p_ := allp[i]
+		if _p_ == nil || _p_.status != _Prunning {
+			continue
+		}
+		if preemptone(_p_) {
+			res = true
+		}
+	}
+	return res
+}
+
+// Tell the goroutine running on processor P to stop.
+// This function is purely best-effort. It can incorrectly fail to inform the
+// goroutine. It can send inform the wrong goroutine. Even if it informs the
+// correct goroutine, that goroutine might ignore the request if it is
+// simultaneously executing newstack.
+// No lock needs to be held.
+// Returns true if preemption request was issued.
+// The actual preemption will happen at some point in the future
+// and will be indicated by the gp->status no longer being
+// Grunning
+func preemptone(_p_ *p) bool {
+	mp := _p_.m.ptr()
+	if mp == nil || mp == getg().m {
+		return false
+	}
+	gp := mp.curg
+	if gp == nil || gp == mp.g0 {
+		return false
+	}
+
+	gp.preempt = true
+
+	// Every call in a go routine checks for stack overflow by
+	// comparing the current stack pointer to gp->stackguard0.
+	// Setting gp->stackguard0 to StackPreempt folds
+	// preemption into the normal stack overflow check.
+	gp.stackguard0 = stackPreempt
+	return true
+}
+
+var starttime int64
+
+func schedtrace(detailed bool) {
+	now := nanotime()
+	if starttime == 0 {
+		starttime = now
+	}
+
+	lock(&sched.lock)
+	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
+	if detailed {
+		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
+	}
+	// We must be careful while reading data from P's, M's and G's.
+	// Even if we hold schedlock, most data can be changed concurrently.
+	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
+	for i := int32(0); i < gomaxprocs; i++ {
+		_p_ := allp[i]
+		if _p_ == nil {
+			continue
+		}
+		mp := _p_.m.ptr()
+		h := atomic.Load(&_p_.runqhead)
+		t := atomic.Load(&_p_.runqtail)
+		if detailed {
+			id := int32(-1)
+			if mp != nil {
+				id = mp.id
+			}
+			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
+		} else {
+			// In non-detailed mode format lengths of per-P run queues as:
+			// [len1 len2 len3 len4]
+			print(" ")
+			if i == 0 {
+				print("[")
+			}
+			print(t - h)
+			if i == gomaxprocs-1 {
+				print("]\n")
+			}
+		}
+	}
+
+	if !detailed {
+		unlock(&sched.lock)
+		return
+	}
+
+	for mp := allm; mp != nil; mp = mp.alllink {
+		_p_ := mp.p.ptr()
+		gp := mp.curg
+		lockedg := mp.lockedg
+		id1 := int32(-1)
+		if _p_ != nil {
+			id1 = _p_.id
+		}
+		id2 := int64(-1)
+		if gp != nil {
+			id2 = gp.goid
+		}
+		id3 := int64(-1)
+		if lockedg != nil {
+			id3 = lockedg.goid
+		}
+		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", mp.blocked, " lockedg=", id3, "\n")
+	}
+
+	lock(&allglock)
+	for gi := 0; gi < len(allgs); gi++ {
+		gp := allgs[gi]
+		mp := gp.m
+		lockedm := gp.lockedm
+		id1 := int32(-1)
+		if mp != nil {
+			id1 = mp.id
+		}
+		id2 := int32(-1)
+		if lockedm != nil {
+			id2 = lockedm.id
+		}
+		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason, ") m=", id1, " lockedm=", id2, "\n")
+	}
+	unlock(&allglock)
+	unlock(&sched.lock)
+}
+
+// Put mp on midle list.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func mput(mp *m) {
+	mp.schedlink = sched.midle
+	sched.midle.set(mp)
+	sched.nmidle++
+	checkdead()
+}
+
+// Try to get an m from midle list.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func mget() *m {
+	mp := sched.midle.ptr()
+	if mp != nil {
+		sched.midle = mp.schedlink
+		sched.nmidle--
+	}
+	return mp
+}
+
+// Put gp on the global runnable queue.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func globrunqput(gp *g) {
+	gp.schedlink = 0
+	if sched.runqtail != 0 {
+		sched.runqtail.ptr().schedlink.set(gp)
+	} else {
+		sched.runqhead.set(gp)
+	}
+	sched.runqtail.set(gp)
+	sched.runqsize++
+}
+
+// Put gp at the head of the global runnable queue.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func globrunqputhead(gp *g) {
+	gp.schedlink = sched.runqhead
+	sched.runqhead.set(gp)
+	if sched.runqtail == 0 {
+		sched.runqtail.set(gp)
+	}
+	sched.runqsize++
+}
+
+// Put a batch of runnable goroutines on the global runnable queue.
+// Sched must be locked.
+func globrunqputbatch(ghead *g, gtail *g, n int32) {
+	gtail.schedlink = 0
+	if sched.runqtail != 0 {
+		sched.runqtail.ptr().schedlink.set(ghead)
+	} else {
+		sched.runqhead.set(ghead)
+	}
+	sched.runqtail.set(gtail)
+	sched.runqsize += n
+}
+
+// Try get a batch of G's from the global runnable queue.
+// Sched must be locked.
+func globrunqget(_p_ *p, max int32) *g {
+	if sched.runqsize == 0 {
+		return nil
+	}
+
+	n := sched.runqsize/gomaxprocs + 1
+	if n > sched.runqsize {
+		n = sched.runqsize
+	}
+	if max > 0 && n > max {
+		n = max
+	}
+	if n > int32(len(_p_.runq))/2 {
+		n = int32(len(_p_.runq)) / 2
+	}
+
+	sched.runqsize -= n
+	if sched.runqsize == 0 {
+		sched.runqtail = 0
+	}
+
+	gp := sched.runqhead.ptr()
+	sched.runqhead = gp.schedlink
+	n--
+	for ; n > 0; n-- {
+		gp1 := sched.runqhead.ptr()
+		sched.runqhead = gp1.schedlink
+		runqput(_p_, gp1, false)
+	}
+	return gp
+}
+
+// Put p to on _Pidle list.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func pidleput(_p_ *p) {
+	if !runqempty(_p_) {
+		throw("pidleput: P has non-empty run queue")
+	}
+	_p_.link = sched.pidle
+	sched.pidle.set(_p_)
+	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
+}
+
+// Try get a p from _Pidle list.
+// Sched must be locked.
+// May run during STW, so write barriers are not allowed.
+//go:nowritebarrier
+func pidleget() *p {
+	_p_ := sched.pidle.ptr()
+	if _p_ != nil {
+		sched.pidle = _p_.link
+		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
+	}
+	return _p_
+}
+
+// runqempty returns true if _p_ has no Gs on its local run queue.
+// It never returns true spuriously.
+func runqempty(_p_ *p) bool {
+	// Defend against a race where 1) _p_ has G1 in runqnext but runqhead == runqtail,
+	// 2) runqput on _p_ kicks G1 to the runq, 3) runqget on _p_ empties runqnext.
+	// Simply observing that runqhead == runqtail and then observing that runqnext == nil
+	// does not mean the queue is empty.
+	for {
+		head := atomic.Load(&_p_.runqhead)
+		tail := atomic.Load(&_p_.runqtail)
+		runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&_p_.runnext)))
+		if tail == atomic.Load(&_p_.runqtail) {
+			return head == tail && runnext == 0
+		}
+	}
+}
+
+// To shake out latent assumptions about scheduling order,
+// we introduce some randomness into scheduling decisions
+// when running with the race detector.
+// The need for this was made obvious by changing the
+// (deterministic) scheduling order in Go 1.5 and breaking
+// many poorly-written tests.
+// With the randomness here, as long as the tests pass
+// consistently with -race, they shouldn't have latent scheduling
+// assumptions.
+const randomizeScheduler = raceenabled
+
+// runqput tries to put g on the local runnable queue.
+// If next if false, runqput adds g to the tail of the runnable queue.
+// If next is true, runqput puts g in the _p_.runnext slot.
+// If the run queue is full, runnext puts g on the global queue.
+// Executed only by the owner P.
+func runqput(_p_ *p, gp *g, next bool) {
+	if randomizeScheduler && next && fastrand1()%2 == 0 {
+		next = false
+	}
+
+	if next {
+	retryNext:
+		oldnext := _p_.runnext
+		if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
+			goto retryNext
+		}
+		if oldnext == 0 {
+			return
+		}
+		// Kick the old runnext out to the regular run queue.
+		gp = oldnext.ptr()
+	}
+
+retry:
+	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
+	t := _p_.runqtail
+	if t-h < uint32(len(_p_.runq)) {
+		_p_.runq[t%uint32(len(_p_.runq))].set(gp)
+		atomic.Store(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
+		return
+	}
+	if runqputslow(_p_, gp, h, t) {
+		return
+	}
+	// the queue is not full, now the put above must succeed
+	goto retry
+}
+
+// Put g and a batch of work from local runnable queue on global queue.
+// Executed only by the owner P.
+func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
+	var batch [len(_p_.runq)/2 + 1]*g
+
+	// First, grab a batch from local queue.
+	n := t - h
+	n = n / 2
+	if n != uint32(len(_p_.runq)/2) {
+		throw("runqputslow: queue is not full")
+	}
+	for i := uint32(0); i < n; i++ {
+		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))].ptr()
+	}
+	if !atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+		return false
+	}
+	batch[n] = gp
+
+	if randomizeScheduler {
+		for i := uint32(1); i <= n; i++ {
+			j := fastrand1() % (i + 1)
+			batch[i], batch[j] = batch[j], batch[i]
+		}
+	}
+
+	// Link the goroutines.
+	for i := uint32(0); i < n; i++ {
+		batch[i].schedlink.set(batch[i+1])
+	}
+
+	// Now put the batch on global queue.
+	lock(&sched.lock)
+	globrunqputbatch(batch[0], batch[n], int32(n+1))
+	unlock(&sched.lock)
+	return true
+}
+
+// Get g from local runnable queue.
+// If inheritTime is true, gp should inherit the remaining time in the
+// current time slice. Otherwise, it should start a new time slice.
+// Executed only by the owner P.
+func runqget(_p_ *p) (gp *g, inheritTime bool) {
+	// If there's a runnext, it's the next G to run.
+	for {
+		next := _p_.runnext
+		if next == 0 {
+			break
+		}
+		if _p_.runnext.cas(next, 0) {
+			return next.ptr(), true
+		}
+	}
+
+	for {
+		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := _p_.runqtail
+		if t == h {
+			return nil, false
+		}
+		gp := _p_.runq[h%uint32(len(_p_.runq))].ptr()
+		if atomic.Cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
+			return gp, false
+		}
+	}
+}
+
+// Grabs a batch of goroutines from _p_'s runnable queue into batch.
+// Batch is a ring buffer starting at batchHead.
+// Returns number of grabbed goroutines.
+// Can be executed by any P.
+func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
+	for {
+		h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with other consumers
+		t := atomic.Load(&_p_.runqtail) // load-acquire, synchronize with the producer
+		n := t - h
+		n = n - n/2
+		if n == 0 {
+			if stealRunNextG {
+				// Try to steal from _p_.runnext.
+				if next := _p_.runnext; next != 0 {
+					// Sleep to ensure that _p_ isn't about to run the g we
+					// are about to steal.
+					// The important use case here is when the g running on _p_
+					// ready()s another g and then almost immediately blocks.
+					// Instead of stealing runnext in this window, back off
+					// to give _p_ a chance to schedule runnext. This will avoid
+					// thrashing gs between different Ps.
+					// A sync chan send/recv takes ~50ns as of time of writing,
+					// so 3us gives ~50x overshoot.
+					if GOOS != "windows" {
+						usleep(3)
+					} else {
+						// On windows system timer granularity is 1-15ms,
+						// which is way too much for this optimization.
+						// So just yield.
+						osyield()
+					}
+					if !_p_.runnext.cas(next, 0) {
+						continue
+					}
+					batch[batchHead%uint32(len(batch))] = next
+					return 1
+				}
+			}
+			return 0
+		}
+		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
+			continue
+		}
+		for i := uint32(0); i < n; i++ {
+			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
+			batch[(batchHead+i)%uint32(len(batch))] = g
+		}
+		if atomic.Cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
+			return n
+		}
+	}
+}
+
+// Steal half of elements from local runnable queue of p2
+// and put onto local runnable queue of p.
+// Returns one of the stolen elements (or nil if failed).
+func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
+	t := _p_.runqtail
+	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
+	if n == 0 {
+		return nil
+	}
+	n--
+	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))].ptr()
+	if n == 0 {
+		return gp
+	}
+	h := atomic.Load(&_p_.runqhead) // load-acquire, synchronize with consumers
+	if t-h+n >= uint32(len(_p_.runq)) {
+		throw("runqsteal: runq overflow")
+	}
+	atomic.Store(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
+	return gp
+}
+
+//go:linkname setMaxThreads runtime/debug.setMaxThreads
+func setMaxThreads(in int) (out int) {
+	lock(&sched.lock)
+	out = int(sched.maxmcount)
+	sched.maxmcount = int32(in)
+	checkmcount()
+	unlock(&sched.lock)
+	return
+}
+
+func haveexperiment(name string) bool {
+	if name == "framepointer" {
+		return framepointer_enabled // set by linker
+	}
+	x := sys.Goexperiment
+	for x != "" {
+		xname := ""
+		i := index(x, ",")
+		if i < 0 {
+			xname, x = x, ""
+		} else {
+			xname, x = x[:i], x[i+1:]
+		}
+		if xname == name {
+			return true
+		}
+		if len(xname) > 2 && xname[:2] == "no" && xname[2:] == name {
+			return false
+		}
+	}
+	return false
+}
+
+//go:nosplit
+func procPin() int {
+	_g_ := getg()
+	mp := _g_.m
+
+	mp.locks++
+	return int(mp.p.ptr().id)
+}
+
+//go:nosplit
+func procUnpin() {
+	_g_ := getg()
+	_g_.m.locks--
+}
+
+//go:linkname sync_runtime_procPin sync.runtime_procPin
+//go:nosplit
+func sync_runtime_procPin() int {
+	return procPin()
+}
+
+//go:linkname sync_runtime_procUnpin sync.runtime_procUnpin
+//go:nosplit
+func sync_runtime_procUnpin() {
+	procUnpin()
+}
+
+//go:linkname sync_atomic_runtime_procPin sync/atomic.runtime_procPin
+//go:nosplit
+func sync_atomic_runtime_procPin() int {
+	return procPin()
+}
+
+//go:linkname sync_atomic_runtime_procUnpin sync/atomic.runtime_procUnpin
+//go:nosplit
+func sync_atomic_runtime_procUnpin() {
+	procUnpin()
+}
+
+// Active spinning for sync.Mutex.
+//go:linkname sync_runtime_canSpin sync.runtime_canSpin
+//go:nosplit
+func sync_runtime_canSpin(i int) bool {
+	// sync.Mutex is cooperative, so we are conservative with spinning.
+	// Spin only few times and only if running on a multicore machine and
+	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
+	// As opposed to runtime mutex we don't do passive spinning here,
+	// because there can be work on global runq on on other Ps.
+	if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
+		return false
+	}
+	if p := getg().m.p.ptr(); !runqempty(p) {
+		return false
+	}
+	return true
+}
+
+//go:linkname sync_runtime_doSpin sync.runtime_doSpin
+//go:nosplit
+func sync_runtime_doSpin() {
+	procyield(active_spin_cnt)
+}
+
+var stealOrder randomOrder
+
+// randomOrder/randomEnum are helper types for randomized work stealing.
+// They allow to enumerate all Ps in different pseudo-random orders without repetitions.
+// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS
+// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration.
+type randomOrder struct {
+	count    uint32
+	coprimes []uint32
+}
+
+type randomEnum struct {
+	i     uint32
+	count uint32
+	pos   uint32
+	inc   uint32
+}
+
+func (ord *randomOrder) reset(count uint32) {
+	ord.count = count
+	ord.coprimes = ord.coprimes[:0]
+	for i := uint32(1); i <= count; i++ {
+		if gcd(i, count) == 1 {
+			ord.coprimes = append(ord.coprimes, i)
+		}
+	}
+}
+
+func (ord *randomOrder) start(i uint32) randomEnum {
+	return randomEnum{
+		count: ord.count,
+		pos:   i % ord.count,
+		inc:   ord.coprimes[i%uint32(len(ord.coprimes))],
+	}
+}
+
+func (enum *randomEnum) done() bool {
+	return enum.i == enum.count
+}
+
+func (enum *randomEnum) next() {
+	enum.i++
+	enum.pos = (enum.pos + enum.inc) % enum.count
+}
+
+func (enum *randomEnum) position() uint32 {
+	return enum.pos
+}
+
+func gcd(a, b uint32) uint32 {
+	for b != 0 {
+		a, b = b, a%b
+	}
+	return a
+}

diff --git a/src/runtime/proc1.go b/src/runtime/proc1.go
deleted file mode 100644
index 09cb775..0000000
--- a/src/runtime/proc1.go
+++ /dev/null

@@ -1,3725 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-var (
-	m0 m
-	g0 g
-)
-
-// Goroutine scheduler
-// The scheduler's job is to distribute ready-to-run goroutines over worker threads.
-//
-// The main concepts are:
-// G - goroutine.
-// M - worker thread, or machine.
-// P - processor, a resource that is required to execute Go code.
-//     M must have an associated P to execute Go code, however it can be
-//     blocked or in a syscall w/o an associated P.
-//
-// Design doc at https://golang.org/s/go11sched.
-
-const (
-	// Number of goroutine ids to grab from sched.goidgen to local per-P cache at once.
-	// 16 seems to provide enough amortization, but other than that it's mostly arbitrary number.
-	_GoidCacheBatch = 16
-)
-
-// The bootstrap sequence is:
-//
-//	call osinit
-//	call schedinit
-//	make & queue new G
-//	call runtime·mstart
-//
-// The new G calls runtime·main.
-func schedinit() {
-	// raceinit must be the first call to race detector.
-	// In particular, it must be done before mallocinit below calls racemapshadow.
-	_g_ := getg()
-	if raceenabled {
-		_g_.racectx = raceinit()
-	}
-
-	sched.maxmcount = 10000
-
-	// Cache the framepointer experiment.  This affects stack unwinding.
-	framepointer_enabled = haveexperiment("framepointer")
-
-	tracebackinit()
-	moduledataverify()
-	stackinit()
-	mallocinit()
-	mcommoninit(_g_.m)
-
-	goargs()
-	goenvs()
-	parsedebugvars()
-	gcinit()
-
-	sched.lastpoll = uint64(nanotime())
-	procs := int(ncpu)
-	if n := atoi(gogetenv("GOMAXPROCS")); n > 0 {
-		if n > _MaxGomaxprocs {
-			n = _MaxGomaxprocs
-		}
-		procs = n
-	}
-	if procresize(int32(procs)) != nil {
-		throw("unknown runnable goroutine during bootstrap")
-	}
-
-	if buildVersion == "" {
-		// Condition should never trigger.  This code just serves
-		// to ensure runtime·buildVersion is kept in the resulting binary.
-		buildVersion = "unknown"
-	}
-}
-
-func dumpgstatus(gp *g) {
-	_g_ := getg()
-	print("runtime: gp: gp=", gp, ", goid=", gp.goid, ", gp->atomicstatus=", readgstatus(gp), "\n")
-	print("runtime:  g:  g=", _g_, ", goid=", _g_.goid, ",  g->atomicstatus=", readgstatus(_g_), "\n")
-}
-
-func checkmcount() {
-	// sched lock is held
-	if sched.mcount > sched.maxmcount {
-		print("runtime: program exceeds ", sched.maxmcount, "-thread limit\n")
-		throw("thread exhaustion")
-	}
-}
-
-func mcommoninit(mp *m) {
-	_g_ := getg()
-
-	// g0 stack won't make sense for user (and is not necessary unwindable).
-	if _g_ != _g_.m.g0 {
-		callers(1, mp.createstack[:])
-	}
-
-	mp.fastrand = 0x49f6428a + uint32(mp.id) + uint32(cputicks())
-	if mp.fastrand == 0 {
-		mp.fastrand = 0x49f6428a
-	}
-
-	lock(&sched.lock)
-	mp.id = sched.mcount
-	sched.mcount++
-	checkmcount()
-	mpreinit(mp)
-	if mp.gsignal != nil {
-		mp.gsignal.stackguard1 = mp.gsignal.stack.lo + _StackGuard
-	}
-
-	// Add to allm so garbage collector doesn't free g->m
-	// when it is just in a register or thread-local storage.
-	mp.alllink = allm
-
-	// NumCgoCall() iterates over allm w/o schedlock,
-	// so we need to publish it safely.
-	atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
-	unlock(&sched.lock)
-}
-
-// Mark gp ready to run.
-func ready(gp *g, traceskip int) {
-	if trace.enabled {
-		traceGoUnpark(gp, traceskip)
-	}
-
-	status := readgstatus(gp)
-
-	// Mark runnable.
-	_g_ := getg()
-	_g_.m.locks++ // disable preemption because it can be holding p in a local var
-	if status&^_Gscan != _Gwaiting {
-		dumpgstatus(gp)
-		throw("bad g->status in ready")
-	}
-
-	// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
-	casgstatus(gp, _Gwaiting, _Grunnable)
-	runqput(_g_.m.p.ptr(), gp, true)
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 { // TODO: fast atomic
-		wakep()
-	}
-	_g_.m.locks--
-	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
-		_g_.stackguard0 = stackPreempt
-	}
-}
-
-func gcprocs() int32 {
-	// Figure out how many CPUs to use during GC.
-	// Limited by gomaxprocs, number of actual CPUs, and MaxGcproc.
-	lock(&sched.lock)
-	n := gomaxprocs
-	if n > ncpu {
-		n = ncpu
-	}
-	if n > _MaxGcproc {
-		n = _MaxGcproc
-	}
-	if n > sched.nmidle+1 { // one M is currently running
-		n = sched.nmidle + 1
-	}
-	unlock(&sched.lock)
-	return n
-}
-
-func needaddgcproc() bool {
-	lock(&sched.lock)
-	n := gomaxprocs
-	if n > ncpu {
-		n = ncpu
-	}
-	if n > _MaxGcproc {
-		n = _MaxGcproc
-	}
-	n -= sched.nmidle + 1 // one M is currently running
-	unlock(&sched.lock)
-	return n > 0
-}
-
-func helpgc(nproc int32) {
-	_g_ := getg()
-	lock(&sched.lock)
-	pos := 0
-	for n := int32(1); n < nproc; n++ { // one M is currently running
-		if allp[pos].mcache == _g_.m.mcache {
-			pos++
-		}
-		mp := mget()
-		if mp == nil {
-			throw("gcprocs inconsistency")
-		}
-		mp.helpgc = n
-		mp.p.set(allp[pos])
-		mp.mcache = allp[pos].mcache
-		pos++
-		notewakeup(&mp.park)
-	}
-	unlock(&sched.lock)
-}
-
-// freezeStopWait is a large value that freezetheworld sets
-// sched.stopwait to in order to request that all Gs permanently stop.
-const freezeStopWait = 0x7fffffff
-
-// Similar to stopTheWorld but best-effort and can be called several times.
-// There is no reverse operation, used during crashing.
-// This function must not lock any mutexes.
-func freezetheworld() {
-	// stopwait and preemption requests can be lost
-	// due to races with concurrently executing threads,
-	// so try several times
-	for i := 0; i < 5; i++ {
-		// this should tell the scheduler to not start any new goroutines
-		sched.stopwait = freezeStopWait
-		atomicstore(&sched.gcwaiting, 1)
-		// this should stop running goroutines
-		if !preemptall() {
-			break // no running goroutines
-		}
-		usleep(1000)
-	}
-	// to be sure
-	usleep(1000)
-	preemptall()
-	usleep(1000)
-}
-
-func isscanstatus(status uint32) bool {
-	if status == _Gscan {
-		throw("isscanstatus: Bad status Gscan")
-	}
-	return status&_Gscan == _Gscan
-}
-
-// All reads and writes of g's status go through readgstatus, casgstatus
-// castogscanstatus, casfrom_Gscanstatus.
-//go:nosplit
-func readgstatus(gp *g) uint32 {
-	return atomicload(&gp.atomicstatus)
-}
-
-// Ownership of gscanvalid:
-//
-// If gp is running (meaning status == _Grunning or _Grunning|_Gscan),
-// then gp owns gp.gscanvalid, and other goroutines must not modify it.
-//
-// Otherwise, a second goroutine can lock the scan state by setting _Gscan
-// in the status bit and then modify gscanvalid, and then unlock the scan state.
-//
-// Note that the first condition implies an exception to the second:
-// if a second goroutine changes gp's status to _Grunning|_Gscan,
-// that second goroutine still does not have the right to modify gscanvalid.
-
-// The Gscanstatuses are acting like locks and this releases them.
-// If it proves to be a performance hit we should be able to make these
-// simple atomic stores but for now we are going to throw if
-// we see an inconsistent state.
-func casfrom_Gscanstatus(gp *g, oldval, newval uint32) {
-	success := false
-
-	// Check that transition is valid.
-	switch oldval {
-	default:
-		print("runtime: casfrom_Gscanstatus bad oldval gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
-		dumpgstatus(gp)
-		throw("casfrom_Gscanstatus:top gp->status is not in scan state")
-	case _Gscanrunnable,
-		_Gscanwaiting,
-		_Gscanrunning,
-		_Gscansyscall:
-		if newval == oldval&^_Gscan {
-			success = cas(&gp.atomicstatus, oldval, newval)
-		}
-	case _Gscanenqueue:
-		if newval == _Gwaiting {
-			success = cas(&gp.atomicstatus, oldval, newval)
-		}
-	}
-	if !success {
-		print("runtime: casfrom_Gscanstatus failed gp=", gp, ", oldval=", hex(oldval), ", newval=", hex(newval), "\n")
-		dumpgstatus(gp)
-		throw("casfrom_Gscanstatus: gp->status is not in scan state")
-	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
-}
-
-// This will return false if the gp is not in the expected status and the cas fails.
-// This acts like a lock acquire while the casfromgstatus acts like a lock release.
-func castogscanstatus(gp *g, oldval, newval uint32) bool {
-	switch oldval {
-	case _Grunnable,
-		_Gwaiting,
-		_Gsyscall:
-		if newval == oldval|_Gscan {
-			return cas(&gp.atomicstatus, oldval, newval)
-		}
-	case _Grunning:
-		if newval == _Gscanrunning || newval == _Gscanenqueue {
-			return cas(&gp.atomicstatus, oldval, newval)
-		}
-	}
-	print("runtime: castogscanstatus oldval=", hex(oldval), " newval=", hex(newval), "\n")
-	throw("castogscanstatus")
-	panic("not reached")
-}
-
-// If asked to move to or from a Gscanstatus this will throw. Use the castogscanstatus
-// and casfrom_Gscanstatus instead.
-// casgstatus will loop if the g->atomicstatus is in a Gscan status until the routine that
-// put it in the Gscan state is finished.
-//go:nosplit
-func casgstatus(gp *g, oldval, newval uint32) {
-	if (oldval&_Gscan != 0) || (newval&_Gscan != 0) || oldval == newval {
-		systemstack(func() {
-			print("runtime: casgstatus: oldval=", hex(oldval), " newval=", hex(newval), "\n")
-			throw("casgstatus: bad incoming values")
-		})
-	}
-
-	if oldval == _Grunning && gp.gcscanvalid {
-		// If oldvall == _Grunning, then the actual status must be
-		// _Grunning or _Grunning|_Gscan; either way,
-		// we own gp.gcscanvalid, so it's safe to read.
-		// gp.gcscanvalid must not be true when we are running.
-		print("runtime: casgstatus ", hex(oldval), "->", hex(newval), " gp.status=", hex(gp.atomicstatus), " gp.gcscanvalid=true\n")
-		throw("casgstatus")
-	}
-
-	// loop if gp->atomicstatus is in a scan state giving
-	// GC time to finish and change the state to oldval.
-	for !cas(&gp.atomicstatus, oldval, newval) {
-		if oldval == _Gwaiting && gp.atomicstatus == _Grunnable {
-			systemstack(func() {
-				throw("casgstatus: waiting for Gwaiting but is Grunnable")
-			})
-		}
-		// Help GC if needed.
-		// if gp.preemptscan && !gp.gcworkdone && (oldval == _Grunning || oldval == _Gsyscall) {
-		// 	gp.preemptscan = false
-		// 	systemstack(func() {
-		// 		gcphasework(gp)
-		// 	})
-		// }
-	}
-	if newval == _Grunning {
-		gp.gcscanvalid = false
-	}
-}
-
-// casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
-// Returns old status. Cannot call casgstatus directly, because we are racing with an
-// async wakeup that might come in from netpoll. If we see Gwaiting from the readgstatus,
-// it might have become Grunnable by the time we get to the cas. If we called casgstatus,
-// it would loop waiting for the status to go back to Gwaiting, which it never will.
-//go:nosplit
-func casgcopystack(gp *g) uint32 {
-	for {
-		oldstatus := readgstatus(gp) &^ _Gscan
-		if oldstatus != _Gwaiting && oldstatus != _Grunnable {
-			throw("copystack: bad status, not Gwaiting or Grunnable")
-		}
-		if cas(&gp.atomicstatus, oldstatus, _Gcopystack) {
-			return oldstatus
-		}
-	}
-}
-
-// scang blocks until gp's stack has been scanned.
-// It might be scanned by scang or it might be scanned by the goroutine itself.
-// Either way, the stack scan has completed when scang returns.
-func scang(gp *g) {
-	// Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
-	// Nothing is racing with us now, but gcscandone might be set to true left over
-	// from an earlier round of stack scanning (we scan twice per GC).
-	// We use gcscandone to record whether the scan has been done during this round.
-	// It is important that the scan happens exactly once: if called twice,
-	// the installation of stack barriers will detect the double scan and die.
-
-	gp.gcscandone = false
-
-	// Endeavor to get gcscandone set to true,
-	// either by doing the stack scan ourselves or by coercing gp to scan itself.
-	// gp.gcscandone can transition from false to true when we're not looking
-	// (if we asked for preemption), so any time we lock the status using
-	// castogscanstatus we have to double-check that the scan is still not done.
-	for !gp.gcscandone {
-		switch s := readgstatus(gp); s {
-		default:
-			dumpgstatus(gp)
-			throw("stopg: invalid status")
-
-		case _Gdead:
-			// No stack.
-			gp.gcscandone = true
-
-		case _Gcopystack:
-			// Stack being switched. Go around again.
-
-		case _Grunnable, _Gsyscall, _Gwaiting:
-			// Claim goroutine by setting scan bit.
-			// Racing with execution or readying of gp.
-			// The scan bit keeps them from running
-			// the goroutine until we're done.
-			if castogscanstatus(gp, s, s|_Gscan) {
-				if !gp.gcscandone {
-					// Coordinate with traceback
-					// in sigprof.
-					for !cas(&gp.stackLock, 0, 1) {
-						osyield()
-					}
-					scanstack(gp)
-					atomicstore(&gp.stackLock, 0)
-					gp.gcscandone = true
-				}
-				restartg(gp)
-			}
-
-		case _Gscanwaiting:
-			// newstack is doing a scan for us right now. Wait.
-
-		case _Grunning:
-			// Goroutine running. Try to preempt execution so it can scan itself.
-			// The preemption handler (in newstack) does the actual scan.
-
-			// Optimization: if there is already a pending preemption request
-			// (from the previous loop iteration), don't bother with the atomics.
-			if gp.preemptscan && gp.preempt && gp.stackguard0 == stackPreempt {
-				break
-			}
-
-			// Ask for preemption and self scan.
-			if castogscanstatus(gp, _Grunning, _Gscanrunning) {
-				if !gp.gcscandone {
-					gp.preemptscan = true
-					gp.preempt = true
-					gp.stackguard0 = stackPreempt
-				}
-				casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
-			}
-		}
-	}
-
-	gp.preemptscan = false // cancel scan request if no longer needed
-}
-
-// The GC requests that this routine be moved from a scanmumble state to a mumble state.
-func restartg(gp *g) {
-	s := readgstatus(gp)
-	switch s {
-	default:
-		dumpgstatus(gp)
-		throw("restartg: unexpected status")
-
-	case _Gdead:
-		// ok
-
-	case _Gscanrunnable,
-		_Gscanwaiting,
-		_Gscansyscall:
-		casfrom_Gscanstatus(gp, s, s&^_Gscan)
-
-	// Scan is now completed.
-	// Goroutine now needs to be made runnable.
-	// We put it on the global run queue; ready blocks on the global scheduler lock.
-	case _Gscanenqueue:
-		casfrom_Gscanstatus(gp, _Gscanenqueue, _Gwaiting)
-		if gp != getg().m.curg {
-			throw("processing Gscanenqueue on wrong m")
-		}
-		dropg()
-		ready(gp, 0)
-	}
-}
-
-// stopTheWorld stops all P's from executing goroutines, interrupting
-// all goroutines at GC safe points and records reason as the reason
-// for the stop. On return, only the current goroutine's P is running.
-// stopTheWorld must not be called from a system stack and the caller
-// must not hold worldsema. The caller must call startTheWorld when
-// other P's should resume execution.
-//
-// stopTheWorld is safe for multiple goroutines to call at the
-// same time. Each will execute its own stop, and the stops will
-// be serialized.
-//
-// This is also used by routines that do stack dumps. If the system is
-// in panic or being exited, this may not reliably stop all
-// goroutines.
-func stopTheWorld(reason string) {
-	semacquire(&worldsema, false)
-	getg().m.preemptoff = reason
-	systemstack(stopTheWorldWithSema)
-}
-
-// startTheWorld undoes the effects of stopTheWorld.
-func startTheWorld() {
-	systemstack(startTheWorldWithSema)
-	// worldsema must be held over startTheWorldWithSema to ensure
-	// gomaxprocs cannot change while worldsema is held.
-	semrelease(&worldsema)
-	getg().m.preemptoff = ""
-}
-
-// Holding worldsema grants an M the right to try to stop the world
-// and prevents gomaxprocs from changing concurrently.
-var worldsema uint32 = 1
-
-// stopTheWorldWithSema is the core implementation of stopTheWorld.
-// The caller is responsible for acquiring worldsema and disabling
-// preemption first and then should stopTheWorldWithSema on the system
-// stack:
-//
-//	semacquire(&worldsema, false)
-//	m.preemptoff = "reason"
-//	systemstack(stopTheWorldWithSema)
-//
-// When finished, the caller must either call startTheWorld or undo
-// these three operations separately:
-//
-//	m.preemptoff = ""
-//	systemstack(startTheWorldWithSema)
-//	semrelease(&worldsema)
-//
-// It is allowed to acquire worldsema once and then execute multiple
-// startTheWorldWithSema/stopTheWorldWithSema pairs.
-// Other P's are able to execute between successive calls to
-// startTheWorldWithSema and stopTheWorldWithSema.
-// Holding worldsema causes any other goroutines invoking
-// stopTheWorld to block.
-func stopTheWorldWithSema() {
-	_g_ := getg()
-
-	// If we hold a lock, then we won't be able to stop another M
-	// that is blocked trying to acquire the lock.
-	if _g_.m.locks > 0 {
-		throw("stopTheWorld: holding locks")
-	}
-
-	lock(&sched.lock)
-	sched.stopwait = gomaxprocs
-	atomicstore(&sched.gcwaiting, 1)
-	preemptall()
-	// stop current P
-	_g_.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
-	sched.stopwait--
-	// try to retake all P's in Psyscall status
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
-		s := p.status
-		if s == _Psyscall && cas(&p.status, s, _Pgcstop) {
-			if trace.enabled {
-				traceGoSysBlock(p)
-				traceProcStop(p)
-			}
-			p.syscalltick++
-			sched.stopwait--
-		}
-	}
-	// stop idle P's
-	for {
-		p := pidleget()
-		if p == nil {
-			break
-		}
-		p.status = _Pgcstop
-		sched.stopwait--
-	}
-	wait := sched.stopwait > 0
-	unlock(&sched.lock)
-
-	// wait for remaining P's to stop voluntarily
-	if wait {
-		for {
-			// wait for 100us, then try to re-preempt in case of any races
-			if notetsleep(&sched.stopnote, 100*1000) {
-				noteclear(&sched.stopnote)
-				break
-			}
-			preemptall()
-		}
-	}
-	if sched.stopwait != 0 {
-		throw("stopTheWorld: not stopped")
-	}
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
-		if p.status != _Pgcstop {
-			throw("stopTheWorld: not stopped")
-		}
-	}
-}
-
-func mhelpgc() {
-	_g_ := getg()
-	_g_.m.helpgc = -1
-}
-
-func startTheWorldWithSema() {
-	_g_ := getg()
-
-	_g_.m.locks++        // disable preemption because it can be holding p in a local var
-	gp := netpoll(false) // non-blocking
-	injectglist(gp)
-	add := needaddgcproc()
-	lock(&sched.lock)
-
-	procs := gomaxprocs
-	if newprocs != 0 {
-		procs = newprocs
-		newprocs = 0
-	}
-	p1 := procresize(procs)
-	sched.gcwaiting = 0
-	if sched.sysmonwait != 0 {
-		sched.sysmonwait = 0
-		notewakeup(&sched.sysmonnote)
-	}
-	unlock(&sched.lock)
-
-	for p1 != nil {
-		p := p1
-		p1 = p1.link.ptr()
-		if p.m != 0 {
-			mp := p.m.ptr()
-			p.m = 0
-			if mp.nextp != 0 {
-				throw("startTheWorld: inconsistent mp->nextp")
-			}
-			mp.nextp.set(p)
-			notewakeup(&mp.park)
-		} else {
-			// Start M to run P.  Do not start another M below.
-			newm(nil, p)
-			add = false
-		}
-	}
-
-	// Wakeup an additional proc in case we have excessive runnable goroutines
-	// in local queues or in the global queue. If we don't, the proc will park itself.
-	// If we have lots of excessive work, resetspinning will unpark additional procs as necessary.
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 {
-		wakep()
-	}
-
-	if add {
-		// If GC could have used another helper proc, start one now,
-		// in the hope that it will be available next time.
-		// It would have been even better to start it before the collection,
-		// but doing so requires allocating memory, so it's tricky to
-		// coordinate.  This lazy approach works out in practice:
-		// we don't mind if the first couple gc rounds don't have quite
-		// the maximum number of procs.
-		newm(mhelpgc, nil)
-	}
-	_g_.m.locks--
-	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
-		_g_.stackguard0 = stackPreempt
-	}
-}
-
-// Called to start an M.
-//go:nosplit
-func mstart() {
-	_g_ := getg()
-
-	if _g_.stack.lo == 0 {
-		// Initialize stack bounds from system stack.
-		// Cgo may have left stack size in stack.hi.
-		size := _g_.stack.hi
-		if size == 0 {
-			size = 8192 * stackGuardMultiplier
-		}
-		_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&size)))
-		_g_.stack.lo = _g_.stack.hi - size + 1024
-	}
-	// Initialize stack guards so that we can start calling
-	// both Go and C functions with stack growth prologues.
-	_g_.stackguard0 = _g_.stack.lo + _StackGuard
-	_g_.stackguard1 = _g_.stackguard0
-	mstart1()
-}
-
-func mstart1() {
-	_g_ := getg()
-
-	if _g_ != _g_.m.g0 {
-		throw("bad runtime·mstart")
-	}
-
-	// Record top of stack for use by mcall.
-	// Once we call schedule we're never coming back,
-	// so other calls can reuse this stack space.
-	gosave(&_g_.m.g0.sched)
-	_g_.m.g0.sched.pc = ^uintptr(0) // make sure it is never used
-	asminit()
-	minit()
-
-	// Install signal handlers; after minit so that minit can
-	// prepare the thread to be able to handle the signals.
-	if _g_.m == &m0 {
-		// Create an extra M for callbacks on threads not created by Go.
-		if iscgo && !cgoHasExtraM {
-			cgoHasExtraM = true
-			newextram()
-		}
-		initsig()
-	}
-
-	if fn := _g_.m.mstartfn; fn != nil {
-		fn()
-	}
-
-	if _g_.m.helpgc != 0 {
-		_g_.m.helpgc = 0
-		stopm()
-	} else if _g_.m != &m0 {
-		acquirep(_g_.m.nextp.ptr())
-		_g_.m.nextp = 0
-	}
-	schedule()
-}
-
-// forEachP calls fn(p) for every P p when p reaches a GC safe point.
-// If a P is currently executing code, this will bring the P to a GC
-// safe point and execute fn on that P. If the P is not executing code
-// (it is idle or in a syscall), this will call fn(p) directly while
-// preventing the P from exiting its state. This does not ensure that
-// fn will run on every CPU executing Go code, but it acts as a global
-// memory barrier. GC uses this as a "ragged barrier."
-//
-// The caller must hold worldsema.
-func forEachP(fn func(*p)) {
-	mp := acquirem()
-	_p_ := getg().m.p.ptr()
-
-	lock(&sched.lock)
-	if sched.safePointWait != 0 {
-		throw("forEachP: sched.safePointWait != 0")
-	}
-	sched.safePointWait = gomaxprocs - 1
-	sched.safePointFn = fn
-
-	// Ask all Ps to run the safe point function.
-	for _, p := range allp[:gomaxprocs] {
-		if p != _p_ {
-			atomicstore(&p.runSafePointFn, 1)
-		}
-	}
-	preemptall()
-
-	// Any P entering _Pidle or _Psyscall from now on will observe
-	// p.runSafePointFn == 1 and will call runSafePointFn when
-	// changing its status to _Pidle/_Psyscall.
-
-	// Run safe point function for all idle Ps. sched.pidle will
-	// not change because we hold sched.lock.
-	for p := sched.pidle.ptr(); p != nil; p = p.link.ptr() {
-		if cas(&p.runSafePointFn, 1, 0) {
-			fn(p)
-			sched.safePointWait--
-		}
-	}
-
-	wait := sched.safePointWait > 0
-	unlock(&sched.lock)
-
-	// Run fn for the current P.
-	fn(_p_)
-
-	// Force Ps currently in _Psyscall into _Pidle and hand them
-	// off to induce safe point function execution.
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
-		s := p.status
-		if s == _Psyscall && p.runSafePointFn == 1 && cas(&p.status, s, _Pidle) {
-			if trace.enabled {
-				traceGoSysBlock(p)
-				traceProcStop(p)
-			}
-			p.syscalltick++
-			handoffp(p)
-		}
-	}
-
-	// Wait for remaining Ps to run fn.
-	if wait {
-		for {
-			// Wait for 100us, then try to re-preempt in
-			// case of any races.
-			if notetsleep(&sched.safePointNote, 100*1000) {
-				noteclear(&sched.safePointNote)
-				break
-			}
-			preemptall()
-		}
-	}
-	if sched.safePointWait != 0 {
-		throw("forEachP: not done")
-	}
-	for i := 0; i < int(gomaxprocs); i++ {
-		p := allp[i]
-		if p.runSafePointFn != 0 {
-			throw("forEachP: P did not run fn")
-		}
-	}
-
-	lock(&sched.lock)
-	sched.safePointFn = nil
-	unlock(&sched.lock)
-	releasem(mp)
-}
-
-// runSafePointFn runs the safe point function, if any, for this P.
-// This should be called like
-//
-//     if getg().m.p.runSafePointFn != 0 {
-//         runSafePointFn()
-//     }
-//
-// runSafePointFn must be checked on any transition in to _Pidle or
-// _Psyscall to avoid a race where forEachP sees that the P is running
-// just before the P goes into _Pidle/_Psyscall and neither forEachP
-// nor the P run the safe-point function.
-func runSafePointFn() {
-	p := getg().m.p.ptr()
-	// Resolve the race between forEachP running the safe-point
-	// function on this P's behalf and this P running the
-	// safe-point function directly.
-	if !cas(&p.runSafePointFn, 1, 0) {
-		return
-	}
-	sched.safePointFn(p)
-	lock(&sched.lock)
-	sched.safePointWait--
-	if sched.safePointWait == 0 {
-		notewakeup(&sched.safePointNote)
-	}
-	unlock(&sched.lock)
-}
-
-// When running with cgo, we call _cgo_thread_start
-// to start threads for us so that we can play nicely with
-// foreign code.
-var cgoThreadStart unsafe.Pointer
-
-type cgothreadstart struct {
-	g   guintptr
-	tls *uint64
-	fn  unsafe.Pointer
-}
-
-// Allocate a new m unassociated with any thread.
-// Can use p for allocation context if needed.
-// fn is recorded as the new m's m.mstartfn.
-func allocm(_p_ *p, fn func()) *m {
-	_g_ := getg()
-	_g_.m.locks++ // disable GC because it can be called from sysmon
-	if _g_.m.p == 0 {
-		acquirep(_p_) // temporarily borrow p for mallocs in this function
-	}
-	mp := new(m)
-	mp.mstartfn = fn
-	mcommoninit(mp)
-
-	// In case of cgo or Solaris, pthread_create will make us a stack.
-	// Windows and Plan 9 will layout sched stack on OS stack.
-	if iscgo || GOOS == "solaris" || GOOS == "windows" || GOOS == "plan9" {
-		mp.g0 = malg(-1)
-	} else {
-		mp.g0 = malg(8192 * stackGuardMultiplier)
-	}
-	mp.g0.m = mp
-
-	if _p_ == _g_.m.p.ptr() {
-		releasep()
-	}
-	_g_.m.locks--
-	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
-		_g_.stackguard0 = stackPreempt
-	}
-
-	return mp
-}
-
-// needm is called when a cgo callback happens on a
-// thread without an m (a thread not created by Go).
-// In this case, needm is expected to find an m to use
-// and return with m, g initialized correctly.
-// Since m and g are not set now (likely nil, but see below)
-// needm is limited in what routines it can call. In particular
-// it can only call nosplit functions (textflag 7) and cannot
-// do any scheduling that requires an m.
-//
-// In order to avoid needing heavy lifting here, we adopt
-// the following strategy: there is a stack of available m's
-// that can be stolen. Using compare-and-swap
-// to pop from the stack has ABA races, so we simulate
-// a lock by doing an exchange (via casp) to steal the stack
-// head and replace the top pointer with MLOCKED (1).
-// This serves as a simple spin lock that we can use even
-// without an m. The thread that locks the stack in this way
-// unlocks the stack by storing a valid stack head pointer.
-//
-// In order to make sure that there is always an m structure
-// available to be stolen, we maintain the invariant that there
-// is always one more than needed. At the beginning of the
-// program (if cgo is in use) the list is seeded with a single m.
-// If needm finds that it has taken the last m off the list, its job
-// is - once it has installed its own m so that it can do things like
-// allocate memory - to create a spare m and put it on the list.
-//
-// Each of these extra m's also has a g0 and a curg that are
-// pressed into service as the scheduling stack and current
-// goroutine for the duration of the cgo callback.
-//
-// When the callback is done with the m, it calls dropm to
-// put the m back on the list.
-//go:nosplit
-func needm(x byte) {
-	if iscgo && !cgoHasExtraM {
-		// Can happen if C/C++ code calls Go from a global ctor.
-		// Can not throw, because scheduler is not initialized yet.
-		write(2, unsafe.Pointer(&earlycgocallback[0]), int32(len(earlycgocallback)))
-		exit(1)
-	}
-
-	// Lock extra list, take head, unlock popped list.
-	// nilokay=false is safe here because of the invariant above,
-	// that the extra list always contains or will soon contain
-	// at least one m.
-	mp := lockextra(false)
-
-	// Set needextram when we've just emptied the list,
-	// so that the eventual call into cgocallbackg will
-	// allocate a new m for the extra list. We delay the
-	// allocation until then so that it can be done
-	// after exitsyscall makes sure it is okay to be
-	// running at all (that is, there's no garbage collection
-	// running right now).
-	mp.needextram = mp.schedlink == 0
-	unlockextra(mp.schedlink.ptr())
-
-	// Install g (= m->g0) and set the stack bounds
-	// to match the current stack. We don't actually know
-	// how big the stack is, like we don't know how big any
-	// scheduling stack is, but we assume there's at least 32 kB,
-	// which is more than enough for us.
-	setg(mp.g0)
-	_g_ := getg()
-	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
-	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
-	_g_.stackguard0 = _g_.stack.lo + _StackGuard
-
-	msigsave(mp)
-	// Initialize this thread to use the m.
-	asminit()
-	minit()
-}
-
-var earlycgocallback = []byte("fatal error: cgo callback before cgo call\n")
-
-// newextram allocates an m and puts it on the extra list.
-// It is called with a working local m, so that it can do things
-// like call schedlock and allocate.
-func newextram() {
-	// Create extra goroutine locked to extra m.
-	// The goroutine is the context in which the cgo callback will run.
-	// The sched.pc will never be returned to, but setting it to
-	// goexit makes clear to the traceback routines where
-	// the goroutine stack ends.
-	mp := allocm(nil, nil)
-	gp := malg(4096)
-	gp.sched.pc = funcPC(goexit) + _PCQuantum
-	gp.sched.sp = gp.stack.hi
-	gp.sched.sp -= 4 * regSize // extra space in case of reads slightly beyond frame
-	gp.sched.lr = 0
-	gp.sched.g = guintptr(unsafe.Pointer(gp))
-	gp.syscallpc = gp.sched.pc
-	gp.syscallsp = gp.sched.sp
-	// malg returns status as Gidle, change to Gsyscall before adding to allg
-	// where GC will see it.
-	casgstatus(gp, _Gidle, _Gsyscall)
-	gp.m = mp
-	mp.curg = gp
-	mp.locked = _LockInternal
-	mp.lockedg = gp
-	gp.lockedm = mp
-	gp.goid = int64(xadd64(&sched.goidgen, 1))
-	if raceenabled {
-		gp.racectx = racegostart(funcPC(newextram))
-	}
-	// put on allg for garbage collector
-	allgadd(gp)
-
-	// Add m to the extra list.
-	mnext := lockextra(true)
-	mp.schedlink.set(mnext)
-	unlockextra(mp)
-}
-
-// dropm is called when a cgo callback has called needm but is now
-// done with the callback and returning back into the non-Go thread.
-// It puts the current m back onto the extra list.
-//
-// The main expense here is the call to signalstack to release the
-// m's signal stack, and then the call to needm on the next callback
-// from this thread. It is tempting to try to save the m for next time,
-// which would eliminate both these costs, but there might not be
-// a next time: the current thread (which Go does not control) might exit.
-// If we saved the m for that thread, there would be an m leak each time
-// such a thread exited. Instead, we acquire and release an m on each
-// call. These should typically not be scheduling operations, just a few
-// atomics, so the cost should be small.
-//
-// TODO(rsc): An alternative would be to allocate a dummy pthread per-thread
-// variable using pthread_key_create. Unlike the pthread keys we already use
-// on OS X, this dummy key would never be read by Go code. It would exist
-// only so that we could register at thread-exit-time destructor.
-// That destructor would put the m back onto the extra list.
-// This is purely a performance optimization. The current version,
-// in which dropm happens on each cgo call, is still correct too.
-// We may have to keep the current version on systems with cgo
-// but without pthreads, like Windows.
-func dropm() {
-	// Undo whatever initialization minit did during needm.
-	unminit()
-
-	// Clear m and g, and return m to the extra list.
-	// After the call to setg we can only call nosplit functions
-	// with no pointer manipulation.
-	mp := getg().m
-	mnext := lockextra(true)
-	mp.schedlink.set(mnext)
-
-	setg(nil)
-	unlockextra(mp)
-}
-
-var extram uintptr
-
-// lockextra locks the extra list and returns the list head.
-// The caller must unlock the list by storing a new list head
-// to extram. If nilokay is true, then lockextra will
-// return a nil list head if that's what it finds. If nilokay is false,
-// lockextra will keep waiting until the list head is no longer nil.
-//go:nosplit
-func lockextra(nilokay bool) *m {
-	const locked = 1
-
-	for {
-		old := atomicloaduintptr(&extram)
-		if old == locked {
-			yield := osyield
-			yield()
-			continue
-		}
-		if old == 0 && !nilokay {
-			usleep(1)
-			continue
-		}
-		if casuintptr(&extram, old, locked) {
-			return (*m)(unsafe.Pointer(old))
-		}
-		yield := osyield
-		yield()
-		continue
-	}
-}
-
-//go:nosplit
-func unlockextra(mp *m) {
-	atomicstoreuintptr(&extram, uintptr(unsafe.Pointer(mp)))
-}
-
-// Create a new m.  It will start off with a call to fn, or else the scheduler.
-// fn needs to be static and not a heap allocated closure.
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func newm(fn func(), _p_ *p) {
-	mp := allocm(_p_, fn)
-	mp.nextp.set(_p_)
-	msigsave(mp)
-	if iscgo {
-		var ts cgothreadstart
-		if _cgo_thread_start == nil {
-			throw("_cgo_thread_start missing")
-		}
-		ts.g.set(mp.g0)
-		ts.tls = (*uint64)(unsafe.Pointer(&mp.tls[0]))
-		ts.fn = unsafe.Pointer(funcPC(mstart))
-		asmcgocall(_cgo_thread_start, unsafe.Pointer(&ts))
-		return
-	}
-	newosproc(mp, unsafe.Pointer(mp.g0.stack.hi))
-}
-
-// Stops execution of the current m until new work is available.
-// Returns with acquired P.
-func stopm() {
-	_g_ := getg()
-
-	if _g_.m.locks != 0 {
-		throw("stopm holding locks")
-	}
-	if _g_.m.p != 0 {
-		throw("stopm holding p")
-	}
-	if _g_.m.spinning {
-		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
-	}
-
-retry:
-	lock(&sched.lock)
-	mput(_g_.m)
-	unlock(&sched.lock)
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
-	if _g_.m.helpgc != 0 {
-		gchelper()
-		_g_.m.helpgc = 0
-		_g_.m.mcache = nil
-		_g_.m.p = 0
-		goto retry
-	}
-	acquirep(_g_.m.nextp.ptr())
-	_g_.m.nextp = 0
-}
-
-func mspinning() {
-	gp := getg()
-	if !runqempty(gp.m.nextp.ptr()) {
-		// Something (presumably the GC) was readied while the
-		// runtime was starting up this M, so the M is no
-		// longer spinning.
-		if int32(xadd(&sched.nmspinning, -1)) < 0 {
-			throw("mspinning: nmspinning underflowed")
-		}
-	} else {
-		gp.m.spinning = true
-	}
-}
-
-// Schedules some M to run the p (creates an M if necessary).
-// If p==nil, tries to get an idle P, if no idle P's does nothing.
-// May run with m.p==nil, so write barriers are not allowed.
-//go:nowritebarrier
-func startm(_p_ *p, spinning bool) {
-	lock(&sched.lock)
-	if _p_ == nil {
-		_p_ = pidleget()
-		if _p_ == nil {
-			unlock(&sched.lock)
-			if spinning {
-				xadd(&sched.nmspinning, -1)
-			}
-			return
-		}
-	}
-	mp := mget()
-	unlock(&sched.lock)
-	if mp == nil {
-		var fn func()
-		if spinning {
-			fn = mspinning
-		}
-		newm(fn, _p_)
-		return
-	}
-	if mp.spinning {
-		throw("startm: m is spinning")
-	}
-	if mp.nextp != 0 {
-		throw("startm: m has p")
-	}
-	if spinning && !runqempty(_p_) {
-		throw("startm: p has runnable gs")
-	}
-	mp.spinning = spinning
-	mp.nextp.set(_p_)
-	notewakeup(&mp.park)
-}
-
-// Hands off P from syscall or locked M.
-// Always runs without a P, so write barriers are not allowed.
-//go:nowritebarrier
-func handoffp(_p_ *p) {
-	// if it has local work, start it straight away
-	if !runqempty(_p_) || sched.runqsize != 0 {
-		startm(_p_, false)
-		return
-	}
-	// no local work, check that there are no spinning/idle M's,
-	// otherwise our help is not required
-	if atomicload(&sched.nmspinning)+atomicload(&sched.npidle) == 0 && cas(&sched.nmspinning, 0, 1) { // TODO: fast atomic
-		startm(_p_, true)
-		return
-	}
-	lock(&sched.lock)
-	if sched.gcwaiting != 0 {
-		_p_.status = _Pgcstop
-		sched.stopwait--
-		if sched.stopwait == 0 {
-			notewakeup(&sched.stopnote)
-		}
-		unlock(&sched.lock)
-		return
-	}
-	if _p_.runSafePointFn != 0 && cas(&_p_.runSafePointFn, 1, 0) {
-		sched.safePointFn(_p_)
-		sched.safePointWait--
-		if sched.safePointWait == 0 {
-			notewakeup(&sched.safePointNote)
-		}
-	}
-	if sched.runqsize != 0 {
-		unlock(&sched.lock)
-		startm(_p_, false)
-		return
-	}
-	// If this is the last running P and nobody is polling network,
-	// need to wakeup another M to poll network.
-	if sched.npidle == uint32(gomaxprocs-1) && atomicload64(&sched.lastpoll) != 0 {
-		unlock(&sched.lock)
-		startm(_p_, false)
-		return
-	}
-	pidleput(_p_)
-	unlock(&sched.lock)
-}
-
-// Tries to add one more P to execute G's.
-// Called when a G is made runnable (newproc, ready).
-func wakep() {
-	// be conservative about spinning threads
-	if !cas(&sched.nmspinning, 0, 1) {
-		return
-	}
-	startm(nil, true)
-}
-
-// Stops execution of the current m that is locked to a g until the g is runnable again.
-// Returns with acquired P.
-func stoplockedm() {
-	_g_ := getg()
-
-	if _g_.m.lockedg == nil || _g_.m.lockedg.lockedm != _g_.m {
-		throw("stoplockedm: inconsistent locking")
-	}
-	if _g_.m.p != 0 {
-		// Schedule another M to run this p.
-		_p_ := releasep()
-		handoffp(_p_)
-	}
-	incidlelocked(1)
-	// Wait until another thread schedules lockedg again.
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
-	status := readgstatus(_g_.m.lockedg)
-	if status&^_Gscan != _Grunnable {
-		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_)
-		throw("stoplockedm: not runnable")
-	}
-	acquirep(_g_.m.nextp.ptr())
-	_g_.m.nextp = 0
-}
-
-// Schedules the locked m to run the locked gp.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func startlockedm(gp *g) {
-	_g_ := getg()
-
-	mp := gp.lockedm
-	if mp == _g_.m {
-		throw("startlockedm: locked to me")
-	}
-	if mp.nextp != 0 {
-		throw("startlockedm: m has p")
-	}
-	// directly handoff current P to the locked m
-	incidlelocked(-1)
-	_p_ := releasep()
-	mp.nextp.set(_p_)
-	notewakeup(&mp.park)
-	stopm()
-}
-
-// Stops the current m for stopTheWorld.
-// Returns when the world is restarted.
-func gcstopm() {
-	_g_ := getg()
-
-	if sched.gcwaiting == 0 {
-		throw("gcstopm: not waiting for gc")
-	}
-	if _g_.m.spinning {
-		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
-	}
-	_p_ := releasep()
-	lock(&sched.lock)
-	_p_.status = _Pgcstop
-	sched.stopwait--
-	if sched.stopwait == 0 {
-		notewakeup(&sched.stopnote)
-	}
-	unlock(&sched.lock)
-	stopm()
-}
-
-// Schedules gp to run on the current M.
-// If inheritTime is true, gp inherits the remaining time in the
-// current time slice. Otherwise, it starts a new time slice.
-// Never returns.
-func execute(gp *g, inheritTime bool) {
-	_g_ := getg()
-
-	casgstatus(gp, _Grunnable, _Grunning)
-	gp.waitsince = 0
-	gp.preempt = false
-	gp.stackguard0 = gp.stack.lo + _StackGuard
-	if !inheritTime {
-		_g_.m.p.ptr().schedtick++
-	}
-	_g_.m.curg = gp
-	gp.m = _g_.m
-
-	// Check whether the profiler needs to be turned on or off.
-	hz := sched.profilehz
-	if _g_.m.profilehz != hz {
-		resetcpuprofiler(hz)
-	}
-
-	if trace.enabled {
-		// GoSysExit has to happen when we have a P, but before GoStart.
-		// So we emit it here.
-		if gp.syscallsp != 0 && gp.sysblocktraced {
-			// Since gp.sysblocktraced is true, we must emit an event.
-			// There is a race between the code that initializes sysexitseq
-			// and sysexitticks (in exitsyscall, which runs without a P,
-			// and therefore is not stopped with the rest of the world)
-			// and the code that initializes a new trace.
-			// The recorded sysexitseq and sysexitticks must therefore
-			// be treated as "best effort". If they are valid for this trace,
-			// then great, use them for greater accuracy.
-			// But if they're not valid for this trace, assume that the
-			// trace was started after the actual syscall exit (but before
-			// we actually managed to start the goroutine, aka right now),
-			// and assign a fresh time stamp to keep the log consistent.
-			seq, ts := gp.sysexitseq, gp.sysexitticks
-			if seq == 0 || int64(seq)-int64(trace.seqStart) < 0 {
-				seq, ts = tracestamp()
-			}
-			traceGoSysExit(seq, ts)
-		}
-		traceGoStart()
-	}
-
-	gogo(&gp.sched)
-}
-
-// Finds a runnable goroutine to execute.
-// Tries to steal from other P's, get g from global queue, poll network.
-func findrunnable() (gp *g, inheritTime bool) {
-	_g_ := getg()
-
-top:
-	if sched.gcwaiting != 0 {
-		gcstopm()
-		goto top
-	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
-		runSafePointFn()
-	}
-	if fingwait && fingwake {
-		if gp := wakefing(); gp != nil {
-			ready(gp, 0)
-		}
-	}
-
-	// local runq
-	if gp, inheritTime := runqget(_g_.m.p.ptr()); gp != nil {
-		return gp, inheritTime
-	}
-
-	// global runq
-	if sched.runqsize != 0 {
-		lock(&sched.lock)
-		gp := globrunqget(_g_.m.p.ptr(), 0)
-		unlock(&sched.lock)
-		if gp != nil {
-			return gp, false
-		}
-	}
-
-	// Poll network.
-	// This netpoll is only an optimization before we resort to stealing.
-	// We can safely skip it if there a thread blocked in netpoll already.
-	// If there is any kind of logical race with that blocked thread
-	// (e.g. it has already returned from netpoll, but does not set lastpoll yet),
-	// this thread will do blocking netpoll below anyway.
-	if netpollinited() && sched.lastpoll != 0 {
-		if gp := netpoll(false); gp != nil { // non-blocking
-			// netpoll returns list of goroutines linked by schedlink.
-			injectglist(gp.schedlink.ptr())
-			casgstatus(gp, _Gwaiting, _Grunnable)
-			if trace.enabled {
-				traceGoUnpark(gp, 0)
-			}
-			return gp, false
-		}
-	}
-
-	// If number of spinning M's >= number of busy P's, block.
-	// This is necessary to prevent excessive CPU consumption
-	// when GOMAXPROCS>>1 but the program parallelism is low.
-	if !_g_.m.spinning && 2*atomicload(&sched.nmspinning) >= uint32(gomaxprocs)-atomicload(&sched.npidle) { // TODO: fast atomic
-		goto stop
-	}
-	if !_g_.m.spinning {
-		_g_.m.spinning = true
-		xadd(&sched.nmspinning, 1)
-	}
-	// random steal from other P's
-	for i := 0; i < int(4*gomaxprocs); i++ {
-		if sched.gcwaiting != 0 {
-			goto top
-		}
-		_p_ := allp[fastrand1()%uint32(gomaxprocs)]
-		var gp *g
-		if _p_ == _g_.m.p.ptr() {
-			gp, _ = runqget(_p_)
-		} else {
-			stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
-			gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
-		}
-		if gp != nil {
-			return gp, false
-		}
-	}
-
-stop:
-
-	// We have nothing to do. If we're in the GC mark phase and can
-	// safely scan and blacken objects, run idle-time marking
-	// rather than give up the P.
-	if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil && gcMarkWorkAvailable(_p_) {
-		_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
-		gp := _p_.gcBgMarkWorker
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		if trace.enabled {
-			traceGoUnpark(gp, 0)
-		}
-		return gp, false
-	}
-
-	// return P and block
-	lock(&sched.lock)
-	if sched.gcwaiting != 0 || _g_.m.p.ptr().runSafePointFn != 0 {
-		unlock(&sched.lock)
-		goto top
-	}
-	if sched.runqsize != 0 {
-		gp := globrunqget(_g_.m.p.ptr(), 0)
-		unlock(&sched.lock)
-		return gp, false
-	}
-	_p_ := releasep()
-	pidleput(_p_)
-	unlock(&sched.lock)
-	if _g_.m.spinning {
-		_g_.m.spinning = false
-		xadd(&sched.nmspinning, -1)
-	}
-
-	// check all runqueues once again
-	for i := 0; i < int(gomaxprocs); i++ {
-		_p_ := allp[i]
-		if _p_ != nil && !runqempty(_p_) {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
-				goto top
-			}
-			break
-		}
-	}
-
-	// poll network
-	if netpollinited() && xchg64(&sched.lastpoll, 0) != 0 {
-		if _g_.m.p != 0 {
-			throw("findrunnable: netpoll with p")
-		}
-		if _g_.m.spinning {
-			throw("findrunnable: netpoll with spinning")
-		}
-		gp := netpoll(true) // block until new work is available
-		atomicstore64(&sched.lastpoll, uint64(nanotime()))
-		if gp != nil {
-			lock(&sched.lock)
-			_p_ = pidleget()
-			unlock(&sched.lock)
-			if _p_ != nil {
-				acquirep(_p_)
-				injectglist(gp.schedlink.ptr())
-				casgstatus(gp, _Gwaiting, _Grunnable)
-				if trace.enabled {
-					traceGoUnpark(gp, 0)
-				}
-				return gp, false
-			}
-			injectglist(gp)
-		}
-	}
-	stopm()
-	goto top
-}
-
-func resetspinning() {
-	_g_ := getg()
-
-	var nmspinning uint32
-	if _g_.m.spinning {
-		_g_.m.spinning = false
-		nmspinning = xadd(&sched.nmspinning, -1)
-		if nmspinning < 0 {
-			throw("findrunnable: negative nmspinning")
-		}
-	} else {
-		nmspinning = atomicload(&sched.nmspinning)
-	}
-
-	// M wakeup policy is deliberately somewhat conservative (see nmspinning handling),
-	// so see if we need to wakeup another P here.
-	if nmspinning == 0 && atomicload(&sched.npidle) > 0 {
-		wakep()
-	}
-}
-
-// Injects the list of runnable G's into the scheduler.
-// Can run concurrently with GC.
-func injectglist(glist *g) {
-	if glist == nil {
-		return
-	}
-	if trace.enabled {
-		for gp := glist; gp != nil; gp = gp.schedlink.ptr() {
-			traceGoUnpark(gp, 0)
-		}
-	}
-	lock(&sched.lock)
-	var n int
-	for n = 0; glist != nil; n++ {
-		gp := glist
-		glist = gp.schedlink.ptr()
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
-	}
-	unlock(&sched.lock)
-	for ; n != 0 && sched.npidle != 0; n-- {
-		startm(nil, false)
-	}
-}
-
-// One round of scheduler: find a runnable goroutine and execute it.
-// Never returns.
-func schedule() {
-	_g_ := getg()
-
-	if _g_.m.locks != 0 {
-		throw("schedule: holding locks")
-	}
-
-	if _g_.m.lockedg != nil {
-		stoplockedm()
-		execute(_g_.m.lockedg, false) // Never returns.
-	}
-
-top:
-	if sched.gcwaiting != 0 {
-		gcstopm()
-		goto top
-	}
-	if _g_.m.p.ptr().runSafePointFn != 0 {
-		runSafePointFn()
-	}
-
-	var gp *g
-	var inheritTime bool
-	if trace.enabled || trace.shutdown {
-		gp = traceReader()
-		if gp != nil {
-			casgstatus(gp, _Gwaiting, _Grunnable)
-			traceGoUnpark(gp, 0)
-			resetspinning()
-		}
-	}
-	if gp == nil && gcBlackenEnabled != 0 {
-		gp = gcController.findRunnableGCWorker(_g_.m.p.ptr())
-		if gp != nil {
-			resetspinning()
-		}
-	}
-	if gp == nil {
-		// Check the global runnable queue once in a while to ensure fairness.
-		// Otherwise two goroutines can completely occupy the local runqueue
-		// by constantly respawning each other.
-		if _g_.m.p.ptr().schedtick%61 == 0 && sched.runqsize > 0 {
-			lock(&sched.lock)
-			gp = globrunqget(_g_.m.p.ptr(), 1)
-			unlock(&sched.lock)
-			if gp != nil {
-				resetspinning()
-			}
-		}
-	}
-	if gp == nil {
-		gp, inheritTime = runqget(_g_.m.p.ptr())
-		if gp != nil && _g_.m.spinning {
-			throw("schedule: spinning with local work")
-		}
-	}
-	if gp == nil {
-		gp, inheritTime = findrunnable() // blocks until work is available
-		resetspinning()
-	}
-
-	if gp.lockedm != nil {
-		// Hands off own p to the locked m,
-		// then blocks waiting for a new p.
-		startlockedm(gp)
-		goto top
-	}
-
-	execute(gp, inheritTime)
-}
-
-// dropg removes the association between m and the current goroutine m->curg (gp for short).
-// Typically a caller sets gp's status away from Grunning and then
-// immediately calls dropg to finish the job. The caller is also responsible
-// for arranging that gp will be restarted using ready at an
-// appropriate time. After calling dropg and arranging for gp to be
-// readied later, the caller can do other work but eventually should
-// call schedule to restart the scheduling of goroutines on this m.
-func dropg() {
-	_g_ := getg()
-
-	if _g_.m.lockedg == nil {
-		_g_.m.curg.m = nil
-		_g_.m.curg = nil
-	}
-}
-
-func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
-	unlock((*mutex)(lock))
-	return true
-}
-
-// park continuation on g0.
-func park_m(gp *g) {
-	_g_ := getg()
-
-	if trace.enabled {
-		traceGoPark(_g_.m.waittraceev, _g_.m.waittraceskip, gp)
-	}
-
-	casgstatus(gp, _Grunning, _Gwaiting)
-	dropg()
-
-	if _g_.m.waitunlockf != nil {
-		fn := *(*func(*g, unsafe.Pointer) bool)(unsafe.Pointer(&_g_.m.waitunlockf))
-		ok := fn(gp, _g_.m.waitlock)
-		_g_.m.waitunlockf = nil
-		_g_.m.waitlock = nil
-		if !ok {
-			if trace.enabled {
-				traceGoUnpark(gp, 2)
-			}
-			casgstatus(gp, _Gwaiting, _Grunnable)
-			execute(gp, true) // Schedule it back, never returns.
-		}
-	}
-	schedule()
-}
-
-func goschedImpl(gp *g) {
-	status := readgstatus(gp)
-	if status&^_Gscan != _Grunning {
-		dumpgstatus(gp)
-		throw("bad g status")
-	}
-	casgstatus(gp, _Grunning, _Grunnable)
-	dropg()
-	lock(&sched.lock)
-	globrunqput(gp)
-	unlock(&sched.lock)
-
-	schedule()
-}
-
-// Gosched continuation on g0.
-func gosched_m(gp *g) {
-	if trace.enabled {
-		traceGoSched()
-	}
-	goschedImpl(gp)
-}
-
-func gopreempt_m(gp *g) {
-	if trace.enabled {
-		traceGoPreempt()
-	}
-	goschedImpl(gp)
-}
-
-// Finishes execution of the current goroutine.
-func goexit1() {
-	if raceenabled {
-		racegoend()
-	}
-	if trace.enabled {
-		traceGoEnd()
-	}
-	mcall(goexit0)
-}
-
-// goexit continuation on g0.
-func goexit0(gp *g) {
-	_g_ := getg()
-
-	casgstatus(gp, _Grunning, _Gdead)
-	gp.m = nil
-	gp.lockedm = nil
-	_g_.m.lockedg = nil
-	gp.paniconfault = false
-	gp._defer = nil // should be true already but just in case.
-	gp._panic = nil // non-nil for Goexit during panic. points at stack-allocated data.
-	gp.writebuf = nil
-	gp.waitreason = ""
-	gp.param = nil
-
-	dropg()
-
-	if _g_.m.locked&^_LockExternal != 0 {
-		print("invalid m->locked = ", _g_.m.locked, "\n")
-		throw("internal lockOSThread error")
-	}
-	_g_.m.locked = 0
-	gfput(_g_.m.p.ptr(), gp)
-	schedule()
-}
-
-//go:nosplit
-//go:nowritebarrier
-func save(pc, sp uintptr) {
-	_g_ := getg()
-
-	_g_.sched.pc = pc
-	_g_.sched.sp = sp
-	_g_.sched.lr = 0
-	_g_.sched.ret = 0
-	_g_.sched.ctxt = nil
-	_g_.sched.g = guintptr(unsafe.Pointer(_g_))
-}
-
-// The goroutine g is about to enter a system call.
-// Record that it's not using the cpu anymore.
-// This is called only from the go syscall library and cgocall,
-// not from the low-level system calls used by the
-//
-// Entersyscall cannot split the stack: the gosave must
-// make g->sched refer to the caller's stack segment, because
-// entersyscall is going to return immediately after.
-//
-// Nothing entersyscall calls can split the stack either.
-// We cannot safely move the stack during an active call to syscall,
-// because we do not know which of the uintptr arguments are
-// really pointers (back into the stack).
-// In practice, this means that we make the fast path run through
-// entersyscall doing no-split things, and the slow path has to use systemstack
-// to run bigger things on the system stack.
-//
-// reentersyscall is the entry point used by cgo callbacks, where explicitly
-// saved SP and PC are restored. This is needed when exitsyscall will be called
-// from a function further up in the call stack than the parent, as g->syscallsp
-// must always point to a valid stack frame. entersyscall below is the normal
-// entry point for syscalls, which obtains the SP and PC from the caller.
-//
-// Syscall tracing:
-// At the start of a syscall we emit traceGoSysCall to capture the stack trace.
-// If the syscall does not block, that is it, we do not emit any other events.
-// If the syscall blocks (that is, P is retaken), retaker emits traceGoSysBlock;
-// when syscall returns we emit traceGoSysExit and when the goroutine starts running
-// (potentially instantly, if exitsyscallfast returns true) we emit traceGoStart.
-// To ensure that traceGoSysExit is emitted strictly after traceGoSysBlock,
-// we remember current value of syscalltick in m (_g_.m.syscalltick = _g_.m.p.ptr().syscalltick),
-// whoever emits traceGoSysBlock increments p.syscalltick afterwards;
-// and we wait for the increment before emitting traceGoSysExit.
-// Note that the increment is done even if tracing is not enabled,
-// because tracing can be enabled in the middle of syscall. We don't want the wait to hang.
-//
-//go:nosplit
-func reentersyscall(pc, sp uintptr) {
-	_g_ := getg()
-
-	// Disable preemption because during this function g is in Gsyscall status,
-	// but can have inconsistent g->sched, do not let GC observe it.
-	_g_.m.locks++
-
-	if trace.enabled {
-		systemstack(traceGoSysCall)
-	}
-
-	// Entersyscall must not call any function that might split/grow the stack.
-	// (See details in comment above.)
-	// Catch calls that might, by replacing the stack guard with something that
-	// will trip any stack check and leaving a flag to tell newstack to die.
-	_g_.stackguard0 = stackPreempt
-	_g_.throwsplit = true
-
-	// Leave SP around for GC and traceback.
-	save(pc, sp)
-	_g_.syscallsp = sp
-	_g_.syscallpc = pc
-	casgstatus(_g_, _Grunning, _Gsyscall)
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(func() {
-			print("entersyscall inconsistent ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
-			throw("entersyscall")
-		})
-	}
-
-	if atomicload(&sched.sysmonwait) != 0 { // TODO: fast atomic
-		systemstack(entersyscall_sysmon)
-		save(pc, sp)
-	}
-
-	if _g_.m.p.ptr().runSafePointFn != 0 {
-		// runSafePointFn may stack split if run on this stack
-		systemstack(runSafePointFn)
-		save(pc, sp)
-	}
-
-	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
-	_g_.sysblocktraced = true
-	_g_.m.mcache = nil
-	_g_.m.p.ptr().m = 0
-	atomicstore(&_g_.m.p.ptr().status, _Psyscall)
-	if sched.gcwaiting != 0 {
-		systemstack(entersyscall_gcwait)
-		save(pc, sp)
-	}
-
-	// Goroutines must not split stacks in Gsyscall status (it would corrupt g->sched).
-	// We set _StackGuard to StackPreempt so that first split stack check calls morestack.
-	// Morestack detects this case and throws.
-	_g_.stackguard0 = stackPreempt
-	_g_.m.locks--
-}
-
-// Standard syscall entry used by the go syscall library and normal cgo calls.
-//go:nosplit
-func entersyscall(dummy int32) {
-	reentersyscall(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
-}
-
-func entersyscall_sysmon() {
-	lock(&sched.lock)
-	if atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
-		notewakeup(&sched.sysmonnote)
-	}
-	unlock(&sched.lock)
-}
-
-func entersyscall_gcwait() {
-	_g_ := getg()
-	_p_ := _g_.m.p.ptr()
-
-	lock(&sched.lock)
-	if sched.stopwait > 0 && cas(&_p_.status, _Psyscall, _Pgcstop) {
-		if trace.enabled {
-			traceGoSysBlock(_p_)
-			traceProcStop(_p_)
-		}
-		_p_.syscalltick++
-		if sched.stopwait--; sched.stopwait == 0 {
-			notewakeup(&sched.stopnote)
-		}
-	}
-	unlock(&sched.lock)
-}
-
-// The same as entersyscall(), but with a hint that the syscall is blocking.
-//go:nosplit
-func entersyscallblock(dummy int32) {
-	_g_ := getg()
-
-	_g_.m.locks++ // see comment in entersyscall
-	_g_.throwsplit = true
-	_g_.stackguard0 = stackPreempt // see comment in entersyscall
-	_g_.m.syscalltick = _g_.m.p.ptr().syscalltick
-	_g_.sysblocktraced = true
-	_g_.m.p.ptr().syscalltick++
-
-	// Leave SP around for GC and traceback.
-	pc := getcallerpc(unsafe.Pointer(&dummy))
-	sp := getcallersp(unsafe.Pointer(&dummy))
-	save(pc, sp)
-	_g_.syscallsp = _g_.sched.sp
-	_g_.syscallpc = _g_.sched.pc
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		sp1 := sp
-		sp2 := _g_.sched.sp
-		sp3 := _g_.syscallsp
-		systemstack(func() {
-			print("entersyscallblock inconsistent ", hex(sp1), " ", hex(sp2), " ", hex(sp3), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
-			throw("entersyscallblock")
-		})
-	}
-	casgstatus(_g_, _Grunning, _Gsyscall)
-	if _g_.syscallsp < _g_.stack.lo || _g_.stack.hi < _g_.syscallsp {
-		systemstack(func() {
-			print("entersyscallblock inconsistent ", hex(sp), " ", hex(_g_.sched.sp), " ", hex(_g_.syscallsp), " [", hex(_g_.stack.lo), ",", hex(_g_.stack.hi), "]\n")
-			throw("entersyscallblock")
-		})
-	}
-
-	systemstack(entersyscallblock_handoff)
-
-	// Resave for traceback during blocked call.
-	save(getcallerpc(unsafe.Pointer(&dummy)), getcallersp(unsafe.Pointer(&dummy)))
-
-	_g_.m.locks--
-}
-
-func entersyscallblock_handoff() {
-	if trace.enabled {
-		traceGoSysCall()
-		traceGoSysBlock(getg().m.p.ptr())
-	}
-	handoffp(releasep())
-}
-
-// The goroutine g exited its system call.
-// Arrange for it to run on a cpu again.
-// This is called only from the go syscall library, not
-// from the low-level system calls used by the
-//go:nosplit
-func exitsyscall(dummy int32) {
-	_g_ := getg()
-
-	_g_.m.locks++ // see comment in entersyscall
-	if getcallersp(unsafe.Pointer(&dummy)) > _g_.syscallsp {
-		throw("exitsyscall: syscall frame is no longer valid")
-	}
-
-	_g_.waitsince = 0
-	oldp := _g_.m.p.ptr()
-	if exitsyscallfast() {
-		if _g_.m.mcache == nil {
-			throw("lost mcache")
-		}
-		if trace.enabled {
-			if oldp != _g_.m.p.ptr() || _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
-				systemstack(traceGoStart)
-			}
-		}
-		// There's a cpu for us, so we can run.
-		_g_.m.p.ptr().syscalltick++
-		// We need to cas the status and scan before resuming...
-		casgstatus(_g_, _Gsyscall, _Grunning)
-
-		// Garbage collector isn't running (since we are),
-		// so okay to clear syscallsp.
-		_g_.syscallsp = 0
-		_g_.m.locks--
-		if _g_.preempt {
-			// restore the preemption request in case we've cleared it in newstack
-			_g_.stackguard0 = stackPreempt
-		} else {
-			// otherwise restore the real _StackGuard, we've spoiled it in entersyscall/entersyscallblock
-			_g_.stackguard0 = _g_.stack.lo + _StackGuard
-		}
-		_g_.throwsplit = false
-		return
-	}
-
-	_g_.sysexitticks = 0
-	_g_.sysexitseq = 0
-	if trace.enabled {
-		// Wait till traceGoSysBlock event is emitted.
-		// This ensures consistency of the trace (the goroutine is started after it is blocked).
-		for oldp != nil && oldp.syscalltick == _g_.m.syscalltick {
-			osyield()
-		}
-		// We can't trace syscall exit right now because we don't have a P.
-		// Tracing code can invoke write barriers that cannot run without a P.
-		// So instead we remember the syscall exit time and emit the event
-		// in execute when we have a P.
-		_g_.sysexitseq, _g_.sysexitticks = tracestamp()
-	}
-
-	_g_.m.locks--
-
-	// Call the scheduler.
-	mcall(exitsyscall0)
-
-	if _g_.m.mcache == nil {
-		throw("lost mcache")
-	}
-
-	// Scheduler returned, so we're allowed to run now.
-	// Delete the syscallsp information that we left for
-	// the garbage collector during the system call.
-	// Must wait until now because until gosched returns
-	// we don't know for sure that the garbage collector
-	// is not running.
-	_g_.syscallsp = 0
-	_g_.m.p.ptr().syscalltick++
-	_g_.throwsplit = false
-}
-
-//go:nosplit
-func exitsyscallfast() bool {
-	_g_ := getg()
-
-	// Freezetheworld sets stopwait but does not retake P's.
-	if sched.stopwait == freezeStopWait {
-		_g_.m.mcache = nil
-		_g_.m.p = 0
-		return false
-	}
-
-	// Try to re-acquire the last P.
-	if _g_.m.p != 0 && _g_.m.p.ptr().status == _Psyscall && cas(&_g_.m.p.ptr().status, _Psyscall, _Prunning) {
-		// There's a cpu for us, so we can run.
-		_g_.m.mcache = _g_.m.p.ptr().mcache
-		_g_.m.p.ptr().m.set(_g_.m)
-		if _g_.m.syscalltick != _g_.m.p.ptr().syscalltick {
-			if trace.enabled {
-				// The p was retaken and then enter into syscall again (since _g_.m.syscalltick has changed).
-				// traceGoSysBlock for this syscall was already emitted,
-				// but here we effectively retake the p from the new syscall running on the same p.
-				systemstack(func() {
-					// Denote blocking of the new syscall.
-					traceGoSysBlock(_g_.m.p.ptr())
-					// Denote completion of the current syscall.
-					traceGoSysExit(tracestamp())
-				})
-			}
-			_g_.m.p.ptr().syscalltick++
-		}
-		return true
-	}
-
-	// Try to get any other idle P.
-	oldp := _g_.m.p.ptr()
-	_g_.m.mcache = nil
-	_g_.m.p = 0
-	if sched.pidle != 0 {
-		var ok bool
-		systemstack(func() {
-			ok = exitsyscallfast_pidle()
-			if ok && trace.enabled {
-				if oldp != nil {
-					// Wait till traceGoSysBlock event is emitted.
-					// This ensures consistency of the trace (the goroutine is started after it is blocked).
-					for oldp.syscalltick == _g_.m.syscalltick {
-						osyield()
-					}
-				}
-				traceGoSysExit(tracestamp())
-			}
-		})
-		if ok {
-			return true
-		}
-	}
-	return false
-}
-
-func exitsyscallfast_pidle() bool {
-	lock(&sched.lock)
-	_p_ := pidleget()
-	if _p_ != nil && atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
-		notewakeup(&sched.sysmonnote)
-	}
-	unlock(&sched.lock)
-	if _p_ != nil {
-		acquirep(_p_)
-		return true
-	}
-	return false
-}
-
-// exitsyscall slow path on g0.
-// Failed to acquire P, enqueue gp as runnable.
-func exitsyscall0(gp *g) {
-	_g_ := getg()
-
-	casgstatus(gp, _Gsyscall, _Grunnable)
-	dropg()
-	lock(&sched.lock)
-	_p_ := pidleget()
-	if _p_ == nil {
-		globrunqput(gp)
-	} else if atomicload(&sched.sysmonwait) != 0 {
-		atomicstore(&sched.sysmonwait, 0)
-		notewakeup(&sched.sysmonnote)
-	}
-	unlock(&sched.lock)
-	if _p_ != nil {
-		acquirep(_p_)
-		execute(gp, false) // Never returns.
-	}
-	if _g_.m.lockedg != nil {
-		// Wait until another thread schedules gp and so m again.
-		stoplockedm()
-		execute(gp, false) // Never returns.
-	}
-	stopm()
-	schedule() // Never returns.
-}
-
-func beforefork() {
-	gp := getg().m.curg
-
-	// Fork can hang if preempted with signals frequently enough (see issue 5517).
-	// Ensure that we stay on the same M where we disable profiling.
-	gp.m.locks++
-	if gp.m.profilehz != 0 {
-		resetcpuprofiler(0)
-	}
-
-	// This function is called before fork in syscall package.
-	// Code between fork and exec must not allocate memory nor even try to grow stack.
-	// Here we spoil g->_StackGuard to reliably detect any attempts to grow stack.
-	// runtime_AfterFork will undo this in parent process, but not in child.
-	gp.stackguard0 = stackFork
-}
-
-// Called from syscall package before fork.
-//go:linkname syscall_runtime_BeforeFork syscall.runtime_BeforeFork
-//go:nosplit
-func syscall_runtime_BeforeFork() {
-	systemstack(beforefork)
-}
-
-func afterfork() {
-	gp := getg().m.curg
-
-	// See the comment in beforefork.
-	gp.stackguard0 = gp.stack.lo + _StackGuard
-
-	hz := sched.profilehz
-	if hz != 0 {
-		resetcpuprofiler(hz)
-	}
-	gp.m.locks--
-}
-
-// Called from syscall package after fork in parent.
-//go:linkname syscall_runtime_AfterFork syscall.runtime_AfterFork
-//go:nosplit
-func syscall_runtime_AfterFork() {
-	systemstack(afterfork)
-}
-
-// Allocate a new g, with a stack big enough for stacksize bytes.
-func malg(stacksize int32) *g {
-	newg := new(g)
-	if stacksize >= 0 {
-		stacksize = round2(_StackSystem + stacksize)
-		systemstack(func() {
-			newg.stack, newg.stkbar = stackalloc(uint32(stacksize))
-		})
-		newg.stackguard0 = newg.stack.lo + _StackGuard
-		newg.stackguard1 = ^uintptr(0)
-		newg.stackAlloc = uintptr(stacksize)
-	}
-	return newg
-}
-
-// Create a new g running fn with siz bytes of arguments.
-// Put it on the queue of g's waiting to run.
-// The compiler turns a go statement into a call to this.
-// Cannot split the stack because it assumes that the arguments
-// are available sequentially after &fn; they would not be
-// copied if a stack split occurred.
-//go:nosplit
-func newproc(siz int32, fn *funcval) {
-	argp := add(unsafe.Pointer(&fn), ptrSize)
-	pc := getcallerpc(unsafe.Pointer(&siz))
-	systemstack(func() {
-		newproc1(fn, (*uint8)(argp), siz, 0, pc)
-	})
-}
-
-// Create a new g running fn with narg bytes of arguments starting
-// at argp and returning nret bytes of results.  callerpc is the
-// address of the go statement that created this.  The new g is put
-// on the queue of g's waiting to run.
-func newproc1(fn *funcval, argp *uint8, narg int32, nret int32, callerpc uintptr) *g {
-	_g_ := getg()
-
-	if fn == nil {
-		_g_.m.throwing = -1 // do not dump full stacks
-		throw("go of nil func value")
-	}
-	_g_.m.locks++ // disable preemption because it can be holding p in a local var
-	siz := narg + nret
-	siz = (siz + 7) &^ 7
-
-	// We could allocate a larger initial stack if necessary.
-	// Not worth it: this is almost always an error.
-	// 4*sizeof(uintreg): extra space added below
-	// sizeof(uintreg): caller's LR (arm) or return address (x86, in gostartcall).
-	if siz >= _StackMin-4*regSize-regSize {
-		throw("newproc: function arguments too large for new goroutine")
-	}
-
-	_p_ := _g_.m.p.ptr()
-	newg := gfget(_p_)
-	if newg == nil {
-		newg = malg(_StackMin)
-		casgstatus(newg, _Gidle, _Gdead)
-		allgadd(newg) // publishes with a g->status of Gdead so GC scanner doesn't look at uninitialized stack.
-	}
-	if newg.stack.hi == 0 {
-		throw("newproc1: newg missing stack")
-	}
-
-	if readgstatus(newg) != _Gdead {
-		throw("newproc1: new g is not Gdead")
-	}
-
-	totalSize := 4*regSize + uintptr(siz) // extra space in case of reads slightly beyond frame
-	if hasLinkRegister {
-		totalSize += ptrSize
-	}
-	totalSize += -totalSize & (spAlign - 1) // align to spAlign
-	sp := newg.stack.hi - totalSize
-	spArg := sp
-	if hasLinkRegister {
-		// caller's LR
-		*(*unsafe.Pointer)(unsafe.Pointer(sp)) = nil
-		spArg += ptrSize
-	}
-	memmove(unsafe.Pointer(spArg), unsafe.Pointer(argp), uintptr(narg))
-
-	memclr(unsafe.Pointer(&newg.sched), unsafe.Sizeof(newg.sched))
-	newg.sched.sp = sp
-	newg.sched.pc = funcPC(goexit) + _PCQuantum // +PCQuantum so that previous instruction is in same function
-	newg.sched.g = guintptr(unsafe.Pointer(newg))
-	gostartcallfn(&newg.sched, fn)
-	newg.gopc = callerpc
-	newg.startpc = fn.fn
-	casgstatus(newg, _Gdead, _Grunnable)
-
-	if _p_.goidcache == _p_.goidcacheend {
-		// Sched.goidgen is the last allocated id,
-		// this batch must be [sched.goidgen+1, sched.goidgen+GoidCacheBatch].
-		// At startup sched.goidgen=0, so main goroutine receives goid=1.
-		_p_.goidcache = xadd64(&sched.goidgen, _GoidCacheBatch)
-		_p_.goidcache -= _GoidCacheBatch - 1
-		_p_.goidcacheend = _p_.goidcache + _GoidCacheBatch
-	}
-	newg.goid = int64(_p_.goidcache)
-	_p_.goidcache++
-	if raceenabled {
-		newg.racectx = racegostart(callerpc)
-	}
-	if trace.enabled {
-		traceGoCreate(newg, newg.startpc)
-	}
-	runqput(_p_, newg, true)
-
-	if atomicload(&sched.npidle) != 0 && atomicload(&sched.nmspinning) == 0 && unsafe.Pointer(fn.fn) != unsafe.Pointer(funcPC(main)) { // TODO: fast atomic
-		wakep()
-	}
-	_g_.m.locks--
-	if _g_.m.locks == 0 && _g_.preempt { // restore the preemption request in case we've cleared it in newstack
-		_g_.stackguard0 = stackPreempt
-	}
-	return newg
-}
-
-// Put on gfree list.
-// If local list is too long, transfer a batch to the global list.
-func gfput(_p_ *p, gp *g) {
-	if readgstatus(gp) != _Gdead {
-		throw("gfput: bad status (not Gdead)")
-	}
-
-	stksize := gp.stackAlloc
-
-	if stksize != _FixedStack {
-		// non-standard stack size - free it.
-		stackfree(gp.stack, gp.stackAlloc)
-		gp.stack.lo = 0
-		gp.stack.hi = 0
-		gp.stackguard0 = 0
-		gp.stkbar = nil
-		gp.stkbarPos = 0
-	} else {
-		// Reset stack barriers.
-		gp.stkbar = gp.stkbar[:0]
-		gp.stkbarPos = 0
-	}
-
-	gp.schedlink.set(_p_.gfree)
-	_p_.gfree = gp
-	_p_.gfreecnt++
-	if _p_.gfreecnt >= 64 {
-		lock(&sched.gflock)
-		for _p_.gfreecnt >= 32 {
-			_p_.gfreecnt--
-			gp = _p_.gfree
-			_p_.gfree = gp.schedlink.ptr()
-			gp.schedlink.set(sched.gfree)
-			sched.gfree = gp
-			sched.ngfree++
-		}
-		unlock(&sched.gflock)
-	}
-}
-
-// Get from gfree list.
-// If local list is empty, grab a batch from global list.
-func gfget(_p_ *p) *g {
-retry:
-	gp := _p_.gfree
-	if gp == nil && sched.gfree != nil {
-		lock(&sched.gflock)
-		for _p_.gfreecnt < 32 && sched.gfree != nil {
-			_p_.gfreecnt++
-			gp = sched.gfree
-			sched.gfree = gp.schedlink.ptr()
-			sched.ngfree--
-			gp.schedlink.set(_p_.gfree)
-			_p_.gfree = gp
-		}
-		unlock(&sched.gflock)
-		goto retry
-	}
-	if gp != nil {
-		_p_.gfree = gp.schedlink.ptr()
-		_p_.gfreecnt--
-		if gp.stack.lo == 0 {
-			// Stack was deallocated in gfput.  Allocate a new one.
-			systemstack(func() {
-				gp.stack, gp.stkbar = stackalloc(_FixedStack)
-			})
-			gp.stackguard0 = gp.stack.lo + _StackGuard
-			gp.stackAlloc = _FixedStack
-		} else {
-			if raceenabled {
-				racemalloc(unsafe.Pointer(gp.stack.lo), gp.stackAlloc)
-			}
-		}
-	}
-	return gp
-}
-
-// Purge all cached G's from gfree list to the global list.
-func gfpurge(_p_ *p) {
-	lock(&sched.gflock)
-	for _p_.gfreecnt != 0 {
-		_p_.gfreecnt--
-		gp := _p_.gfree
-		_p_.gfree = gp.schedlink.ptr()
-		gp.schedlink.set(sched.gfree)
-		sched.gfree = gp
-		sched.ngfree++
-	}
-	unlock(&sched.gflock)
-}
-
-// Breakpoint executes a breakpoint trap.
-func Breakpoint() {
-	breakpoint()
-}
-
-// dolockOSThread is called by LockOSThread and lockOSThread below
-// after they modify m.locked. Do not allow preemption during this call,
-// or else the m might be different in this function than in the caller.
-//go:nosplit
-func dolockOSThread() {
-	_g_ := getg()
-	_g_.m.lockedg = _g_
-	_g_.lockedm = _g_.m
-}
-
-//go:nosplit
-
-// LockOSThread wires the calling goroutine to its current operating system thread.
-// Until the calling goroutine exits or calls UnlockOSThread, it will always
-// execute in that thread, and no other goroutine can.
-func LockOSThread() {
-	getg().m.locked |= _LockExternal
-	dolockOSThread()
-}
-
-//go:nosplit
-func lockOSThread() {
-	getg().m.locked += _LockInternal
-	dolockOSThread()
-}
-
-// dounlockOSThread is called by UnlockOSThread and unlockOSThread below
-// after they update m->locked. Do not allow preemption during this call,
-// or else the m might be in different in this function than in the caller.
-//go:nosplit
-func dounlockOSThread() {
-	_g_ := getg()
-	if _g_.m.locked != 0 {
-		return
-	}
-	_g_.m.lockedg = nil
-	_g_.lockedm = nil
-}
-
-//go:nosplit
-
-// UnlockOSThread unwires the calling goroutine from its fixed operating system thread.
-// If the calling goroutine has not called LockOSThread, UnlockOSThread is a no-op.
-func UnlockOSThread() {
-	getg().m.locked &^= _LockExternal
-	dounlockOSThread()
-}
-
-//go:nosplit
-func unlockOSThread() {
-	_g_ := getg()
-	if _g_.m.locked < _LockInternal {
-		systemstack(badunlockosthread)
-	}
-	_g_.m.locked -= _LockInternal
-	dounlockOSThread()
-}
-
-func badunlockosthread() {
-	throw("runtime: internal error: misuse of lockOSThread/unlockOSThread")
-}
-
-func gcount() int32 {
-	n := int32(allglen) - sched.ngfree
-	for i := 0; ; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			break
-		}
-		n -= _p_.gfreecnt
-	}
-
-	// All these variables can be changed concurrently, so the result can be inconsistent.
-	// But at least the current goroutine is running.
-	if n < 1 {
-		n = 1
-	}
-	return n
-}
-
-func mcount() int32 {
-	return sched.mcount
-}
-
-var prof struct {
-	lock uint32
-	hz   int32
-}
-
-func _System()       { _System() }
-func _ExternalCode() { _ExternalCode() }
-func _GC()           { _GC() }
-
-// Called if we receive a SIGPROF signal.
-func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
-	if prof.hz == 0 {
-		return
-	}
-
-	// Profiling runs concurrently with GC, so it must not allocate.
-	mp.mallocing++
-
-	// Coordinate with stack barrier insertion in scanstack.
-	for !cas(&gp.stackLock, 0, 1) {
-		osyield()
-	}
-
-	// Define that a "user g" is a user-created goroutine, and a "system g"
-	// is one that is m->g0 or m->gsignal.
-	//
-	// We might be interrupted for profiling halfway through a
-	// goroutine switch. The switch involves updating three (or four) values:
-	// g, PC, SP, and (on arm) LR. The PC must be the last to be updated,
-	// because once it gets updated the new g is running.
-	//
-	// When switching from a user g to a system g, LR is not considered live,
-	// so the update only affects g, SP, and PC. Since PC must be last, there
-	// the possible partial transitions in ordinary execution are (1) g alone is updated,
-	// (2) both g and SP are updated, and (3) SP alone is updated.
-	// If SP or g alone is updated, we can detect the partial transition by checking
-	// whether the SP is within g's stack bounds. (We could also require that SP
-	// be changed only after g, but the stack bounds check is needed by other
-	// cases, so there is no need to impose an additional requirement.)
-	//
-	// There is one exceptional transition to a system g, not in ordinary execution.
-	// When a signal arrives, the operating system starts the signal handler running
-	// with an updated PC and SP. The g is updated last, at the beginning of the
-	// handler. There are two reasons this is okay. First, until g is updated the
-	// g and SP do not match, so the stack bounds check detects the partial transition.
-	// Second, signal handlers currently run with signals disabled, so a profiling
-	// signal cannot arrive during the handler.
-	//
-	// When switching from a system g to a user g, there are three possibilities.
-	//
-	// First, it may be that the g switch has no PC update, because the SP
-	// either corresponds to a user g throughout (as in asmcgocall)
-	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback_gofunc). In this case, since the entire
-	// transition is a g+SP update, a partial transition updating just one of
-	// those will be detected by the stack bounds check.
-	//
-	// Second, when returning from a signal handler, the PC and SP updates
-	// are performed by the operating system in an atomic update, so the g
-	// update must be done before them. The stack bounds check detects
-	// the partial transition here, and (again) signal handlers run with signals
-	// disabled, so a profiling signal cannot arrive then anyway.
-	//
-	// Third, the common case: it may be that the switch updates g, SP, and PC
-	// separately. If the PC is within any of the functions that does this,
-	// we don't ask for a traceback. C.F. the function setsSP for more about this.
-	//
-	// There is another apparently viable approach, recorded here in case
-	// the "PC within setsSP function" check turns out not to be usable.
-	// It would be possible to delay the update of either g or SP until immediately
-	// before the PC update instruction. Then, because of the stack bounds check,
-	// the only problematic interrupt point is just before that PC update instruction,
-	// and the sigprof handler can detect that instruction and simulate stepping past
-	// it in order to reach a consistent state. On ARM, the update of g must be made
-	// in two places (in R10 and also in a TLS slot), so the delayed update would
-	// need to be the SP update. The sigprof handler must read the instruction at
-	// the current PC and if it was the known instruction (for example, JMP BX or
-	// MOV R2, PC), use that other register in place of the PC value.
-	// The biggest drawback to this solution is that it requires that we can tell
-	// whether it's safe to read from the memory pointed at by PC.
-	// In a correct program, we can test PC == nil and otherwise read,
-	// but if a profiling signal happens at the instant that a program executes
-	// a bad jump (before the program manages to handle the resulting fault)
-	// the profiling handler could fault trying to read nonexistent memory.
-	//
-	// To recap, there are no constraints on the assembly being used for the
-	// transition. We simply require that g and SP match and that the PC is not
-	// in gogo.
-	traceback := true
-	if gp == nil || sp < gp.stack.lo || gp.stack.hi < sp || setsSP(pc) {
-		traceback = false
-	}
-	var stk [maxCPUProfStack]uintptr
-	n := 0
-	if mp.ncgo > 0 && mp.curg != nil && mp.curg.syscallpc != 0 && mp.curg.syscallsp != 0 {
-		// Cgo, we can't unwind and symbolize arbitrary C code,
-		// so instead collect Go stack that leads to the cgo call.
-		// This is especially important on windows, since all syscalls are cgo calls.
-		n = gentraceback(mp.curg.syscallpc, mp.curg.syscallsp, 0, mp.curg, 0, &stk[0], len(stk), nil, nil, 0)
-	} else if traceback {
-		n = gentraceback(pc, sp, lr, gp, 0, &stk[0], len(stk), nil, nil, _TraceTrap|_TraceJumpStack)
-	}
-	if !traceback || n <= 0 {
-		// Normal traceback is impossible or has failed.
-		// See if it falls into several common cases.
-		n = 0
-		if GOOS == "windows" && n == 0 && mp.libcallg != 0 && mp.libcallpc != 0 && mp.libcallsp != 0 {
-			// Libcall, i.e. runtime syscall on windows.
-			// Collect Go stack that leads to the call.
-			n = gentraceback(mp.libcallpc, mp.libcallsp, 0, mp.libcallg.ptr(), 0, &stk[0], len(stk), nil, nil, 0)
-		}
-		if n == 0 {
-			// If all of the above has failed, account it against abstract "System" or "GC".
-			n = 2
-			// "ExternalCode" is better than "etext".
-			if pc > firstmoduledata.etext {
-				pc = funcPC(_ExternalCode) + _PCQuantum
-			}
-			stk[0] = pc
-			if mp.preemptoff != "" || mp.helpgc != 0 {
-				stk[1] = funcPC(_GC) + _PCQuantum
-			} else {
-				stk[1] = funcPC(_System) + _PCQuantum
-			}
-		}
-	}
-	atomicstore(&gp.stackLock, 0)
-
-	if prof.hz != 0 {
-		// Simple cas-lock to coordinate with setcpuprofilerate.
-		for !cas(&prof.lock, 0, 1) {
-			osyield()
-		}
-		if prof.hz != 0 {
-			cpuprof.add(stk[:n])
-		}
-		atomicstore(&prof.lock, 0)
-	}
-	mp.mallocing--
-}
-
-// Reports whether a function will set the SP
-// to an absolute value. Important that
-// we don't traceback when these are at the bottom
-// of the stack since we can't be sure that we will
-// find the caller.
-//
-// If the function is not on the bottom of the stack
-// we assume that it will have set it up so that traceback will be consistent,
-// either by being a traceback terminating function
-// or putting one on the stack at the right offset.
-func setsSP(pc uintptr) bool {
-	f := findfunc(pc)
-	if f == nil {
-		// couldn't find the function for this PC,
-		// so assume the worst and stop traceback
-		return true
-	}
-	switch f.entry {
-	case gogoPC, systemstackPC, mcallPC, morestackPC:
-		return true
-	}
-	return false
-}
-
-// Arrange to call fn with a traceback hz times a second.
-func setcpuprofilerate_m(hz int32) {
-	// Force sane arguments.
-	if hz < 0 {
-		hz = 0
-	}
-
-	// Disable preemption, otherwise we can be rescheduled to another thread
-	// that has profiling enabled.
-	_g_ := getg()
-	_g_.m.locks++
-
-	// Stop profiler on this thread so that it is safe to lock prof.
-	// if a profiling signal came in while we had prof locked,
-	// it would deadlock.
-	resetcpuprofiler(0)
-
-	for !cas(&prof.lock, 0, 1) {
-		osyield()
-	}
-	prof.hz = hz
-	atomicstore(&prof.lock, 0)
-
-	lock(&sched.lock)
-	sched.profilehz = hz
-	unlock(&sched.lock)
-
-	if hz != 0 {
-		resetcpuprofiler(hz)
-	}
-
-	_g_.m.locks--
-}
-
-// Change number of processors.  The world is stopped, sched is locked.
-// gcworkbufs are not being modified by either the GC or
-// the write barrier code.
-// Returns list of Ps with local work, they need to be scheduled by the caller.
-func procresize(nprocs int32) *p {
-	old := gomaxprocs
-	if old < 0 || old > _MaxGomaxprocs || nprocs <= 0 || nprocs > _MaxGomaxprocs {
-		throw("procresize: invalid arg")
-	}
-	if trace.enabled {
-		traceGomaxprocs(nprocs)
-	}
-
-	// update statistics
-	now := nanotime()
-	if sched.procresizetime != 0 {
-		sched.totaltime += int64(old) * (now - sched.procresizetime)
-	}
-	sched.procresizetime = now
-
-	// initialize new P's
-	for i := int32(0); i < nprocs; i++ {
-		pp := allp[i]
-		if pp == nil {
-			pp = new(p)
-			pp.id = i
-			pp.status = _Pgcstop
-			pp.sudogcache = pp.sudogbuf[:0]
-			for i := range pp.deferpool {
-				pp.deferpool[i] = pp.deferpoolbuf[i][:0]
-			}
-			atomicstorep(unsafe.Pointer(&allp[i]), unsafe.Pointer(pp))
-		}
-		if pp.mcache == nil {
-			if old == 0 && i == 0 {
-				if getg().m.mcache == nil {
-					throw("missing mcache?")
-				}
-				pp.mcache = getg().m.mcache // bootstrap
-			} else {
-				pp.mcache = allocmcache()
-			}
-		}
-	}
-
-	// free unused P's
-	for i := nprocs; i < old; i++ {
-		p := allp[i]
-		if trace.enabled {
-			if p == getg().m.p.ptr() {
-				// moving to p[0], pretend that we were descheduled
-				// and then scheduled again to keep the trace sane.
-				traceGoSched()
-				traceProcStop(p)
-			}
-		}
-		// move all runnable goroutines to the global queue
-		for p.runqhead != p.runqtail {
-			// pop from tail of local queue
-			p.runqtail--
-			gp := p.runq[p.runqtail%uint32(len(p.runq))]
-			// push onto head of global queue
-			globrunqputhead(gp)
-		}
-		if p.runnext != 0 {
-			globrunqputhead(p.runnext.ptr())
-			p.runnext = 0
-		}
-		// if there's a background worker, make it runnable and put
-		// it on the global queue so it can clean itself up
-		if p.gcBgMarkWorker != nil {
-			casgstatus(p.gcBgMarkWorker, _Gwaiting, _Grunnable)
-			if trace.enabled {
-				traceGoUnpark(p.gcBgMarkWorker, 0)
-			}
-			globrunqput(p.gcBgMarkWorker)
-			p.gcBgMarkWorker = nil
-		}
-		for i := range p.sudogbuf {
-			p.sudogbuf[i] = nil
-		}
-		p.sudogcache = p.sudogbuf[:0]
-		for i := range p.deferpool {
-			for j := range p.deferpoolbuf[i] {
-				p.deferpoolbuf[i][j] = nil
-			}
-			p.deferpool[i] = p.deferpoolbuf[i][:0]
-		}
-		freemcache(p.mcache)
-		p.mcache = nil
-		gfpurge(p)
-		traceProcFree(p)
-		p.status = _Pdead
-		// can't free P itself because it can be referenced by an M in syscall
-	}
-
-	_g_ := getg()
-	if _g_.m.p != 0 && _g_.m.p.ptr().id < nprocs {
-		// continue to use the current P
-		_g_.m.p.ptr().status = _Prunning
-	} else {
-		// release the current P and acquire allp[0]
-		if _g_.m.p != 0 {
-			_g_.m.p.ptr().m = 0
-		}
-		_g_.m.p = 0
-		_g_.m.mcache = nil
-		p := allp[0]
-		p.m = 0
-		p.status = _Pidle
-		acquirep(p)
-		if trace.enabled {
-			traceGoStart()
-		}
-	}
-	var runnablePs *p
-	for i := nprocs - 1; i >= 0; i-- {
-		p := allp[i]
-		if _g_.m.p.ptr() == p {
-			continue
-		}
-		p.status = _Pidle
-		if runqempty(p) {
-			pidleput(p)
-		} else {
-			p.m.set(mget())
-			p.link.set(runnablePs)
-			runnablePs = p
-		}
-	}
-	var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
-	atomicstore((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
-	return runnablePs
-}
-
-// Associate p and the current m.
-func acquirep(_p_ *p) {
-	acquirep1(_p_)
-
-	// have p; write barriers now allowed
-	_g_ := getg()
-	_g_.m.mcache = _p_.mcache
-
-	if trace.enabled {
-		traceProcStart()
-	}
-}
-
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func acquirep1(_p_ *p) {
-	_g_ := getg()
-
-	if _g_.m.p != 0 || _g_.m.mcache != nil {
-		throw("acquirep: already in go")
-	}
-	if _p_.m != 0 || _p_.status != _Pidle {
-		id := int32(0)
-		if _p_.m != 0 {
-			id = _p_.m.ptr().id
-		}
-		print("acquirep: p->m=", _p_.m, "(", id, ") p->status=", _p_.status, "\n")
-		throw("acquirep: invalid p state")
-	}
-	_g_.m.p.set(_p_)
-	_p_.m.set(_g_.m)
-	_p_.status = _Prunning
-}
-
-// Disassociate p and the current m.
-func releasep() *p {
-	_g_ := getg()
-
-	if _g_.m.p == 0 || _g_.m.mcache == nil {
-		throw("releasep: invalid arg")
-	}
-	_p_ := _g_.m.p.ptr()
-	if _p_.m.ptr() != _g_.m || _p_.mcache != _g_.m.mcache || _p_.status != _Prunning {
-		print("releasep: m=", _g_.m, " m->p=", _g_.m.p.ptr(), " p->m=", _p_.m, " m->mcache=", _g_.m.mcache, " p->mcache=", _p_.mcache, " p->status=", _p_.status, "\n")
-		throw("releasep: invalid p state")
-	}
-	if trace.enabled {
-		traceProcStop(_g_.m.p.ptr())
-	}
-	_g_.m.p = 0
-	_g_.m.mcache = nil
-	_p_.m = 0
-	_p_.status = _Pidle
-	return _p_
-}
-
-func incidlelocked(v int32) {
-	lock(&sched.lock)
-	sched.nmidlelocked += v
-	if v > 0 {
-		checkdead()
-	}
-	unlock(&sched.lock)
-}
-
-// Check for deadlock situation.
-// The check is based on number of running M's, if 0 -> deadlock.
-func checkdead() {
-	// For -buildmode=c-shared or -buildmode=c-archive it's OK if
-	// there are no running goroutines.  The calling program is
-	// assumed to be running.
-	if islibrary || isarchive {
-		return
-	}
-
-	// If we are dying because of a signal caught on an already idle thread,
-	// freezetheworld will cause all running threads to block.
-	// And runtime will essentially enter into deadlock state,
-	// except that there is a thread that will call exit soon.
-	if panicking > 0 {
-		return
-	}
-
-	// -1 for sysmon
-	run := sched.mcount - sched.nmidle - sched.nmidlelocked - 1
-	if run > 0 {
-		return
-	}
-	if run < 0 {
-		print("runtime: checkdead: nmidle=", sched.nmidle, " nmidlelocked=", sched.nmidlelocked, " mcount=", sched.mcount, "\n")
-		throw("checkdead: inconsistent counts")
-	}
-
-	grunning := 0
-	lock(&allglock)
-	for i := 0; i < len(allgs); i++ {
-		gp := allgs[i]
-		if isSystemGoroutine(gp) {
-			continue
-		}
-		s := readgstatus(gp)
-		switch s &^ _Gscan {
-		case _Gwaiting:
-			grunning++
-		case _Grunnable,
-			_Grunning,
-			_Gsyscall:
-			unlock(&allglock)
-			print("runtime: checkdead: find g ", gp.goid, " in status ", s, "\n")
-			throw("checkdead: runnable g")
-		}
-	}
-	unlock(&allglock)
-	if grunning == 0 { // possible if main goroutine calls runtime·Goexit()
-		throw("no goroutines (main called runtime.Goexit) - deadlock!")
-	}
-
-	// Maybe jump time forward for playground.
-	gp := timejump()
-	if gp != nil {
-		casgstatus(gp, _Gwaiting, _Grunnable)
-		globrunqput(gp)
-		_p_ := pidleget()
-		if _p_ == nil {
-			throw("checkdead: no p for timer")
-		}
-		mp := mget()
-		if mp == nil {
-			newm(nil, _p_)
-		} else {
-			mp.nextp.set(_p_)
-			notewakeup(&mp.park)
-		}
-		return
-	}
-
-	getg().m.throwing = -1 // do not dump full stacks
-	throw("all goroutines are asleep - deadlock!")
-}
-
-func sysmon() {
-	// If we go two minutes without a garbage collection, force one to run.
-	forcegcperiod := int64(2 * 60 * 1e9)
-
-	// If a heap span goes unused for 5 minutes after a garbage collection,
-	// we hand it back to the operating system.
-	scavengelimit := int64(5 * 60 * 1e9)
-
-	if debug.scavenge > 0 {
-		// Scavenge-a-lot for testing.
-		forcegcperiod = 10 * 1e6
-		scavengelimit = 20 * 1e6
-	}
-
-	lastscavenge := nanotime()
-	nscavenge := 0
-
-	// Make wake-up period small enough for the sampling to be correct.
-	maxsleep := forcegcperiod / 2
-	if scavengelimit < forcegcperiod {
-		maxsleep = scavengelimit / 2
-	}
-
-	lasttrace := int64(0)
-	idle := 0 // how many cycles in succession we had not wokeup somebody
-	delay := uint32(0)
-	for {
-		if idle == 0 { // start with 20us sleep...
-			delay = 20
-		} else if idle > 50 { // start doubling the sleep after 1ms...
-			delay *= 2
-		}
-		if delay > 10*1000 { // up to 10ms
-			delay = 10 * 1000
-		}
-		usleep(delay)
-		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs)) { // TODO: fast atomic
-			lock(&sched.lock)
-			if atomicload(&sched.gcwaiting) != 0 || atomicload(&sched.npidle) == uint32(gomaxprocs) {
-				atomicstore(&sched.sysmonwait, 1)
-				unlock(&sched.lock)
-				notetsleep(&sched.sysmonnote, maxsleep)
-				lock(&sched.lock)
-				atomicstore(&sched.sysmonwait, 0)
-				noteclear(&sched.sysmonnote)
-				idle = 0
-				delay = 20
-			}
-			unlock(&sched.lock)
-		}
-		// poll network if not polled for more than 10ms
-		lastpoll := int64(atomicload64(&sched.lastpoll))
-		now := nanotime()
-		unixnow := unixnanotime()
-		if lastpoll != 0 && lastpoll+10*1000*1000 < now {
-			cas64(&sched.lastpoll, uint64(lastpoll), uint64(now))
-			gp := netpoll(false) // non-blocking - returns list of goroutines
-			if gp != nil {
-				// Need to decrement number of idle locked M's
-				// (pretending that one more is running) before injectglist.
-				// Otherwise it can lead to the following situation:
-				// injectglist grabs all P's but before it starts M's to run the P's,
-				// another M returns from syscall, finishes running its G,
-				// observes that there is no work to do and no other running M's
-				// and reports deadlock.
-				incidlelocked(-1)
-				injectglist(gp)
-				incidlelocked(1)
-			}
-		}
-		// retake P's blocked in syscalls
-		// and preempt long running G's
-		if retake(now) != 0 {
-			idle = 0
-		} else {
-			idle++
-		}
-		// check if we need to force a GC
-		lastgc := int64(atomicload64(&memstats.last_gc))
-		if lastgc != 0 && unixnow-lastgc > forcegcperiod && atomicload(&forcegc.idle) != 0 && atomicloaduint(&bggc.working) == 0 {
-			lock(&forcegc.lock)
-			forcegc.idle = 0
-			forcegc.g.schedlink = 0
-			injectglist(forcegc.g)
-			unlock(&forcegc.lock)
-		}
-		// scavenge heap once in a while
-		if lastscavenge+scavengelimit/2 < now {
-			mHeap_Scavenge(int32(nscavenge), uint64(now), uint64(scavengelimit))
-			lastscavenge = now
-			nscavenge++
-		}
-		if debug.schedtrace > 0 && lasttrace+int64(debug.schedtrace*1000000) <= now {
-			lasttrace = now
-			schedtrace(debug.scheddetail > 0)
-		}
-	}
-}
-
-var pdesc [_MaxGomaxprocs]struct {
-	schedtick   uint32
-	schedwhen   int64
-	syscalltick uint32
-	syscallwhen int64
-}
-
-// forcePreemptNS is the time slice given to a G before it is
-// preempted.
-const forcePreemptNS = 10 * 1000 * 1000 // 10ms
-
-func retake(now int64) uint32 {
-	n := 0
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			continue
-		}
-		pd := &pdesc[i]
-		s := _p_.status
-		if s == _Psyscall {
-			// Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
-			t := int64(_p_.syscalltick)
-			if int64(pd.syscalltick) != t {
-				pd.syscalltick = uint32(t)
-				pd.syscallwhen = now
-				continue
-			}
-			// On the one hand we don't want to retake Ps if there is no other work to do,
-			// but on the other hand we want to retake them eventually
-			// because they can prevent the sysmon thread from deep sleep.
-			if runqempty(_p_) && atomicload(&sched.nmspinning)+atomicload(&sched.npidle) > 0 && pd.syscallwhen+10*1000*1000 > now {
-				continue
-			}
-			// Need to decrement number of idle locked M's
-			// (pretending that one more is running) before the CAS.
-			// Otherwise the M from which we retake can exit the syscall,
-			// increment nmidle and report deadlock.
-			incidlelocked(-1)
-			if cas(&_p_.status, s, _Pidle) {
-				if trace.enabled {
-					traceGoSysBlock(_p_)
-					traceProcStop(_p_)
-				}
-				n++
-				_p_.syscalltick++
-				handoffp(_p_)
-			}
-			incidlelocked(1)
-		} else if s == _Prunning {
-			// Preempt G if it's running for too long.
-			t := int64(_p_.schedtick)
-			if int64(pd.schedtick) != t {
-				pd.schedtick = uint32(t)
-				pd.schedwhen = now
-				continue
-			}
-			if pd.schedwhen+forcePreemptNS > now {
-				continue
-			}
-			preemptone(_p_)
-		}
-	}
-	return uint32(n)
-}
-
-// Tell all goroutines that they have been preempted and they should stop.
-// This function is purely best-effort.  It can fail to inform a goroutine if a
-// processor just started running it.
-// No locks need to be held.
-// Returns true if preemption request was issued to at least one goroutine.
-func preemptall() bool {
-	res := false
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil || _p_.status != _Prunning {
-			continue
-		}
-		if preemptone(_p_) {
-			res = true
-		}
-	}
-	return res
-}
-
-// Tell the goroutine running on processor P to stop.
-// This function is purely best-effort.  It can incorrectly fail to inform the
-// goroutine.  It can send inform the wrong goroutine.  Even if it informs the
-// correct goroutine, that goroutine might ignore the request if it is
-// simultaneously executing newstack.
-// No lock needs to be held.
-// Returns true if preemption request was issued.
-// The actual preemption will happen at some point in the future
-// and will be indicated by the gp->status no longer being
-// Grunning
-func preemptone(_p_ *p) bool {
-	mp := _p_.m.ptr()
-	if mp == nil || mp == getg().m {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp == mp.g0 {
-		return false
-	}
-
-	gp.preempt = true
-
-	// Every call in a go routine checks for stack overflow by
-	// comparing the current stack pointer to gp->stackguard0.
-	// Setting gp->stackguard0 to StackPreempt folds
-	// preemption into the normal stack overflow check.
-	gp.stackguard0 = stackPreempt
-	return true
-}
-
-var starttime int64
-
-func schedtrace(detailed bool) {
-	now := nanotime()
-	if starttime == 0 {
-		starttime = now
-	}
-
-	lock(&sched.lock)
-	print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle, " threads=", sched.mcount, " spinningthreads=", sched.nmspinning, " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
-	if detailed {
-		print(" gcwaiting=", sched.gcwaiting, " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait, "\n")
-	}
-	// We must be careful while reading data from P's, M's and G's.
-	// Even if we hold schedlock, most data can be changed concurrently.
-	// E.g. (p->m ? p->m->id : -1) can crash if p->m changes from non-nil to nil.
-	for i := int32(0); i < gomaxprocs; i++ {
-		_p_ := allp[i]
-		if _p_ == nil {
-			continue
-		}
-		mp := _p_.m.ptr()
-		h := atomicload(&_p_.runqhead)
-		t := atomicload(&_p_.runqtail)
-		if detailed {
-			id := int32(-1)
-			if mp != nil {
-				id = mp.id
-			}
-			print("  P", i, ": status=", _p_.status, " schedtick=", _p_.schedtick, " syscalltick=", _p_.syscalltick, " m=", id, " runqsize=", t-h, " gfreecnt=", _p_.gfreecnt, "\n")
-		} else {
-			// In non-detailed mode format lengths of per-P run queues as:
-			// [len1 len2 len3 len4]
-			print(" ")
-			if i == 0 {
-				print("[")
-			}
-			print(t - h)
-			if i == gomaxprocs-1 {
-				print("]\n")
-			}
-		}
-	}
-
-	if !detailed {
-		unlock(&sched.lock)
-		return
-	}
-
-	for mp := allm; mp != nil; mp = mp.alllink {
-		_p_ := mp.p.ptr()
-		gp := mp.curg
-		lockedg := mp.lockedg
-		id1 := int32(-1)
-		if _p_ != nil {
-			id1 = _p_.id
-		}
-		id2 := int64(-1)
-		if gp != nil {
-			id2 = gp.goid
-		}
-		id3 := int64(-1)
-		if lockedg != nil {
-			id3 = lockedg.goid
-		}
-		print("  M", mp.id, ": p=", id1, " curg=", id2, " mallocing=", mp.mallocing, " throwing=", mp.throwing, " preemptoff=", mp.preemptoff, ""+" locks=", mp.locks, " dying=", mp.dying, " helpgc=", mp.helpgc, " spinning=", mp.spinning, " blocked=", getg().m.blocked, " lockedg=", id3, "\n")
-	}
-
-	lock(&allglock)
-	for gi := 0; gi < len(allgs); gi++ {
-		gp := allgs[gi]
-		mp := gp.m
-		lockedm := gp.lockedm
-		id1 := int32(-1)
-		if mp != nil {
-			id1 = mp.id
-		}
-		id2 := int32(-1)
-		if lockedm != nil {
-			id2 = lockedm.id
-		}
-		print("  G", gp.goid, ": status=", readgstatus(gp), "(", gp.waitreason, ") m=", id1, " lockedm=", id2, "\n")
-	}
-	unlock(&allglock)
-	unlock(&sched.lock)
-}
-
-// Put mp on midle list.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func mput(mp *m) {
-	mp.schedlink = sched.midle
-	sched.midle.set(mp)
-	sched.nmidle++
-	checkdead()
-}
-
-// Try to get an m from midle list.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func mget() *m {
-	mp := sched.midle.ptr()
-	if mp != nil {
-		sched.midle = mp.schedlink
-		sched.nmidle--
-	}
-	return mp
-}
-
-// Put gp on the global runnable queue.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func globrunqput(gp *g) {
-	gp.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(gp)
-	} else {
-		sched.runqhead.set(gp)
-	}
-	sched.runqtail.set(gp)
-	sched.runqsize++
-}
-
-// Put gp at the head of the global runnable queue.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func globrunqputhead(gp *g) {
-	gp.schedlink = sched.runqhead
-	sched.runqhead.set(gp)
-	if sched.runqtail == 0 {
-		sched.runqtail.set(gp)
-	}
-	sched.runqsize++
-}
-
-// Put a batch of runnable goroutines on the global runnable queue.
-// Sched must be locked.
-func globrunqputbatch(ghead *g, gtail *g, n int32) {
-	gtail.schedlink = 0
-	if sched.runqtail != 0 {
-		sched.runqtail.ptr().schedlink.set(ghead)
-	} else {
-		sched.runqhead.set(ghead)
-	}
-	sched.runqtail.set(gtail)
-	sched.runqsize += n
-}
-
-// Try get a batch of G's from the global runnable queue.
-// Sched must be locked.
-func globrunqget(_p_ *p, max int32) *g {
-	if sched.runqsize == 0 {
-		return nil
-	}
-
-	n := sched.runqsize/gomaxprocs + 1
-	if n > sched.runqsize {
-		n = sched.runqsize
-	}
-	if max > 0 && n > max {
-		n = max
-	}
-	if n > int32(len(_p_.runq))/2 {
-		n = int32(len(_p_.runq)) / 2
-	}
-
-	sched.runqsize -= n
-	if sched.runqsize == 0 {
-		sched.runqtail = 0
-	}
-
-	gp := sched.runqhead.ptr()
-	sched.runqhead = gp.schedlink
-	n--
-	for ; n > 0; n-- {
-		gp1 := sched.runqhead.ptr()
-		sched.runqhead = gp1.schedlink
-		runqput(_p_, gp1, false)
-	}
-	return gp
-}
-
-// Put p to on _Pidle list.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func pidleput(_p_ *p) {
-	if !runqempty(_p_) {
-		throw("pidleput: P has non-empty run queue")
-	}
-	_p_.link = sched.pidle
-	sched.pidle.set(_p_)
-	xadd(&sched.npidle, 1) // TODO: fast atomic
-}
-
-// Try get a p from _Pidle list.
-// Sched must be locked.
-// May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
-func pidleget() *p {
-	_p_ := sched.pidle.ptr()
-	if _p_ != nil {
-		sched.pidle = _p_.link
-		xadd(&sched.npidle, -1) // TODO: fast atomic
-	}
-	return _p_
-}
-
-// runqempty returns true if _p_ has no Gs on its local run queue.
-// Note that this test is generally racy.
-func runqempty(_p_ *p) bool {
-	return _p_.runqhead == _p_.runqtail && _p_.runnext == 0
-}
-
-// To shake out latent assumptions about scheduling order,
-// we introduce some randomness into scheduling decisions
-// when running with the race detector.
-// The need for this was made obvious by changing the
-// (deterministic) scheduling order in Go 1.5 and breaking
-// many poorly-written tests.
-// With the randomness here, as long as the tests pass
-// consistently with -race, they shouldn't have latent scheduling
-// assumptions.
-const randomizeScheduler = raceenabled
-
-// runqput tries to put g on the local runnable queue.
-// If next if false, runqput adds g to the tail of the runnable queue.
-// If next is true, runqput puts g in the _p_.runnext slot.
-// If the run queue is full, runnext puts g on the global queue.
-// Executed only by the owner P.
-func runqput(_p_ *p, gp *g, next bool) {
-	if randomizeScheduler && next && fastrand1()%2 == 0 {
-		next = false
-	}
-
-	if next {
-	retryNext:
-		oldnext := _p_.runnext
-		if !_p_.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
-			goto retryNext
-		}
-		if oldnext == 0 {
-			return
-		}
-		// Kick the old runnext out to the regular run queue.
-		gp = oldnext.ptr()
-	}
-
-retry:
-	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
-	t := _p_.runqtail
-	if t-h < uint32(len(_p_.runq)) {
-		_p_.runq[t%uint32(len(_p_.runq))] = gp
-		atomicstore(&_p_.runqtail, t+1) // store-release, makes the item available for consumption
-		return
-	}
-	if runqputslow(_p_, gp, h, t) {
-		return
-	}
-	// the queue is not full, now the put above must suceed
-	goto retry
-}
-
-// Put g and a batch of work from local runnable queue on global queue.
-// Executed only by the owner P.
-func runqputslow(_p_ *p, gp *g, h, t uint32) bool {
-	var batch [len(_p_.runq)/2 + 1]*g
-
-	// First, grab a batch from local queue.
-	n := t - h
-	n = n / 2
-	if n != uint32(len(_p_.runq)/2) {
-		throw("runqputslow: queue is not full")
-	}
-	for i := uint32(0); i < n; i++ {
-		batch[i] = _p_.runq[(h+i)%uint32(len(_p_.runq))]
-	}
-	if !cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
-		return false
-	}
-	batch[n] = gp
-
-	if randomizeScheduler {
-		for i := uint32(1); i <= n; i++ {
-			j := fastrand1() % (i + 1)
-			batch[i], batch[j] = batch[j], batch[i]
-		}
-	}
-
-	// Link the goroutines.
-	for i := uint32(0); i < n; i++ {
-		batch[i].schedlink.set(batch[i+1])
-	}
-
-	// Now put the batch on global queue.
-	lock(&sched.lock)
-	globrunqputbatch(batch[0], batch[n], int32(n+1))
-	unlock(&sched.lock)
-	return true
-}
-
-// Get g from local runnable queue.
-// If inheritTime is true, gp should inherit the remaining time in the
-// current time slice. Otherwise, it should start a new time slice.
-// Executed only by the owner P.
-func runqget(_p_ *p) (gp *g, inheritTime bool) {
-	// If there's a runnext, it's the next G to run.
-	for {
-		next := _p_.runnext
-		if next == 0 {
-			break
-		}
-		if _p_.runnext.cas(next, 0) {
-			return next.ptr(), true
-		}
-	}
-
-	for {
-		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := _p_.runqtail
-		if t == h {
-			return nil, false
-		}
-		gp := _p_.runq[h%uint32(len(_p_.runq))]
-		if cas(&_p_.runqhead, h, h+1) { // cas-release, commits consume
-			return gp, false
-		}
-	}
-}
-
-// Grabs a batch of goroutines from _p_'s runnable queue into batch.
-// Batch is a ring buffer starting at batchHead.
-// Returns number of grabbed goroutines.
-// Can be executed by any P.
-func runqgrab(_p_ *p, batch *[256]*g, batchHead uint32, stealRunNextG bool) uint32 {
-	for {
-		h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers
-		t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer
-		n := t - h
-		n = n - n/2
-		if n == 0 {
-			if stealRunNextG {
-				// Try to steal from _p_.runnext.
-				if next := _p_.runnext; next != 0 {
-					// Sleep to ensure that _p_ isn't about to run the g we
-					// are about to steal.
-					// The important use case here is when the g running on _p_
-					// ready()s another g and then almost immediately blocks.
-					// Instead of stealing runnext in this window, back off
-					// to give _p_ a chance to schedule runnext. This will avoid
-					// thrashing gs between different Ps.
-					usleep(100)
-					if !_p_.runnext.cas(next, 0) {
-						continue
-					}
-					batch[batchHead%uint32(len(batch))] = next.ptr()
-					return 1
-				}
-			}
-			return 0
-		}
-		if n > uint32(len(_p_.runq)/2) { // read inconsistent h and t
-			continue
-		}
-		for i := uint32(0); i < n; i++ {
-			g := _p_.runq[(h+i)%uint32(len(_p_.runq))]
-			batch[(batchHead+i)%uint32(len(batch))] = g
-		}
-		if cas(&_p_.runqhead, h, h+n) { // cas-release, commits consume
-			return n
-		}
-	}
-}
-
-// Steal half of elements from local runnable queue of p2
-// and put onto local runnable queue of p.
-// Returns one of the stolen elements (or nil if failed).
-func runqsteal(_p_, p2 *p, stealRunNextG bool) *g {
-	t := _p_.runqtail
-	n := runqgrab(p2, &_p_.runq, t, stealRunNextG)
-	if n == 0 {
-		return nil
-	}
-	n--
-	gp := _p_.runq[(t+n)%uint32(len(_p_.runq))]
-	if n == 0 {
-		return gp
-	}
-	h := atomicload(&_p_.runqhead) // load-acquire, synchronize with consumers
-	if t-h+n >= uint32(len(_p_.runq)) {
-		throw("runqsteal: runq overflow")
-	}
-	atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption
-	return gp
-}
-
-func testSchedLocalQueue() {
-	_p_ := new(p)
-	gs := make([]g, len(_p_.runq))
-	for i := 0; i < len(_p_.runq); i++ {
-		if g, _ := runqget(_p_); g != nil {
-			throw("runq is not empty initially")
-		}
-		for j := 0; j < i; j++ {
-			runqput(_p_, &gs[i], false)
-		}
-		for j := 0; j < i; j++ {
-			if g, _ := runqget(_p_); g != &gs[i] {
-				print("bad element at iter ", i, "/", j, "\n")
-				throw("bad element")
-			}
-		}
-		if g, _ := runqget(_p_); g != nil {
-			throw("runq is not empty afterwards")
-		}
-	}
-}
-
-func testSchedLocalQueueSteal() {
-	p1 := new(p)
-	p2 := new(p)
-	gs := make([]g, len(p1.runq))
-	for i := 0; i < len(p1.runq); i++ {
-		for j := 0; j < i; j++ {
-			gs[j].sig = 0
-			runqput(p1, &gs[j], false)
-		}
-		gp := runqsteal(p2, p1, true)
-		s := 0
-		if gp != nil {
-			s++
-			gp.sig++
-		}
-		for {
-			gp, _ = runqget(p2)
-			if gp == nil {
-				break
-			}
-			s++
-			gp.sig++
-		}
-		for {
-			gp, _ = runqget(p1)
-			if gp == nil {
-				break
-			}
-			gp.sig++
-		}
-		for j := 0; j < i; j++ {
-			if gs[j].sig != 1 {
-				print("bad element ", j, "(", gs[j].sig, ") at iter ", i, "\n")
-				throw("bad element")
-			}
-		}
-		if s != i/2 && s != i/2+1 {
-			print("bad steal ", s, ", want ", i/2, " or ", i/2+1, ", iter ", i, "\n")
-			throw("bad steal")
-		}
-	}
-}
-
-func setMaxThreads(in int) (out int) {
-	lock(&sched.lock)
-	out = int(sched.maxmcount)
-	sched.maxmcount = int32(in)
-	checkmcount()
-	unlock(&sched.lock)
-	return
-}
-
-func haveexperiment(name string) bool {
-	x := goexperiment
-	for x != "" {
-		xname := ""
-		i := index(x, ",")
-		if i < 0 {
-			xname, x = x, ""
-		} else {
-			xname, x = x[:i], x[i+1:]
-		}
-		if xname == name {
-			return true
-		}
-	}
-	return false
-}
-
-//go:nosplit
-func procPin() int {
-	_g_ := getg()
-	mp := _g_.m
-
-	mp.locks++
-	return int(mp.p.ptr().id)
-}
-
-//go:nosplit
-func procUnpin() {
-	_g_ := getg()
-	_g_.m.locks--
-}
-
-//go:linkname sync_runtime_procPin sync.runtime_procPin
-//go:nosplit
-func sync_runtime_procPin() int {
-	return procPin()
-}
-
-//go:linkname sync_runtime_procUnpin sync.runtime_procUnpin
-//go:nosplit
-func sync_runtime_procUnpin() {
-	procUnpin()
-}
-
-//go:linkname sync_atomic_runtime_procPin sync/atomic.runtime_procPin
-//go:nosplit
-func sync_atomic_runtime_procPin() int {
-	return procPin()
-}
-
-//go:linkname sync_atomic_runtime_procUnpin sync/atomic.runtime_procUnpin
-//go:nosplit
-func sync_atomic_runtime_procUnpin() {
-	procUnpin()
-}
-
-// Active spinning for sync.Mutex.
-//go:linkname sync_runtime_canSpin sync.runtime_canSpin
-//go:nosplit
-func sync_runtime_canSpin(i int) bool {
-	// sync.Mutex is cooperative, so we are conservative with spinning.
-	// Spin only few times and only if running on a multicore machine and
-	// GOMAXPROCS>1 and there is at least one other running P and local runq is empty.
-	// As opposed to runtime mutex we don't do passive spinning here,
-	// because there can be work on global runq on on other Ps.
-	if i >= active_spin || ncpu <= 1 || gomaxprocs <= int32(sched.npidle+sched.nmspinning)+1 {
-		return false
-	}
-	if p := getg().m.p.ptr(); !runqempty(p) {
-		return false
-	}
-	return true
-}
-
-//go:linkname sync_runtime_doSpin sync.runtime_doSpin
-//go:nosplit
-func sync_runtime_doSpin() {
-	procyield(active_spin_cnt)
-}

diff --git a/src/runtime/proc_runtime_test.go b/src/runtime/proc_runtime_test.go
new file mode 100644
index 0000000..a7bde2c
--- /dev/null
+++ b/src/runtime/proc_runtime_test.go

@@ -0,0 +1,33 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Proc unit tests. In runtime package so can use runtime guts.
+
+package runtime
+
+func RunStealOrderTest() {
+	var ord randomOrder
+	for procs := 1; procs <= 64; procs++ {
+		ord.reset(uint32(procs))
+		if procs >= 3 && len(ord.coprimes) < 2 {
+			panic("too few coprimes")
+		}
+		for co := 0; co < len(ord.coprimes); co++ {
+			enum := ord.start(uint32(co))
+			checked := make([]bool, procs)
+			for p := 0; p < procs; p++ {
+				x := enum.position()
+				if checked[x] {
+					println("procs:", procs, "inc:", enum.inc)
+					panic("duplicate during enumeration")
+				}
+				checked[x] = true
+				enum.next()
+			}
+			if !enum.done() {
+				panic("not done")
+			}
+		}
+	}
+}

diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 2be103e..22e4dca 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go

@@ -6,8 +6,10 @@
 
 import (
 	"math"
+	"net"
 	"runtime"
 	"runtime/debug"
+	"strings"
 	"sync"
 	"sync/atomic"
 	"syscall"
@@ -132,6 +134,86 @@
 	}
 }
 
+// Test that all runnable goroutines are scheduled at the same time.
+func TestGoroutineParallelism2(t *testing.T) {
+	//testGoroutineParallelism2(t, false, false)
+	testGoroutineParallelism2(t, true, false)
+	testGoroutineParallelism2(t, false, true)
+	testGoroutineParallelism2(t, true, true)
+}
+
+func testGoroutineParallelism2(t *testing.T, load, netpoll bool) {
+	if runtime.NumCPU() == 1 {
+		// Takes too long, too easy to deadlock, etc.
+		t.Skip("skipping on uniprocessor")
+	}
+	P := 4
+	N := 10
+	if testing.Short() {
+		N = 3
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(P))
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted.
+	// Disable GC for this test (see issue #10958).
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+	for try := 0; try < N; try++ {
+		if load {
+			// Create P goroutines and wait until they all run.
+			// When we run the actual test below, worker threads
+			// running the goroutines will start parking.
+			done := make(chan bool)
+			x := uint32(0)
+			for p := 0; p < P; p++ {
+				go func() {
+					if atomic.AddUint32(&x, 1) == uint32(P) {
+						done <- true
+						return
+					}
+					for atomic.LoadUint32(&x) != uint32(P) {
+					}
+				}()
+			}
+			<-done
+		}
+		if netpoll {
+			// Enable netpoller, affects schedler behavior.
+			laddr := "localhost:0"
+			if runtime.GOOS == "android" {
+				// On some Android devices, there are no records for localhost,
+				// see https://golang.org/issues/14486.
+				// Don't use 127.0.0.1 for every case, it won't work on IPv6-only systems.
+				laddr = "127.0.0.1:0"
+			}
+			ln, err := net.Listen("tcp", laddr)
+			if err != nil {
+				defer ln.Close() // yup, defer in a loop
+			}
+		}
+		done := make(chan bool)
+		x := uint32(0)
+		// Spawn P goroutines in a nested fashion just to differ from TestGoroutineParallelism.
+		for p := 0; p < P/2; p++ {
+			go func(p int) {
+				for p2 := 0; p2 < 2; p2++ {
+					go func(p2 int) {
+						for i := 0; i < 3; i++ {
+							expected := uint32(P*i + p*2 + p2)
+							for atomic.LoadUint32(&x) != expected {
+							}
+							atomic.StoreUint32(&x, expected+1)
+						}
+						done <- true
+					}(p2)
+				}
+			}(p)
+		}
+		for p := 0; p < P; p++ {
+			<-done
+		}
+	}
+}
+
 func TestBlockLocked(t *testing.T) {
 	const N = 10
 	c := make(chan bool)
@@ -255,47 +337,52 @@
 }
 
 func TestGCFairness(t *testing.T) {
-	output := executeTest(t, testGCFairnessSource, nil)
+	output := runTestProg(t, "testprog", "GCFairness")
 	want := "OK\n"
 	if output != want {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
 
-const testGCFairnessSource = `
-package main
-
-import (
-	"fmt"
-	"os"
-	"runtime"
-	"time"
-)
-
-func main() {
-	runtime.GOMAXPROCS(1)
-	f, err := os.Open("/dev/null")
-	if os.IsNotExist(err) {
-		// This test tests what it is intended to test only if writes are fast.
-		// If there is no /dev/null, we just don't execute the test.
-		fmt.Println("OK")
-		return
+func TestGCFairness2(t *testing.T) {
+	output := runTestProg(t, "testprog", "GCFairness2")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
 	}
-	if err != nil {
-		fmt.Println(err)
-		os.Exit(1)
-	}
-	for i := 0; i < 2; i++ {
-		go func() {
-			for {
-				f.Write([]byte("."))
-			}
-		}()
-	}
-	time.Sleep(10 * time.Millisecond)
-	fmt.Println("OK")
 }
-`
+
+func TestNumGoroutine(t *testing.T) {
+	output := runTestProg(t, "testprog", "NumGoroutine")
+	want := "1\n"
+	if output != want {
+		t.Fatalf("want %q, got %q", want, output)
+	}
+
+	buf := make([]byte, 1<<20)
+
+	// Try up to 10 times for a match before giving up.
+	// This is a fundamentally racy check but it's important
+	// to notice if NumGoroutine and Stack are _always_ out of sync.
+	for i := 0; ; i++ {
+		// Give goroutines about to exit a chance to exit.
+		// The NumGoroutine and Stack below need to see
+		// the same state of the world, so anything we can do
+		// to keep it quiet is good.
+		runtime.Gosched()
+
+		n := runtime.NumGoroutine()
+		buf = buf[:runtime.Stack(buf, true)]
+
+		nstk := strings.Count(string(buf), "goroutine ")
+		if n == nstk {
+			break
+		}
+		if i >= 10 {
+			t.Fatalf("NumGoroutine=%d, but found %d goroutines in stack dump: %s", n, nstk, buf)
+		}
+	}
+}
 
 func TestPingPongHog(t *testing.T) {
 	if testing.Short() {
@@ -349,6 +436,9 @@
 }
 
 func BenchmarkPingPongHog(b *testing.B) {
+	if b.N == 0 {
+		return
+	}
 	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
 
 	// Create a CPU hog
@@ -471,6 +561,24 @@
 	runtime.RunSchedLocalQueueStealTest()
 }
 
+func TestSchedLocalQueueEmpty(t *testing.T) {
+	if runtime.NumCPU() == 1 {
+		// Takes too long and does not trigger the race.
+		t.Skip("skipping on uniprocessor")
+	}
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(4))
+
+	// If runtime triggers a forced GC during this test then it will deadlock,
+	// since the goroutines can't be stopped/preempted during spin wait.
+	defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+	iters := int(1e5)
+	if testing.Short() {
+		iters = 1e2
+	}
+	runtime.RunSchedLocalQueueEmptyTest(iters)
+}
+
 func benchmarkStackGrowth(b *testing.B, rec int) {
 	b.RunParallel(func(pb *testing.PB) {
 		for pb.Next() {
@@ -607,3 +715,7 @@
 		done <- struct{}{}
 	}
 }
+
+func TestStealOrder(t *testing.T) {
+	runtime.RunStealOrderTest()
+}

diff --git a/src/runtime/race.go b/src/runtime/race.go
index 923d611..42da936 100644
--- a/src/runtime/race.go
+++ b/src/runtime/race.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -58,7 +58,7 @@
 //go:noescape
 func racewritepc(addr unsafe.Pointer, callpc, pc uintptr)
 
-type symbolizeContext struct {
+type symbolizeCodeContext struct {
 	pc   uintptr
 	fn   *byte
 	file *byte
@@ -70,8 +70,27 @@
 var qq = [...]byte{'?', '?', 0}
 var dash = [...]byte{'-', 0}
 
+const (
+	raceGetProcCmd = iota
+	raceSymbolizeCodeCmd
+	raceSymbolizeDataCmd
+)
+
 // Callback from C into Go, runs on g0.
-func racesymbolize(ctx *symbolizeContext) {
+func racecallback(cmd uintptr, ctx unsafe.Pointer) {
+	switch cmd {
+	case raceGetProcCmd:
+		throw("should have been handled by racecallbackthunk")
+	case raceSymbolizeCodeCmd:
+		raceSymbolizeCode((*symbolizeCodeContext)(ctx))
+	case raceSymbolizeDataCmd:
+		raceSymbolizeData((*symbolizeDataContext)(ctx))
+	default:
+		throw("unknown command")
+	}
+}
+
+func raceSymbolizeCode(ctx *symbolizeCodeContext) {
 	f := findfunc(ctx.pc)
 	if f == nil {
 		ctx.fn = &qq[0]
@@ -90,3 +109,360 @@
 	ctx.res = 1
 	return
 }
+
+type symbolizeDataContext struct {
+	addr  uintptr
+	heap  uintptr
+	start uintptr
+	size  uintptr
+	name  *byte
+	file  *byte
+	line  uintptr
+	res   uintptr
+}
+
+func raceSymbolizeData(ctx *symbolizeDataContext) {
+	if _, x, n := findObject(unsafe.Pointer(ctx.addr)); x != nil {
+		ctx.heap = 1
+		ctx.start = uintptr(x)
+		ctx.size = n
+		ctx.res = 1
+	}
+}
+
+// Race runtime functions called via runtime·racecall.
+//go:linkname __tsan_init __tsan_init
+var __tsan_init byte
+
+//go:linkname __tsan_fini __tsan_fini
+var __tsan_fini byte
+
+//go:linkname __tsan_proc_create __tsan_proc_create
+var __tsan_proc_create byte
+
+//go:linkname __tsan_proc_destroy __tsan_proc_destroy
+var __tsan_proc_destroy byte
+
+//go:linkname __tsan_map_shadow __tsan_map_shadow
+var __tsan_map_shadow byte
+
+//go:linkname __tsan_finalizer_goroutine __tsan_finalizer_goroutine
+var __tsan_finalizer_goroutine byte
+
+//go:linkname __tsan_go_start __tsan_go_start
+var __tsan_go_start byte
+
+//go:linkname __tsan_go_end __tsan_go_end
+var __tsan_go_end byte
+
+//go:linkname __tsan_malloc __tsan_malloc
+var __tsan_malloc byte
+
+//go:linkname __tsan_free __tsan_free
+var __tsan_free byte
+
+//go:linkname __tsan_acquire __tsan_acquire
+var __tsan_acquire byte
+
+//go:linkname __tsan_release __tsan_release
+var __tsan_release byte
+
+//go:linkname __tsan_release_merge __tsan_release_merge
+var __tsan_release_merge byte
+
+//go:linkname __tsan_go_ignore_sync_begin __tsan_go_ignore_sync_begin
+var __tsan_go_ignore_sync_begin byte
+
+//go:linkname __tsan_go_ignore_sync_end __tsan_go_ignore_sync_end
+var __tsan_go_ignore_sync_end byte
+
+// Mimic what cmd/cgo would do.
+//go:cgo_import_static __tsan_init
+//go:cgo_import_static __tsan_fini
+//go:cgo_import_static __tsan_proc_create
+//go:cgo_import_static __tsan_proc_destroy
+//go:cgo_import_static __tsan_map_shadow
+//go:cgo_import_static __tsan_finalizer_goroutine
+//go:cgo_import_static __tsan_go_start
+//go:cgo_import_static __tsan_go_end
+//go:cgo_import_static __tsan_malloc
+//go:cgo_import_static __tsan_free
+//go:cgo_import_static __tsan_acquire
+//go:cgo_import_static __tsan_release
+//go:cgo_import_static __tsan_release_merge
+//go:cgo_import_static __tsan_go_ignore_sync_begin
+//go:cgo_import_static __tsan_go_ignore_sync_end
+
+// These are called from race_amd64.s.
+//go:cgo_import_static __tsan_read
+//go:cgo_import_static __tsan_read_pc
+//go:cgo_import_static __tsan_read_range
+//go:cgo_import_static __tsan_write
+//go:cgo_import_static __tsan_write_pc
+//go:cgo_import_static __tsan_write_range
+//go:cgo_import_static __tsan_func_enter
+//go:cgo_import_static __tsan_func_exit
+
+//go:cgo_import_static __tsan_go_atomic32_load
+//go:cgo_import_static __tsan_go_atomic64_load
+//go:cgo_import_static __tsan_go_atomic32_store
+//go:cgo_import_static __tsan_go_atomic64_store
+//go:cgo_import_static __tsan_go_atomic32_exchange
+//go:cgo_import_static __tsan_go_atomic64_exchange
+//go:cgo_import_static __tsan_go_atomic32_fetch_add
+//go:cgo_import_static __tsan_go_atomic64_fetch_add
+//go:cgo_import_static __tsan_go_atomic32_compare_exchange
+//go:cgo_import_static __tsan_go_atomic64_compare_exchange
+
+// start/end of global data (data+bss).
+var racedatastart uintptr
+var racedataend uintptr
+
+// start/end of heap for race_amd64.s
+var racearenastart uintptr
+var racearenaend uintptr
+
+func racefuncenter(uintptr)
+func racefuncexit()
+func racereadrangepc1(uintptr, uintptr, uintptr)
+func racewriterangepc1(uintptr, uintptr, uintptr)
+func racecallbackthunk(uintptr)
+
+// racecall allows calling an arbitrary function f from C race runtime
+// with up to 4 uintptr arguments.
+func racecall(*byte, uintptr, uintptr, uintptr, uintptr)
+
+// checks if the address has shadow (i.e. heap or data/bss)
+//go:nosplit
+func isvalidaddr(addr unsafe.Pointer) bool {
+	return racearenastart <= uintptr(addr) && uintptr(addr) < racearenaend ||
+		racedatastart <= uintptr(addr) && uintptr(addr) < racedataend
+}
+
+//go:nosplit
+func raceinit() (gctx, pctx uintptr) {
+	// cgo is required to initialize libc, which is used by race runtime
+	if !iscgo {
+		throw("raceinit: race build must use cgo")
+	}
+
+	racecall(&__tsan_init, uintptr(unsafe.Pointer(&gctx)), uintptr(unsafe.Pointer(&pctx)), funcPC(racecallbackthunk), 0)
+
+	// Round data segment to page boundaries, because it's used in mmap().
+	start := ^uintptr(0)
+	end := uintptr(0)
+	if start > firstmoduledata.noptrdata {
+		start = firstmoduledata.noptrdata
+	}
+	if start > firstmoduledata.data {
+		start = firstmoduledata.data
+	}
+	if start > firstmoduledata.noptrbss {
+		start = firstmoduledata.noptrbss
+	}
+	if start > firstmoduledata.bss {
+		start = firstmoduledata.bss
+	}
+	if end < firstmoduledata.enoptrdata {
+		end = firstmoduledata.enoptrdata
+	}
+	if end < firstmoduledata.edata {
+		end = firstmoduledata.edata
+	}
+	if end < firstmoduledata.enoptrbss {
+		end = firstmoduledata.enoptrbss
+	}
+	if end < firstmoduledata.ebss {
+		end = firstmoduledata.ebss
+	}
+	size := round(end-start, _PageSize)
+	racecall(&__tsan_map_shadow, start, size, 0, 0)
+	racedatastart = start
+	racedataend = start + size
+
+	return
+}
+
+var raceFiniLock mutex
+
+//go:nosplit
+func racefini() {
+	// racefini() can only be called once to avoid races.
+	// This eventually (via __tsan_fini) calls C.exit which has
+	// undefined behavior if called more than once. If the lock is
+	// already held it's assumed that the first caller exits the program
+	// so other calls can hang forever without an issue.
+	lock(&raceFiniLock)
+	racecall(&__tsan_fini, 0, 0, 0, 0)
+}
+
+//go:nosplit
+func raceproccreate() uintptr {
+	var ctx uintptr
+	racecall(&__tsan_proc_create, uintptr(unsafe.Pointer(&ctx)), 0, 0, 0)
+	return ctx
+}
+
+//go:nosplit
+func raceprocdestroy(ctx uintptr) {
+	racecall(&__tsan_proc_destroy, ctx, 0, 0, 0)
+}
+
+//go:nosplit
+func racemapshadow(addr unsafe.Pointer, size uintptr) {
+	if racearenastart == 0 {
+		racearenastart = uintptr(addr)
+	}
+	if racearenaend < uintptr(addr)+size {
+		racearenaend = uintptr(addr) + size
+	}
+	racecall(&__tsan_map_shadow, uintptr(addr), size, 0, 0)
+}
+
+//go:nosplit
+func racemalloc(p unsafe.Pointer, sz uintptr) {
+	racecall(&__tsan_malloc, 0, 0, uintptr(p), sz)
+}
+
+//go:nosplit
+func racefree(p unsafe.Pointer, sz uintptr) {
+	racecall(&__tsan_free, uintptr(p), sz, 0, 0)
+}
+
+//go:nosplit
+func racegostart(pc uintptr) uintptr {
+	_g_ := getg()
+	var spawng *g
+	if _g_.m.curg != nil {
+		spawng = _g_.m.curg
+	} else {
+		spawng = _g_
+	}
+
+	var racectx uintptr
+	racecall(&__tsan_go_start, spawng.racectx, uintptr(unsafe.Pointer(&racectx)), pc, 0)
+	return racectx
+}
+
+//go:nosplit
+func racegoend() {
+	racecall(&__tsan_go_end, getg().racectx, 0, 0, 0)
+}
+
+//go:nosplit
+func racewriterangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
+	_g_ := getg()
+	if _g_ != _g_.m.curg {
+		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
+		// Not interesting.
+		return
+	}
+	if callpc != 0 {
+		racefuncenter(callpc)
+	}
+	racewriterangepc1(uintptr(addr), sz, pc)
+	if callpc != 0 {
+		racefuncexit()
+	}
+}
+
+//go:nosplit
+func racereadrangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
+	_g_ := getg()
+	if _g_ != _g_.m.curg {
+		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
+		// Not interesting.
+		return
+	}
+	if callpc != 0 {
+		racefuncenter(callpc)
+	}
+	racereadrangepc1(uintptr(addr), sz, pc)
+	if callpc != 0 {
+		racefuncexit()
+	}
+}
+
+//go:nosplit
+func raceacquire(addr unsafe.Pointer) {
+	raceacquireg(getg(), addr)
+}
+
+//go:nosplit
+func raceacquireg(gp *g, addr unsafe.Pointer) {
+	if getg().raceignore != 0 || !isvalidaddr(addr) {
+		return
+	}
+	racecall(&__tsan_acquire, gp.racectx, uintptr(addr), 0, 0)
+}
+
+//go:nosplit
+func racerelease(addr unsafe.Pointer) {
+	racereleaseg(getg(), addr)
+}
+
+//go:nosplit
+func racereleaseg(gp *g, addr unsafe.Pointer) {
+	if getg().raceignore != 0 || !isvalidaddr(addr) {
+		return
+	}
+	racecall(&__tsan_release, gp.racectx, uintptr(addr), 0, 0)
+}
+
+//go:nosplit
+func racereleasemerge(addr unsafe.Pointer) {
+	racereleasemergeg(getg(), addr)
+}
+
+//go:nosplit
+func racereleasemergeg(gp *g, addr unsafe.Pointer) {
+	if getg().raceignore != 0 || !isvalidaddr(addr) {
+		return
+	}
+	racecall(&__tsan_release_merge, gp.racectx, uintptr(addr), 0, 0)
+}
+
+//go:nosplit
+func racefingo() {
+	racecall(&__tsan_finalizer_goroutine, getg().racectx, 0, 0, 0)
+}
+
+//go:nosplit
+
+func RaceAcquire(addr unsafe.Pointer) {
+	raceacquire(addr)
+}
+
+//go:nosplit
+
+func RaceRelease(addr unsafe.Pointer) {
+	racerelease(addr)
+}
+
+//go:nosplit
+
+func RaceReleaseMerge(addr unsafe.Pointer) {
+	racereleasemerge(addr)
+}
+
+//go:nosplit
+
+// RaceDisable disables handling of race events in the current goroutine.
+func RaceDisable() {
+	_g_ := getg()
+	if _g_.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_begin, _g_.racectx, 0, 0, 0)
+	}
+	_g_.raceignore++
+}
+
+//go:nosplit
+
+// RaceEnable re-enables handling of race events in the current goroutine.
+func RaceEnable() {
+	_g_ := getg()
+	_g_.raceignore--
+	if _g_.raceignore == 0 {
+		racecall(&__tsan_go_ignore_sync_end, _g_.racectx, 0, 0, 0)
+	}
+}

diff --git a/src/runtime/race/README b/src/runtime/race/README
index 52dd38e..3a506b0 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README

@@ -1,12 +1,7 @@
 runtime/race package contains the data race detector runtime library.
 It is based on ThreadSanitizer race detector, that is currently a part of
-the LLVM project.
+the LLVM project (http://llvm.org/git/compiler-rt.git).
 
-To update the .syso files you need to:
-$ svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk
-$ cd compiler-rt/lib/tsan/go
-$ ./buildgo.sh
+To update the .syso files use golang.org/x/build/cmd/racebuild.
 
-Tested with gcc 4.6.1 and 4.7.0.  On Windows it's built with 64-bit MinGW.
-
-Current runtime is built on rev 229396.
+Current runtime is built on rev 9d79ea3416bfbe3acac50e47802ee9621bf53254.

diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index a9f9f0f..5157f7e 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go

@@ -51,7 +51,10 @@
 			}
 			cmd.Env = append(cmd.Env, env)
 		}
-		cmd.Env = append(cmd.Env, "GORACE="+test.gorace)
+		cmd.Env = append(cmd.Env,
+			"GOMAXPROCS=1", // see comment in race_test.go
+			"GORACE="+test.gorace,
+		)
 		got, _ := cmd.CombinedOutput()
 		if !regexp.MustCompile(test.re).MatchString(string(got)) {
 			t.Fatalf("failed test case %v, expect:\n%v\ngot:\n%s",
@@ -90,13 +93,13 @@
 }
 `, `==================
 WARNING: DATA RACE
-Write by goroutine [0-9]:
+Write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.store\(\)
       .+/main\.go:12 \+0x[0-9,a-f]+
   main\.racer\(\)
       .+/main\.go:19 \+0x[0-9,a-f]+
 
-Previous write by main goroutine:
+Previous write at 0x[0-9,a-f]+ by main goroutine:
   main\.store\(\)
       .+/main\.go:12 \+0x[0-9,a-f]+
   main\.main\(\)
@@ -177,4 +180,21 @@
 PASS
 Found 1 data race\(s\)
 FAIL`},
+
+	{"slicebytetostring_pc", "run", "atexit_sleep_ms=0", `
+package main
+func main() {
+	done := make(chan string)
+	data := make([]byte, 10)
+	go func() {
+		done <- string(data)
+	}()
+	data[0] = 1
+	<-done
+}
+`, `
+  runtime\.slicebytetostring\(\)
+      .*/runtime/string\.go:.*
+  main\.main\.func1\(\)
+      .*/main.go:7`},
 }

diff --git a/src/runtime/race/race.go b/src/runtime/race/race.go
index 31deedd..15e2011 100644
--- a/src/runtime/race/race.go
+++ b/src/runtime/race/race.go

@@ -9,7 +9,7 @@
 // This file merely ensures that we link in runtime/cgo in race build,
 // this is turn ensures that runtime uses pthread_create to create threads.
 // The prebuilt race runtime lives in race_GOOS_GOARCH.syso.
-// Calls to the runtime are done directly from src/runtime/race.c.
+// Calls to the runtime are done directly from src/runtime/race.go.
 
 // void __race_unused_func(void);
 import "C"

diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
index 9cf1ecc..1822486 100644
--- a/src/runtime/race/race_darwin_amd64.syso
+++ b/src/runtime/race/race_darwin_amd64.syso
Binary files differ

diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
index 50ae2d3..75d9495 100644
--- a/src/runtime/race/race_freebsd_amd64.syso
+++ b/src/runtime/race/race_freebsd_amd64.syso
Binary files differ

diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
index a141051..8f571af 100644
--- a/src/runtime/race/race_linux_amd64.syso
+++ b/src/runtime/race/race_linux_amd64.syso
Binary files differ

diff --git a/src/runtime/race/race_linux_test.go b/src/runtime/race/race_linux_test.go
new file mode 100644
index 0000000..c00ce4d
--- /dev/null
+++ b/src/runtime/race/race_linux_test.go

@@ -0,0 +1,37 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux,race
+
+package race_test
+
+import (
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestAtomicMmap(t *testing.T) {
+	// Test that atomic operations work on "external" memory. Previously they crashed (#16206).
+	// Also do a sanity correctness check: under race detector atomic operations
+	// are implemented inside of race runtime.
+	mem, err := syscall.Mmap(-1, 0, 1<<20, syscall.PROT_READ|syscall.PROT_WRITE, syscall.MAP_ANON|syscall.MAP_PRIVATE)
+	if err != nil {
+		t.Fatalf("mmap failed: %v", err)
+	}
+	defer syscall.Munmap(mem)
+	a := (*uint64)(unsafe.Pointer(&mem[0]))
+	if *a != 0 {
+		t.Fatalf("bad atomic value: %v, want 0", *a)
+	}
+	atomic.AddUint64(a, 1)
+	if *a != 1 {
+		t.Fatalf("bad atomic value: %v, want 1", *a)
+	}
+	atomic.AddUint64(a, 1)
+	if *a != 2 {
+		t.Fatalf("bad atomic value: %v, want 2", *a)
+	}
+}

diff --git a/src/runtime/race/race_test.go b/src/runtime/race/race_test.go
index 6898e74..81e51cc 100644
--- a/src/runtime/race/race_test.go
+++ b/src/runtime/race/race_test.go

@@ -17,10 +17,13 @@
 	"fmt"
 	"io"
 	"log"
+	"math/rand"
 	"os"
 	"os/exec"
 	"path/filepath"
 	"strings"
+	"sync"
+	"sync/atomic"
 	"testing"
 )
 
@@ -155,7 +158,20 @@
 		}
 		cmd.Env = append(cmd.Env, env)
 	}
-	cmd.Env = append(cmd.Env, `GORACE=suppress_equal_stacks=0 suppress_equal_addresses=0 exitcode=0`)
+	// We set GOMAXPROCS=1 to prevent test flakiness.
+	// There are two sources of flakiness:
+	// 1. Some tests rely on particular execution order.
+	//    If the order is different, race does not happen at all.
+	// 2. Ironically, ThreadSanitizer runtime contains a logical race condition
+	//    that can lead to false negatives if racy accesses happen literally at the same time.
+	// Tests used to work reliably in the good old days of GOMAXPROCS=1.
+	// So let's set it for now. A more reliable solution is to explicitly annotate tests
+	// with required execution order by means of a special "invisible" synchronization primitive
+	// (that's what is done for C++ ThreadSanitizer tests). This is issue #14119.
+	cmd.Env = append(cmd.Env,
+		"GOMAXPROCS=1",
+		"GORACE=suppress_equal_stacks=0 suppress_equal_addresses=0 exitcode=0",
+	)
 	return cmd.CombinedOutput()
 }
 
@@ -182,3 +198,26 @@
 		t.Errorf("mangled a: %q %q", a, a[:1])
 	}
 }
+
+func BenchmarkSyncLeak(b *testing.B) {
+	const (
+		G = 1000
+		S = 1000
+		H = 10
+	)
+	var wg sync.WaitGroup
+	wg.Add(G)
+	for g := 0; g < G; g++ {
+		go func() {
+			defer wg.Done()
+			hold := make([][]uint32, H)
+			for i := 0; i < b.N; i++ {
+				a := make([]uint32, S)
+				atomic.AddUint32(&a[rand.Intn(len(a))], 1)
+				hold[rand.Intn(len(hold))] = a
+			}
+			_ = hold
+		}()
+	}
+	wg.Wait()
+}

diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
index 125115e..64c54b6 100644
--- a/src/runtime/race/race_windows_amd64.syso
+++ b/src/runtime/race/race_windows_amd64.syso
Binary files differ

diff --git a/src/runtime/race/race_windows_test.go b/src/runtime/race/race_windows_test.go
new file mode 100644
index 0000000..307a1ea
--- /dev/null
+++ b/src/runtime/race/race_windows_test.go

@@ -0,0 +1,46 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build windows,race
+
+package race_test
+
+import (
+	"sync/atomic"
+	"syscall"
+	"testing"
+	"unsafe"
+)
+
+func TestAtomicMmap(t *testing.T) {
+	// Test that atomic operations work on "external" memory. Previously they crashed (#16206).
+	// Also do a sanity correctness check: under race detector atomic operations
+	// are implemented inside of race runtime.
+	kernel32 := syscall.NewLazyDLL("kernel32.dll")
+	VirtualAlloc := kernel32.NewProc("VirtualAlloc")
+	VirtualFree := kernel32.NewProc("VirtualFree")
+	const (
+		MEM_COMMIT     = 0x00001000
+		MEM_RESERVE    = 0x00002000
+		MEM_RELEASE    = 0x8000
+		PAGE_READWRITE = 0x04
+	)
+	mem, _, err := syscall.Syscall6(VirtualAlloc.Addr(), 4, 0, 1<<20, MEM_COMMIT|MEM_RESERVE, PAGE_READWRITE, 0, 0)
+	if err != 0 {
+		t.Fatalf("VirtualAlloc failed: %v", err)
+	}
+	defer syscall.Syscall(VirtualFree.Addr(), 3, mem, 1<<20, MEM_RELEASE)
+	a := (*uint64)(unsafe.Pointer(mem))
+	if *a != 0 {
+		t.Fatalf("bad atomic value: %v, want 0", *a)
+	}
+	atomic.AddUint64(a, 1)
+	if *a != 1 {
+		t.Fatalf("bad atomic value: %v, want 1", *a)
+	}
+	atomic.AddUint64(a, 1)
+	if *a != 2 {
+		t.Fatalf("bad atomic value: %v, want 2", *a)
+	}
+}

diff --git a/src/runtime/race/sched_test.go b/src/runtime/race/sched_test.go
index aac8fed..d6bb323 100644
--- a/src/runtime/race/sched_test.go
+++ b/src/runtime/race/sched_test.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/race/testdata/chan_test.go b/src/runtime/race/testdata/chan_test.go
index eabd81f..4491916 100644
--- a/src/runtime/race/testdata/chan_test.go
+++ b/src/runtime/race/testdata/chan_test.go

@@ -285,17 +285,20 @@
 	v1 := 0
 	v2 := 0
 	c := make(chan int, 1)
+	done := make(chan bool)
 	go func() {
 		defer func() {
 			recover()
 		}()
 		v1 = 1
 		c <- 1
+		done <- true
 	}()
 	go func() {
 		time.Sleep(1e7)
 		v2 = 2
 		close(c)
+		done <- true
 	}()
 	time.Sleep(2e7)
 	if _, who := <-c; who {
@@ -303,6 +306,8 @@
 	} else {
 		v1 = 2
 	}
+	<-done
+	<-done
 }
 
 func TestRaceChanSendClose(t *testing.T) {
@@ -657,3 +662,42 @@
 		_ = data[i]
 	}
 }
+
+// Test that sender synchronizes with receiver even if the sender was blocked.
+func TestNoRaceBlockedSendSync(t *testing.T) {
+	c := make(chan *int, 1)
+	c <- nil
+	go func() {
+		i := 42
+		c <- &i
+	}()
+	// Give the sender time to actually block.
+	// This sleep is completely optional: race report must not be printed
+	// regardless of whether the sender actually blocks or not.
+	// It cannot lead to flakiness.
+	time.Sleep(10 * time.Millisecond)
+	<-c
+	p := <-c
+	if *p != 42 {
+		t.Fatal()
+	}
+}
+
+// The same as TestNoRaceBlockedSendSync above, but sender unblock happens in a select.
+func TestNoRaceBlockedSelectSendSync(t *testing.T) {
+	c := make(chan *int, 1)
+	c <- nil
+	go func() {
+		i := 42
+		c <- &i
+	}()
+	time.Sleep(10 * time.Millisecond)
+	<-c
+	select {
+	case p := <-c:
+		if *p != 42 {
+			t.Fatal()
+		}
+	case <-make(chan int):
+	}
+}

diff --git a/src/runtime/race/testdata/io_test.go b/src/runtime/race/testdata/io_test.go
index 1b3ee38..30a121b 100644
--- a/src/runtime/race/testdata/io_test.go
+++ b/src/runtime/race/testdata/io_test.go

@@ -7,9 +7,11 @@
 import (
 	"fmt"
 	"io/ioutil"
+	"net"
 	"net/http"
 	"os"
 	"path/filepath"
+	"sync"
 	"testing"
 	"time"
 )
@@ -41,29 +43,34 @@
 	_ = x
 }
 
+var (
+	regHandler  sync.Once
+	handlerData int
+)
+
 func TestNoRaceIOHttp(t *testing.T) {
-	x := 0
-	go func() {
+	regHandler.Do(func() {
 		http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
-			x = 41
+			handlerData++
 			fmt.Fprintf(w, "test")
-			x = 42
+			handlerData++
 		})
-		err := http.ListenAndServe("127.0.0.1:23651", nil)
-		if err != nil {
-			t.Fatalf("http.ListenAndServe: %v", err)
-		}
-	}()
-	time.Sleep(1e7)
-	x = 1
-	_, err := http.Get("http://127.0.0.1:23651")
+	})
+	ln, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("net.Listen: %v", err)
+	}
+	defer ln.Close()
+	go http.Serve(ln, nil)
+	handlerData++
+	_, err = http.Get("http://" + ln.Addr().String())
 	if err != nil {
 		t.Fatalf("http.Get: %v", err)
 	}
-	x = 2
-	_, err = http.Get("http://127.0.0.1:23651")
+	handlerData++
+	_, err = http.Get("http://" + ln.Addr().String())
 	if err != nil {
 		t.Fatalf("http.Get: %v", err)
 	}
-	x = 3
+	handlerData++
 }

diff --git a/src/runtime/race/testdata/issue12225_test.go b/src/runtime/race/testdata/issue12225_test.go
new file mode 100644
index 0000000..0494493
--- /dev/null
+++ b/src/runtime/race/testdata/issue12225_test.go

@@ -0,0 +1,20 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import "unsafe"
+
+// golang.org/issue/12225
+// The test is that this compiles at all.
+
+//go:noinline
+func convert(s string) []byte {
+	return []byte(s)
+}
+
+func issue12225() {
+	println(*(*int)(unsafe.Pointer(&convert("")[0])))
+	println(*(*int)(unsafe.Pointer(&[]byte("")[0])))
+}

diff --git a/src/runtime/race/testdata/issue12664_test.go b/src/runtime/race/testdata/issue12664_test.go
new file mode 100644
index 0000000..c9f790e
--- /dev/null
+++ b/src/runtime/race/testdata/issue12664_test.go

@@ -0,0 +1,76 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+import (
+	"fmt"
+	"testing"
+)
+
+var issue12664 = "hi"
+
+func TestRaceIssue12664(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		issue12664 = "bye"
+		close(c)
+	}()
+	fmt.Println(issue12664)
+	<-c
+}
+
+type MyI interface {
+	foo()
+}
+
+type MyT int
+
+func (MyT) foo() {
+}
+
+var issue12664_2 MyT = 0
+
+func TestRaceIssue12664_2(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		issue12664_2 = 1
+		close(c)
+	}()
+	func(x MyI) {
+		// Never true, but prevents inlining.
+		if x.(MyT) == -1 {
+			close(c)
+		}
+	}(issue12664_2)
+	<-c
+}
+
+var issue12664_3 MyT = 0
+
+func TestRaceIssue12664_3(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		issue12664_3 = 1
+		close(c)
+	}()
+	var r MyT
+	var i interface{} = r
+	issue12664_3 = i.(MyT)
+	<-c
+}
+
+var issue12664_4 MyT = 0
+
+func TestRaceIssue12664_4(t *testing.T) {
+	c := make(chan struct{})
+	go func() {
+		issue12664_4 = 1
+		close(c)
+	}()
+	var r MyT
+	var i MyI = r
+	issue12664_4 = i.(MyT)
+	<-c
+}

diff --git a/src/runtime/race/testdata/issue13264_test.go b/src/runtime/race/testdata/issue13264_test.go
new file mode 100644
index 0000000..d42290d
--- /dev/null
+++ b/src/runtime/race/testdata/issue13264_test.go

@@ -0,0 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package race_test
+
+// golang.org/issue/13264
+// The test is that this compiles at all.
+
+func issue13264() {
+	for ; ; []map[int]int{}[0][0] = 0 {
+	}
+}

diff --git a/src/runtime/race/testdata/mop_test.go b/src/runtime/race/testdata/mop_test.go
index d7cbc98..c96acb9 100644
--- a/src/runtime/race/testdata/mop_test.go
+++ b/src/runtime/race/testdata/mop_test.go

@@ -1356,14 +1356,8 @@
 	x, y int
 }
 
+//go:noinline
 func (p InterImpl) Foo(x int) {
-	// prevent inlining
-	z := 42
-	x = 85
-	y := x / z
-	z = y * z
-	x = z * y
-	_, _, _ = x, y, z
 }
 
 type InterImpl2 InterImpl

diff --git a/src/runtime/race/testdata/regression_test.go b/src/runtime/race/testdata/regression_test.go
index d461269..6a7802f 100644
--- a/src/runtime/race/testdata/regression_test.go
+++ b/src/runtime/race/testdata/regression_test.go

@@ -65,10 +65,8 @@
 	min, max Rect
 }
 
+//go:noinline
 func NewImage() Image {
-	var pleaseDoNotInlineMe stack
-	pleaseDoNotInlineMe.push(1)
-	_ = pleaseDoNotInlineMe.pop()
 	return Image{}
 }
 
@@ -113,11 +111,8 @@
 
 var makeChanCalls int
 
+//go:noinline
 func makeChan() *RpcChan {
-	var pleaseDoNotInlineMe stack
-	pleaseDoNotInlineMe.push(1)
-	_ = pleaseDoNotInlineMe.pop()
-
 	makeChanCalls++
 	c := &RpcChan{make(chan bool, 1)}
 	c.c <- true

diff --git a/src/runtime/race/testdata/select_test.go b/src/runtime/race/testdata/select_test.go
index b4b1991..9969f47 100644
--- a/src/runtime/race/testdata/select_test.go
+++ b/src/runtime/race/testdata/select_test.go

@@ -19,7 +19,7 @@
 		x = 1
 		// At least two channels are needed because
 		// otherwise the compiler optimizes select out.
-		// See comment in runtime/chan.c:^selectgo.
+		// See comment in runtime/select.go:^func selectgoImpl.
 		select {
 		case c <- true:
 		case c1 <- true:

diff --git a/src/runtime/race0.go b/src/runtime/race0.go
index 591d5d9..f1d3706 100644
--- a/src/runtime/race0.go
+++ b/src/runtime/race0.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -18,8 +18,10 @@
 
 func raceReadObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr)  { throw("race") }
 func raceWriteObjectPC(t *_type, addr unsafe.Pointer, callerpc, pc uintptr) { throw("race") }
-func raceinit() uintptr                                                     { throw("race"); return 0 }
+func raceinit() (uintptr, uintptr)                                          { throw("race"); return 0, 0 }
 func racefini()                                                             { throw("race") }
+func raceproccreate() uintptr                                               { throw("race"); return 0 }
+func raceprocdestroy(ctx uintptr)                                           { throw("race") }
 func racemapshadow(addr unsafe.Pointer, size uintptr)                       { throw("race") }
 func racewritepc(addr unsafe.Pointer, callerpc, pc uintptr)                 { throw("race") }
 func racereadpc(addr unsafe.Pointer, callerpc, pc uintptr)                  { throw("race") }
@@ -33,5 +35,6 @@
 func racereleasemergeg(gp *g, addr unsafe.Pointer)                          { throw("race") }
 func racefingo()                                                            { throw("race") }
 func racemalloc(p unsafe.Pointer, sz uintptr)                               { throw("race") }
+func racefree(p unsafe.Pointer, sz uintptr)                                 { throw("race") }
 func racegostart(pc uintptr) uintptr                                        { throw("race"); return 0 }
 func racegoend()                                                            { throw("race") }

diff --git a/src/runtime/race1.go b/src/runtime/race1.go
deleted file mode 100644
index 38afca7..0000000
--- a/src/runtime/race1.go
+++ /dev/null

@@ -1,315 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Implementation of the race detector API.
-// +build race
-
-package runtime
-
-import "unsafe"
-
-// Race runtime functions called via runtime·racecall.
-//go:linkname __tsan_init __tsan_init
-var __tsan_init byte
-
-//go:linkname __tsan_fini __tsan_fini
-var __tsan_fini byte
-
-//go:linkname __tsan_map_shadow __tsan_map_shadow
-var __tsan_map_shadow byte
-
-//go:linkname __tsan_finalizer_goroutine __tsan_finalizer_goroutine
-var __tsan_finalizer_goroutine byte
-
-//go:linkname __tsan_go_start __tsan_go_start
-var __tsan_go_start byte
-
-//go:linkname __tsan_go_end __tsan_go_end
-var __tsan_go_end byte
-
-//go:linkname __tsan_malloc __tsan_malloc
-var __tsan_malloc byte
-
-//go:linkname __tsan_acquire __tsan_acquire
-var __tsan_acquire byte
-
-//go:linkname __tsan_release __tsan_release
-var __tsan_release byte
-
-//go:linkname __tsan_release_merge __tsan_release_merge
-var __tsan_release_merge byte
-
-//go:linkname __tsan_go_ignore_sync_begin __tsan_go_ignore_sync_begin
-var __tsan_go_ignore_sync_begin byte
-
-//go:linkname __tsan_go_ignore_sync_end __tsan_go_ignore_sync_end
-var __tsan_go_ignore_sync_end byte
-
-// Mimic what cmd/cgo would do.
-//go:cgo_import_static __tsan_init
-//go:cgo_import_static __tsan_fini
-//go:cgo_import_static __tsan_map_shadow
-//go:cgo_import_static __tsan_finalizer_goroutine
-//go:cgo_import_static __tsan_go_start
-//go:cgo_import_static __tsan_go_end
-//go:cgo_import_static __tsan_malloc
-//go:cgo_import_static __tsan_acquire
-//go:cgo_import_static __tsan_release
-//go:cgo_import_static __tsan_release_merge
-//go:cgo_import_static __tsan_go_ignore_sync_begin
-//go:cgo_import_static __tsan_go_ignore_sync_end
-
-// These are called from race_amd64.s.
-//go:cgo_import_static __tsan_read
-//go:cgo_import_static __tsan_read_pc
-//go:cgo_import_static __tsan_read_range
-//go:cgo_import_static __tsan_write
-//go:cgo_import_static __tsan_write_pc
-//go:cgo_import_static __tsan_write_range
-//go:cgo_import_static __tsan_func_enter
-//go:cgo_import_static __tsan_func_exit
-
-//go:cgo_import_static __tsan_go_atomic32_load
-//go:cgo_import_static __tsan_go_atomic64_load
-//go:cgo_import_static __tsan_go_atomic32_store
-//go:cgo_import_static __tsan_go_atomic64_store
-//go:cgo_import_static __tsan_go_atomic32_exchange
-//go:cgo_import_static __tsan_go_atomic64_exchange
-//go:cgo_import_static __tsan_go_atomic32_fetch_add
-//go:cgo_import_static __tsan_go_atomic64_fetch_add
-//go:cgo_import_static __tsan_go_atomic32_compare_exchange
-//go:cgo_import_static __tsan_go_atomic64_compare_exchange
-
-// start/end of global data (data+bss).
-var racedatastart uintptr
-var racedataend uintptr
-
-// start/end of heap for race_amd64.s
-var racearenastart uintptr
-var racearenaend uintptr
-
-func racefuncenter(uintptr)
-func racefuncexit()
-func racereadrangepc1(uintptr, uintptr, uintptr)
-func racewriterangepc1(uintptr, uintptr, uintptr)
-func racesymbolizethunk(uintptr)
-
-// racecall allows calling an arbitrary function f from C race runtime
-// with up to 4 uintptr arguments.
-func racecall(*byte, uintptr, uintptr, uintptr, uintptr)
-
-// checks if the address has shadow (i.e. heap or data/bss)
-//go:nosplit
-func isvalidaddr(addr unsafe.Pointer) bool {
-	return racearenastart <= uintptr(addr) && uintptr(addr) < racearenaend ||
-		racedatastart <= uintptr(addr) && uintptr(addr) < racedataend
-}
-
-//go:nosplit
-func raceinit() uintptr {
-	// cgo is required to initialize libc, which is used by race runtime
-	if !iscgo {
-		throw("raceinit: race build must use cgo")
-	}
-
-	var racectx uintptr
-	racecall(&__tsan_init, uintptr(unsafe.Pointer(&racectx)), funcPC(racesymbolizethunk), 0, 0)
-
-	// Round data segment to page boundaries, because it's used in mmap().
-	start := ^uintptr(0)
-	end := uintptr(0)
-	if start > firstmoduledata.noptrdata {
-		start = firstmoduledata.noptrdata
-	}
-	if start > firstmoduledata.data {
-		start = firstmoduledata.data
-	}
-	if start > firstmoduledata.noptrbss {
-		start = firstmoduledata.noptrbss
-	}
-	if start > firstmoduledata.bss {
-		start = firstmoduledata.bss
-	}
-	if end < firstmoduledata.enoptrdata {
-		end = firstmoduledata.enoptrdata
-	}
-	if end < firstmoduledata.edata {
-		end = firstmoduledata.edata
-	}
-	if end < firstmoduledata.enoptrbss {
-		end = firstmoduledata.enoptrbss
-	}
-	if end < firstmoduledata.ebss {
-		end = firstmoduledata.ebss
-	}
-	size := round(end-start, _PageSize)
-	racecall(&__tsan_map_shadow, start, size, 0, 0)
-	racedatastart = start
-	racedataend = start + size
-
-	return racectx
-}
-
-//go:nosplit
-func racefini() {
-	racecall(&__tsan_fini, 0, 0, 0, 0)
-}
-
-//go:nosplit
-func racemapshadow(addr unsafe.Pointer, size uintptr) {
-	if racearenastart == 0 {
-		racearenastart = uintptr(addr)
-	}
-	if racearenaend < uintptr(addr)+size {
-		racearenaend = uintptr(addr) + size
-	}
-	racecall(&__tsan_map_shadow, uintptr(addr), size, 0, 0)
-}
-
-//go:nosplit
-func racemalloc(p unsafe.Pointer, sz uintptr) {
-	racecall(&__tsan_malloc, uintptr(p), sz, 0, 0)
-}
-
-//go:nosplit
-func racegostart(pc uintptr) uintptr {
-	_g_ := getg()
-	var spawng *g
-	if _g_.m.curg != nil {
-		spawng = _g_.m.curg
-	} else {
-		spawng = _g_
-	}
-
-	var racectx uintptr
-	racecall(&__tsan_go_start, spawng.racectx, uintptr(unsafe.Pointer(&racectx)), pc, 0)
-	return racectx
-}
-
-//go:nosplit
-func racegoend() {
-	racecall(&__tsan_go_end, getg().racectx, 0, 0, 0)
-}
-
-//go:nosplit
-func racewriterangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
-	_g_ := getg()
-	if _g_ != _g_.m.curg {
-		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
-		// Not interesting.
-		return
-	}
-	if callpc != 0 {
-		racefuncenter(callpc)
-	}
-	racewriterangepc1(uintptr(addr), sz, pc)
-	if callpc != 0 {
-		racefuncexit()
-	}
-}
-
-//go:nosplit
-func racereadrangepc(addr unsafe.Pointer, sz, callpc, pc uintptr) {
-	_g_ := getg()
-	if _g_ != _g_.m.curg {
-		// The call is coming from manual instrumentation of Go code running on g0/gsignal.
-		// Not interesting.
-		return
-	}
-	if callpc != 0 {
-		racefuncenter(callpc)
-	}
-	racereadrangepc1(uintptr(addr), sz, pc)
-	if callpc != 0 {
-		racefuncexit()
-	}
-}
-
-//go:nosplit
-func raceacquire(addr unsafe.Pointer) {
-	raceacquireg(getg(), addr)
-}
-
-//go:nosplit
-func raceacquireg(gp *g, addr unsafe.Pointer) {
-	if getg().raceignore != 0 || !isvalidaddr(addr) {
-		return
-	}
-	racecall(&__tsan_acquire, gp.racectx, uintptr(addr), 0, 0)
-}
-
-//go:nosplit
-func racerelease(addr unsafe.Pointer) {
-	_g_ := getg()
-	if _g_.raceignore != 0 || !isvalidaddr(addr) {
-		return
-	}
-	racereleaseg(_g_, addr)
-}
-
-//go:nosplit
-func racereleaseg(gp *g, addr unsafe.Pointer) {
-	if getg().raceignore != 0 || !isvalidaddr(addr) {
-		return
-	}
-	racecall(&__tsan_release, gp.racectx, uintptr(addr), 0, 0)
-}
-
-//go:nosplit
-func racereleasemerge(addr unsafe.Pointer) {
-	racereleasemergeg(getg(), addr)
-}
-
-//go:nosplit
-func racereleasemergeg(gp *g, addr unsafe.Pointer) {
-	if getg().raceignore != 0 || !isvalidaddr(addr) {
-		return
-	}
-	racecall(&__tsan_release_merge, gp.racectx, uintptr(addr), 0, 0)
-}
-
-//go:nosplit
-func racefingo() {
-	racecall(&__tsan_finalizer_goroutine, getg().racectx, 0, 0, 0)
-}
-
-//go:nosplit
-
-func RaceAcquire(addr unsafe.Pointer) {
-	raceacquire(addr)
-}
-
-//go:nosplit
-
-func RaceRelease(addr unsafe.Pointer) {
-	racerelease(addr)
-}
-
-//go:nosplit
-
-func RaceReleaseMerge(addr unsafe.Pointer) {
-	racereleasemerge(addr)
-}
-
-//go:nosplit
-
-// RaceDisable disables handling of race events in the current goroutine.
-func RaceDisable() {
-	_g_ := getg()
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_begin, _g_.racectx, 0, 0, 0)
-	}
-	_g_.raceignore++
-}
-
-//go:nosplit
-
-// RaceEnable re-enables handling of race events in the current goroutine.
-func RaceEnable() {
-	_g_ := getg()
-	_g_.raceignore--
-	if _g_.raceignore == 0 {
-		racecall(&__tsan_go_ignore_sync_end, _g_.racectx, 0, 0, 0)
-	}
-}

diff --git a/src/runtime/race_amd64.s b/src/runtime/race_amd64.s
index d9e674b..cc1d92f 100644
--- a/src/runtime/race_amd64.s
+++ b/src/runtime/race_amd64.s

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -159,14 +159,28 @@
 ret:
 	RET
 
+// func runtime·racefuncenterfp(fp uintptr)
+// Called from instrumented code.
+// Like racefuncenter but passes FP, not PC
+TEXT	runtime·racefuncenterfp(SB), NOSPLIT, $0-8
+	MOVQ	fp+0(FP), R11
+	MOVQ	-8(R11), R11
+	JMP	racefuncenter<>(SB)
+
 // func runtime·racefuncenter(pc uintptr)
 // Called from instrumented code.
 TEXT	runtime·racefuncenter(SB), NOSPLIT, $0-8
+	MOVQ	callpc+0(FP), R11
+	JMP	racefuncenter<>(SB)
+
+// Common code for racefuncenter/racefuncenterfp
+// R11 = caller's return address
+TEXT	racefuncenter<>(SB), NOSPLIT, $0-0
 	MOVQ	DX, R15		// save function entry context (for closures)
 	get_tls(R12)
 	MOVQ	g(R12), R14
 	MOVQ	g_racectx(R14), RARG0	// goroutine context
-	MOVQ	callpc+0(FP), RARG1
+	MOVQ	R11, RARG1
 	// void __tsan_func_enter(ThreadState *thr, void *pc);
 	MOVQ	$__tsan_func_enter(SB), AX
 	// racecall<> preserves R15
@@ -324,6 +338,7 @@
 	// An attempt to synchronize on the address would cause crash.
 	MOVQ	AX, R15	// remember the original function
 	MOVQ	$__tsan_go_ignore_sync_begin(SB), AX
+	get_tls(R12)
 	MOVQ	g(R12), R14
 	MOVQ	g_racectx(R14), RARG0	// goroutine context
 	CALL	racecall<>(SB)
@@ -370,7 +385,24 @@
 // C->Go callback thunk that allows to call runtime·racesymbolize from C code.
 // Direct Go->C race call has only switched SP, finish g->g0 switch by setting correct g.
 // The overall effect of Go->C->Go call chain is similar to that of mcall.
-TEXT	runtime·racesymbolizethunk(SB), NOSPLIT, $56-8
+// RARG0 contains command code. RARG1 contains command-specific context.
+// See racecallback for command codes.
+TEXT	runtime·racecallbackthunk(SB), NOSPLIT, $56-8
+	// Handle command raceGetProcCmd (0) here.
+	// First, code below assumes that we are on curg, while raceGetProcCmd
+	// can be executed on g0. Second, it is called frequently, so will
+	// benefit from this fast path.
+	CMPQ	RARG0, $0
+	JNE	rest
+	get_tls(RARG0)
+	MOVQ	g(RARG0), RARG0
+	MOVQ	g_m(RARG0), RARG0
+	MOVQ	m_p(RARG0), RARG0
+	MOVQ	p_racectx(RARG0), RARG0
+	MOVQ	RARG0, (RARG1)
+	RET
+
+rest:
 	// Save callee-saved registers (Go code won't respect that).
 	// This is superset of darwin/linux/windows registers.
 	PUSHQ	BX
@@ -387,8 +419,10 @@
 	MOVQ	g_m(R13), R13
 	MOVQ	m_g0(R13), R14
 	MOVQ	R14, g(R12)	// g = m->g0
+	PUSHQ	RARG1	// func arg
 	PUSHQ	RARG0	// func arg
-	CALL	runtime·racesymbolize(SB)
+	CALL	runtime·racecallback(SB)
+	POPQ	R12
 	POPQ	R12
 	// All registers are smashed after Go code, reload.
 	get_tls(R12)

diff --git a/src/runtime/rdebug.go b/src/runtime/rdebug.go
index f2766d7..1b213f1 100644
--- a/src/runtime/rdebug.go
+++ b/src/runtime/rdebug.go

@@ -1,19 +1,22 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
+import _ "unsafe" // for go:linkname
+
+//go:linkname setMaxStack runtime/debug.setMaxStack
 func setMaxStack(in int) (out int) {
 	out = int(maxstacksize)
 	maxstacksize = uintptr(in)
 	return out
 }
 
+//go:linkname setPanicOnFault runtime/debug.setPanicOnFault
 func setPanicOnFault(new bool) (old bool) {
-	mp := acquirem()
-	old = mp.curg.paniconfault
-	mp.curg.paniconfault = new
-	releasem(mp)
+	_g_ := getg()
+	old = _g_.paniconfault
+	_g_.paniconfault = new
 	return old
 }

diff --git a/src/runtime/rt0_android_386.s b/src/runtime/rt0_android_386.s
new file mode 100644
index 0000000..9d20fc8
--- /dev/null
+++ b/src/runtime/rt0_android_386.s

@@ -0,0 +1,32 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_386_android(SB),NOSPLIT,$8
+	MOVL	8(SP), AX  // argc
+	LEAL	12(SP), BX  // argv
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	CALL	main(SB)
+	INT	$3
+
+TEXT _rt0_386_android_lib(SB),NOSPLIT,$0
+	PUSHL	$_rt0_386_android_argv(SB)  // argv
+	PUSHL	$1  // argc
+	CALL	_rt0_386_linux_lib(SB)
+	POPL	AX
+	POPL	AX
+	RET
+
+DATA _rt0_386_android_argv+0x00(SB)/4,$_rt0_386_android_argv0(SB)
+DATA _rt0_386_android_argv+0x04(SB)/4,$0  // argv terminate
+DATA _rt0_386_android_argv+0x08(SB)/4,$0  // envp terminate
+DATA _rt0_386_android_argv+0x0c(SB)/4,$0  // auxv terminate
+GLOBL _rt0_386_android_argv(SB),NOPTR,$0x10
+
+// TODO: wire up necessary VDSO (see os_linux_386.go)
+
+DATA _rt0_386_android_argv0(SB)/8, $"gojni"
+GLOBL _rt0_386_android_argv0(SB),RODATA,$8

diff --git a/src/runtime/rt0_android_amd64.s b/src/runtime/rt0_android_amd64.s
new file mode 100644
index 0000000..9af6cab
--- /dev/null
+++ b/src/runtime/rt0_android_amd64.s

@@ -0,0 +1,33 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_amd64_android(SB),NOSPLIT,$-8
+	MOVQ	0(SP), DI // argc
+	LEAQ	8(SP), SI // argv
+	MOVQ	$main(SB), AX
+	JMP	AX
+
+TEXT _rt0_amd64_android_lib(SB),NOSPLIT,$0
+	MOVQ	$1, DI // argc
+	MOVQ	$_rt0_amd64_android_argv(SB), SI  // argv
+	MOVQ	$_rt0_amd64_linux_lib(SB), AX
+	JMP	AX
+
+DATA _rt0_amd64_android_argv+0x00(SB)/8,$_rt0_amd64_android_argv0(SB)
+DATA _rt0_amd64_android_argv+0x08(SB)/8,$0
+DATA _rt0_amd64_android_argv+0x10(SB)/8,$0
+DATA _rt0_amd64_android_argv+0x18(SB)/8,$15  // AT_PLATFORM
+DATA _rt0_amd64_android_argv+0x20(SB)/8,$_rt0_amd64_android_auxv0(SB)
+DATA _rt0_amd64_android_argv+0x28(SB)/8,$0
+GLOBL _rt0_amd64_android_argv(SB),NOPTR,$0x30
+
+// TODO: AT_HWCAP necessary? If so, what value?
+
+DATA _rt0_amd64_android_argv0(SB)/8, $"gojni"
+GLOBL _rt0_amd64_android_argv0(SB),RODATA,$8
+
+DATA _rt0_amd64_android_auxv0(SB)/8, $"x86_64"
+GLOBL _rt0_amd64_android_auxv0(SB),RODATA,$8

diff --git a/src/runtime/rt0_android_arm64.s b/src/runtime/rt0_android_arm64.s
new file mode 100644
index 0000000..582fc5a
--- /dev/null
+++ b/src/runtime/rt0_android_arm64.s

@@ -0,0 +1,25 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_arm64_android(SB),NOSPLIT,$-8
+	MOVD	$_rt0_arm64_linux(SB), R4
+	B	(R4)
+
+// When building with -buildmode=c-shared, this symbol is called when the shared
+// library is loaded.
+TEXT _rt0_arm64_android_lib(SB),NOSPLIT,$-8
+	MOVW	$1, R0                            // argc
+	MOVD	$_rt0_arm64_android_argv(SB), R1  // **argv
+	MOVD	$_rt0_arm64_linux_lib(SB), R4
+	B	(R4)
+
+DATA _rt0_arm64_android_argv+0x00(SB)/8,$_rt0_arm64_android_argv0(SB)
+DATA _rt0_arm64_android_argv+0x08(SB)/8,$0
+DATA _rt0_arm64_android_argv+0x10(SB)/8,$0
+GLOBL _rt0_arm64_android_argv(SB),NOPTR,$0x18
+
+DATA _rt0_arm64_android_argv0(SB)/8, $"gojni"
+GLOBL _rt0_arm64_android_argv0(SB),RODATA,$8

diff --git a/src/runtime/rt0_darwin_386.s b/src/runtime/rt0_darwin_386.s
index 4c8c92d..6b404db 100644
--- a/src/runtime/rt0_darwin_386.s
+++ b/src/runtime/rt0_darwin_386.s

@@ -12,5 +12,64 @@
 	CALL	main(SB)
 	INT	$3
 
+// With -buildmode=c-archive, this symbol is called from a global constructor.
+TEXT _rt0_386_darwin_lib(SB),NOSPLIT,$0
+	PUSHL	BP
+	MOVL	SP, BP
+	PUSHL	BX
+	PUSHL	SI
+	PUSHL	DI
+
+	MOVL	8(BP), AX
+	MOVL	AX, _rt0_386_darwin_lib_argc<>(SB)
+	MOVL	12(BP), AX
+	MOVL	AX, _rt0_386_darwin_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOVL	$runtime·libpreinit(SB), AX
+	CALL	AX
+
+	SUBL	$12, SP
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVL	_cgo_sys_thread_create(SB), AX
+	TESTL	AX, AX
+	JZ	nocgo
+	MOVL	$_rt0_386_darwin_lib_go(SB), BX
+	MOVL	BX, 0(SP)
+	MOVL	$0, 4(SP)
+	CALL	AX
+	JMP     restore
+
+nocgo:
+	MOVL	$0x800000, 0(SP)               // stacksize = 8192KB
+	MOVL	$_rt0_386_darwin_lib_go(SB), AX
+	MOVL	AX, 4(SP)                      // fn
+	MOVL	$0, 8(SP)                      // fnarg
+	MOVL	$runtime·newosproc0(SB), AX
+	CALL	AX
+
+restore:
+	ADDL	$12, SP
+	POPL	DI
+	POPL	SI
+	POPL	BX
+	POPL	BP
+	RET
+
+TEXT _rt0_386_darwin_lib_go(SB),NOSPLIT,$12
+	MOVL	_rt0_386_darwin_lib_argc<>(SB), AX
+	MOVL	AX, 0(SP)
+	MOVL	_rt0_386_darwin_lib_argv<>(SB), AX
+	MOVL	AX, 4(SP)
+	MOVL	$runtime·rt0_go(SB), AX
+	CALL	AX
+	RET
+
+DATA _rt0_386_darwin_lib_argc<>(SB)/4, $0
+GLOBL _rt0_386_darwin_lib_argc<>(SB),NOPTR, $4
+DATA _rt0_386_darwin_lib_argv<>(SB)/4, $0
+GLOBL _rt0_386_darwin_lib_argv<>(SB),NOPTR, $4
+
 TEXT main(SB),NOSPLIT,$0
 	JMP	runtime·rt0_go(SB)

diff --git a/src/runtime/rt0_darwin_amd64.s b/src/runtime/rt0_darwin_amd64.s
index 8d50e96..655e77a 100644
--- a/src/runtime/rt0_darwin_amd64.s
+++ b/src/runtime/rt0_darwin_amd64.s

@@ -12,10 +12,28 @@
 
 // When linking with -shared, this symbol is called when the shared library
 // is loaded.
-TEXT _rt0_amd64_darwin_lib(SB),NOSPLIT,$40
+TEXT _rt0_amd64_darwin_lib(SB),NOSPLIT,$0x58
+	// Align stack. We don't know whether Go is adding a frame pointer here or not.
+	MOVQ	SP, R8
+	SUBQ	$16, R8
+	ANDQ	$~15, R8
+	XCHGQ	SP, R8
+
+	MOVQ	R8, 0x48(SP)
+	MOVQ	BX, 0x18(SP)
+	MOVQ	BP, 0x20(SP)
+	MOVQ	R12, 0x28(SP)
+	MOVQ	R13, 0x30(SP)
+	MOVQ	R14, 0x38(SP)
+	MOVQ	R15, 0x40(SP)
+
 	MOVQ	DI, _rt0_amd64_darwin_lib_argc<>(SB)
 	MOVQ	SI, _rt0_amd64_darwin_lib_argv<>(SB)
 
+	// Synchronous initialization.
+	MOVQ	$runtime·libpreinit(SB), AX
+	CALL	AX
+
 	// Create a new thread to do the runtime initialization and return.
 	MOVQ	_cgo_sys_thread_create(SB), AX
 	TESTQ	AX, AX
@@ -23,7 +41,8 @@
 	MOVQ	$_rt0_amd64_darwin_lib_go(SB), DI
 	MOVQ	$0, SI
 	CALL	AX
-	RET
+	JMP	restore
+
 nocgo:
 	MOVQ	$8388608, 0(SP)                    // stacksize
 	MOVQ	$_rt0_amd64_darwin_lib_go(SB), AX
@@ -31,6 +50,17 @@
 	MOVQ	$0, 16(SP)                         // fnarg
 	MOVQ	$runtime·newosproc0(SB), AX
 	CALL	AX
+
+restore:
+	MOVQ	0x18(SP), BX
+	MOVQ	0x20(SP), BP
+	MOVQ	0x28(SP), R12
+	MOVQ	0x30(SP), R13
+	MOVQ	0x38(SP), R14
+	MOVQ	0x40(SP), R15
+	
+	MOVQ	0x48(SP), R8
+	MOVQ	R8, SP
 	RET
 
 TEXT _rt0_amd64_darwin_lib_go(SB),NOSPLIT,$0

diff --git a/src/runtime/rt0_darwin_arm.s b/src/runtime/rt0_darwin_arm.s
index 95a2b17..526d88f 100644
--- a/src/runtime/rt0_darwin_arm.s
+++ b/src/runtime/rt0_darwin_arm.s

@@ -16,29 +16,67 @@
 //
 // Note that all currently shipping darwin/arm platforms require
 // cgo and do not support c-shared.
-TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$12
+TEXT _rt0_arm_darwin_lib(SB),NOSPLIT,$104
+	// Preserve callee-save registers.
+	MOVW    R4, 12(R13)
+	MOVW    R5, 16(R13)
+	MOVW    R6, 20(R13)
+	MOVW    R7, 24(R13)
+	MOVW    R8, 28(R13)
+	MOVW    R11, 32(R13)
+
+	MOVD	F8, (32+8*1)(R13)
+	MOVD	F9, (32+8*2)(R13)
+	MOVD	F10, (32+8*3)(R13)
+	MOVD	F11, (32+8*4)(R13)
+	MOVD	F12, (32+8*5)(R13)
+	MOVD	F13, (32+8*6)(R13)
+	MOVD	F14, (32+8*7)(R13)
+	MOVD	F15, (32+8*8)(R13)
+
 	MOVW  R0, _rt0_arm_darwin_lib_argc<>(SB)
 	MOVW  R1, _rt0_arm_darwin_lib_argv<>(SB)
 
+	// Synchronous initialization.
+	MOVW	$runtime·libpreinit(SB), R3
+	CALL	(R3)
+
 	// Create a new thread to do the runtime initialization and return.
-	MOVW  _cgo_sys_thread_create(SB), R4
-	CMP   $0, R4
+	MOVW  _cgo_sys_thread_create(SB), R3
+	CMP   $0, R3
 	B.EQ  nocgo
 	MOVW  $_rt0_arm_darwin_lib_go(SB), R0
 	MOVW  $0, R1
-	BL    (R4)
-	RET
+	BL    (R3)
+	B rr
 nocgo:
 	MOVW  $0x400000, R0
-	MOVW  $_rt0_arm_darwin_lib_go(SB), R1
-	MOVW  $0, R2
-	MOVW  R0,  (R13) // stacksize
-	MOVW  R1, 4(R13) // fn
-	MOVW  R2, 8(R13) // fnarg
-	MOVW  $runtime·newosproc0(SB), R4
-	BL    (R4)
+	MOVW  R0, (R13) // stacksize
+	MOVW  $_rt0_arm_darwin_lib_go(SB), R0
+	MOVW  R0, 4(R13) // fn
+	MOVW  $0, R0
+	MOVW  R0, 8(R13) // fnarg
+	MOVW  $runtime·newosproc0(SB), R3
+	BL    (R3)
+rr:
+	// Restore callee-save registers and return.
+	MOVW    12(R13), R4
+	MOVW    16(R13), R5
+	MOVW    20(R13), R6
+	MOVW    24(R13), R7
+	MOVW    28(R13), R8
+	MOVW    32(R13), R11
+	MOVD	(32+8*1)(R13), F8
+	MOVD	(32+8*2)(R13), F9
+	MOVD	(32+8*3)(R13), F10
+	MOVD	(32+8*4)(R13), F11
+	MOVD	(32+8*5)(R13), F12
+	MOVD	(32+8*6)(R13), F13
+	MOVD	(32+8*7)(R13), F14
+	MOVD	(32+8*8)(R13), F15
 	RET
 
+
 TEXT _rt0_arm_darwin_lib_go(SB),NOSPLIT,$0
 	MOVW  _rt0_arm_darwin_lib_argc<>(SB), R0
 	MOVW  _rt0_arm_darwin_lib_argv<>(SB), R1

diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s
index e4e4a30..f607683 100644
--- a/src/runtime/rt0_darwin_arm64.s
+++ b/src/runtime/rt0_darwin_arm64.s

@@ -16,22 +16,57 @@
 //
 // Note that all currently shipping darwin/arm64 platforms require
 // cgo and do not support c-shared.
-TEXT _rt0_arm64_darwin_lib(SB),NOSPLIT,$0
-	// R27 is REGTMP, reserved for liblink. It is used below to
-	// move R0/R1 into globals. However in the standard ARM64 calling
-	// convention, it is a callee-saved register. So we save it to a
-	// temporary register.
-	MOVD  R27, R7
+TEXT _rt0_arm64_darwin_lib(SB),NOSPLIT,$168
+	// Preserve callee-save registers.
+	MOVD R19, 24(RSP)
+	MOVD R20, 32(RSP)
+	MOVD R21, 40(RSP)
+	MOVD R22, 48(RSP)
+	MOVD R23, 56(RSP)
+	MOVD R24, 64(RSP)
+	MOVD R25, 72(RSP)
+	MOVD R26, 80(RSP)
+	MOVD R27, 88(RSP)
+	FMOVD F8, 96(RSP)
+	FMOVD F9, 104(RSP)
+	FMOVD F10, 112(RSP)
+	FMOVD F11, 120(RSP)
+	FMOVD F12, 128(RSP)
+	FMOVD F13, 136(RSP)
+	FMOVD F14, 144(RSP)
+	FMOVD F15, 152(RSP)
 
 	MOVD  R0, _rt0_arm64_darwin_lib_argc<>(SB)
 	MOVD  R1, _rt0_arm64_darwin_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOVD	$runtime·libpreinit(SB), R4
+	BL	(R4)
+
 	// Create a new thread to do the runtime initialization and return.
 	MOVD  _cgo_sys_thread_create(SB), R4
 	MOVD  $_rt0_arm64_darwin_lib_go(SB), R0
 	MOVD  $0, R1
 	BL    (R4)
 
-	MOVD  R7, R27
+	// Restore callee-save registers.
+	MOVD 24(RSP), R19
+	MOVD 32(RSP), R20
+	MOVD 40(RSP), R21
+	MOVD 48(RSP), R22
+	MOVD 56(RSP), R23
+	MOVD 64(RSP), R24
+	MOVD 72(RSP), R25
+	MOVD 80(RSP), R26
+	MOVD 88(RSP), R27
+	FMOVD 96(RSP), F8
+	FMOVD 104(RSP), F9
+	FMOVD 112(RSP), F10
+	FMOVD 120(RSP), F11
+	FMOVD 128(RSP), F12
+	FMOVD 136(RSP), F13
+	FMOVD 144(RSP), F14
+	FMOVD 152(RSP), F15
 	RET
 
 TEXT _rt0_arm64_darwin_lib_go(SB),NOSPLIT,$0

diff --git a/src/runtime/rt0_linux_386.s b/src/runtime/rt0_linux_386.s
index 633e806..23bfc98 100644
--- a/src/runtime/rt0_linux_386.s
+++ b/src/runtime/rt0_linux_386.s

@@ -26,6 +26,10 @@
 	MOVL	12(BP), AX
 	MOVL	AX, _rt0_386_linux_lib_argv<>(SB)
 
+	// Synchronous initialization.
+	MOVL	$runtime·libpreinit(SB), AX
+	CALL	AX
+
 	SUBL	$8, SP
 
 	// Create a new thread to do the runtime initialization.
@@ -69,11 +73,3 @@
 
 TEXT main(SB),NOSPLIT,$0
 	JMP	runtime·rt0_go(SB)
-
-TEXT _fallback_vdso(SB),NOSPLIT,$0
-	INT	$0x80
-	RET
-
-DATA	runtime·_vdso(SB)/4, $_fallback_vdso(SB)
-GLOBL	runtime·_vdso(SB), NOPTR, $4
-

diff --git a/src/runtime/rt0_linux_amd64.s b/src/runtime/rt0_linux_amd64.s
index 726b550..564b51c 100644
--- a/src/runtime/rt0_linux_amd64.s
+++ b/src/runtime/rt0_linux_amd64.s

@@ -23,6 +23,10 @@
 	MOVQ	DI, _rt0_amd64_linux_lib_argc<>(SB)
 	MOVQ	SI, _rt0_amd64_linux_lib_argv<>(SB)
 
+	// Synchronous initialization.
+	MOVQ	$runtime·libpreinit(SB), AX
+	CALL	AX
+
 	// Create a new thread to do the runtime initialization and return.
 	MOVQ	_cgo_sys_thread_create(SB), AX
 	TESTQ	AX, AX

diff --git a/src/runtime/rt0_linux_arm.s b/src/runtime/rt0_linux_arm.s
index b71a3f9..597e642 100644
--- a/src/runtime/rt0_linux_arm.s
+++ b/src/runtime/rt0_linux_arm.s

@@ -12,8 +12,8 @@
 
 // When building with -buildmode=c-shared, this symbol is called when the shared
 // library is loaded.
-TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$32
-	// Preserve callee-save registers.  Raspberry Pi's dlopen(), for example,
+TEXT _rt0_arm_linux_lib(SB),NOSPLIT,$104
+	// Preserve callee-save registers. Raspberry Pi's dlopen(), for example,
 	// actually cares that R11 is preserved.
 	MOVW	R4, 12(R13)
 	MOVW	R5, 16(R13)
@@ -22,10 +22,27 @@
 	MOVW	R8, 28(R13)
 	MOVW	R11, 32(R13)
 
+	// Skip floating point registers on GOARM < 6.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfpsave
+	MOVD	F8, (32+8*1)(R13)
+	MOVD	F9, (32+8*2)(R13)
+	MOVD	F10, (32+8*3)(R13)
+	MOVD	F11, (32+8*4)(R13)
+	MOVD	F12, (32+8*5)(R13)
+	MOVD	F13, (32+8*6)(R13)
+	MOVD	F14, (32+8*7)(R13)
+	MOVD	F15, (32+8*8)(R13)
+skipfpsave:
 	// Save argc/argv.
 	MOVW	R0, _rt0_arm_linux_lib_argc<>(SB)
 	MOVW	R1, _rt0_arm_linux_lib_argv<>(SB)
 
+	// Synchronous initialization.
+	MOVW	$runtime·libpreinit(SB), R2
+	CALL	(R2)
+
 	// Create a new thread to do the runtime initialization.
 	MOVW	_cgo_sys_thread_create(SB), R2
 	CMP	$0, R2
@@ -42,6 +59,18 @@
 	BL	runtime·newosproc0(SB)
 rr:
 	// Restore callee-save registers and return.
+	MOVB    runtime·goarm(SB), R11
+	CMP $6, R11
+	BLT skipfprest
+	MOVD	(32+8*1)(R13), F8
+	MOVD	(32+8*2)(R13), F9
+	MOVD	(32+8*3)(R13), F10
+	MOVD	(32+8*4)(R13), F11
+	MOVD	(32+8*5)(R13), F12
+	MOVD	(32+8*6)(R13), F13
+	MOVD	(32+8*7)(R13), F14
+	MOVD	(32+8*8)(R13), F15
+skipfprest:
 	MOVW	12(R13), R4
 	MOVW	16(R13), R5
 	MOVW	20(R13), R6

diff --git a/src/runtime/rt0_linux_arm64.s b/src/runtime/rt0_linux_arm64.s
index 1eb0352..d01d415 100644
--- a/src/runtime/rt0_linux_arm64.s
+++ b/src/runtime/rt0_linux_arm64.s

@@ -9,6 +9,85 @@
 	ADD	$8, RSP, R1	// argv
 	BL	main(SB)
 
+// When building with -buildmode=c-shared, this symbol is called when the shared
+// library is loaded.
+TEXT _rt0_arm64_linux_lib(SB),NOSPLIT,$168
+	// Preserve callee-save registers.
+	MOVD R19, 24(RSP)
+	MOVD R20, 32(RSP)
+	MOVD R21, 40(RSP)
+	MOVD R22, 48(RSP)
+	MOVD R23, 56(RSP)
+	MOVD R24, 64(RSP)
+	MOVD R25, 72(RSP)
+	MOVD R26, 80(RSP)
+	MOVD R27, 88(RSP)
+	FMOVD F8, 96(RSP)
+	FMOVD F9, 104(RSP)
+	FMOVD F10, 112(RSP)
+	FMOVD F11, 120(RSP)
+	FMOVD F12, 128(RSP)
+	FMOVD F13, 136(RSP)
+	FMOVD F14, 144(RSP)
+	FMOVD F15, 152(RSP)
+
+	MOVD	R0, _rt0_arm64_linux_lib_argc<>(SB)
+	MOVD	R1, _rt0_arm64_linux_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOVD	$runtime·libpreinit(SB), R4
+	BL	(R4)
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVD	_cgo_sys_thread_create(SB), R4
+	CMP	$0, R4
+	BEQ	nocgo
+	MOVD	$_rt0_arm64_linux_lib_go(SB), R0
+	MOVD	$0, R1
+	BL	(R4)
+	B	restore
+
+nocgo:
+	MOVD	$0x800000, R0                     // stacksize = 8192KB
+	MOVD	$_rt0_arm64_linux_lib_go(SB), R1
+	MOVD	R0, 8(RSP)
+	MOVD	R1, 16(RSP)
+	MOVD	$runtime·newosproc0(SB),R4
+	BL	(R4)
+
+restore:
+	// Restore callee-save registers.
+	MOVD 24(RSP), R19
+	MOVD 32(RSP), R20
+	MOVD 40(RSP), R21
+	MOVD 48(RSP), R22
+	MOVD 56(RSP), R23
+	MOVD 64(RSP), R24
+	MOVD 72(RSP), R25
+	MOVD 80(RSP), R26
+	MOVD 88(RSP), R27
+	FMOVD 96(RSP), F8
+	FMOVD 104(RSP), F9
+	FMOVD 112(RSP), F10
+	FMOVD 120(RSP), F11
+	FMOVD 128(RSP), F12
+	FMOVD 136(RSP), F13
+	FMOVD 144(RSP), F14
+	FMOVD 152(RSP), F15
+	RET
+
+TEXT _rt0_arm64_linux_lib_go(SB),NOSPLIT,$0
+	MOVD	_rt0_arm64_linux_lib_argc<>(SB), R0
+	MOVD	_rt0_arm64_linux_lib_argv<>(SB), R1
+	MOVD	$runtime·rt0_go(SB),R4
+	B       (R4)
+
+DATA _rt0_arm64_linux_lib_argc<>(SB)/8, $0
+GLOBL _rt0_arm64_linux_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_arm64_linux_lib_argv<>(SB)/8, $0
+GLOBL _rt0_arm64_linux_lib_argv<>(SB),NOPTR, $8
+
+
 TEXT main(SB),NOSPLIT,$-8
 	MOVD	$runtime·rt0_go(SB), R2
 	BL	(R2)

diff --git a/src/runtime/rt0_linux_mips64x.s b/src/runtime/rt0_linux_mips64x.s
new file mode 100644
index 0000000..beb4ef2
--- /dev/null
+++ b/src/runtime/rt0_linux_mips64x.s

@@ -0,0 +1,39 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+#include "textflag.h"
+
+TEXT _rt0_mips64_linux(SB),NOSPLIT,$0
+	JMP	_main<>(SB)
+
+TEXT _rt0_mips64le_linux(SB),NOSPLIT,$0
+	JMP	_main<>(SB)
+
+TEXT _main<>(SB),NOSPLIT,$-8
+	// In a statically linked binary, the stack contains argc,
+	// argv as argc string pointers followed by a NULL, envv as a
+	// sequence of string pointers followed by a NULL, and auxv.
+	// There is no TLS base pointer.
+#ifdef GOARCH_mips64
+	MOVW	4(R29), R4 // argc, big-endian ABI places int32 at offset 4
+#else
+	MOVW	0(R29), R4 // argc
+#endif
+	ADDV	$8, R29, R5 // argv
+	JMP	main(SB)
+
+TEXT main(SB),NOSPLIT,$-8
+	// in external linking, glibc jumps to main with argc in R4
+	// and argv in R5
+
+	// initalize REGSB = PC&0xffffffff00000000
+	BGEZAL	R0, 1(PC)
+	SRLV	$32, R31, RSB
+	SLLV	$32, RSB
+
+	MOVV	$runtime·rt0_go(SB), R1
+	JMP	(R1)

diff --git a/src/runtime/rt0_linux_ppc64.s b/src/runtime/rt0_linux_ppc64.s
index 33e973d..f814515 100644
--- a/src/runtime/rt0_linux_ppc64.s
+++ b/src/runtime/rt0_linux_ppc64.s

@@ -18,6 +18,6 @@
 	BR main(SB)
 
 TEXT main(SB),NOSPLIT,$-8
-	MOVD	$runtime·rt0_go(SB), R31
-	MOVD	R31, CTR
+	MOVD	$runtime·rt0_go(SB), R12
+	MOVD	R12, CTR
 	BR	(CTR)

diff --git a/src/runtime/rt0_linux_ppc64le.s b/src/runtime/rt0_linux_ppc64le.s
index f5c0af5..2c55413 100644
--- a/src/runtime/rt0_linux_ppc64le.s
+++ b/src/runtime/rt0_linux_ppc64le.s

@@ -1,8 +1,138 @@
+#include "go_asm.h"
 #include "textflag.h"
 
 TEXT _rt0_ppc64le_linux(SB),NOSPLIT,$0
 	BR _main<>(SB)
 
+TEXT _rt0_ppc64le_linux_lib(SB),NOSPLIT,$-8
+	// Start with standard C stack frame layout and linkage.
+	MOVD	LR, R0
+	MOVD	R0, 16(R1) // Save LR in caller's frame.
+	MOVD	R2, 24(R1) // Save TOC in caller's frame.
+	MOVDU	R1, -320(R1) // Allocate frame.
+	
+	// Preserve callee-save registers.
+	MOVD	R14, 24(R1)
+	MOVD	R15, 32(R1)
+	MOVD	R16, 40(R1)
+	MOVD	R17, 48(R1)
+	MOVD	R18, 56(R1)
+	MOVD	R19, 64(R1)
+	MOVD	R20, 72(R1)
+	MOVD	R21, 80(R1)
+	MOVD	R22, 88(R1)
+	MOVD	R23, 96(R1)
+	MOVD	R24, 104(R1)
+	MOVD	R25, 112(R1)
+	MOVD	R26, 120(R1)
+	MOVD	R27, 128(R1)
+	MOVD	R28, 136(R1)
+	MOVD	R29, 144(R1)
+	MOVD	g, 152(R1) // R30
+	MOVD	R31, 160(R1)
+	FMOVD	F14, 168(R1)
+	FMOVD	F15, 176(R1)
+	FMOVD	F16, 184(R1)
+	FMOVD	F17, 192(R1)
+	FMOVD	F18, 200(R1)
+	FMOVD	F19, 208(R1)
+	FMOVD	F20, 216(R1)
+	FMOVD	F21, 224(R1)
+	FMOVD	F22, 232(R1)
+	FMOVD	F23, 240(R1)
+	FMOVD	F24, 248(R1)
+	FMOVD	F25, 256(R1)
+	FMOVD	F26, 264(R1)
+	FMOVD	F27, 272(R1)
+	FMOVD	F28, 280(R1)
+	FMOVD	F29, 288(R1)
+	FMOVD	F30, 296(R1)
+	FMOVD	F31, 304(R1)
+
+	MOVD	R3, _rt0_ppc64le_linux_lib_argc<>(SB)
+	MOVD	R4, _rt0_ppc64le_linux_lib_argv<>(SB)
+
+	// Synchronous initialization.
+	MOVD	$runtime·libpreinit(SB), R12
+	MOVD	R12, CTR
+	BL	(CTR)
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVD	_cgo_sys_thread_create(SB), R12
+	CMP	$0, R12
+	BEQ	nocgo
+	MOVD	$_rt0_ppc64le_linux_lib_go(SB), R3
+	MOVD	$0, R4
+	MOVD	R12, CTR
+	BL	(CTR)
+	BR	done
+
+nocgo:
+	MOVD	$0x800000, R12                     // stacksize = 8192KB
+	MOVD	R12, 8(R1)
+	MOVD	$_rt0_ppc64le_linux_lib_go(SB), R12
+	MOVD	R12, 16(R1)
+	MOVD	$runtime·newosproc0(SB),R12
+	MOVD	R12, CTR
+	BL	(CTR)
+
+done:
+	// Restore saved registers.
+	MOVD	24(R1), R14
+	MOVD	32(R1), R15
+	MOVD	40(R1), R16
+	MOVD	48(R1), R17
+	MOVD	56(R1), R18
+	MOVD	64(R1), R19
+	MOVD	72(R1), R20
+	MOVD	80(R1), R21
+	MOVD	88(R1), R22
+	MOVD	96(R1), R23
+	MOVD	104(R1), R24
+	MOVD	112(R1), R25
+	MOVD	120(R1), R26
+	MOVD	128(R1), R27
+	MOVD	136(R1), R28
+	MOVD	144(R1), R29
+	MOVD	152(R1), g // R30
+	MOVD	160(R1), R31
+	FMOVD	168(R1), F14
+	FMOVD	176(R1), F15
+	FMOVD	184(R1), F16
+	FMOVD	192(R1), F17
+	FMOVD	200(R1), F18
+	FMOVD	208(R1), F19
+	FMOVD	216(R1), F20
+	FMOVD	224(R1), F21
+	FMOVD	232(R1), F22
+	FMOVD	240(R1), F23
+	FMOVD	248(R1), F24
+	FMOVD	256(R1), F25
+	FMOVD	264(R1), F26
+	FMOVD	272(R1), F27
+	FMOVD	280(R1), F28
+	FMOVD	288(R1), F29
+	FMOVD	296(R1), F30
+	FMOVD	304(R1), F31
+
+	ADD	$320, R1
+	MOVD	24(R1), R2
+	MOVD	16(R1), R0
+	MOVD	R0, LR
+	RET
+
+TEXT _rt0_ppc64le_linux_lib_go(SB),NOSPLIT,$0
+	MOVD	_rt0_ppc64le_linux_lib_argc<>(SB), R3
+	MOVD	_rt0_ppc64le_linux_lib_argv<>(SB), R4
+	MOVD	$runtime·rt0_go(SB), R12
+	MOVD	R12, CTR
+	BR	(CTR)
+
+DATA _rt0_ppc64le_linux_lib_argc<>(SB)/8, $0
+GLOBL _rt0_ppc64le_linux_lib_argc<>(SB),NOPTR, $8
+DATA _rt0_ppc64le_linux_lib_argv<>(SB)/8, $0
+GLOBL _rt0_ppc64le_linux_lib_argv<>(SB),NOPTR, $8
+
 TEXT _main<>(SB),NOSPLIT,$-8
 	// In a statically linked binary, the stack contains argc,
 	// argv as argc string pointers followed by a NULL, envv as a
@@ -22,13 +152,13 @@
 	// Statically linked
 	MOVD	0(R1), R3 // argc
 	ADD	$8, R1, R4 // argv
-	MOVD	$runtime·tls0(SB), R13 // TLS
+	MOVD	$runtime·m0+m_tls(SB), R13 // TLS
 	ADD	$0x7000, R13
 
 dlink:
 	BR	main(SB)
 
 TEXT main(SB),NOSPLIT,$-8
-	MOVD	$runtime·rt0_go(SB), R31
-	MOVD	R31, CTR
+	MOVD	$runtime·rt0_go(SB), R12
+	MOVD	R12, CTR
 	BR	(CTR)

diff --git a/src/runtime/rt0_linux_s390x.s b/src/runtime/rt0_linux_s390x.s
new file mode 100644
index 0000000..aedd6c7
--- /dev/null
+++ b/src/runtime/rt0_linux_s390x.s

@@ -0,0 +1,20 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+TEXT _rt0_s390x_linux(SB),NOSPLIT|NOFRAME,$0
+	// In a statically linked binary, the stack contains argc,
+	// argv as argc string pointers followed by a NULL, envv as a
+	// sequence of string pointers followed by a NULL, and auxv.
+	// There is no TLS base pointer.
+	//
+	// TODO: Support dynamic linking entry point
+	MOVD 0(R15), R2 // argc
+	ADD $8, R15, R3 // argv
+	BR main(SB)
+
+TEXT main(SB),NOSPLIT|NOFRAME,$0
+	MOVD	$runtime·rt0_go(SB), R11
+	BR	R11

diff --git a/src/runtime/rt0_plan9_386.s b/src/runtime/rt0_plan9_386.s
index c451299..cbbf245 100644
--- a/src/runtime/rt0_plan9_386.s
+++ b/src/runtime/rt0_plan9_386.s

@@ -16,8 +16,6 @@
 	MOVL	AX, 4(SP)
 	CALL	runtime·rt0_go(SB)
 
-DATA  runtime·isplan9(SB)/4, $1
-GLOBL runtime·isplan9(SB), NOPTR, $4
 GLOBL _tos(SB), NOPTR, $4
 GLOBL _privates(SB), NOPTR, $4
 GLOBL _nprivates(SB), NOPTR, $4

diff --git a/src/runtime/rt0_plan9_amd64.s b/src/runtime/rt0_plan9_amd64.s
index ec2d9ec..6fd493a 100644
--- a/src/runtime/rt0_plan9_amd64.s
+++ b/src/runtime/rt0_plan9_amd64.s

@@ -14,8 +14,6 @@
 	MOVQ	$runtime·rt0_go(SB), AX
 	JMP	AX
 
-DATA runtime·isplan9(SB)/4, $1
-GLOBL runtime·isplan9(SB), NOPTR, $4
 GLOBL _tos(SB), NOPTR, $8
 GLOBL _privates(SB), NOPTR, $8
 GLOBL _nprivates(SB), NOPTR, $4

diff --git a/src/runtime/rt0_plan9_arm.s b/src/runtime/rt0_plan9_arm.s
new file mode 100644
index 0000000..2a35e4e
--- /dev/null
+++ b/src/runtime/rt0_plan9_arm.s

@@ -0,0 +1,17 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+//in plan 9 argc is at top of stack followed by ptrs to arguments
+
+TEXT _rt0_arm_plan9(SB),NOSPLIT,$-4
+	MOVW	R0, _tos(SB)
+	MOVW	0(R13), R0
+	MOVW	$4(R13), R1
+	MOVW.W	R1, -4(R13)
+	MOVW.W	R0, -4(R13)
+	B	runtime·rt0_go(SB)
+
+GLOBL _tos(SB), NOPTR, $4

diff --git a/src/runtime/rt0_solaris_amd64.s b/src/runtime/rt0_solaris_amd64.s
index 5997cbf..e2d1e71 100644
--- a/src/runtime/rt0_solaris_amd64.s
+++ b/src/runtime/rt0_solaris_amd64.s

@@ -13,6 +13,3 @@
 TEXT main(SB),NOSPLIT,$-8
 	MOVQ	$runtime·rt0_go(SB), AX
 	JMP	AX
-
-DATA runtime·issolaris(SB)/4, $1
-GLOBL runtime·issolaris(SB), NOPTR, $4

diff --git a/src/runtime/rt0_windows_386.s b/src/runtime/rt0_windows_386.s
index 03f95d1..b9407a9 100644
--- a/src/runtime/rt0_windows_386.s
+++ b/src/runtime/rt0_windows_386.s

@@ -12,9 +12,39 @@
 	MOVL	$-1, 0(SP) // return PC for main
 	JMP	_main(SB)
 
+// When building with -buildmode=(c-shared or c-archive), this
+// symbol is called. For dynamic libraries it is called when the
+// library is loaded. For static libraries it is called when the
+// final executable starts, during the C runtime initialization
+// phase.
+TEXT _rt0_386_windows_lib(SB),NOSPLIT,$0x1C
+	MOVL	BP, 0x08(SP)
+	MOVL	BX, 0x0C(SP)
+	MOVL	AX, 0x10(SP)
+	MOVL  CX, 0x14(SP)
+	MOVL  DX, 0x18(SP)
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVL	_cgo_sys_thread_create(SB), AX
+	MOVL	$_rt0_386_windows_lib_go(SB), 0x00(SP)
+	MOVL	$0, 0x04(SP)
+
+	 // Top two items on the stack are passed to _cgo_sys_thread_create
+	 // as parameters. This is the calling convention on 32-bit Windows.
+	CALL	AX
+
+	MOVL	0x08(SP), BP
+	MOVL	0x0C(SP), BX
+	MOVL	0x10(SP), AX
+	MOVL	0x14(SP), CX
+	MOVL	0x18(SP), DX
+	RET
+
+TEXT _rt0_386_windows_lib_go(SB),NOSPLIT,$0
+	MOVL  $0, DI
+	MOVL	$0, SI
+	MOVL	$runtime·rt0_go(SB), AX
+	JMP	AX
+
 TEXT _main(SB),NOSPLIT,$0
 	JMP	runtime·rt0_go(SB)
-
-
-DATA  runtime·iswindows(SB)/4, $1
-GLOBL runtime·iswindows(SB), NOPTR, $4

diff --git a/src/runtime/rt0_windows_amd64.s b/src/runtime/rt0_windows_amd64.s
index df956ba..2f73b37 100644
--- a/src/runtime/rt0_windows_amd64.s
+++ b/src/runtime/rt0_windows_amd64.s

@@ -12,9 +12,37 @@
 	MOVQ	$main(SB), AX
 	JMP	AX
 
-TEXT main(SB),NOSPLIT,$-8
+// When building with -buildmode=(c-shared or c-archive), this
+// symbol is called. For dynamic libraries it is called when the
+// library is loaded. For static libraries it is called when the
+// final executable starts, during the C runtime initialization
+// phase.
+TEXT _rt0_amd64_windows_lib(SB),NOSPLIT,$0x28
+	MOVQ	BP, 0x00(SP)
+	MOVQ	BX, 0x08(SP)
+	MOVQ	AX, 0x10(SP)
+	MOVQ  CX, 0x18(SP)
+	MOVQ  DX, 0x20(SP)
+
+	// Create a new thread to do the runtime initialization and return.
+	MOVQ	_cgo_sys_thread_create(SB), AX
+	MOVQ	$_rt0_amd64_windows_lib_go(SB), CX
+	MOVQ	$0, DX
+	CALL	AX
+
+	MOVQ	0x00(SP), BP
+	MOVQ	0x08(SP), BX
+	MOVQ	0x10(SP), AX
+	MOVQ	0x18(SP), CX
+	MOVQ	0x20(SP), DX
+	RET
+
+TEXT _rt0_amd64_windows_lib_go(SB),NOSPLIT,$0
+	MOVQ  $0, DI
+	MOVQ	$0, SI
 	MOVQ	$runtime·rt0_go(SB), AX
 	JMP	AX
 
-DATA  runtime·iswindows(SB)/4, $1
-GLOBL runtime·iswindows(SB), NOPTR, $4
+TEXT main(SB),NOSPLIT,$-8
+	MOVQ	$runtime·rt0_go(SB), AX
+	JMP	AX

diff --git a/src/runtime/runtime-gdb.py b/src/runtime/runtime-gdb.py
index e57fa00..5c9b2a0 100644
--- a/src/runtime/runtime-gdb.py
+++ b/src/runtime/runtime-gdb.py

@@ -448,15 +448,15 @@
 		except gdb.error:
 			pc = int(str(pc).split(None, 1)[0], 16)
 		save_frame = gdb.selected_frame()
-		gdb.parse_and_eval('$save_pc = $pc')
 		gdb.parse_and_eval('$save_sp = $sp')
-		gdb.parse_and_eval('$pc = {0}'.format(str(pc)))
+		gdb.parse_and_eval('$save_pc = $pc')
 		gdb.parse_and_eval('$sp = {0}'.format(str(sp)))
+		gdb.parse_and_eval('$pc = {0}'.format(str(pc)))
 		try:
 			gdb.execute(cmd)
 		finally:
-			gdb.parse_and_eval('$pc = $save_pc')
 			gdb.parse_and_eval('$sp = $save_sp')
+			gdb.parse_and_eval('$pc = $save_pc')
 			save_frame.select()
 
 

diff --git a/src/runtime/runtime-gdb_test.go b/src/runtime/runtime-gdb_test.go
index 2843633..aabe52d 100644
--- a/src/runtime/runtime-gdb_test.go
+++ b/src/runtime/runtime-gdb_test.go

@@ -1,8 +1,13 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
 package runtime_test
 
 import (
 	"bytes"
 	"fmt"
+	"internal/testenv"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -13,19 +18,22 @@
 	"testing"
 )
 
-func checkGdbPython(t *testing.T) {
-	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
-	out, err := cmd.CombinedOutput()
-
-	if err != nil {
-		t.Skipf("skipping due to issue running gdb: %v", err)
+func checkGdbEnvironment(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	if runtime.GOOS == "darwin" {
+		t.Skip("gdb does not work on darwin")
 	}
-	if string(out) != "go gdb python support\n" {
-		t.Skipf("skipping due to lack of python gdb support: %s", out)
+	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
+		t.Skip("gdb test can fail with GOROOT_FINAL pending")
 	}
+}
 
+func checkGdbVersion(t *testing.T) {
 	// Issue 11214 reports various failures with older versions of gdb.
-	out, err = exec.Command("gdb", "--version").CombinedOutput()
+	out, err := exec.Command("gdb", "--version").CombinedOutput()
+	if err != nil {
+		t.Skipf("skipping: error executing gdb: %v", err)
+	}
 	re := regexp.MustCompile(`([0-9]+)\.([0-9]+)`)
 	matches := re.FindSubmatch(out)
 	if len(matches) < 3 {
@@ -42,6 +50,18 @@
 	t.Logf("gdb version %d.%d", major, minor)
 }
 
+func checkGdbPython(t *testing.T) {
+	cmd := exec.Command("gdb", "-nx", "-q", "--batch", "-iex", "python import sys; print('go gdb python support')")
+	out, err := cmd.CombinedOutput()
+
+	if err != nil {
+		t.Skipf("skipping due to issue running gdb: %v", err)
+	}
+	if string(out) != "go gdb python support\n" {
+		t.Skipf("skipping due to lack of python gdb support: %s", out)
+	}
+}
+
 const helloSource = `
 package main
 import "fmt"
@@ -57,10 +77,8 @@
 `
 
 func TestGdbPython(t *testing.T) {
-	if runtime.GOOS == "darwin" {
-		t.Skip("gdb does not work on darwin")
-	}
-
+	checkGdbEnvironment(t)
+	checkGdbVersion(t)
 	checkGdbPython(t)
 
 	dir, err := ioutil.TempDir("", "go-build")
@@ -84,6 +102,9 @@
 
 	args := []string{"-nx", "-q", "--batch", "-iex",
 		fmt.Sprintf("add-auto-load-safe-path %s/src/runtime", runtime.GOROOT()),
+		"-ex", "set startup-with-shell off",
+		"-ex", "info auto-load python-scripts",
+		"-ex", "set python print-stack full",
 		"-ex", "br main.go:10",
 		"-ex", "run",
 		"-ex", "echo BEGIN info goroutines\n",
@@ -94,16 +115,13 @@
 		"-ex", "echo END\n",
 		"-ex", "echo BEGIN print strvar\n",
 		"-ex", "print strvar",
-		"-ex", "echo END\n",
-		"-ex", "echo BEGIN print ptrvar\n",
-		"-ex", "print ptrvar",
 		"-ex", "echo END\n"}
 
 	// without framepointer, gdb cannot backtrace our non-standard
 	// stack frames on RISC architectures.
 	canBackTrace := false
 	switch runtime.GOARCH {
-	case "amd64", "386", "ppc64", "ppc64le", "arm", "arm64":
+	case "amd64", "386", "ppc64", "ppc64le", "arm", "arm64", "mips64", "mips64le", "s390x":
 		canBackTrace = true
 		args = append(args,
 			"-ex", "echo BEGIN goroutine 2 bt\n",
@@ -126,7 +144,10 @@
 			t.Skipf("skipping because GOROOT=%s does not exist", runtime.GOROOT())
 		}
 
-		t.Fatalf("failed to load Go runtime support: %s", firstLine)
+		_, file, _, _ := runtime.Caller(1)
+
+		t.Logf("package testing source file: %s", file)
+		t.Fatalf("failed to load Go runtime support: %s\n%s", firstLine, got)
 	}
 
 	// Extract named BEGIN...END blocks from output
@@ -151,10 +172,6 @@
 		t.Fatalf("print strvar failed: %s", bl)
 	}
 
-	if bl := blocks["print ptrvar"]; !strVarRe.MatchString(bl) {
-		t.Fatalf("print ptrvar failed: %s", bl)
-	}
-
 	btGoroutineRe := regexp.MustCompile(`^#0\s+runtime.+at`)
 	if bl := blocks["goroutine 2 bt"]; canBackTrace && !btGoroutineRe.MatchString(bl) {
 		t.Fatalf("goroutine 2 bt failed: %s", bl)
@@ -162,3 +179,87 @@
 		t.Logf("gdb cannot backtrace for GOARCH=%s, skipped goroutine backtrace test", runtime.GOARCH)
 	}
 }
+
+const backtraceSource = `
+package main
+
+//go:noinline
+func aaa() bool { return bbb() }
+
+//go:noinline
+func bbb() bool { return ccc() }
+
+//go:noinline
+func ccc() bool { return ddd() }
+
+//go:noinline
+func ddd() bool { return f() }
+
+//go:noinline
+func eee() bool { return true }
+
+var f = eee
+
+func main() {
+	_ = aaa()
+}
+`
+
+// TestGdbBacktrace tests that gdb can unwind the stack correctly
+// using only the DWARF debug info.
+func TestGdbBacktrace(t *testing.T) {
+	checkGdbEnvironment(t)
+	checkGdbVersion(t)
+
+	if runtime.GOOS == "netbsd" {
+		testenv.SkipFlaky(t, 15603)
+	}
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	// Build the source code.
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(backtraceSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	cmd := exec.Command("go", "build", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := testEnv(cmd).CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	// Execute gdb commands.
+	args := []string{"-nx", "-batch",
+		"-ex", "set startup-with-shell off",
+		"-ex", "break main.eee",
+		"-ex", "run",
+		"-ex", "backtrace",
+		"-ex", "continue",
+		filepath.Join(dir, "a.exe"),
+	}
+	got, _ := exec.Command("gdb", args...).CombinedOutput()
+
+	// Check that the backtrace matches the source code.
+	bt := []string{
+		"eee",
+		"ddd",
+		"ccc",
+		"bbb",
+		"aaa",
+		"main",
+	}
+	for i, name := range bt {
+		s := fmt.Sprintf("#%v.*main\\.%v", i, name)
+		re := regexp.MustCompile(s)
+		if found := re.Find(got) != nil; !found {
+			t.Errorf("could not find '%v' in backtrace", s)
+			t.Fatalf("gdb output:\n%v", string(got))
+		}
+	}
+}

diff --git a/src/runtime/runtime-lldb_test.go b/src/runtime/runtime-lldb_test.go
new file mode 100644
index 0000000..4c379b9
--- /dev/null
+++ b/src/runtime/runtime-lldb_test.go

@@ -0,0 +1,262 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"debug/elf"
+	"debug/macho"
+	"encoding/binary"
+	"internal/testenv"
+	"io"
+	"io/ioutil"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+	"testing"
+)
+
+var lldbPath string
+
+func checkLldbPython(t *testing.T) {
+	cmd := exec.Command("lldb", "-P")
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Skipf("skipping due to issue running lldb: %v\n%s", err, out)
+	}
+	lldbPath = strings.TrimSpace(string(out))
+
+	cmd = exec.Command("/usr/bin/python2.7", "-c", "import sys;sys.path.append(sys.argv[1]);import lldb; print('go lldb python support')", lldbPath)
+	out, err = cmd.CombinedOutput()
+
+	if err != nil {
+		t.Skipf("skipping due to issue running python: %v\n%s", err, out)
+	}
+	if string(out) != "go lldb python support\n" {
+		t.Skipf("skipping due to lack of python lldb support: %s", out)
+	}
+
+	if runtime.GOOS == "darwin" {
+		// Try to see if we have debugging permissions.
+		cmd = exec.Command("/usr/sbin/DevToolsSecurity", "-status")
+		out, err = cmd.CombinedOutput()
+		if err != nil {
+			t.Skipf("DevToolsSecurity failed: %v", err)
+		} else if !strings.Contains(string(out), "enabled") {
+			t.Skip(string(out))
+		}
+		cmd = exec.Command("/usr/bin/groups")
+		out, err = cmd.CombinedOutput()
+		if err != nil {
+			t.Skipf("groups failed: %v", err)
+		} else if !strings.Contains(string(out), "_developer") {
+			t.Skip("Not in _developer group")
+		}
+	}
+}
+
+const lldbHelloSource = `
+package main
+import "fmt"
+func main() {
+	mapvar := make(map[string]string,5)
+	mapvar["abc"] = "def"
+	mapvar["ghi"] = "jkl"
+	intvar := 42
+	ptrvar := &intvar
+	fmt.Println("hi") // line 10
+	_ = ptrvar
+}
+`
+
+const lldbScriptSource = `
+import sys
+sys.path.append(sys.argv[1])
+import lldb
+import os
+
+TIMEOUT_SECS = 5
+
+debugger = lldb.SBDebugger.Create()
+debugger.SetAsync(True)
+target = debugger.CreateTargetWithFileAndArch("a.exe", None)
+if target:
+  print "Created target"
+  main_bp = target.BreakpointCreateByLocation("main.go", 10)
+  if main_bp:
+    print "Created breakpoint"
+  process = target.LaunchSimple(None, None, os.getcwd())
+  if process:
+    print "Process launched"
+    listener = debugger.GetListener()
+    process.broadcaster.AddListener(listener, lldb.SBProcess.eBroadcastBitStateChanged)
+    while True:
+      event = lldb.SBEvent()
+      if listener.WaitForEvent(TIMEOUT_SECS, event):
+        if lldb.SBProcess.GetRestartedFromEvent(event):
+          continue
+        state = process.GetState()
+        if state in [lldb.eStateUnloaded, lldb.eStateLaunching, lldb.eStateRunning]:
+          continue
+      else:
+        print "Timeout launching"
+      break
+    if state == lldb.eStateStopped:
+      for t in process.threads:
+        if t.GetStopReason() == lldb.eStopReasonBreakpoint:
+          print "Hit breakpoint"
+          frame = t.GetFrameAtIndex(0)
+          if frame:
+            if frame.line_entry:
+              print "Stopped at %s:%d" % (frame.line_entry.file.basename, frame.line_entry.line)
+            if frame.function:
+              print "Stopped in %s" % (frame.function.name,)
+            var = frame.FindVariable('intvar')
+            if var:
+              print "intvar = %s" % (var.GetValue(),)
+            else:
+              print "no intvar"
+    else:
+      print "Process state", state
+    process.Destroy()
+else:
+  print "Failed to create target a.exe"
+
+lldb.SBDebugger.Destroy(debugger)
+sys.exit()
+`
+
+const expectedLldbOutput = `Created target
+Created breakpoint
+Process launched
+Hit breakpoint
+Stopped at main.go:10
+Stopped in main.main
+intvar = 42
+`
+
+func TestLldbPython(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	if final := os.Getenv("GOROOT_FINAL"); final != "" && runtime.GOROOT() != final {
+		t.Skip("gdb test can fail with GOROOT_FINAL pending")
+	}
+
+	checkLldbPython(t)
+
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+
+	cmd := exec.Command("go", "build", "-gcflags", "-N -l", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	src = filepath.Join(dir, "script.py")
+	err = ioutil.WriteFile(src, []byte(lldbScriptSource), 0755)
+	if err != nil {
+		t.Fatalf("failed to create script: %v", err)
+	}
+
+	cmd = exec.Command("/usr/bin/python2.7", "script.py", lldbPath)
+	cmd.Dir = dir
+	got, _ := cmd.CombinedOutput()
+
+	if string(got) != expectedLldbOutput {
+		if strings.Contains(string(got), "Timeout launching") {
+			t.Skip("Timeout launching")
+		}
+		t.Fatalf("Unexpected lldb output:\n%s", got)
+	}
+}
+
+// Check that aranges are valid even when lldb isn't installed.
+func TestDwarfAranges(t *testing.T) {
+	testenv.MustHaveGoBuild(t)
+	dir, err := ioutil.TempDir("", "go-build")
+	if err != nil {
+		t.Fatalf("failed to create temp directory: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	src := filepath.Join(dir, "main.go")
+	err = ioutil.WriteFile(src, []byte(lldbHelloSource), 0644)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+
+	cmd := exec.Command("go", "build", "-o", "a.exe")
+	cmd.Dir = dir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("building source %v\n%s", err, out)
+	}
+
+	filename := filepath.Join(dir, "a.exe")
+	if f, err := elf.Open(filename); err == nil {
+		sect := f.Section(".debug_aranges")
+		if sect == nil {
+			t.Fatal("Missing aranges section")
+		}
+		verifyAranges(t, f.ByteOrder, sect.Open())
+	} else if f, err := macho.Open(filename); err == nil {
+		sect := f.Section("__debug_aranges")
+		if sect == nil {
+			t.Fatal("Missing aranges section")
+		}
+		verifyAranges(t, f.ByteOrder, sect.Open())
+	} else {
+		t.Skip("Not an elf or macho binary.")
+	}
+}
+
+func verifyAranges(t *testing.T, byteorder binary.ByteOrder, data io.ReadSeeker) {
+	var header struct {
+		UnitLength  uint32 // does not include the UnitLength field
+		Version     uint16
+		Offset      uint32
+		AddressSize uint8
+		SegmentSize uint8
+	}
+	for {
+		offset, err := data.Seek(0, io.SeekCurrent)
+		if err != nil {
+			t.Fatalf("Seek error: %v", err)
+		}
+		if err = binary.Read(data, byteorder, &header); err == io.EOF {
+			return
+		} else if err != nil {
+			t.Fatalf("Error reading arange header: %v", err)
+		}
+		tupleSize := int64(header.SegmentSize) + 2*int64(header.AddressSize)
+		lastTupleOffset := offset + int64(header.UnitLength) + 4 - tupleSize
+		if lastTupleOffset%tupleSize != 0 {
+			t.Fatalf("Invalid arange length %d, (addr %d, seg %d)", header.UnitLength, header.AddressSize, header.SegmentSize)
+		}
+		if _, err = data.Seek(lastTupleOffset, io.SeekStart); err != nil {
+			t.Fatalf("Seek error: %v", err)
+		}
+		buf := make([]byte, tupleSize)
+		if n, err := data.Read(buf); err != nil || int64(n) < tupleSize {
+			t.Fatalf("Read error: %v", err)
+		}
+		for _, val := range buf {
+			if val != 0 {
+				t.Fatalf("Invalid terminator")
+			}
+		}
+	}
+}

diff --git a/src/runtime/runtime.go b/src/runtime/runtime.go
index 2387d9a..d9c26cc 100644
--- a/src/runtime/runtime.go
+++ b/src/runtime/runtime.go

@@ -1,13 +1,17 @@
-// Copyright 2009 The Go Authors.  All rights reserved.
+// Copyright 2009 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import _ "unsafe" // for go:linkname
+import (
+	"runtime/internal/atomic"
+	_ "unsafe" // for go:linkname
+)
 
 //go:generate go run wincallback.go
 //go:generate go run mkduff.go
+//go:generate go run mkfastlog2table.go
 
 var ticks struct {
 	lock mutex
@@ -15,11 +19,9 @@
 	val  uint64
 }
 
-var tls0 [8]uintptr // available storage for m0's TLS; not necessarily used; opaque to GC
-
 // Note: Called by runtime/pprof in addition to runtime code.
 func tickspersecond() int64 {
-	r := int64(atomicload64(&ticks.val))
+	r := int64(atomic.Load64(&ticks.val))
 	if r != 0 {
 		return r
 	}
@@ -38,7 +40,7 @@
 		if r == 0 {
 			r++
 		}
-		atomicstore64(&ticks.val, uint64(r))
+		atomic.Store64(&ticks.val, uint64(r))
 	}
 	unlock(&ticks.lock)
 	return r

diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 134c999..302f58d 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go

@@ -4,34 +4,47 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Keep a cached value to make gotraceback fast,
 // since we call it on every call to gentraceback.
-// The cached value is a uint32 in which the low bit
-// is the "crash" setting and the top 31 bits are the
-// gotraceback value.
-var traceback_cache uint32 = 2 << 1
+// The cached value is a uint32 in which the low bits
+// are the "crash" and "all" settings and the remaining
+// bits are the traceback value (0 off, 1 on, 2 include system).
+const (
+	tracebackCrash = 1 << iota
+	tracebackAll
+	tracebackShift = iota
+)
 
-// The GOTRACEBACK environment variable controls the
-// behavior of a Go program that is crashing and exiting.
-//	GOTRACEBACK=0   suppress all tracebacks
-//	GOTRACEBACK=1   default behavior - show tracebacks but exclude runtime frames
-//	GOTRACEBACK=2   show tracebacks including runtime frames
-//	GOTRACEBACK=crash   show tracebacks including runtime frames, then crash (core dump etc)
+var traceback_cache uint32 = 2 << tracebackShift
+var traceback_env uint32
+
+// gotraceback returns the current traceback settings.
+//
+// If level is 0, suppress all tracebacks.
+// If level is 1, show tracebacks, but exclude runtime frames.
+// If level is 2, show tracebacks including runtime frames.
+// If all is set, print all goroutine stacks. Otherwise, print just the current goroutine.
+// If crash is set, crash (core dump, etc) after tracebacking.
+//
 //go:nosplit
-func gotraceback(crash *bool) int32 {
+func gotraceback() (level int32, all, crash bool) {
 	_g_ := getg()
-	if crash != nil {
-		*crash = false
-	}
+	all = _g_.m.throwing > 0
 	if _g_.m.traceback != 0 {
-		return int32(_g_.m.traceback)
+		level = int32(_g_.m.traceback)
+		return
 	}
-	if crash != nil {
-		*crash = traceback_cache&1 != 0
-	}
-	return int32(traceback_cache >> 1)
+	t := atomic.Load(&traceback_cache)
+	crash = t&tracebackCrash != 0
+	all = all || t&tracebackAll != 0
+	level = int32(t >> tracebackShift)
+	return
 }
 
 var (
@@ -39,10 +52,10 @@
 	argv **byte
 )
 
-// nosplit for use in linux/386 startup linux_setup_vdso
+// nosplit for use in linux startup sysargs
 //go:nosplit
 func argv_index(argv **byte, i int32) *byte {
-	return *(**byte)(add(unsafe.Pointer(argv), uintptr(i)*ptrSize))
+	return *(**byte)(add(unsafe.Pointer(argv), uintptr(i)*sys.PtrSize))
 }
 
 func args(c int32, v **byte) {
@@ -51,13 +64,6 @@
 	sysargs(c, v)
 }
 
-var (
-	// TODO: Retire in favor of GOOS== checks.
-	isplan9   int32
-	issolaris int32
-	iswindows int32
-)
-
 func goargs() {
 	if GOOS == "windows" {
 		return
@@ -71,7 +77,7 @@
 
 func goenvs_unix() {
 	// TODO(austin): ppc64 in dynamic linking mode doesn't
-	// guarantee env[] will immediately follow argv.  Might cause
+	// guarantee env[] will immediately follow argv. Might cause
 	// problems.
 	n := int32(0)
 	for argv_index(argv, argc+1+n) != nil {
@@ -99,36 +105,36 @@
 	prefetcht1(uintptr(unsafe.Pointer(&test_z64)))
 	prefetcht2(uintptr(unsafe.Pointer(&test_z64)))
 	prefetchnta(uintptr(unsafe.Pointer(&test_z64)))
-	if cas64(&test_z64, test_x64, 1) {
+	if atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
 	if test_x64 != 0 {
 		throw("cas64 failed")
 	}
 	test_x64 = 42
-	if !cas64(&test_z64, test_x64, 1) {
+	if !atomic.Cas64(&test_z64, test_x64, 1) {
 		throw("cas64 failed")
 	}
 	if test_x64 != 42 || test_z64 != 1 {
 		throw("cas64 failed")
 	}
-	if atomicload64(&test_z64) != 1 {
+	if atomic.Load64(&test_z64) != 1 {
 		throw("load64 failed")
 	}
-	atomicstore64(&test_z64, (1<<40)+1)
-	if atomicload64(&test_z64) != (1<<40)+1 {
+	atomic.Store64(&test_z64, (1<<40)+1)
+	if atomic.Load64(&test_z64) != (1<<40)+1 {
 		throw("store64 failed")
 	}
-	if xadd64(&test_z64, (1<<40)+1) != (2<<40)+2 {
+	if atomic.Xadd64(&test_z64, (1<<40)+1) != (2<<40)+2 {
 		throw("xadd64 failed")
 	}
-	if atomicload64(&test_z64) != (2<<40)+2 {
+	if atomic.Load64(&test_z64) != (2<<40)+2 {
 		throw("xadd64 failed")
 	}
-	if xchg64(&test_z64, (3<<40)+3) != (2<<40)+2 {
+	if atomic.Xchg64(&test_z64, (3<<40)+3) != (2<<40)+2 {
 		throw("xchg64 failed")
 	}
-	if atomicload64(&test_z64) != (3<<40)+3 {
+	if atomic.Load64(&test_z64) != (3<<40)+3 {
 		throw("xchg64 failed")
 	}
 }
@@ -189,10 +195,10 @@
 	if unsafe.Sizeof(j) != 8 {
 		throw("bad j")
 	}
-	if unsafe.Sizeof(k) != ptrSize {
+	if unsafe.Sizeof(k) != sys.PtrSize {
 		throw("bad k")
 	}
-	if unsafe.Sizeof(l) != ptrSize {
+	if unsafe.Sizeof(l) != sys.PtrSize {
 		throw("bad l")
 	}
 	if unsafe.Sizeof(x1) != 1 {
@@ -211,7 +217,7 @@
 
 	var z uint32
 	z = 1
-	if !cas(&z, 1, 2) {
+	if !atomic.Cas(&z, 1, 2) {
 		throw("cas1")
 	}
 	if z != 2 {
@@ -219,7 +225,7 @@
 	}
 
 	z = 4
-	if cas(&z, 5, 6) {
+	if atomic.Cas(&z, 5, 6) {
 		throw("cas3")
 	}
 	if z != 4 {
@@ -227,7 +233,7 @@
 	}
 
 	z = 0xffffffff
-	if !cas(&z, 0xffffffff, 0xfffffffe) {
+	if !atomic.Cas(&z, 0xffffffff, 0xfffffffe) {
 		throw("cas5")
 	}
 	if z != 0xfffffffe {
@@ -235,8 +241,8 @@
 	}
 
 	k = unsafe.Pointer(uintptr(0xfedcb123))
-	if ptrSize == 8 {
-		k = unsafe.Pointer(uintptr(unsafe.Pointer(k)) << 10)
+	if sys.PtrSize == 8 {
+		k = unsafe.Pointer(uintptr(k) << 10)
 	}
 	if casp(&k, nil, nil) {
 		throw("casp1")
@@ -250,7 +256,7 @@
 	}
 
 	m = [4]byte{1, 1, 1, 1}
-	atomicor8(&m[1], 0xf0)
+	atomic.Or8(&m[1], 0xf0)
 	if m[0] != 1 || m[1] != 0xf1 || m[2] != 1 || m[3] != 1 {
 		throw("atomicor8")
 	}
@@ -292,6 +298,10 @@
 	if _FixedStack != round2(_FixedStack) {
 		throw("FixedStack is not power-of-2")
 	}
+
+	if !checkASM() {
+		throw("assembly checks failed")
+	}
 }
 
 type dbgVar struct {
@@ -305,6 +315,7 @@
 // already have an initial value.
 var debug struct {
 	allocfreetrace    int32
+	cgocheck          int32
 	efence            int32
 	gccheckmark       int32
 	gcpacertrace      int32
@@ -323,6 +334,7 @@
 
 var dbgvars = []dbgVar{
 	{"allocfreetrace", &debug.allocfreetrace},
+	{"cgocheck", &debug.cgocheck},
 	{"efence", &debug.efence},
 	{"gccheckmark", &debug.gccheckmark},
 	{"gcpacertrace", &debug.gcpacertrace},
@@ -341,6 +353,7 @@
 
 func parsedebugvars() {
 	// defaults
+	debug.cgocheck = 1
 	debug.invalidptr = 1
 
 	for p := gogetenv("GODEBUG"); p != ""; {
@@ -371,23 +384,47 @@
 		}
 	}
 
-	switch p := gogetenv("GOTRACEBACK"); p {
-	case "":
-		traceback_cache = 1 << 1
-	case "crash":
-		traceback_cache = 2<<1 | 1
-	default:
-		traceback_cache = uint32(atoi(p)) << 1
-	}
-	// when C owns the process, simply exit'ing the process on fatal errors
-	// and panics is surprising. Be louder and abort instead.
-	if islibrary || isarchive {
-		traceback_cache |= 1
-	}
+	setTraceback(gogetenv("GOTRACEBACK"))
+	traceback_env = traceback_cache
 
 	if debug.gcstackbarrierall > 0 {
 		firstStackBarrierOffset = 0
 	}
+
+	// For cgocheck > 1, we turn on the write barrier at all times
+	// and check all pointer writes.
+	if debug.cgocheck > 1 {
+		writeBarrier.cgo = true
+		writeBarrier.enabled = true
+	}
+}
+
+//go:linkname setTraceback runtime/debug.SetTraceback
+func setTraceback(level string) {
+	var t uint32
+	switch level {
+	case "none":
+		t = 0
+	case "single", "":
+		t = 1 << tracebackShift
+	case "all":
+		t = 1<<tracebackShift | tracebackAll
+	case "system":
+		t = 2<<tracebackShift | tracebackAll
+	case "crash":
+		t = 2<<tracebackShift | tracebackAll | tracebackCrash
+	default:
+		t = uint32(atoi(level))<<tracebackShift | tracebackAll
+	}
+	// when C owns the process, simply exit'ing the process on fatal errors
+	// and panics is surprising. Be louder and abort instead.
+	if islibrary || isarchive {
+		t |= tracebackCrash
+	}
+
+	t |= traceback_env
+
+	atomic.Store(&traceback_cache, t)
 }
 
 // Poor mans 64-bit division.
@@ -440,11 +477,51 @@
 }
 
 //go:linkname reflect_typelinks reflect.typelinks
-//go:nosplit
-func reflect_typelinks() [][]*_type {
-	ret := [][]*_type{firstmoduledata.typelinks}
+func reflect_typelinks() ([]unsafe.Pointer, [][]int32) {
+	sections := []unsafe.Pointer{unsafe.Pointer(firstmoduledata.types)}
+	ret := [][]int32{firstmoduledata.typelinks}
 	for datap := firstmoduledata.next; datap != nil; datap = datap.next {
+		sections = append(sections, unsafe.Pointer(datap.types))
 		ret = append(ret, datap.typelinks)
 	}
-	return ret
+	return sections, ret
+}
+
+// reflect_resolveNameOff resolves a name offset from a base pointer.
+//go:linkname reflect_resolveNameOff reflect.resolveNameOff
+func reflect_resolveNameOff(ptrInModule unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer(resolveNameOff(ptrInModule, nameOff(off)).bytes)
+}
+
+// reflect_resolveTypeOff resolves an *rtype offset from a base type.
+//go:linkname reflect_resolveTypeOff reflect.resolveTypeOff
+func reflect_resolveTypeOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return unsafe.Pointer((*_type)(rtype).typeOff(typeOff(off)))
+}
+
+// reflect_resolveTextOff resolves an function pointer offset from a base type.
+//go:linkname reflect_resolveTextOff reflect.resolveTextOff
+func reflect_resolveTextOff(rtype unsafe.Pointer, off int32) unsafe.Pointer {
+	return (*_type)(rtype).textOff(textOff(off))
+
+}
+
+// reflect_addReflectOff adds a pointer to the reflection offset lookup map.
+//go:linkname reflect_addReflectOff reflect.addReflectOff
+func reflect_addReflectOff(ptr unsafe.Pointer) int32 {
+	reflectOffsLock()
+	if reflectOffs.m == nil {
+		reflectOffs.m = make(map[int32]unsafe.Pointer)
+		reflectOffs.minv = make(map[unsafe.Pointer]int32)
+		reflectOffs.next = -1
+	}
+	id, found := reflectOffs.minv[ptr]
+	if !found {
+		id = reflectOffs.next
+		reflectOffs.next-- // use negative offsets as IDs to aid debugging
+		reflectOffs.m[id] = ptr
+		reflectOffs.minv[ptr] = id
+	}
+	reflectOffsUnlock()
+	return id
 }

diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 57cd869..6119e75 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go

@@ -4,36 +4,88 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
-/*
- * defined constants
- */
+// defined constants
 const (
 	// G status
 	//
+	// Beyond indicating the general state of a G, the G status
+	// acts like a lock on the goroutine's stack (and hence its
+	// ability to execute user code).
+	//
 	// If you add to this list, add to the list
 	// of "okay during garbage collection" status
 	// in mgcmark.go too.
-	_Gidle            = iota // 0
-	_Grunnable               // 1 runnable and on a run queue
-	_Grunning                // 2
-	_Gsyscall                // 3
-	_Gwaiting                // 4
-	_Gmoribund_unused        // 5 currently unused, but hardcoded in gdb scripts
-	_Gdead                   // 6
-	_Genqueue                // 7 Only the Gscanenqueue is used.
-	_Gcopystack              // 8 in this state when newstack is moving the stack
-	// the following encode that the GC is scanning the stack and what to do when it is done
-	_Gscan = 0x1000 // atomicstatus&~Gscan = the non-scan state,
-	// _Gscanidle =     _Gscan + _Gidle,      // Not used. Gidle only used with newly malloced gs
-	_Gscanrunnable = _Gscan + _Grunnable //  0x1001 When scanning completes make Grunnable (it is already on run queue)
-	_Gscanrunning  = _Gscan + _Grunning  //  0x1002 Used to tell preemption newstack routine to scan preempted stack.
-	_Gscansyscall  = _Gscan + _Gsyscall  //  0x1003 When scanning completes make it Gsyscall
-	_Gscanwaiting  = _Gscan + _Gwaiting  //  0x1004 When scanning completes make it Gwaiting
-	// _Gscanmoribund_unused,               //  not possible
-	// _Gscandead,                          //  not possible
-	_Gscanenqueue = _Gscan + _Genqueue //  When scanning completes make it Grunnable and put on runqueue
+
+	// _Gidle means this goroutine was just allocated and has not
+	// yet been initialized.
+	_Gidle = iota // 0
+
+	// _Grunnable means this goroutine is on a run queue. It is
+	// not currently executing user code. The stack is not owned.
+	_Grunnable // 1
+
+	// _Grunning means this goroutine may execute user code. The
+	// stack is owned by this goroutine. It is not on a run queue.
+	// It is assigned an M and a P.
+	_Grunning // 2
+
+	// _Gsyscall means this goroutine is executing a system call.
+	// It is not executing user code. The stack is owned by this
+	// goroutine. It is not on a run queue. It is assigned an M.
+	_Gsyscall // 3
+
+	// _Gwaiting means this goroutine is blocked in the runtime.
+	// It is not executing user code. It is not on a run queue,
+	// but should be recorded somewhere (e.g., a channel wait
+	// queue) so it can be ready()d when necessary. The stack is
+	// not owned *except* that a channel operation may read or
+	// write parts of the stack under the appropriate channel
+	// lock. Otherwise, it is not safe to access the stack after a
+	// goroutine enters _Gwaiting (e.g., it may get moved).
+	_Gwaiting // 4
+
+	// _Gmoribund_unused is currently unused, but hardcoded in gdb
+	// scripts.
+	_Gmoribund_unused // 5
+
+	// _Gdead means this goroutine is currently unused. It may be
+	// just exited, on a free list, or just being initialized. It
+	// is not executing user code. It may or may not have a stack
+	// allocated. The G and its stack (if any) are owned by the M
+	// that is exiting the G or that obtained the G from the free
+	// list.
+	_Gdead // 6
+
+	// _Genqueue_unused is currently unused.
+	_Genqueue_unused // 7
+
+	// _Gcopystack means this goroutine's stack is being moved. It
+	// is not executing user code and is not on a run queue. The
+	// stack is owned by the goroutine that put it in _Gcopystack.
+	_Gcopystack // 8
+
+	// _Gscan combined with one of the above states other than
+	// _Grunning indicates that GC is scanning the stack. The
+	// goroutine is not executing user code and the stack is owned
+	// by the goroutine that set the _Gscan bit.
+	//
+	// _Gscanrunning is different: it is used to briefly block
+	// state transitions while GC signals the G to scan its own
+	// stack. This is otherwise like _Grunning.
+	//
+	// atomicstatus&~Gscan gives the state the goroutine will
+	// return to when the scan completes.
+	_Gscan         = 0x1000
+	_Gscanrunnable = _Gscan + _Grunnable // 0x1001
+	_Gscanrunning  = _Gscan + _Grunning  // 0x1002
+	_Gscansyscall  = _Gscan + _Gsyscall  // 0x1003
+	_Gscanwaiting  = _Gscan + _Gwaiting  // 0x1004
 )
 
 const (
@@ -45,14 +97,10 @@
 	_Pdead
 )
 
-// The next line makes 'go generate' write the zgen_*.go files with
-// per-OS and per-arch information, including constants
-// named goos_$GOOS and goarch_$GOARCH for every
-// known GOOS and GOARCH. The constant is 1 on the
-// current system, 0 otherwise; multiplying by them is
-// useful for defining GOOS- or GOARCH-specific constants.
-//go:generate go run gengoos.go
-
+// Mutual exclusion locks.  In the uncontended case,
+// as fast as spin locks (just a few user-level instructions),
+// but on the contention path they sleep in the kernel.
+// A zeroed Mutex is unlocked (no need to initialize each lock).
 type mutex struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -60,6 +108,26 @@
 	key uintptr
 }
 
+// sleep and wakeup on one-time events.
+// before any calls to notesleep or notewakeup,
+// must call noteclear to initialize the Note.
+// then, exactly one thread can call notesleep
+// and exactly one thread can call notewakeup (once).
+// once notewakeup has been called, the notesleep
+// will return.  future notesleep will return immediately.
+// subsequent noteclear must be called only after
+// previous notesleep has returned, e.g. it's disallowed
+// to call noteclear straight after notewakeup.
+//
+// notetsleep is like notesleep but wakes up after
+// a given number of nanoseconds even if the event
+// has not yet happened.  if a goroutine uses notetsleep to
+// wake up early, it must wait to call noteclear until it
+// can be sure that no other goroutine is calling
+// notewakeup.
+//
+// notesleep/notetsleep are generally called on g0,
+// notetsleepg is similar to notetsleep but is called on user g.
 type note struct {
 	// Futex-based impl treats it as uint32 key,
 	// while sema-based impl as M* waitm.
@@ -67,11 +135,6 @@
 	key uintptr
 }
 
-type _string struct {
-	str *byte
-	len int
-}
-
 type funcval struct {
 	fn uintptr
 	// variable-size, fn-specific data here
@@ -87,6 +150,10 @@
 	data  unsafe.Pointer
 }
 
+func efaceOf(ep *interface{}) *eface {
+	return (*eface)(unsafe.Pointer(ep))
+}
+
 // The guintptr, muintptr, and puintptr are all used to bypass write barriers.
 // It is particularly important to avoid write barriers when the current P has
 // been released, because the GC thinks the world is stopped, and an
@@ -127,20 +194,31 @@
 // alternate arena. Using guintptr doesn't make that problem any worse.
 type guintptr uintptr
 
-func (gp guintptr) ptr() *g   { return (*g)(unsafe.Pointer(gp)) }
+//go:nosplit
+func (gp guintptr) ptr() *g { return (*g)(unsafe.Pointer(gp)) }
+
+//go:nosplit
 func (gp *guintptr) set(g *g) { *gp = guintptr(unsafe.Pointer(g)) }
+
+//go:nosplit
 func (gp *guintptr) cas(old, new guintptr) bool {
-	return casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
+	return atomic.Casuintptr((*uintptr)(unsafe.Pointer(gp)), uintptr(old), uintptr(new))
 }
 
 type puintptr uintptr
 
-func (pp puintptr) ptr() *p   { return (*p)(unsafe.Pointer(pp)) }
+//go:nosplit
+func (pp puintptr) ptr() *p { return (*p)(unsafe.Pointer(pp)) }
+
+//go:nosplit
 func (pp *puintptr) set(p *p) { *pp = puintptr(unsafe.Pointer(p)) }
 
 type muintptr uintptr
 
-func (mp muintptr) ptr() *m   { return (*m)(unsafe.Pointer(mp)) }
+//go:nosplit
+func (mp muintptr) ptr() *m { return (*m)(unsafe.Pointer(mp)) }
+
+//go:nosplit
 func (mp *muintptr) set(m *m) { *mp = muintptr(unsafe.Pointer(m)) }
 
 type gobuf struct {
@@ -149,22 +227,39 @@
 	pc   uintptr
 	g    guintptr
 	ctxt unsafe.Pointer // this has to be a pointer so that gc scans it
-	ret  uintreg
+	ret  sys.Uintreg
 	lr   uintptr
 	bp   uintptr // for GOEXPERIMENT=framepointer
 }
 
-// Known to compiler.
-// Changes here must also be made in src/cmd/internal/gc/select.go's selecttype.
+// sudog represents a g in a wait list, such as for sending/receiving
+// on a channel.
+//
+// sudog is necessary because the g ↔ synchronization object relation
+// is many-to-many. A g can be on many wait lists, so there may be
+// many sudogs for one g; and many gs may be waiting on the same
+// synchronization object, so there may be many sudogs for one object.
+//
+// sudogs are allocated from a special pool. Use acquireSudog and
+// releaseSudog to allocate and free them.
 type sudog struct {
-	g           *g
-	selectdone  *uint32
-	next        *sudog
-	prev        *sudog
-	elem        unsafe.Pointer // data element
+	// The following fields are protected by the hchan.lock of the
+	// channel this sudog is blocking on. shrinkstack depends on
+	// this.
+
+	g          *g
+	selectdone *uint32 // CAS to 1 to win select race (may point to stack)
+	next       *sudog
+	prev       *sudog
+	elem       unsafe.Pointer // data element (may point to stack)
+
+	// The following fields are never accessed concurrently.
+	// waitlink is only accessed by g.
+
 	releasetime int64
-	nrelease    int32  // -1 for acquire
+	ticket      uint32
 	waitlink    *sudog // g.waiting list
+	c           *hchan // channel
 }
 
 type gcstats struct {
@@ -227,8 +322,9 @@
 	sched          gobuf
 	syscallsp      uintptr        // if status==Gsyscall, syscallsp = sched.sp to use during gc
 	syscallpc      uintptr        // if status==Gsyscall, syscallpc = sched.pc to use during gc
-	stkbar         []stkbar       // stack barriers, from low to high
+	stkbar         []stkbar       // stack barriers, from low to high (see top of mstkbar.go)
 	stkbarPos      uintptr        // index of lowest stack barrier not hit
+	stktopsp       uintptr        // expected sp at top of stack, to check in traceback
 	param          unsafe.Pointer // passed parameter on wakeup
 	atomicstatus   uint32
 	stackLock      uint32 // sigprof/scang lock; TODO: fold in to atomicstatus
@@ -236,16 +332,17 @@
 	waitsince      int64  // approx time when the g become blocked
 	waitreason     string // if status==Gwaiting
 	schedlink      guintptr
-	preempt        bool   // preemption signal, duplicates stackguard0 = stackpreempt
-	paniconfault   bool   // panic (instead of crash) on unexpected fault address
-	preemptscan    bool   // preempted g does scan for gc
-	gcscandone     bool   // g has scanned stack; protected by _Gscan bit in status
-	gcscanvalid    bool   // false at start of gc cycle, true if G has not run since last scan
-	throwsplit     bool   // must not split stack
-	raceignore     int8   // ignore race detection events
-	sysblocktraced bool   // StartTrace has emitted EvGoInSyscall about this goroutine
-	sysexitticks   int64  // cputicks when syscall has returned (for tracing)
-	sysexitseq     uint64 // trace seq when syscall has returned (for tracing)
+	preempt        bool     // preemption signal, duplicates stackguard0 = stackpreempt
+	paniconfault   bool     // panic (instead of crash) on unexpected fault address
+	preemptscan    bool     // preempted g does scan for gc
+	gcscandone     bool     // g has scanned stack; protected by _Gscan bit in status
+	gcscanvalid    bool     // false at start of gc cycle, true if G has not run since last scan; transition from true to false by calling queueRescan and false to true by calling dequeueRescan
+	throwsplit     bool     // must not split stack
+	raceignore     int8     // ignore race detection events
+	sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+	sysexitticks   int64    // cputicks when syscall has returned (for tracing)
+	traceseq       uint64   // trace event sequencer
+	tracelastp     puintptr // last P emitted an event for this goroutine
 	lockedm        *m
 	sig            uint32
 	writebuf       []byte
@@ -255,21 +352,26 @@
 	gopc           uintptr // pc of go statement that created this goroutine
 	startpc        uintptr // pc of goroutine function
 	racectx        uintptr
-	waiting        *sudog // sudog structures this g is waiting on (that have a valid elem ptr)
-	readyg         *g     // scratch for readyExecute
+	waiting        *sudog    // sudog structures this g is waiting on (that have a valid elem ptr); in lock order
+	cgoCtxt        []uintptr // cgo traceback context
 
-	// Per-G gcController state
-	gcalloc    uintptr // bytes allocated during this GC cycle
-	gcscanwork int64   // scan work done (or stolen) this GC cycle
-}
+	// Per-G GC state
 
-type mts struct {
-	tv_sec  int64
-	tv_nsec int64
-}
+	// gcRescan is this G's index in work.rescan.list. If this is
+	// -1, this G is not on the rescan list.
+	//
+	// If gcphase != _GCoff and this G is visible to the garbage
+	// collector, writes to this are protected by work.rescan.lock.
+	gcRescan int32
 
-type mscratch struct {
-	v [6]uintptr
+	// gcAssistBytes is this G's GC assist credit in terms of
+	// bytes allocated. If this is positive, then the G has credit
+	// to allocate gcAssistBytes bytes without assisting. If this
+	// is negative, then the G must correct this by performing
+	// scan work. We track this in bytes to make it fast to update
+	// and check for debt in the malloc hot path. The assist ratio
+	// determines how this corresponds to scan work debt.
+	gcAssistBytes int64
 }
 
 type m struct {
@@ -280,8 +382,8 @@
 	// Fields not known to debuggers.
 	procid        uint64     // for debuggers, but offset not hard-coded
 	gsignal       *g         // signal-handling g
-	sigmask       [4]uintptr // storage for saved signal mask
-	tls           [4]uintptr // thread-local storage (for x86 extern register)
+	sigmask       sigset     // storage for saved signal mask
+	tls           [6]uintptr // thread-local storage (for x86 extern register)
 	mstartfn      func()
 	curg          *g       // current running goroutine
 	caughtsig     guintptr // goroutine running during fatal signal
@@ -299,14 +401,16 @@
 	spinning      bool // m is out of work and is actively looking for work
 	blocked       bool // m is blocked on a note
 	inwb          bool // m is executing a write barrier
+	newSigstack   bool // minit on C thread called sigaltstack
 	printlock     int8
 	fastrand      uint32
-	ncgocall      uint64 // number of cgo calls in total
-	ncgo          int32  // number of cgo calls currently in progress
+	ncgocall      uint64      // number of cgo calls in total
+	ncgo          int32       // number of cgo calls currently in progress
+	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
+	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
-	machport      uint32 // return address for mach ipc (os x)
 	mcache        *mcache
 	lockedg       *g
 	createstack   [32]uintptr // stack that created this thread.
@@ -315,9 +419,6 @@
 	fflag         uint32      // floating point compare flags
 	locked        uint32      // tracking for lockosthread
 	nextwaitm     uintptr     // next m waiting for lock
-	waitsema      uintptr     // semaphore for parking on locks
-	waitsemacount uint32
-	waitsemalock  uint32
 	gcstats       gcstats
 	needextram    bool
 	traceback     uint8
@@ -327,8 +428,8 @@
 	waittraceskip int
 	startingtrace bool
 	syscalltick   uint32
-	//#ifdef GOOS_windows
-	thread uintptr // thread handle
+	thread        uintptr // thread handle
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -336,19 +437,8 @@
 	libcallsp uintptr
 	libcallg  guintptr
 	syscall   libcall // stores syscall parameters on windows
-	//#endif
-	//#ifdef GOOS_solaris
-	perrno *int32 // pointer to tls errno
-	// these are here because they are too large to be on the stack
-	// of low-level NOSPLIT functions.
-	//LibCall	libcall;
-	ts      mts
-	scratch mscratch
-	//#endif
-	//#ifdef GOOS_plan9
-	notesig *int8
-	errstr  *byte
-	//#endif
+
+	mOS
 }
 
 type p struct {
@@ -361,6 +451,7 @@
 	syscalltick uint32   // incremented on every system call
 	m           muintptr // back-link to associated m (nil if idle)
 	mcache      *mcache
+	racectx     uintptr
 
 	deferpool    [5][]*_defer // pool of available defer structs of different sizes (see panic.go)
 	deferpoolbuf [5][32]*_defer
@@ -372,7 +463,7 @@
 	// Queue of runnable goroutines. Accessed without lock.
 	runqhead uint32
 	runqtail uint32
-	runq     [256]*g
+	runq     [256]guintptr
 	// runnext, if non-nil, is a runnable G that was ready'd by
 	// the current G and should be run next instead of what's in
 	// runq if there's time remaining in the running G's time
@@ -391,13 +482,13 @@
 	sudogcache []*sudog
 	sudogbuf   [128]*sudog
 
-	tracebuf *traceBuf
+	tracebuf traceBufPtr
 
 	palloc persistentAlloc // per-P to avoid mutex
 
 	// Per-P GC state
 	gcAssistTime     int64 // Nanoseconds in assistAlloc
-	gcBgMarkWorker   *g
+	gcBgMarkWorker   guintptr
 	gcMarkWorkerMode gcMarkWorkerMode
 
 	// gcw is this P's GC work buffer cache. The work buffer is
@@ -417,9 +508,11 @@
 )
 
 type schedt struct {
-	lock mutex
+	// accessed atomically. keep at top to ensure alignment on 32-bit systems.
+	goidgen  uint64
+	lastpoll uint64
 
-	goidgen uint64
+	lock mutex
 
 	midle        muintptr // idle m's waiting for work
 	nmidle       int32    // number of idle m's waiting for work
@@ -427,9 +520,11 @@
 	mcount       int32    // number of m's that have been created
 	maxmcount    int32    // maximum number of m's allowed (or die)
 
+	ngsys uint32 // number of system goroutines; updated atomically
+
 	pidle      puintptr // idle p's
 	npidle     uint32
-	nmspinning uint32
+	nmspinning uint32 // See "Worker thread parking/unparking" comment in proc.go.
 
 	// Global runnable queue.
 	runqhead guintptr
@@ -437,9 +532,10 @@
 	runqsize int32
 
 	// Global cache of dead G's.
-	gflock mutex
-	gfree  *g
-	ngfree int32
+	gflock       mutex
+	gfreeStack   *g
+	gfreeNoStack *g
+	ngfree       int32
 
 	// Central cache of sudog structs.
 	sudoglock  mutex
@@ -454,7 +550,6 @@
 	stopnote   note
 	sysmonwait uint32
 	sysmonnote note
-	lastpoll   uint64
 
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
@@ -468,10 +563,10 @@
 	totaltime      int64 // ∫gomaxprocs dt up to procresizetime
 }
 
-// The m->locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
+// The m.locked word holds two pieces of state counting active calls to LockOSThread/lockOSThread.
 // The low bit (LockExternal) is a boolean reporting whether any LockOSThread call is active.
 // External locks are not recursive; a second lock is silently ignored.
-// The upper bits of m->locked record the nesting depth of calls to lockOSThread
+// The upper bits of m.locked record the nesting depth of calls to lockOSThread
 // (counting up by LockInternal), popped by unlockOSThread (counting down by LockInternal).
 // Internal locks can be recursive. For instance, a lock for cgo can occur while the main
 // goroutine is holding the lock during the initialization phase.
@@ -492,7 +587,6 @@
 	_SigPanic                // if the signal is from the kernel, panic
 	_SigDefault              // if the signal isn't explicitly requested, don't monitor it
 	_SigHandling             // our signal handler is registered
-	_SigIgnored              // the signal was ignored before we registered for it
 	_SigGoExit               // cause all runtime procs to exit (only used on Plan 9).
 	_SigSetStack             // add SA_ONSTACK to libc handler
 	_SigUnblock              // unblocked in minit
@@ -506,8 +600,8 @@
 	entry   uintptr // start pc
 	nameoff int32   // function name
 
-	args  int32 // in/out args size
-	frame int32 // legacy frame size; use pcsp if possible
+	args int32 // in/out args size
+	_    int32 // previously legacy frame size; kept for layout compatibility
 
 	pcsp      int32
 	pcfile    int32
@@ -518,6 +612,8 @@
 
 // layout of Itab known to compilers
 // allocated in non-garbage-collected memory
+// Needs to be in sync with
+// ../cmd/compile/internal/gc/reflect.go:/^func.dumptypestructs.
 type itab struct {
 	inter  *interfacetype
 	_type  *_type
@@ -540,14 +636,7 @@
 	idle uint32
 }
 
-/*
- * known to compiler
- */
-const (
-	_Structrnd = regSize
-)
-
-// startup_random_data holds random bytes initialized at startup.  These come from
+// startup_random_data holds random bytes initialized at startup. These come from
 // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
 var startupRandomData []byte
 
@@ -564,7 +653,7 @@
 			w = 16
 		}
 		h := memhash(unsafe.Pointer(&r[n-w]), uintptr(nanotime()), uintptr(w))
-		for i := 0; i < ptrSize && n < len(r); i++ {
+		for i := 0; i < sys.PtrSize && n < len(r); i++ {
 			r[n] = byte(h)
 			n++
 			h >>= 8
@@ -572,9 +661,7 @@
 	}
 }
 
-/*
- * deferred subroutine calls
- */
+// deferred subroutine calls
 type _defer struct {
 	siz     int32
 	started bool
@@ -585,9 +672,7 @@
 	link    *_defer
 }
 
-/*
- * panics
- */
+// panics
 type _panic struct {
 	argp      unsafe.Pointer // pointer to arguments of deferred call run during panic; cannot move - known to liblink
 	arg       interface{}    // argument to panic
@@ -596,10 +681,7 @@
 	aborted   bool           // the panic was aborted
 }
 
-/*
- * stack traces
- */
-
+// stack traces
 type stkframe struct {
 	fn       *_func     // function being run
 	pc       uintptr    // program counter within fn
@@ -619,23 +701,17 @@
 	_TraceJumpStack                 // if traceback is on a systemstack, resume trace at g that called into it
 )
 
-const (
-	// The maximum number of frames we print for a traceback
-	_TracebackMaxFrames = 100
-)
+// The maximum number of frames we print for a traceback
+const _TracebackMaxFrames = 100
 
 var (
 	emptystring string
-	allg        **g
 	allglen     uintptr
-	lastg       *g
 	allm        *m
 	allp        [_MaxGomaxprocs + 1]*p
 	gomaxprocs  int32
 	panicking   uint32
-	goos        *int8
 	ncpu        int32
-	signote     note
 	forcegc     forcegcstate
 	sched       schedt
 	newprocs    int32
@@ -644,9 +720,13 @@
 	// Set on startup in asm_{x86,amd64}.s.
 	cpuid_ecx         uint32
 	cpuid_edx         uint32
+	cpuid_ebx7        uint32
 	lfenceBeforeRdtsc bool
+	support_avx       bool
+	support_avx2      bool
 
-	goarm uint8 // set by cmd/link on arm systems
+	goarm                uint8 // set by cmd/link on arm systems
+	framepointer_enabled bool  // set by cmd/link
 )
 
 // Set by the linker so the runtime can determine the buildmode.
@@ -654,46 +734,3 @@
 	islibrary bool // -buildmode=c-shared
 	isarchive bool // -buildmode=c-archive
 )
-
-/*
- * mutual exclusion locks.  in the uncontended case,
- * as fast as spin locks (just a few user-level instructions),
- * but on the contention path they sleep in the kernel.
- * a zeroed Mutex is unlocked (no need to initialize each lock).
- */
-
-/*
- * sleep and wakeup on one-time events.
- * before any calls to notesleep or notewakeup,
- * must call noteclear to initialize the Note.
- * then, exactly one thread can call notesleep
- * and exactly one thread can call notewakeup (once).
- * once notewakeup has been called, the notesleep
- * will return.  future notesleep will return immediately.
- * subsequent noteclear must be called only after
- * previous notesleep has returned, e.g. it's disallowed
- * to call noteclear straight after notewakeup.
- *
- * notetsleep is like notesleep but wakes up after
- * a given number of nanoseconds even if the event
- * has not yet happened.  if a goroutine uses notetsleep to
- * wake up early, it must wait to call noteclear until it
- * can be sure that no other goroutine is calling
- * notewakeup.
- *
- * notesleep/notetsleep are generally called on g0,
- * notetsleepg is similar to notetsleep but is called on user g.
- */
-// bool	runtime·notetsleep(Note*, int64);  // false - timeout
-// bool	runtime·notetsleepg(Note*, int64);  // false - timeout
-
-/*
- * Lock-free stack.
- * Initialize uint64 head to 0, compare with 0 to test for emptiness.
- * The stack does not keep pointers to nodes,
- * so they can be garbage collected if there are no other pointers to nodes.
- */
-
-// for mmap, we only pass the lower 32 bits of file offset to the
-// assembly routine; the higher bits (if required), should be provided
-// by the assembly routine as 0.

diff --git a/src/runtime/runtime_linux_test.go b/src/runtime/runtime_linux_test.go
index 5344ed2..2b6daec 100644
--- a/src/runtime/runtime_linux_test.go
+++ b/src/runtime/runtime_linux_test.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -8,6 +8,7 @@
 	. "runtime"
 	"syscall"
 	"testing"
+	"unsafe"
 )
 
 var pid, tid int
@@ -27,3 +28,15 @@
 		t.Fatalf("pid=%d but tid=%d", pid, tid)
 	}
 }
+
+// Test that error values are negative. Use address 1 (a misaligned
+// pointer) to get -EINVAL.
+func TestMincoreErrorSign(t *testing.T) {
+	var dst byte
+	v := Mincore(unsafe.Pointer(uintptr(1)), 1, &dst)
+
+	const EINVAL = 0x16
+	if v != -EINVAL {
+		t.Errorf("mincore = %v, want %v", v, -EINVAL)
+	}
+}

diff --git a/src/runtime/runtime_mmap_test.go b/src/runtime/runtime_mmap_test.go
new file mode 100644
index 0000000..cf240c1
--- /dev/null
+++ b/src/runtime/runtime_mmap_test.go

@@ -0,0 +1,30 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux nacl netbsd openbsd solaris
+
+package runtime_test
+
+import (
+	"runtime"
+	"runtime/internal/sys"
+	"testing"
+)
+
+// Test that the error value returned by mmap is positive, as that is
+// what the code in mem_bsd.go, mem_darwin.go, and mem_linux.go expects.
+// See the uses of ENOMEM in sysMap in those files.
+func TestMmapErrorSign(t *testing.T) {
+	p := runtime.Mmap(nil, ^uintptr(0)&^(sys.PhysPageSize-1), 0, runtime.MAP_ANON|runtime.MAP_PRIVATE, -1, 0)
+
+	// The runtime.mmap function is nosplit, but t.Errorf is not.
+	// Reset the pointer so that we don't get an "invalid stack
+	// pointer" error from t.Errorf if we call it.
+	v := uintptr(p)
+	p = nil
+
+	if v != runtime.ENOMEM {
+		t.Errorf("mmap = %v, want %v", v, runtime.ENOMEM)
+	}
+}

diff --git a/src/runtime/runtime_test.go b/src/runtime/runtime_test.go
index 75fc9bc..cd078c7 100644
--- a/src/runtime/runtime_test.go
+++ b/src/runtime/runtime_test.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -12,6 +12,13 @@
 	"unsafe"
 )
 
+func init() {
+	// We're testing the runtime, so make tracebacks show things
+	// in the runtime. This only raises the level, so it won't
+	// override GOTRACEBACK=crash from the user.
+	SetTracebackEnv("system")
+}
+
 var errf error
 
 func errfn() error {
@@ -97,7 +104,7 @@
 // of the larger addresses must themselves be invalid addresses.
 // We might get unlucky and the OS might have mapped one of these
 // addresses, but probably not: they're all in the first page, very high
-// adderesses that normally an OS would reserve for itself, or malformed
+// addresses that normally an OS would reserve for itself, or malformed
 // addresses. Even so, we might have to remove one or two on different
 // systems. We will see.
 
@@ -240,8 +247,8 @@
 	if GOOS == "windows" || GOOS == "nacl" {
 		t.Skip("skipping OS that doesn't have open/read/write/close")
 	}
-	// make sure we get the correct error code if open fails.  Same for
-	// read/write/close on the resulting -1 fd.  See issue 10052.
+	// make sure we get the correct error code if open fails. Same for
+	// read/write/close on the resulting -1 fd. See issue 10052.
 	nonfile := []byte("/notreallyafile")
 	fd := Open(&nonfile[0], 0, 0)
 	if fd != -1 {
@@ -301,3 +308,24 @@
 		}
 	}
 }
+
+func TestGoroutineProfileTrivial(t *testing.T) {
+	// Calling GoroutineProfile twice in a row should find the same number of goroutines,
+	// but it's possible there are goroutines just about to exit, so we might end up
+	// with fewer in the second call. Try a few times; it should converge once those
+	// zombies are gone.
+	for i := 0; ; i++ {
+		n1, ok := GoroutineProfile(nil) // should fail, there's at least 1 goroutine
+		if n1 < 1 || ok {
+			t.Fatalf("GoroutineProfile(nil) = %d, %v, want >0, false", n1, ok)
+		}
+		n2, ok := GoroutineProfile(make([]StackRecord, n1))
+		if n2 == n1 && ok {
+			break
+		}
+		t.Logf("GoroutineProfile(%d) = %d, %v, want %d, true", n1, n2, ok, n1)
+		if i >= 10 {
+			t.Fatalf("GoroutineProfile not converging")
+		}
+	}
+}

diff --git a/src/runtime/runtime_unix_test.go b/src/runtime/runtime_unix_test.go
index cfec332..e912163 100644
--- a/src/runtime/runtime_unix_test.go
+++ b/src/runtime/runtime_unix_test.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/select.go b/src/runtime/select.go
index b18b44c..433048f 100644
--- a/src/runtime/select.go
+++ b/src/runtime/select.go

@@ -6,7 +6,10 @@
 
 // This file contains the implementation of Go select statements.
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 const (
 	debugSelect = false
@@ -24,7 +27,7 @@
 	tcase     uint16   // total count of scase[]
 	ncase     uint16   // currently filled scase[]
 	pollorder *uint16  // case poll order
-	lockorder **hchan  // channel lock order
+	lockorder *uint16  // channel lock order
 	scase     [1]scase // one per case (in order of appearance)
 }
 
@@ -51,7 +54,7 @@
 		(size-1)*unsafe.Sizeof(hselect{}.scase[0]) +
 		size*unsafe.Sizeof(*hselect{}.lockorder) +
 		size*unsafe.Sizeof(*hselect{}.pollorder)
-	return round(selsize, _Int64Align)
+	return round(selsize, sys.Int64Align)
 }
 
 func newselect(sel *hselect, selsize int64, size int32) {
@@ -61,7 +64,7 @@
 	}
 	sel.tcase = uint16(size)
 	sel.ncase = 0
-	sel.lockorder = (**hchan)(add(unsafe.Pointer(&sel.scase), uintptr(size)*unsafe.Sizeof(hselect{}.scase[0])))
+	sel.lockorder = (*uint16)(add(unsafe.Pointer(&sel.scase), uintptr(size)*unsafe.Sizeof(hselect{}.scase[0])))
 	sel.pollorder = (*uint16)(add(unsafe.Pointer(sel.lockorder), uintptr(size)*unsafe.Sizeof(*hselect{}.lockorder)))
 
 	if debugSelect {
@@ -158,11 +161,10 @@
 	}
 }
 
-func sellock(sel *hselect) {
-	lockslice := slice{unsafe.Pointer(sel.lockorder), int(sel.ncase), int(sel.ncase)}
-	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
+func sellock(scases []scase, lockorder []uint16) {
 	var c *hchan
-	for _, c0 := range lockorder {
+	for _, o := range lockorder {
+		c0 := scases[o].c
 		if c0 != nil && c0 != c {
 			c = c0
 			lock(&c.lock)
@@ -170,7 +172,7 @@
 	}
 }
 
-func selunlock(sel *hselect) {
+func selunlock(scases []scase, lockorder []uint16) {
 	// We must be very careful here to not touch sel after we have unlocked
 	// the last lock, because sel can be freed right after the last unlock.
 	// Consider the following situation.
@@ -179,25 +181,42 @@
 	// the G that calls select runnable again and schedules it for execution.
 	// When the G runs on another M, it locks all the locks and frees sel.
 	// Now if the first M touches sel, it will access freed memory.
-	n := int(sel.ncase)
+	n := len(scases)
 	r := 0
-	lockslice := slice{unsafe.Pointer(sel.lockorder), n, n}
-	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
 	// skip the default case
-	if n > 0 && lockorder[0] == nil {
+	if n > 0 && scases[lockorder[0]].c == nil {
 		r = 1
 	}
 	for i := n - 1; i >= r; i-- {
-		c := lockorder[i]
-		if i > 0 && c == lockorder[i-1] {
+		c := scases[lockorder[i]].c
+		if i > 0 && c == scases[lockorder[i-1]].c {
 			continue // will unlock it on the next iteration
 		}
 		unlock(&c.lock)
 	}
 }
 
-func selparkcommit(gp *g, sel unsafe.Pointer) bool {
-	selunlock((*hselect)(sel))
+func selparkcommit(gp *g, _ unsafe.Pointer) bool {
+	// This must not access gp's stack (see gopark). In
+	// particular, it must not access the *hselect. That's okay,
+	// because by the time this is called, gp.waiting has all
+	// channels in lock order.
+	var lastc *hchan
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc && lastc != nil {
+			// As soon as we unlock the channel, fields in
+			// any sudog with that channel may change,
+			// including c and waitlink. Since multiple
+			// sudogs may have the same channel, we unlock
+			// only after we've passed the last instance
+			// of a channel.
+			unlock(&lastc.lock)
+		}
+		lastc = sg.c
+	}
+	if lastc != nil {
+		unlock(&lastc.lock)
+	}
 	return true
 }
 
@@ -205,8 +224,15 @@
 	gopark(nil, nil, "select (no cases)", traceEvGoStop, 1) // forever
 }
 
-// overwrites return pc on stack to signal which case of the select
-// to run, so cannot appear at the top of a split stack.
+// selectgo implements the select statement.
+//
+// *sel is on the current goroutine's stack (regardless of any
+// escaping in selectgo).
+//
+// selectgo does not return. Instead, it overwrites its return PC and
+// returns directly to the triggered select case. Because of this, it
+// cannot appear at the top of a split stack.
+//
 //go:nosplit
 func selectgo(sel *hselect) {
 	pc, offset := selectgoImpl(sel)
@@ -236,7 +262,7 @@
 	// only 0 or 1 cases plus default into simpler constructs.
 	// The only way we can end up with such small sel.ncase
 	// values here is for a larger select in which most channels
-	// have been nilled out.  The general code handles those
+	// have been nilled out. The general code handles those
 	// cases correctly, and they are rare enough not to bother
 	// optimizing (and needing to test).
 
@@ -252,19 +278,21 @@
 	// sort the cases by Hchan address to get the locking order.
 	// simple heap sort, to guarantee n log n time and constant stack footprint.
 	lockslice := slice{unsafe.Pointer(sel.lockorder), int(sel.ncase), int(sel.ncase)}
-	lockorder := *(*[]*hchan)(unsafe.Pointer(&lockslice))
+	lockorder := *(*[]uint16)(unsafe.Pointer(&lockslice))
 	for i := 0; i < int(sel.ncase); i++ {
 		j := i
-		c := scases[j].c
-		for j > 0 && lockorder[(j-1)/2].sortkey() < c.sortkey() {
+		// Start with the pollorder to permute cases on the same channel.
+		c := scases[pollorder[i]].c
+		for j > 0 && scases[lockorder[(j-1)/2]].c.sortkey() < c.sortkey() {
 			k := (j - 1) / 2
 			lockorder[j] = lockorder[k]
 			j = k
 		}
-		lockorder[j] = c
+		lockorder[j] = pollorder[i]
 	}
 	for i := int(sel.ncase) - 1; i >= 0; i-- {
-		c := lockorder[i]
+		o := lockorder[i]
+		c := scases[o].c
 		lockorder[i] = lockorder[0]
 		j := 0
 		for {
@@ -272,21 +300,21 @@
 			if k >= i {
 				break
 			}
-			if k+1 < i && lockorder[k].sortkey() < lockorder[k+1].sortkey() {
+			if k+1 < i && scases[lockorder[k]].c.sortkey() < scases[lockorder[k+1]].c.sortkey() {
 				k++
 			}
-			if c.sortkey() < lockorder[k].sortkey() {
+			if c.sortkey() < scases[lockorder[k]].c.sortkey() {
 				lockorder[j] = lockorder[k]
 				j = k
 				continue
 			}
 			break
 		}
-		lockorder[j] = c
+		lockorder[j] = o
 	}
 	/*
 		for i := 0; i+1 < int(sel.ncase); i++ {
-			if lockorder[i].sortkey() > lockorder[i+1].sortkey() {
+			if scases[lockorder[i]].c.sortkey() > scases[lockorder[i+1]].c.sortkey() {
 				print("i=", i, " x=", lockorder[i], " y=", lockorder[i+1], "\n")
 				throw("select: broken sort")
 			}
@@ -294,7 +322,7 @@
 	*/
 
 	// lock all the channels involved in the select
-	sellock(sel)
+	sellock(scases, lockorder)
 
 	var (
 		gp     *g
@@ -304,7 +332,8 @@
 		k      *scase
 		sglist *sudog
 		sgnext *sudog
-		futile byte
+		qp     unsafe.Pointer
+		nextp  **sudog
 	)
 
 loop:
@@ -317,15 +346,12 @@
 
 		switch cas.kind {
 		case caseRecv:
-			if c.dataqsiz > 0 {
-				if c.qcount > 0 {
-					goto asyncrecv
-				}
-			} else {
-				sg = c.sendq.dequeue()
-				if sg != nil {
-					goto syncrecv
-				}
+			sg = c.sendq.dequeue()
+			if sg != nil {
+				goto recv
+			}
+			if c.qcount > 0 {
+				goto bufrecv
 			}
 			if c.closed != 0 {
 				goto rclose
@@ -338,15 +364,12 @@
 			if c.closed != 0 {
 				goto sclose
 			}
-			if c.dataqsiz > 0 {
-				if c.qcount < c.dataqsiz {
-					goto asyncsend
-				}
-			} else {
-				sg = c.recvq.dequeue()
-				if sg != nil {
-					goto syncsend
-				}
+			sg = c.recvq.dequeue()
+			if sg != nil {
+				goto send
+			}
+			if c.qcount < c.dataqsiz {
+				goto bufsend
 			}
 
 		case caseDefault:
@@ -355,7 +378,7 @@
 	}
 
 	if dfl != nil {
-		selunlock(sel)
+		selunlock(scases, lockorder)
 		cas = dfl
 		goto retc
 	}
@@ -363,20 +386,28 @@
 	// pass 2 - enqueue on all chans
 	gp = getg()
 	done = 0
-	for i := 0; i < int(sel.ncase); i++ {
-		cas = &scases[pollorder[i]]
+	if gp.waiting != nil {
+		throw("gp.waiting != nil")
+	}
+	nextp = &gp.waiting
+	for _, casei := range lockorder {
+		cas = &scases[casei]
 		c = cas.c
 		sg := acquireSudog()
 		sg.g = gp
 		// Note: selectdone is adjusted for stack copies in stack1.go:adjustsudogs
 		sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
+		// No stack splits between assigning elem and enqueuing
+		// sg on gp.waiting where copystack can find it.
 		sg.elem = cas.elem
 		sg.releasetime = 0
 		if t0 != 0 {
 			sg.releasetime = -1
 		}
-		sg.waitlink = gp.waiting
-		gp.waiting = sg
+		sg.c = c
+		// Construct waiting list in lock order.
+		*nextp = sg
+		nextp = &sg.waitlink
 
 		switch cas.kind {
 		case caseRecv:
@@ -389,28 +420,29 @@
 
 	// wait for someone to wake us up
 	gp.param = nil
-	gopark(selparkcommit, unsafe.Pointer(sel), "select", traceEvGoBlockSelect|futile, 2)
+	gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 2)
 
 	// someone woke us up
-	sellock(sel)
+	sellock(scases, lockorder)
 	sg = (*sudog)(gp.param)
 	gp.param = nil
 
 	// pass 3 - dequeue from unsuccessful chans
 	// otherwise they stack up on quiet channels
 	// record the successful case, if any.
-	// We singly-linked up the SudoGs in case order, so when
-	// iterating through the linked list they are in reverse order.
+	// We singly-linked up the SudoGs in lock order.
 	cas = nil
 	sglist = gp.waiting
 	// Clear all elem before unlinking from gp.waiting.
 	for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
 		sg1.selectdone = nil
 		sg1.elem = nil
+		sg1.c = nil
 	}
 	gp.waiting = nil
-	for i := int(sel.ncase) - 1; i >= 0; i-- {
-		k = &scases[pollorder[i]]
+
+	for _, casei := range lockorder {
+		k = &scases[casei]
 		if sglist.releasetime > 0 {
 			k.releasetime = sglist.releasetime
 		}
@@ -432,16 +464,13 @@
 	}
 
 	if cas == nil {
-		futile = traceFutileWakeup
+		// This can happen if we were woken up by a close().
+		// TODO: figure that out explicitly so we don't need this loop.
 		goto loop
 	}
 
 	c = cas.c
 
-	if c.dataqsiz > 0 {
-		throw("selectgo: shouldn't happen")
-	}
-
 	if debugSelect {
 		print("wait-return: sel=", sel, " c=", c, " cas=", cas, " kind=", cas.kind, "\n")
 	}
@@ -459,11 +488,18 @@
 			raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
 		}
 	}
+	if msanenabled {
+		if cas.kind == caseRecv && cas.elem != nil {
+			msanwrite(cas.elem, c.elemtype.size)
+		} else if cas.kind == caseSend {
+			msanread(cas.elem, c.elemtype.size)
+		}
+	}
 
-	selunlock(sel)
+	selunlock(scases, lockorder)
 	goto retc
 
-asyncrecv:
+bufrecv:
 	// can receive from buffer
 	if raceenabled {
 		if cas.elem != nil {
@@ -472,87 +508,58 @@
 		raceacquire(chanbuf(c, c.recvx))
 		racerelease(chanbuf(c, c.recvx))
 	}
+	if msanenabled && cas.elem != nil {
+		msanwrite(cas.elem, c.elemtype.size)
+	}
 	if cas.receivedp != nil {
 		*cas.receivedp = true
 	}
+	qp = chanbuf(c, c.recvx)
 	if cas.elem != nil {
-		typedmemmove(c.elemtype, cas.elem, chanbuf(c, c.recvx))
+		typedmemmove(c.elemtype, cas.elem, qp)
 	}
-	memclr(chanbuf(c, c.recvx), uintptr(c.elemsize))
+	memclr(qp, uintptr(c.elemsize))
 	c.recvx++
 	if c.recvx == c.dataqsiz {
 		c.recvx = 0
 	}
 	c.qcount--
-	sg = c.sendq.dequeue()
-	if sg != nil {
-		gp = sg.g
-		selunlock(sel)
-		if sg.releasetime != 0 {
-			sg.releasetime = cputicks()
-		}
-		goready(gp, 3)
-	} else {
-		selunlock(sel)
-	}
+	selunlock(scases, lockorder)
 	goto retc
 
-asyncsend:
+bufsend:
 	// can send to buffer
 	if raceenabled {
 		raceacquire(chanbuf(c, c.sendx))
 		racerelease(chanbuf(c, c.sendx))
 		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
 	}
+	if msanenabled {
+		msanread(cas.elem, c.elemtype.size)
+	}
 	typedmemmove(c.elemtype, chanbuf(c, c.sendx), cas.elem)
 	c.sendx++
 	if c.sendx == c.dataqsiz {
 		c.sendx = 0
 	}
 	c.qcount++
-	sg = c.recvq.dequeue()
-	if sg != nil {
-		gp = sg.g
-		selunlock(sel)
-		if sg.releasetime != 0 {
-			sg.releasetime = cputicks()
-		}
-		goready(gp, 3)
-	} else {
-		selunlock(sel)
-	}
+	selunlock(scases, lockorder)
 	goto retc
 
-syncrecv:
+recv:
 	// can receive from sleeping sender (sg)
-	if raceenabled {
-		if cas.elem != nil {
-			raceWriteObjectPC(c.elemtype, cas.elem, cas.pc, chanrecvpc)
-		}
-		racesync(c, sg)
-	}
-	selunlock(sel)
+	recv(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
 	if debugSelect {
 		print("syncrecv: sel=", sel, " c=", c, "\n")
 	}
 	if cas.receivedp != nil {
 		*cas.receivedp = true
 	}
-	if cas.elem != nil {
-		typedmemmove(c.elemtype, cas.elem, sg.elem)
-	}
-	sg.elem = nil
-	gp = sg.g
-	gp.param = unsafe.Pointer(sg)
-	if sg.releasetime != 0 {
-		sg.releasetime = cputicks()
-	}
-	goready(gp, 3)
 	goto retc
 
 rclose:
 	// read at end of closed channel
-	selunlock(sel)
+	selunlock(scases, lockorder)
 	if cas.receivedp != nil {
 		*cas.receivedp = false
 	}
@@ -564,26 +571,19 @@
 	}
 	goto retc
 
-syncsend:
-	// can send to sleeping receiver (sg)
+send:
+	// can send to a sleeping receiver (sg)
 	if raceenabled {
 		raceReadObjectPC(c.elemtype, cas.elem, cas.pc, chansendpc)
-		racesync(c, sg)
 	}
-	selunlock(sel)
+	if msanenabled {
+		msanread(cas.elem, c.elemtype.size)
+	}
+	send(c, sg, cas.elem, func() { selunlock(scases, lockorder) })
 	if debugSelect {
 		print("syncsend: sel=", sel, " c=", c, "\n")
 	}
-	if sg.elem != nil {
-		syncsend(c, sg, cas.elem)
-	}
-	sg.elem = nil
-	gp = sg.g
-	gp.param = unsafe.Pointer(sg)
-	if sg.releasetime != 0 {
-		sg.releasetime = cputicks()
-	}
-	goready(gp, 3)
+	goto retc
 
 retc:
 	if cas.releasetime > 0 {
@@ -593,8 +593,8 @@
 
 sclose:
 	// send on closed channel
-	selunlock(sel)
-	panic("send on closed channel")
+	selunlock(scases, lockorder)
+	panic(plainError("send on closed channel"))
 }
 
 func (c *hchan) sortkey() uintptr {
@@ -626,7 +626,7 @@
 func reflect_rselect(cases []runtimeSelect) (chosen int, recvOK bool) {
 	// flagNoScan is safe here, because all objects are also referenced from cases.
 	size := selectsize(uintptr(len(cases)))
-	sel := (*hselect)(mallocgc(size, nil, flagNoScan))
+	sel := (*hselect)(mallocgc(size, nil, true))
 	newselect(sel, int64(size), int32(len(cases)))
 	r := new(bool)
 	for i := range cases {
@@ -679,8 +679,8 @@
 		return
 	}
 
-	// x==y==nil.  Either sgp is the only element in the queue,
-	// or it has already been removed.  Use q.first to disambiguate.
+	// x==y==nil. Either sgp is the only element in the queue,
+	// or it has already been removed. Use q.first to disambiguate.
 	if q.first == sgp {
 		q.first = nil
 		q.last = nil

diff --git a/src/runtime/sema.go b/src/runtime/sema.go
index 8ae51b4..45fbbca 100644
--- a/src/runtime/sema.go
+++ b/src/runtime/sema.go

@@ -19,7 +19,11 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Asynchronous semaphore for sync.Mutex.
 
@@ -35,7 +39,7 @@
 
 var semtable [semTabSize]struct {
 	root semaRoot
-	pad  [_CacheLineSize - unsafe.Sizeof(semaRoot{})]byte
+	pad  [sys.CacheLineSize - unsafe.Sizeof(semaRoot{})]byte
 }
 
 //go:linkname sync_runtime_Semacquire sync.runtime_Semacquire
@@ -58,6 +62,13 @@
 	semrelease(addr)
 }
 
+func readyWithTime(s *sudog, traceskip int) {
+	if s.releasetime != 0 {
+		s.releasetime = cputicks()
+	}
+	goready(s.g, traceskip)
+}
+
 // Called from runtime.
 func semacquire(addr *uint32, profile bool) {
 	gp := getg()
@@ -87,10 +98,10 @@
 	for {
 		lock(&root.lock)
 		// Add ourselves to nwait to disable "easy case" in semrelease.
-		xadd(&root.nwait, 1)
+		atomic.Xadd(&root.nwait, 1)
 		// Check cansemacquire to avoid missed wakeup.
 		if cansemacquire(addr) {
-			xadd(&root.nwait, -1)
+			atomic.Xadd(&root.nwait, -1)
 			unlock(&root.lock)
 			break
 		}
@@ -103,25 +114,25 @@
 		}
 	}
 	if s.releasetime > 0 {
-		blockevent(int64(s.releasetime)-t0, 3)
+		blockevent(s.releasetime-t0, 3)
 	}
 	releaseSudog(s)
 }
 
 func semrelease(addr *uint32) {
 	root := semroot(addr)
-	xadd(addr, 1)
+	atomic.Xadd(addr, 1)
 
 	// Easy case: no waiters?
 	// This check must happen after the xadd, to avoid a missed wakeup
 	// (see loop in semacquire).
-	if atomicload(&root.nwait) == 0 {
+	if atomic.Load(&root.nwait) == 0 {
 		return
 	}
 
 	// Harder case: search for a waiter and wake it.
 	lock(&root.lock)
-	if atomicload(&root.nwait) == 0 {
+	if atomic.Load(&root.nwait) == 0 {
 		// The count is already consumed by another goroutine,
 		// so no need to wake up another goroutine.
 		unlock(&root.lock)
@@ -130,17 +141,14 @@
 	s := root.head
 	for ; s != nil; s = s.next {
 		if s.elem == unsafe.Pointer(addr) {
-			xadd(&root.nwait, -1)
+			atomic.Xadd(&root.nwait, -1)
 			root.dequeue(s)
 			break
 		}
 	}
 	unlock(&root.lock)
 	if s != nil {
-		if s.releasetime != 0 {
-			s.releasetime = cputicks()
-		}
-		goready(s.g, 4)
+		readyWithTime(s, 5)
 	}
 }
 
@@ -150,11 +158,11 @@
 
 func cansemacquire(addr *uint32) bool {
 	for {
-		v := atomicload(addr)
+		v := atomic.Load(addr)
 		if v == 0 {
 			return false
 		}
-		if cas(addr, v, v-1) {
+		if atomic.Cas(addr, v, v-1) {
 			return true
 		}
 	}
@@ -189,101 +197,158 @@
 	s.prev = nil
 }
 
-// Synchronous semaphore for sync.Cond.
-type syncSema struct {
+// notifyList is a ticket-based notification list used to implement sync.Cond.
+//
+// It must be kept in sync with the sync package.
+type notifyList struct {
+	// wait is the ticket number of the next waiter. It is atomically
+	// incremented outside the lock.
+	wait uint32
+
+	// notify is the ticket number of the next waiter to be notified. It can
+	// be read outside the lock, but is only written to with lock held.
+	//
+	// Both wait & notify can wrap around, and such cases will be correctly
+	// handled as long as their "unwrapped" difference is bounded by 2^31.
+	// For this not to be the case, we'd need to have 2^31+ goroutines
+	// blocked on the same condvar, which is currently not possible.
+	notify uint32
+
+	// List of parked waiters.
 	lock mutex
 	head *sudog
 	tail *sudog
 }
 
-// syncsemacquire waits for a pairing syncsemrelease on the same semaphore s.
-//go:linkname syncsemacquire sync.runtime_Syncsemacquire
-func syncsemacquire(s *syncSema) {
-	lock(&s.lock)
-	if s.head != nil && s.head.nrelease > 0 {
-		// Have pending release, consume it.
-		var wake *sudog
-		s.head.nrelease--
-		if s.head.nrelease == 0 {
-			wake = s.head
-			s.head = wake.next
-			if s.head == nil {
-				s.tail = nil
+// less checks if a < b, considering a & b running counts that may overflow the
+// 32-bit range, and that their "unwrapped" difference is always less than 2^31.
+func less(a, b uint32) bool {
+	return int32(a-b) < 0
+}
+
+// notifyListAdd adds the caller to a notify list such that it can receive
+// notifications. The caller must eventually call notifyListWait to wait for
+// such a notification, passing the returned ticket number.
+//go:linkname notifyListAdd sync.runtime_notifyListAdd
+func notifyListAdd(l *notifyList) uint32 {
+	// This may be called concurrently, for example, when called from
+	// sync.Cond.Wait while holding a RWMutex in read mode.
+	return atomic.Xadd(&l.wait, 1) - 1
+}
+
+// notifyListWait waits for a notification. If one has been sent since
+// notifyListAdd was called, it returns immediately. Otherwise, it blocks.
+//go:linkname notifyListWait sync.runtime_notifyListWait
+func notifyListWait(l *notifyList, t uint32) {
+	lock(&l.lock)
+
+	// Return right away if this ticket has already been notified.
+	if less(t, l.notify) {
+		unlock(&l.lock)
+		return
+	}
+
+	// Enqueue itself.
+	s := acquireSudog()
+	s.g = getg()
+	s.ticket = t
+	s.releasetime = 0
+	t0 := int64(0)
+	if blockprofilerate > 0 {
+		t0 = cputicks()
+		s.releasetime = -1
+	}
+	if l.tail == nil {
+		l.head = s
+	} else {
+		l.tail.next = s
+	}
+	l.tail = s
+	goparkunlock(&l.lock, "semacquire", traceEvGoBlockCond, 3)
+	if t0 != 0 {
+		blockevent(s.releasetime-t0, 2)
+	}
+	releaseSudog(s)
+}
+
+// notifyListNotifyAll notifies all entries in the list.
+//go:linkname notifyListNotifyAll sync.runtime_notifyListNotifyAll
+func notifyListNotifyAll(l *notifyList) {
+	// Fast-path: if there are no new waiters since the last notification
+	// we don't need to acquire the lock.
+	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+		return
+	}
+
+	// Pull the list out into a local variable, waiters will be readied
+	// outside the lock.
+	lock(&l.lock)
+	s := l.head
+	l.head = nil
+	l.tail = nil
+
+	// Update the next ticket to be notified. We can set it to the current
+	// value of wait because any previous waiters are already in the list
+	// or will notice that they have already been notified when trying to
+	// add themselves to the list.
+	atomic.Store(&l.notify, atomic.Load(&l.wait))
+	unlock(&l.lock)
+
+	// Go through the local list and ready all waiters.
+	for s != nil {
+		next := s.next
+		s.next = nil
+		readyWithTime(s, 4)
+		s = next
+	}
+}
+
+// notifyListNotifyOne notifies one entry in the list.
+//go:linkname notifyListNotifyOne sync.runtime_notifyListNotifyOne
+func notifyListNotifyOne(l *notifyList) {
+	// Fast-path: if there are no new waiters since the last notification
+	// we don't need to acquire the lock at all.
+	if atomic.Load(&l.wait) == atomic.Load(&l.notify) {
+		return
+	}
+
+	lock(&l.lock)
+
+	// Re-check under the lock if we need to do anything.
+	t := l.notify
+	if t == atomic.Load(&l.wait) {
+		unlock(&l.lock)
+		return
+	}
+
+	// Update the next notify ticket number, and try to find the G that
+	// needs to be notified. If it hasn't made it to the list yet we won't
+	// find it, but it won't park itself once it sees the new notify number.
+	atomic.Store(&l.notify, t+1)
+	for p, s := (*sudog)(nil), l.head; s != nil; p, s = s, s.next {
+		if s.ticket == t {
+			n := s.next
+			if p != nil {
+				p.next = n
+			} else {
+				l.head = n
 			}
+			if n == nil {
+				l.tail = p
+			}
+			unlock(&l.lock)
+			s.next = nil
+			readyWithTime(s, 4)
+			return
 		}
-		unlock(&s.lock)
-		if wake != nil {
-			wake.next = nil
-			goready(wake.g, 4)
-		}
-	} else {
-		// Enqueue itself.
-		w := acquireSudog()
-		w.g = getg()
-		w.nrelease = -1
-		w.next = nil
-		w.releasetime = 0
-		t0 := int64(0)
-		if blockprofilerate > 0 {
-			t0 = cputicks()
-			w.releasetime = -1
-		}
-		if s.tail == nil {
-			s.head = w
-		} else {
-			s.tail.next = w
-		}
-		s.tail = w
-		goparkunlock(&s.lock, "semacquire", traceEvGoBlockCond, 3)
-		if t0 != 0 {
-			blockevent(int64(w.releasetime)-t0, 2)
-		}
-		releaseSudog(w)
 	}
+	unlock(&l.lock)
 }
 
-// syncsemrelease waits for n pairing syncsemacquire on the same semaphore s.
-//go:linkname syncsemrelease sync.runtime_Syncsemrelease
-func syncsemrelease(s *syncSema, n uint32) {
-	lock(&s.lock)
-	for n > 0 && s.head != nil && s.head.nrelease < 0 {
-		// Have pending acquire, satisfy it.
-		wake := s.head
-		s.head = wake.next
-		if s.head == nil {
-			s.tail = nil
-		}
-		if wake.releasetime != 0 {
-			wake.releasetime = cputicks()
-		}
-		wake.next = nil
-		goready(wake.g, 4)
-		n--
-	}
-	if n > 0 {
-		// enqueue itself
-		w := acquireSudog()
-		w.g = getg()
-		w.nrelease = int32(n)
-		w.next = nil
-		w.releasetime = 0
-		if s.tail == nil {
-			s.head = w
-		} else {
-			s.tail.next = w
-		}
-		s.tail = w
-		goparkunlock(&s.lock, "semarelease", traceEvGoBlockCond, 3)
-		releaseSudog(w)
-	} else {
-		unlock(&s.lock)
-	}
-}
-
-//go:linkname syncsemcheck sync.runtime_Syncsemcheck
-func syncsemcheck(sz uintptr) {
-	if sz != unsafe.Sizeof(syncSema{}) {
-		print("runtime: bad syncSema size - sync=", sz, " runtime=", unsafe.Sizeof(syncSema{}), "\n")
-		throw("bad syncSema size")
+//go:linkname notifyListCheck sync.runtime_notifyListCheck
+func notifyListCheck(sz uintptr) {
+	if sz != unsafe.Sizeof(notifyList{}) {
+		print("runtime: bad notifyList size - sync=", sz, " runtime=", unsafe.Sizeof(notifyList{}), "\n")
+		throw("bad notifyList size")
 	}
 }

diff --git a/src/runtime/signal1_unix.go b/src/runtime/signal1_unix.go
index 56d9755..5080202 100644
--- a/src/runtime/signal1_unix.go
+++ b/src/runtime/signal1_unix.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,6 +6,11 @@
 
 package runtime
 
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
 const (
 	_SIG_DFL uintptr = 0
 	_SIG_IGN uintptr = 1
@@ -16,7 +21,7 @@
 // handle a particular signal (e.g., signal occurred on a non-Go thread).
 // See sigfwdgo() for more information on when the signals are forwarded.
 //
-// Signal forwarding is currently available only on Linux.
+// Signal forwarding is currently available only on Darwin and Linux.
 var fwdSig [_NSIG]uintptr
 
 // sigmask represents a general signal mask compatible with the GOOS
@@ -32,34 +37,46 @@
 	maskUpdatedChan chan struct{}
 )
 
-func initsig() {
+func init() {
 	// _NSIG is the number of signals on this operating system.
 	// sigtable should describe what to do for all the possible signals.
 	if len(sigtable) != _NSIG {
 		print("runtime: len(sigtable)=", len(sigtable), " _NSIG=", _NSIG, "\n")
-		throw("initsig")
+		throw("bad sigtable len")
+	}
+}
+
+var signalsOK bool
+
+// Initialize signals.
+// Called by libpreinit so runtime may not be initialized.
+//go:nosplit
+//go:nowritebarrierrec
+func initsig(preinit bool) {
+	if !preinit {
+		// It's now OK for signal handlers to run.
+		signalsOK = true
 	}
 
-	// First call: basic setup.
+	// For c-archive/c-shared this is called by libpreinit with
+	// preinit == true.
+	if (isarchive || islibrary) && !preinit {
+		return
+	}
+
 	for i := int32(0); i < _NSIG; i++ {
 		t := &sigtable[i]
 		if t.flags == 0 || t.flags&_SigDefault != 0 {
 			continue
 		}
 		fwdSig[i] = getsig(i)
-		// For some signals, we respect an inherited SIG_IGN handler
-		// rather than insist on installing our own default handler.
-		// Even these signals can be fetched using the os/signal package.
-		switch i {
-		case _SIGHUP, _SIGINT:
-			if getsig(i) == _SIG_IGN {
-				t.flags = _SigNotify | _SigIgnored
-				continue
-			}
-		}
 
-		if t.flags&_SigSetStack != 0 {
-			setsigstack(i)
+		if !sigInstallGoHandler(i) {
+			// Even if we are not installing a signal handler,
+			// set SA_ONSTACK if necessary.
+			if fwdSig[i] != _SIG_DFL && fwdSig[i] != _SIG_IGN {
+				setsigstack(i)
+			}
 			continue
 		}
 
@@ -68,6 +85,33 @@
 	}
 }
 
+//go:nosplit
+//go:nowritebarrierrec
+func sigInstallGoHandler(sig int32) bool {
+	// For some signals, we respect an inherited SIG_IGN handler
+	// rather than insist on installing our own default handler.
+	// Even these signals can be fetched using the os/signal package.
+	switch sig {
+	case _SIGHUP, _SIGINT:
+		if fwdSig[sig] == _SIG_IGN {
+			return false
+		}
+	}
+
+	t := &sigtable[sig]
+	if t.flags&_SigSetStack != 0 {
+		return false
+	}
+
+	// When built using c-archive or c-shared, only install signal
+	// handlers for synchronous signals.
+	if (isarchive || islibrary) && t.flags&_SigPanic == 0 {
+		return false
+	}
+
+	return true
+}
+
 func sigenable(sig uint32) {
 	if sig >= uint32(len(sigtable)) {
 		return
@@ -80,9 +124,7 @@
 		<-maskUpdatedChan
 		if t.flags&_SigHandling == 0 {
 			t.flags |= _SigHandling
-			if getsig(int32(sig)) == _SIG_IGN {
-				t.flags |= _SigIgnored
-			}
+			fwdSig[sig] = getsig(int32(sig))
 			setsig(int32(sig), funcPC(sighandler), true)
 		}
 	}
@@ -98,13 +140,13 @@
 		ensureSigM()
 		disableSigChan <- sig
 		<-maskUpdatedChan
-		if t.flags&_SigHandling != 0 {
+
+		// If initsig does not install a signal handler for a
+		// signal, then to go back to the state before Notify
+		// we should remove the one we installed.
+		if !sigInstallGoHandler(int32(sig)) {
 			t.flags &^= _SigHandling
-			if t.flags&_SigIgnored != 0 {
-				setsig(int32(sig), _SIG_IGN, true)
-			} else {
-				setsig(int32(sig), _SIG_DFL, true)
-			}
+			setsig(int32(sig), fwdSig[sig], true)
 		}
 	}
 }
@@ -136,14 +178,39 @@
 }
 
 func sigpipe() {
-	setsig(_SIGPIPE, _SIG_DFL, false)
-	raise(_SIGPIPE)
+	if sigsend(_SIGPIPE) {
+		return
+	}
+	dieFromSignal(_SIGPIPE)
+}
+
+// dieFromSignal kills the program with a signal.
+// This provides the expected exit status for the shell.
+// This is only called with fatal signals expected to kill the process.
+//go:nosplit
+//go:nowritebarrierrec
+func dieFromSignal(sig int32) {
+	setsig(sig, _SIG_DFL, false)
+	updatesigmask(sigmask{})
+	raise(sig)
+
+	// That should have killed us. On some systems, though, raise
+	// sends the signal to the whole process rather than to just
+	// the current thread, which means that the signal may not yet
+	// have been delivered. Give other threads a chance to run and
+	// pick up the signal.
+	osyield()
+	osyield()
+	osyield()
+
+	// If we are still somehow running, just exit with the wrong status.
+	exit(2)
 }
 
 // raisebadsignal is called when a signal is received on a non-Go
 // thread, and the Go program does not want to handle it (that is, the
 // program has not called os/signal.Notify for the signal).
-func raisebadsignal(sig int32) {
+func raisebadsignal(sig int32, c *sigctxt) {
 	if sig == _SIGPROF {
 		// Ignore profiling signals that arrive on non-Go threads.
 		return
@@ -158,14 +225,26 @@
 
 	// Reset the signal handler and raise the signal.
 	// We are currently running inside a signal handler, so the
-	// signal is blocked.  We need to unblock it before raising the
+	// signal is blocked. We need to unblock it before raising the
 	// signal, or the signal we raise will be ignored until we return
-	// from the signal handler.  We know that the signal was unblocked
+	// from the signal handler. We know that the signal was unblocked
 	// before entering the handler, or else we would not have received
-	// it.  That means that we don't have to worry about blocking it
+	// it. That means that we don't have to worry about blocking it
 	// again.
 	unblocksig(sig)
 	setsig(sig, handler, false)
+
+	// If we're linked into a non-Go program we want to try to
+	// avoid modifying the original context in which the signal
+	// was raised. If the handler is the default, we know it
+	// is non-recoverable, so we don't have to worry about
+	// re-installing sighandler. At this point we can just
+	// return and the signal will be re-raised and caught by
+	// the default handler with the correct context.
+	if (isarchive || islibrary) && handler == _SIG_DFL && c.sigcode() != _SI_USER {
+		return
+	}
+
 	raise(sig)
 
 	// If the signal didn't cause the program to exit, restore the
@@ -185,17 +264,15 @@
 		// this means the OS X core file will be >128 GB and even on a zippy
 		// workstation can take OS X well over an hour to write (uninterruptible).
 		// Save users from making that mistake.
-		if ptrSize == 8 {
+		if sys.PtrSize == 8 {
 			return
 		}
 	}
 
-	updatesigmask(sigmask{})
-	setsig(_SIGABRT, _SIG_DFL, false)
-	raise(_SIGABRT)
+	dieFromSignal(_SIGABRT)
 }
 
-// createSigM starts one global, sleeping thread to make sure at least one thread
+// ensureSigM starts one global, sleeping thread to make sure at least one thread
 // is available to catch signals enabled for os/signal.
 func ensureSigM() {
 	if maskUpdatedChan != nil {
@@ -226,11 +303,11 @@
 		for {
 			select {
 			case sig := <-enableSigChan:
-				if b := sig - 1; b >= 0 {
+				if b := sig - 1; sig > 0 {
 					sigBlocked[b/32] &^= (1 << (b & 31))
 				}
 			case sig := <-disableSigChan:
-				if b := sig - 1; b >= 0 {
+				if b := sig - 1; sig > 0 {
 					sigBlocked[b/32] |= (1 << (b & 31))
 				}
 			}
@@ -239,3 +316,35 @@
 		}
 	}()
 }
+
+// This is called when we receive a signal when there is no signal stack.
+// This can only happen if non-Go code calls sigaltstack to disable the
+// signal stack. This is called via cgocallback to establish a stack.
+func noSignalStack(sig uint32) {
+	println("signal", sig, "received on thread with no signal stack")
+	throw("non-Go code disabled sigaltstack")
+}
+
+// This is called if we receive a signal when there is a signal stack
+// but we are not on it. This can only happen if non-Go code called
+// sigaction without setting the SS_ONSTACK flag.
+func sigNotOnStack(sig uint32) {
+	println("signal", sig, "received but handler not on signal stack")
+	throw("non-Go code set up signal handler without SA_ONSTACK flag")
+}
+
+// This runs on a foreign stack, without an m or a g. No stack split.
+//go:nosplit
+//go:norace
+//go:nowritebarrierrec
+func badsignal(sig uintptr, c *sigctxt) {
+	cgocallback(unsafe.Pointer(funcPC(badsignalgo)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig)+unsafe.Sizeof(c))
+}
+
+func badsignalgo(sig uintptr, c *sigctxt) {
+	if !sigsend(uint32(sig)) {
+		// A foreign thread received the signal sig, and the
+		// Go code does not want to handle it.
+		raisebadsignal(int32(sig), c)
+	}
+}

diff --git a/src/runtime/signal2_unix.go b/src/runtime/signal2_unix.go
new file mode 100644
index 0000000..b137169
--- /dev/null
+++ b/src/runtime/signal2_unix.go

@@ -0,0 +1,69 @@
+// Copyright 2012 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build darwin dragonfly freebsd linux netbsd openbsd
+
+package runtime
+
+import "unsafe"
+
+//go:noescape
+func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
+
+// Determines if the signal should be handled by Go and if not, forwards the
+// signal to the handler that was installed before Go's. Returns whether the
+// signal was forwarded.
+// This is called by the signal handler, and the world may be stopped.
+//go:nosplit
+//go:nowritebarrierrec
+func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
+	if sig >= uint32(len(sigtable)) {
+		return false
+	}
+	fwdFn := fwdSig[sig]
+
+	if !signalsOK {
+		// The only way we can get here is if we are in a
+		// library or archive, we installed a signal handler
+		// at program startup, but the Go runtime has not yet
+		// been initialized.
+		if fwdFn == _SIG_DFL {
+			dieFromSignal(int32(sig))
+		} else {
+			sigfwd(fwdFn, sig, info, ctx)
+		}
+		return true
+	}
+
+	flags := sigtable[sig].flags
+
+	// If there is no handler to forward to, no need to forward.
+	if fwdFn == _SIG_DFL {
+		return false
+	}
+
+	// If we aren't handling the signal, forward it.
+	if flags&_SigHandling == 0 {
+		sigfwd(fwdFn, sig, info, ctx)
+		return true
+	}
+
+	// Only forward synchronous signals.
+	c := &sigctxt{info, ctx}
+	if c.sigcode() == _SI_USER || flags&_SigPanic == 0 {
+		return false
+	}
+	// Determine if the signal occurred inside Go code. We test that:
+	//   (1) we were in a goroutine (i.e., m.curg != nil), and
+	//   (2) we weren't in CGO (i.e., m.curg.syscallsp == 0).
+	g := getg()
+	if g != nil && g.m != nil && g.m.curg != nil && g.m.curg.syscallsp == 0 {
+		return false
+	}
+	// Signal not handled by Go, forward it.
+	if fwdFn != _SIG_IGN {
+		sigfwd(fwdFn, sig, info, ctx)
+	}
+	return true
+}

diff --git a/src/runtime/signal_386.go b/src/runtime/signal_386.go
index ca18942..f27cf9d 100644
--- a/src/runtime/signal_386.go
+++ b/src/runtime/signal_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 func dumpregs(c *sigctxt) {
 	print("eax    ", hex(c.eax()), "\n")
@@ -27,7 +30,8 @@
 var crashing int32
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := &sigctxt{info, ctxt}
@@ -80,16 +84,16 @@
 
 		// Only push runtime.sigpanic if pc != 0.
 		// If pc == 0, probably panicked because of a
-		// call to a nil func.  Not pushing that onto sp will
+		// call to a nil func. Not pushing that onto sp will
 		// make the trace look like a call to runtime.sigpanic instead.
 		// (Otherwise the trace will end at runtime.sigpanic and we
 		// won't get to see who faulted.)
 		if pc != 0 {
-			if regSize > ptrSize {
-				sp -= ptrSize
+			if sys.RegSize > sys.PtrSize {
+				sp -= sys.PtrSize
 				*(*uintptr)(unsafe.Pointer(sp)) = 0
 			}
-			sp -= ptrSize
+			sp -= sys.PtrSize
 			*(*uintptr)(unsafe.Pointer(sp)) = pc
 			c.set_esp(uint32(sp))
 		}
@@ -103,8 +107,12 @@
 		}
 	}
 
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
 	if flags&_SigKill != 0 {
-		exit(2)
+		dieFromSignal(int32(sig))
 	}
 
 	if flags&_SigThrow == 0 {
@@ -131,33 +139,10 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
-
-		// On Linux/386, all system calls go through the vdso kernel_vsyscall routine.
-		// Normally we don't see those PCs, but during signals we can.
-		// If we see a PC in the vsyscall area (it moves around, but near the top of memory),
-		// assume we're blocked in the vsyscall routine, which has saved
-		// three words on the stack after the initial call saved the caller PC.
-		// Pop all four words off SP and use the saved PC.
-		// The check of the stack bounds here should suffice to avoid a fault
-		// during the actual PC pop.
-		// If we do load a bogus PC, not much harm done: we weren't going
-		// to get a decent traceback anyway.
-		// TODO(rsc): Make this more precise: we should do more checks on the PC,
-		// and we should find out whether different versions of the vdso page
-		// use different prologues that store different amounts on the stack.
-		pc := uintptr(c.eip())
-		sp := uintptr(c.esp())
-		if GOOS == "linux" && pc >= 0xf4000000 && gp.stack.lo <= sp && sp+16 <= gp.stack.hi {
-			// Assume in vsyscall page.
-			sp += 16
-			pc = *(*uintptr)(unsafe.Pointer(sp - 4))
-			print("runtime: unwind vdso kernel_vsyscall: pc=", hex(pc), " sp=", hex(sp), "\n")
-		}
-
-		tracebacktrap(pc, sp, 0, gp)
+		tracebacktrap(uintptr(c.eip()), uintptr(c.esp()), 0, gp)
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
 			// tracebackothers on original m skipped this one; trace it now.
 			goroutineheader(_g_.m.curg)

diff --git a/src/runtime/signal_amd64x.go b/src/runtime/signal_amd64x.go
index 3e14480..7b51fcc 100644
--- a/src/runtime/signal_amd64x.go
+++ b/src/runtime/signal_amd64x.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -8,6 +8,7 @@
 package runtime
 
 import (
+	"runtime/internal/sys"
 	"unsafe"
 )
 
@@ -38,7 +39,8 @@
 var crashing int32
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := &sigctxt{info, ctxt}
@@ -114,16 +116,16 @@
 
 		// Only push runtime.sigpanic if pc != 0.
 		// If pc == 0, probably panicked because of a
-		// call to a nil func.  Not pushing that onto sp will
+		// call to a nil func. Not pushing that onto sp will
 		// make the trace look like a call to runtime.sigpanic instead.
 		// (Otherwise the trace will end at runtime.sigpanic and we
 		// won't get to see who faulted.)
 		if pc != 0 {
-			if regSize > ptrSize {
-				sp -= ptrSize
+			if sys.RegSize > sys.PtrSize {
+				sp -= sys.PtrSize
 				*(*uintptr)(unsafe.Pointer(sp)) = 0
 			}
-			sp -= ptrSize
+			sp -= sys.PtrSize
 			*(*uintptr)(unsafe.Pointer(sp)) = pc
 			c.set_rsp(uint64(sp))
 		}
@@ -137,8 +139,12 @@
 		}
 	}
 
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
 	if flags&_SigKill != 0 {
-		exit(2)
+		dieFromSignal(int32(sig))
 	}
 
 	if flags&_SigThrow == 0 {
@@ -165,8 +171,8 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
 		tracebacktrap(uintptr(c.rip()), uintptr(c.rsp()), 0, gp)
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {

diff --git a/src/runtime/signal_arm.go b/src/runtime/signal_arm.go
index 1b8a2f5..3b8eaf6 100644
--- a/src/runtime/signal_arm.go
+++ b/src/runtime/signal_arm.go

@@ -35,7 +35,8 @@
 var crashing int32
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := &sigctxt{info, ctxt}
@@ -69,7 +70,7 @@
 		c.set_sp(sp)
 		*(*uint32)(unsafe.Pointer(uintptr(sp))) = c.lr()
 
-		pc := uintptr(gp.sigpc)
+		pc := gp.sigpc
 
 		// If we don't recognize the PC as code
 		// but we do recognize the link register as code,
@@ -98,8 +99,12 @@
 		}
 	}
 
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
 	if flags&_SigKill != 0 {
-		exit(2)
+		dieFromSignal(int32(sig))
 	}
 
 	if flags&_SigThrow == 0 {
@@ -126,8 +131,8 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
 		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.lr()), gp)
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {

diff --git a/src/runtime/signal_arm64.go b/src/runtime/signal_arm64.go
index 4a7c8b9..0e08623 100644
--- a/src/runtime/signal_arm64.go
+++ b/src/runtime/signal_arm64.go

@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 func dumpregs(c *sigctxt) {
 	print("r0      ", hex(c.r0()), "\n")
@@ -48,7 +51,8 @@
 var crashing int32
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := &sigctxt{info, ctxt}
@@ -78,11 +82,11 @@
 		// functions are correctly handled. This smashes
 		// the stack frame but we're not going back there
 		// anyway.
-		sp := c.sp() - spAlign // needs only sizeof uint64, but must align the stack
+		sp := c.sp() - sys.SpAlign // needs only sizeof uint64, but must align the stack
 		c.set_sp(sp)
 		*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.lr()
 
-		pc := uintptr(gp.sigpc)
+		pc := gp.sigpc
 
 		// If we don't recognize the PC as code
 		// but we do recognize the link register as code,
@@ -111,8 +115,12 @@
 		}
 	}
 
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
 	if flags&_SigKill != 0 {
-		exit(2)
+		dieFromSignal(int32(sig))
 	}
 
 	if flags&_SigThrow == 0 {
@@ -139,8 +147,8 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
 		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.lr()), gp)
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {

diff --git a/src/runtime/signal_darwin.go b/src/runtime/signal_darwin.go
index 6cd1865..c8534ff 100644
--- a/src/runtime/signal_darwin.go
+++ b/src/runtime/signal_darwin.go

@@ -31,7 +31,7 @@
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
 	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
@@ -47,12 +47,10 @@
 }
 
 //go:noescape
-func sigfwd(fn uintptr, sig uint32, info *siginfo, ctx unsafe.Pointer)
-
-//go:noescape
 func sigreturn(ctx unsafe.Pointer, infostyle uint32)
 
 //go:nosplit
+//go:nowritebarrierrec
 func sigtrampgo(fn uintptr, infostyle, sig uint32, info *siginfo, ctx unsafe.Pointer) {
 	if sigfwdgo(sig, info, ctx) {
 		sigreturn(ctx, infostyle)
@@ -60,11 +58,36 @@
 	}
 	g := getg()
 	if g == nil {
-		badsignal(uintptr(sig))
+		badsignal(uintptr(sig), &sigctxt{info, ctx})
 		sigreturn(ctx, infostyle)
 		return
 	}
+
+	// If some non-Go code called sigaltstack, adjust.
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp < g.m.gsignal.stack.lo || sp >= g.m.gsignal.stack.hi {
+		var st stackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(noSignalStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		if sp < stsp || sp >= stsp+st.ss_size {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(sigNotOnStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		g.m.gsignal.stack.lo = stsp
+		g.m.gsignal.stack.hi = stsp + st.ss_size
+		g.m.gsignal.stackguard0 = stsp + _StackGuard
+		g.m.gsignal.stackguard1 = stsp + _StackGuard
+		g.m.gsignal.stackAlloc = st.ss_size
+		g.m.gsignal.stktopsp = getcallersp(unsafe.Pointer(&sig))
+	}
+
 	setg(g.m.gsignal)
+	c := &sigctxt{info, ctx}
+	c.fixsigcode(sig)
 	sighandler(sig, info, ctx, g)
 	setg(g)
 	sigreturn(ctx, infostyle)

diff --git a/src/runtime/signal_darwin_386.go b/src/runtime/signal_darwin_386.go
index 302b3aa..056f09a 100644
--- a/src/runtime/signal_darwin_386.go
+++ b/src/runtime/signal_darwin_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -32,3 +32,25 @@
 func (c *sigctxt) set_esp(x uint32)     { c.regs().esp = x }
 func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint32) { c.info.si_addr = x }
+
+func (c *sigctxt) fixsigcode(sig uint32) {
+	switch sig {
+	case _SIGTRAP:
+		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
+		// leaving no way to distinguish a breakpoint-induced SIGTRAP
+		// from an asynchronous signal SIGTRAP.
+		// They all look breakpoint-induced by default.
+		// Try looking at the code to see if it's a breakpoint.
+		// The assumption is that we're very unlikely to get an
+		// asynchronous SIGTRAP at just the moment that the
+		// PC started to point at unmapped memory.
+		pc := uintptr(c.eip())
+		// OS X will leave the pc just after the INT 3 instruction.
+		// INT 3 is usually 1 byte, but there is a 2-byte form.
+		code := (*[2]byte)(unsafe.Pointer(pc - 2))
+		if code[1] != 0xCC && (code[0] != 0xCD || code[1] != 3) {
+			// SIGTRAP on something other than INT 3.
+			c.set_sigcode(_SI_USER)
+		}
+	}
+}

diff --git a/src/runtime/signal_darwin_amd64.go b/src/runtime/signal_darwin_amd64.go
index dbf0448..d219fa4 100644
--- a/src/runtime/signal_darwin_amd64.go
+++ b/src/runtime/signal_darwin_amd64.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -40,3 +40,25 @@
 func (c *sigctxt) set_rsp(x uint64)     { c.regs().rsp = x }
 func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint64) { c.info.si_addr = x }
+
+func (c *sigctxt) fixsigcode(sig uint32) {
+	switch sig {
+	case _SIGTRAP:
+		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
+		// leaving no way to distinguish a breakpoint-induced SIGTRAP
+		// from an asynchronous signal SIGTRAP.
+		// They all look breakpoint-induced by default.
+		// Try looking at the code to see if it's a breakpoint.
+		// The assumption is that we're very unlikely to get an
+		// asynchronous SIGTRAP at just the moment that the
+		// PC started to point at unmapped memory.
+		pc := uintptr(c.rip())
+		// OS X will leave the pc just after the INT 3 instruction.
+		// INT 3 is usually 1 byte, but there is a 2-byte form.
+		code := (*[2]byte)(unsafe.Pointer(pc - 2))
+		if code[1] != 0xCC && (code[0] != 0xCD || code[1] != 3) {
+			// SIGTRAP on something other than INT 3.
+			c.set_sigcode(_SI_USER)
+		}
+	}
+}

diff --git a/src/runtime/signal_darwin_arm.go b/src/runtime/signal_darwin_arm.go
index 0f10971..82c7c93 100644
--- a/src/runtime/signal_darwin_arm.go
+++ b/src/runtime/signal_darwin_arm.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -42,3 +42,24 @@
 
 func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint32) { c.info.si_addr = x }
+
+func (c *sigctxt) fixsigcode(sig uint32) {
+	switch sig {
+	case _SIGTRAP:
+		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
+		// leaving no way to distinguish a breakpoint-induced SIGTRAP
+		// from an asynchronous signal SIGTRAP.
+		// They all look breakpoint-induced by default.
+		// Try looking at the code to see if it's a breakpoint.
+		// The assumption is that we're very unlikely to get an
+		// asynchronous SIGTRAP at just the moment that the
+		// PC started to point at unmapped memory.
+		pc := uintptr(c.pc())
+		// OS X will leave the pc just after the instruction.
+		code := (*uint32)(unsafe.Pointer(pc - 4))
+		if *code != 0xe7f001f0 {
+			// SIGTRAP on something other than breakpoint.
+			c.set_sigcode(_SI_USER)
+		}
+	}
+}

diff --git a/src/runtime/signal_darwin_arm64.go b/src/runtime/signal_darwin_arm64.go
index 2df4229..12fa520 100644
--- a/src/runtime/signal_darwin_arm64.go
+++ b/src/runtime/signal_darwin_arm64.go

@@ -1,4 +1,4 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -55,6 +55,28 @@
 func (c *sigctxt) set_lr(x uint64)  { c.regs().lr = x }
 func (c *sigctxt) set_r28(x uint64) { c.regs().x[28] = x }
 
+func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint64) {
 	c.info.si_addr = (*byte)(unsafe.Pointer(uintptr(x)))
 }
+
+func (c *sigctxt) fixsigcode(sig uint32) {
+	switch sig {
+	case _SIGTRAP:
+		// OS X sets c.sigcode() == TRAP_BRKPT unconditionally for all SIGTRAPs,
+		// leaving no way to distinguish a breakpoint-induced SIGTRAP
+		// from an asynchronous signal SIGTRAP.
+		// They all look breakpoint-induced by default.
+		// Try looking at the code to see if it's a breakpoint.
+		// The assumption is that we're very unlikely to get an
+		// asynchronous SIGTRAP at just the moment that the
+		// PC started to point at unmapped memory.
+		pc := uintptr(c.pc())
+		// OS X will leave the pc just after the instruction.
+		code := (*uint32)(unsafe.Pointer(pc - 4))
+		if *code != 0xd4200000 {
+			// SIGTRAP on something other than breakpoint.
+			c.set_sigcode(_SI_USER)
+		}
+	}
+}

diff --git a/src/runtime/signal_dragonfly.go b/src/runtime/signal_dragonfly.go
index d37e11a..8e9ce17 100644
--- a/src/runtime/signal_dragonfly.go
+++ b/src/runtime/signal_dragonfly.go

@@ -14,14 +14,14 @@
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -29,15 +29,15 @@
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},

diff --git a/src/runtime/signal_dragonfly_amd64.go b/src/runtime/signal_dragonfly_amd64.go
index 740959c..b32df29 100644
--- a/src/runtime/signal_dragonfly_amd64.go
+++ b/src/runtime/signal_dragonfly_amd64.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -32,11 +32,11 @@
 func (c *sigctxt) r15() uint64     { return c.regs().mc_r15 }
 func (c *sigctxt) rip() uint64     { return c.regs().mc_rip }
 func (c *sigctxt) rflags() uint64  { return c.regs().mc_rflags }
-func (c *sigctxt) cs() uint64      { return uint64(c.regs().mc_cs) }
-func (c *sigctxt) fs() uint64      { return uint64(c.regs().mc_ss) }
-func (c *sigctxt) gs() uint64      { return uint64(c.regs().mc_ss) }
+func (c *sigctxt) cs() uint64      { return c.regs().mc_cs }
+func (c *sigctxt) fs() uint64      { return c.regs().mc_ss }
+func (c *sigctxt) gs() uint64      { return c.regs().mc_ss }
 func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
-func (c *sigctxt) sigaddr() uint64 { return uint64(c.info.si_addr) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
 
 func (c *sigctxt) set_rip(x uint64)     { c.regs().mc_rip = x }
 func (c *sigctxt) set_rsp(x uint64)     { c.regs().mc_rsp = x }

diff --git a/src/runtime/signal_freebsd.go b/src/runtime/signal_freebsd.go
index 1dbdb1b..c4cb687 100644
--- a/src/runtime/signal_freebsd.go
+++ b/src/runtime/signal_freebsd.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+import "unsafe"
+
 type sigTabT struct {
 	flags int32
 	name  string
@@ -14,33 +16,72 @@
 	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/* 4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/* 5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/* 7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/* 8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/* 9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
-	/* 12 */ {_SigNotify, "SIGSYS: bad system call"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
+	/* 12 */ {_SigNotify, "SIGSYS: bad system call"}, // see golang.org/issues/15204
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 32 */ {_SigNotify, "SIGTHR: reserved"},
 }
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
+	if sigfwdgo(sig, info, ctx) {
+		return
+	}
+	g := getg()
+	if g == nil {
+		badsignal(uintptr(sig), &sigctxt{info, ctx})
+		return
+	}
+
+	// If some non-Go code called sigaltstack, adjust.
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp < g.m.gsignal.stack.lo || sp >= g.m.gsignal.stack.hi {
+		var st stackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(noSignalStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		if sp < stsp || sp >= stsp+st.ss_size {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(sigNotOnStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		g.m.gsignal.stack.lo = stsp
+		g.m.gsignal.stack.hi = stsp + st.ss_size
+		g.m.gsignal.stackguard0 = stsp + _StackGuard
+		g.m.gsignal.stackguard1 = stsp + _StackGuard
+		g.m.gsignal.stackAlloc = st.ss_size
+		g.m.gsignal.stktopsp = getcallersp(unsafe.Pointer(&sig))
+	}
+
+	setg(g.m.gsignal)
+	sighandler(sig, info, ctx, g)
+	setg(g)
+}

diff --git a/src/runtime/signal_freebsd_386.go b/src/runtime/signal_freebsd_386.go
index a0fec13..092e6df 100644
--- a/src/runtime/signal_freebsd_386.go
+++ b/src/runtime/signal_freebsd_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -22,9 +22,9 @@
 func (c *sigctxt) esp() uint32     { return c.regs().mc_esp }
 func (c *sigctxt) eip() uint32     { return c.regs().mc_eip }
 func (c *sigctxt) eflags() uint32  { return c.regs().mc_eflags }
-func (c *sigctxt) cs() uint32      { return uint32(c.regs().mc_cs) }
-func (c *sigctxt) fs() uint32      { return uint32(c.regs().mc_fs) }
-func (c *sigctxt) gs() uint32      { return uint32(c.regs().mc_gs) }
+func (c *sigctxt) cs() uint32      { return c.regs().mc_cs }
+func (c *sigctxt) fs() uint32      { return c.regs().mc_fs }
+func (c *sigctxt) gs() uint32      { return c.regs().mc_gs }
 func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
 func (c *sigctxt) sigaddr() uint32 { return uint32(c.info.si_addr) }
 

diff --git a/src/runtime/signal_freebsd_amd64.go b/src/runtime/signal_freebsd_amd64.go
index d10c883..a0b4a72 100644
--- a/src/runtime/signal_freebsd_amd64.go
+++ b/src/runtime/signal_freebsd_amd64.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -32,11 +32,11 @@
 func (c *sigctxt) r15() uint64     { return c.regs().mc_r15 }
 func (c *sigctxt) rip() uint64     { return c.regs().mc_rip }
 func (c *sigctxt) rflags() uint64  { return c.regs().mc_rflags }
-func (c *sigctxt) cs() uint64      { return uint64(c.regs().mc_cs) }
+func (c *sigctxt) cs() uint64      { return c.regs().mc_cs }
 func (c *sigctxt) fs() uint64      { return uint64(c.regs().mc_fs) }
 func (c *sigctxt) gs() uint64      { return uint64(c.regs().mc_gs) }
 func (c *sigctxt) sigcode() uint64 { return uint64(c.info.si_code) }
-func (c *sigctxt) sigaddr() uint64 { return uint64(c.info.si_addr) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
 
 func (c *sigctxt) set_rip(x uint64)     { c.regs().mc_rip = x }
 func (c *sigctxt) set_rsp(x uint64)     { c.regs().mc_rsp = x }

diff --git a/src/runtime/signal_freebsd_arm.go b/src/runtime/signal_freebsd_arm.go
index 12de23d..0357304 100644
--- a/src/runtime/signal_freebsd_arm.go
+++ b/src/runtime/signal_freebsd_arm.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_linux_386.go b/src/runtime/signal_linux_386.go
index 085f66e..415f361 100644
--- a/src/runtime/signal_linux_386.go
+++ b/src/runtime/signal_linux_386.go

@@ -1,10 +1,13 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type sigctxt struct {
 	info *siginfo
@@ -32,5 +35,5 @@
 func (c *sigctxt) set_esp(x uint32)     { c.regs().esp = x }
 func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint32) {
-	*(*uintptr)(add(unsafe.Pointer(c.info), 2*ptrSize)) = uintptr(x)
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
 }

diff --git a/src/runtime/signal_linux_amd64.go b/src/runtime/signal_linux_amd64.go
index 5e339b8..433747f 100644
--- a/src/runtime/signal_linux_amd64.go
+++ b/src/runtime/signal_linux_amd64.go

@@ -1,10 +1,13 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type sigctxt struct {
 	info *siginfo
@@ -42,5 +45,5 @@
 func (c *sigctxt) set_rsp(x uint64)     { c.regs().rsp = x }
 func (c *sigctxt) set_sigcode(x uint64) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint64) {
-	*(*uintptr)(add(unsafe.Pointer(c.info), 2*ptrSize)) = uintptr(x)
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
 }

diff --git a/src/runtime/signal_linux_arm.go b/src/runtime/signal_linux_arm.go
index bdb4314..cd6a0d7 100644
--- a/src/runtime/signal_linux_arm.go
+++ b/src/runtime/signal_linux_arm.go

@@ -1,10 +1,13 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type sigctxt struct {
 	info *siginfo
@@ -44,5 +47,5 @@
 
 func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint32) {
-	*(*uintptr)(add(unsafe.Pointer(c.info), 2*ptrSize)) = uintptr(x)
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
 }

diff --git a/src/runtime/signal_linux_arm64.go b/src/runtime/signal_linux_arm64.go
index 7d8b010..4964e7b 100644
--- a/src/runtime/signal_linux_arm64.go
+++ b/src/runtime/signal_linux_arm64.go

@@ -1,10 +1,13 @@
-// Copyright 2015 The Go Authors.  All rights reserved.
+// Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type sigctxt struct {
 	info *siginfo
@@ -57,5 +60,5 @@
 func (c *sigctxt) set_r28(x uint64) { c.regs().regs[28] = x }
 
 func (c *sigctxt) set_sigaddr(x uint64) {
-	*(*uintptr)(add(unsafe.Pointer(c.info), 2*ptrSize)) = uintptr(x)
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
 }

diff --git a/src/runtime/signal_linux_mips64x.go b/src/runtime/signal_linux_mips64x.go
new file mode 100644
index 0000000..0f590e4
--- /dev/null
+++ b/src/runtime/signal_linux_mips64x.go

@@ -0,0 +1,70 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+func (c *sigctxt) regs() *sigcontext { return &(*ucontext)(c.ctxt).uc_mcontext }
+func (c *sigctxt) r0() uint64        { return c.regs().sc_regs[0] }
+func (c *sigctxt) r1() uint64        { return c.regs().sc_regs[1] }
+func (c *sigctxt) r2() uint64        { return c.regs().sc_regs[2] }
+func (c *sigctxt) r3() uint64        { return c.regs().sc_regs[3] }
+func (c *sigctxt) r4() uint64        { return c.regs().sc_regs[4] }
+func (c *sigctxt) r5() uint64        { return c.regs().sc_regs[5] }
+func (c *sigctxt) r6() uint64        { return c.regs().sc_regs[6] }
+func (c *sigctxt) r7() uint64        { return c.regs().sc_regs[7] }
+func (c *sigctxt) r8() uint64        { return c.regs().sc_regs[8] }
+func (c *sigctxt) r9() uint64        { return c.regs().sc_regs[9] }
+func (c *sigctxt) r10() uint64       { return c.regs().sc_regs[10] }
+func (c *sigctxt) r11() uint64       { return c.regs().sc_regs[11] }
+func (c *sigctxt) r12() uint64       { return c.regs().sc_regs[12] }
+func (c *sigctxt) r13() uint64       { return c.regs().sc_regs[13] }
+func (c *sigctxt) r14() uint64       { return c.regs().sc_regs[14] }
+func (c *sigctxt) r15() uint64       { return c.regs().sc_regs[15] }
+func (c *sigctxt) r16() uint64       { return c.regs().sc_regs[16] }
+func (c *sigctxt) r17() uint64       { return c.regs().sc_regs[17] }
+func (c *sigctxt) r18() uint64       { return c.regs().sc_regs[18] }
+func (c *sigctxt) r19() uint64       { return c.regs().sc_regs[19] }
+func (c *sigctxt) r20() uint64       { return c.regs().sc_regs[20] }
+func (c *sigctxt) r21() uint64       { return c.regs().sc_regs[21] }
+func (c *sigctxt) r22() uint64       { return c.regs().sc_regs[22] }
+func (c *sigctxt) r23() uint64       { return c.regs().sc_regs[23] }
+func (c *sigctxt) r24() uint64       { return c.regs().sc_regs[24] }
+func (c *sigctxt) r25() uint64       { return c.regs().sc_regs[25] }
+func (c *sigctxt) r26() uint64       { return c.regs().sc_regs[26] }
+func (c *sigctxt) r27() uint64       { return c.regs().sc_regs[27] }
+func (c *sigctxt) r28() uint64       { return c.regs().sc_regs[28] }
+func (c *sigctxt) r29() uint64       { return c.regs().sc_regs[29] }
+func (c *sigctxt) r30() uint64       { return c.regs().sc_regs[30] }
+func (c *sigctxt) r31() uint64       { return c.regs().sc_regs[31] }
+func (c *sigctxt) sp() uint64        { return c.regs().sc_regs[29] }
+func (c *sigctxt) pc() uint64        { return c.regs().sc_pc }
+func (c *sigctxt) link() uint64      { return c.regs().sc_regs[31] }
+func (c *sigctxt) lo() uint64        { return c.regs().sc_mdlo }
+func (c *sigctxt) hi() uint64        { return c.regs().sc_mdhi }
+
+func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_r30(x uint64)  { c.regs().sc_regs[30] = x }
+func (c *sigctxt) set_pc(x uint64)   { c.regs().sc_pc = x }
+func (c *sigctxt) set_sp(x uint64)   { c.regs().sc_regs[29] = x }
+func (c *sigctxt) set_link(x uint64) { c.regs().sc_regs[31] = x }
+
+func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
+}

diff --git a/src/runtime/signal_linux_ppc64x.go b/src/runtime/signal_linux_ppc64x.go
index da3afc9..95835ca 100644
--- a/src/runtime/signal_linux_ppc64x.go
+++ b/src/runtime/signal_linux_ppc64x.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -7,7 +7,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 type sigctxt struct {
 	info *siginfo
@@ -60,6 +63,7 @@
 func (c *sigctxt) fault() uint64   { return c.regs().dar }
 
 func (c *sigctxt) set_r0(x uint64)   { c.regs().gpr[0] = x }
+func (c *sigctxt) set_r12(x uint64)  { c.regs().gpr[12] = x }
 func (c *sigctxt) set_r30(x uint64)  { c.regs().gpr[30] = x }
 func (c *sigctxt) set_pc(x uint64)   { c.regs().nip = x }
 func (c *sigctxt) set_sp(x uint64)   { c.regs().gpr[1] = x }
@@ -67,5 +71,5 @@
 
 func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
 func (c *sigctxt) set_sigaddr(x uint64) {
-	*(*uintptr)(add(unsafe.Pointer(c.info), 2*ptrSize)) = uintptr(x)
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
 }

diff --git a/src/runtime/signal_linux_s390x.go b/src/runtime/signal_linux_s390x.go
new file mode 100644
index 0000000..155d3a3
--- /dev/null
+++ b/src/runtime/signal_linux_s390x.go

@@ -0,0 +1,208 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+type sigctxt struct {
+	info *siginfo
+	ctxt unsafe.Pointer
+}
+
+func (c *sigctxt) regs() *sigcontext {
+	return (*sigcontext)(unsafe.Pointer(&(*ucontext)(c.ctxt).uc_mcontext))
+}
+func (c *sigctxt) r0() uint64      { return c.regs().gregs[0] }
+func (c *sigctxt) r1() uint64      { return c.regs().gregs[1] }
+func (c *sigctxt) r2() uint64      { return c.regs().gregs[2] }
+func (c *sigctxt) r3() uint64      { return c.regs().gregs[3] }
+func (c *sigctxt) r4() uint64      { return c.regs().gregs[4] }
+func (c *sigctxt) r5() uint64      { return c.regs().gregs[5] }
+func (c *sigctxt) r6() uint64      { return c.regs().gregs[6] }
+func (c *sigctxt) r7() uint64      { return c.regs().gregs[7] }
+func (c *sigctxt) r8() uint64      { return c.regs().gregs[8] }
+func (c *sigctxt) r9() uint64      { return c.regs().gregs[9] }
+func (c *sigctxt) r10() uint64     { return c.regs().gregs[10] }
+func (c *sigctxt) r11() uint64     { return c.regs().gregs[11] }
+func (c *sigctxt) r12() uint64     { return c.regs().gregs[12] }
+func (c *sigctxt) r13() uint64     { return c.regs().gregs[13] }
+func (c *sigctxt) r14() uint64     { return c.regs().gregs[14] }
+func (c *sigctxt) r15() uint64     { return c.regs().gregs[15] }
+func (c *sigctxt) link() uint64    { return c.regs().gregs[14] }
+func (c *sigctxt) sp() uint64      { return c.regs().gregs[15] }
+func (c *sigctxt) pc() uint64      { return c.regs().psw_addr }
+func (c *sigctxt) sigcode() uint32 { return uint32(c.info.si_code) }
+func (c *sigctxt) sigaddr() uint64 { return c.info.si_addr }
+
+func (c *sigctxt) set_r0(x uint64)      { c.regs().gregs[0] = x }
+func (c *sigctxt) set_r13(x uint64)     { c.regs().gregs[13] = x }
+func (c *sigctxt) set_link(x uint64)    { c.regs().gregs[14] = x }
+func (c *sigctxt) set_sp(x uint64)      { c.regs().gregs[15] = x }
+func (c *sigctxt) set_pc(x uint64)      { c.regs().psw_addr = x }
+func (c *sigctxt) set_sigcode(x uint32) { c.info.si_code = int32(x) }
+func (c *sigctxt) set_sigaddr(x uint64) {
+	*(*uintptr)(add(unsafe.Pointer(c.info), 2*sys.PtrSize)) = uintptr(x)
+}
+
+func dumpregs(c *sigctxt) {
+	print("r0   ", hex(c.r0()), "\t")
+	print("r1   ", hex(c.r1()), "\n")
+	print("r2   ", hex(c.r2()), "\t")
+	print("r3   ", hex(c.r3()), "\n")
+	print("r4   ", hex(c.r4()), "\t")
+	print("r5   ", hex(c.r5()), "\n")
+	print("r6   ", hex(c.r6()), "\t")
+	print("r7   ", hex(c.r7()), "\n")
+	print("r8   ", hex(c.r8()), "\t")
+	print("r9   ", hex(c.r9()), "\n")
+	print("r10  ", hex(c.r10()), "\t")
+	print("r11  ", hex(c.r11()), "\n")
+	print("r12  ", hex(c.r12()), "\t")
+	print("r13  ", hex(c.r13()), "\n")
+	print("r14  ", hex(c.r14()), "\t")
+	print("r15  ", hex(c.r15()), "\n")
+	print("pc   ", hex(c.pc()), "\t")
+	print("link ", hex(c.link()), "\n")
+}
+
+var crashing int32
+
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	if sig == _SIGPROF {
+		sigprof(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp, _g_.m)
+		return
+	}
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = uintptr(c.sigaddr())
+		gp.sigpc = uintptr(c.pc())
+
+		// We arrange link, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LINK to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		sp := c.sp() - sys.MinFrameSize
+		c.set_sp(sp)
+		*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+
+		pc := uintptr(gp.sigpc)
+
+		// If we don't recognize the PC as code
+		// but we do recognize the link register as code,
+		// then assume this was a call to non-code and treat like
+		// pc == 0, to make unwinding show the context.
+		if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+			pc = 0
+		}
+
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
+		if pc != 0 {
+			c.set_link(uint64(pc))
+		}
+
+		// In case we are panicking from external C code
+		c.set_r0(0)
+		c.set_r13(uint64(uintptr(unsafe.Pointer(gp))))
+		c.set_pc(uint64(funcPC(sigpanic)))
+		return
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(int32(sig))
+	}
+
+	if flags&_SigThrow == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(c.pc()), " m=", _g_.m.id, "\n")
+	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+		print("signal arrived during cgo execution\n")
+		gp = _g_.m.lockedg
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp)
+		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+			// tracebackothers on original m skipped this one; trace it now.
+			goroutineheader(_g_.m.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		} else if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(c)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < sched.mcount {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	exit(2)
+}

diff --git a/src/runtime/signal_mips64x.go b/src/runtime/signal_mips64x.go
new file mode 100644
index 0000000..4dbeb42
--- /dev/null
+++ b/src/runtime/signal_mips64x.go

@@ -0,0 +1,188 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+package runtime
+
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+func dumpregs(c *sigctxt) {
+	print("r0   ", hex(c.r0()), "\t")
+	print("r1   ", hex(c.r1()), "\n")
+	print("r2   ", hex(c.r2()), "\t")
+	print("r3   ", hex(c.r3()), "\n")
+	print("r4   ", hex(c.r4()), "\t")
+	print("r5   ", hex(c.r5()), "\n")
+	print("r6   ", hex(c.r6()), "\t")
+	print("r7   ", hex(c.r7()), "\n")
+	print("r8   ", hex(c.r8()), "\t")
+	print("r9   ", hex(c.r9()), "\n")
+	print("r10  ", hex(c.r10()), "\t")
+	print("r11  ", hex(c.r11()), "\n")
+	print("r12  ", hex(c.r12()), "\t")
+	print("r13  ", hex(c.r13()), "\n")
+	print("r14  ", hex(c.r14()), "\t")
+	print("r15  ", hex(c.r15()), "\n")
+	print("r16  ", hex(c.r16()), "\t")
+	print("r17  ", hex(c.r17()), "\n")
+	print("r18  ", hex(c.r18()), "\t")
+	print("r19  ", hex(c.r19()), "\n")
+	print("r20  ", hex(c.r20()), "\t")
+	print("r21  ", hex(c.r21()), "\n")
+	print("r22  ", hex(c.r22()), "\t")
+	print("r23  ", hex(c.r23()), "\n")
+	print("r24  ", hex(c.r24()), "\t")
+	print("r25  ", hex(c.r25()), "\n")
+	print("r26  ", hex(c.r26()), "\t")
+	print("r27  ", hex(c.r27()), "\n")
+	print("r28  ", hex(c.r28()), "\t")
+	print("r29  ", hex(c.r29()), "\n")
+	print("r30  ", hex(c.r30()), "\t")
+	print("r31  ", hex(c.r31()), "\n")
+	print("pc   ", hex(c.pc()), "\t")
+	print("link ", hex(c.link()), "\n")
+	print("lo   ", hex(c.lo()), "\t")
+	print("hi   ", hex(c.hi()), "\n")
+}
+
+var crashing int32
+
+// May run during STW, so write barriers are not allowed.
+//
+//go:nowritebarrierrec
+func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
+	_g_ := getg()
+	c := &sigctxt{info, ctxt}
+
+	if sig == _SIGPROF {
+		sigprof(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp, _g_.m)
+		return
+	}
+	flags := int32(_SigThrow)
+	if sig < uint32(len(sigtable)) {
+		flags = sigtable[sig].flags
+	}
+	if c.sigcode() != _SI_USER && flags&_SigPanic != 0 {
+		// Make it look like a call to the signal func.
+		// Have to pass arguments out of band since
+		// augmenting the stack frame would break
+		// the unwinding code.
+		gp.sig = sig
+		gp.sigcode0 = uintptr(c.sigcode())
+		gp.sigcode1 = uintptr(c.sigaddr())
+		gp.sigpc = uintptr(c.pc())
+
+		// We arrange link, and pc to pretend the panicking
+		// function calls sigpanic directly.
+		// Always save LINK to stack so that panics in leaf
+		// functions are correctly handled. This smashes
+		// the stack frame but we're not going back there
+		// anyway.
+		sp := c.sp() - sys.PtrSize
+		c.set_sp(sp)
+		*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
+
+		pc := gp.sigpc
+
+		// If we don't recognize the PC as code
+		// but we do recognize the link register as code,
+		// then assume this was a call to non-code and treat like
+		// pc == 0, to make unwinding show the context.
+		if pc != 0 && findfunc(pc) == nil && findfunc(uintptr(c.link())) != nil {
+			pc = 0
+		}
+
+		// Don't bother saving PC if it's zero, which is
+		// probably a call to a nil func: the old link register
+		// is more useful in the stack trace.
+		if pc != 0 {
+			c.set_link(uint64(pc))
+		}
+
+		// In case we are panicking from external C code
+		c.set_r30(uint64(uintptr(unsafe.Pointer(gp))))
+		c.set_pc(uint64(funcPC(sigpanic)))
+		return
+	}
+
+	if c.sigcode() == _SI_USER || flags&_SigNotify != 0 {
+		if sigsend(sig) {
+			return
+		}
+	}
+
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
+	if flags&_SigKill != 0 {
+		dieFromSignal(int32(sig))
+	}
+
+	if flags&_SigThrow == 0 {
+		return
+	}
+
+	_g_.m.throwing = 1
+	_g_.m.caughtsig.set(gp)
+
+	if crashing == 0 {
+		startpanic()
+	}
+
+	if sig < uint32(len(sigtable)) {
+		print(sigtable[sig].name, "\n")
+	} else {
+		print("Signal ", sig, "\n")
+	}
+
+	print("PC=", hex(c.pc()), " m=", _g_.m.id, "\n")
+	if _g_.m.lockedg != nil && _g_.m.ncgo > 0 && gp == _g_.m.g0 {
+		print("signal arrived during cgo execution\n")
+		gp = _g_.m.lockedg
+	}
+	print("\n")
+
+	level, _, docrash := gotraceback()
+	if level > 0 {
+		goroutineheader(gp)
+		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp)
+		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {
+			// tracebackothers on original m skipped this one; trace it now.
+			goroutineheader(_g_.m.curg)
+			traceback(^uintptr(0), ^uintptr(0), 0, gp)
+		} else if crashing == 0 {
+			tracebackothers(gp)
+			print("\n")
+		}
+		dumpregs(c)
+	}
+
+	if docrash {
+		crashing++
+		if crashing < sched.mcount {
+			// There are other m's that need to dump their stacks.
+			// Relay SIGQUIT to the next m by sending it to the current process.
+			// All m's that have already received SIGQUIT have signal masks blocking
+			// receipt of any signals, so the SIGQUIT will go to an m that hasn't seen it yet.
+			// When the last m receives the SIGQUIT, it will fall through to the call to
+			// crash below. Just in case the relaying gets botched, each m involved in
+			// the relay sleeps for 5 seconds and then does the crash/exit itself.
+			// In expected operation, the last m has received the SIGQUIT and run
+			// crash/exit and the process is gone, all long before any of the
+			// 5-second sleeps have finished.
+			print("\n-----\n\n")
+			raiseproc(_SIGQUIT)
+			usleep(5 * 1000 * 1000)
+		}
+		crash()
+	}
+
+	exit(2)
+}

diff --git a/src/runtime/signal_nacl.go b/src/runtime/signal_nacl.go
index 122648b..4793075 100644
--- a/src/runtime/signal_nacl.go
+++ b/src/runtime/signal_nacl.go

@@ -29,7 +29,7 @@
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
 	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},

diff --git a/src/runtime/signal_nacl_386.go b/src/runtime/signal_nacl_386.go
index 0a1e7c6..1f48590 100644
--- a/src/runtime/signal_nacl_386.go
+++ b/src/runtime/signal_nacl_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_nacl_amd64p32.go b/src/runtime/signal_nacl_amd64p32.go
index 024ceba..cb72dd0 100644
--- a/src/runtime/signal_nacl_amd64p32.go
+++ b/src/runtime/signal_nacl_amd64p32.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_nacl_arm.go b/src/runtime/signal_nacl_arm.go
index 1aeaa4e..b99827c 100644
--- a/src/runtime/signal_nacl_arm.go
+++ b/src/runtime/signal_nacl_arm.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_netbsd.go b/src/runtime/signal_netbsd.go
index d93a450..30a3b8e 100644
--- a/src/runtime/signal_netbsd.go
+++ b/src/runtime/signal_netbsd.go

@@ -29,7 +29,7 @@
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
 	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},

diff --git a/src/runtime/signal_netbsd_386.go b/src/runtime/signal_netbsd_386.go
index 6702336..af49d5d 100644
--- a/src/runtime/signal_netbsd_386.go
+++ b/src/runtime/signal_netbsd_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -22,12 +22,12 @@
 func (c *sigctxt) esp() uint32      { return c.regs().__gregs[_REG_UESP] }
 func (c *sigctxt) eip() uint32      { return c.regs().__gregs[_REG_EIP] }
 func (c *sigctxt) eflags() uint32   { return c.regs().__gregs[_REG_EFL] }
-func (c *sigctxt) cs() uint32       { return uint32(c.regs().__gregs[_REG_CS]) }
-func (c *sigctxt) fs() uint32       { return uint32(c.regs().__gregs[_REG_FS]) }
-func (c *sigctxt) gs() uint32       { return uint32(c.regs().__gregs[_REG_GS]) }
+func (c *sigctxt) cs() uint32       { return c.regs().__gregs[_REG_CS] }
+func (c *sigctxt) fs() uint32       { return c.regs().__gregs[_REG_FS] }
+func (c *sigctxt) gs() uint32       { return c.regs().__gregs[_REG_GS] }
 func (c *sigctxt) sigcode() uint32  { return uint32(c.info._code) }
 func (c *sigctxt) sigaddr() uint32 {
-	return uint32(*(*uint32)(unsafe.Pointer(&c.info._reason[0])))
+	return *(*uint32)(unsafe.Pointer(&c.info._reason[0]))
 }
 
 func (c *sigctxt) set_eip(x uint32)     { c.regs().__gregs[_REG_EIP] = x }

diff --git a/src/runtime/signal_netbsd_amd64.go b/src/runtime/signal_netbsd_amd64.go
index e22f4a7..db230f8 100644
--- a/src/runtime/signal_netbsd_amd64.go
+++ b/src/runtime/signal_netbsd_amd64.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -37,7 +37,7 @@
 func (c *sigctxt) gs() uint64      { return c.regs().__gregs[_REG_GS] }
 func (c *sigctxt) sigcode() uint64 { return uint64(c.info._code) }
 func (c *sigctxt) sigaddr() uint64 {
-	return uint64(*(*uint64)(unsafe.Pointer(&c.info._reason[0])))
+	return *(*uint64)(unsafe.Pointer(&c.info._reason[0]))
 }
 
 func (c *sigctxt) set_rip(x uint64)     { c.regs().__gregs[_REG_RIP] = x }

diff --git a/src/runtime/signal_netbsd_arm.go b/src/runtime/signal_netbsd_arm.go
index 9b114c8..4e58d3b 100644
--- a/src/runtime/signal_netbsd_arm.go
+++ b/src/runtime/signal_netbsd_arm.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_openbsd.go b/src/runtime/signal_openbsd.go
index 78afc59..9275279 100644
--- a/src/runtime/signal_openbsd.go
+++ b/src/runtime/signal_openbsd.go

@@ -4,6 +4,8 @@
 
 package runtime
 
+import "unsafe"
+
 type sigTabT struct {
 	flags int32
 	name  string
@@ -14,14 +16,14 @@
 	/*  1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
 	/*  2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
 	/*  3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
-	/*  4 */ {_SigThrow, "SIGILL: illegal instruction"},
-	/*  5 */ {_SigThrow, "SIGTRAP: trace trap"},
+	/*  4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/*  5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
 	/*  6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
 	/*  7 */ {_SigThrow, "SIGEMT: emulate instruction executed"},
-	/*  8 */ {_SigPanic, "SIGFPE: floating-point exception"},
+	/*  8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
 	/*  9 */ {0, "SIGKILL: kill"},
-	/* 10 */ {_SigPanic, "SIGBUS: bus error"},
-	/* 11 */ {_SigPanic, "SIGSEGV: segmentation violation"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
 	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
 	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
@@ -29,18 +31,57 @@
 	/* 16 */ {_SigNotify, "SIGURG: urgent condition on socket"},
 	/* 17 */ {0, "SIGSTOP: stop"},
 	/* 18 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
-	/* 19 */ {0, "SIGCONT: continue after stop"},
-	/* 20 */ {_SigNotify, "SIGCHLD: child status has changed"},
+	/* 19 */ {_SigNotify + _SigDefault, "SIGCONT: continue after stop"},
+	/* 20 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
 	/* 22 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
 	/* 23 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 24 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
 	/* 25 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/* 26 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
-	/* 27 */ {_SigNotify, "SIGPROF: profiling alarm clock"},
+	/* 27 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGINFO: status request from keyboard"},
 	/* 30 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
 	/* 31 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
 	/* 32 */ {_SigNotify, "SIGTHR: reserved"},
 }
+
+//go:nosplit
+//go:nowritebarrierrec
+func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
+	if sigfwdgo(sig, info, ctx) {
+		return
+	}
+	g := getg()
+	if g == nil {
+		badsignal(uintptr(sig), &sigctxt{info, ctx})
+		return
+	}
+
+	// If some non-Go code called sigaltstack, adjust.
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp < g.m.gsignal.stack.lo || sp >= g.m.gsignal.stack.hi {
+		var st stackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(noSignalStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		if sp < stsp || sp >= stsp+st.ss_size {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(sigNotOnStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		g.m.gsignal.stack.lo = stsp
+		g.m.gsignal.stack.hi = stsp + st.ss_size
+		g.m.gsignal.stackguard0 = stsp + _StackGuard
+		g.m.gsignal.stackguard1 = stsp + _StackGuard
+		g.m.gsignal.stackAlloc = st.ss_size
+		g.m.gsignal.stktopsp = getcallersp(unsafe.Pointer(&sig))
+	}
+
+	setg(g.m.gsignal)
+	sighandler(sig, info, ctx, g)
+	setg(g)
+}

diff --git a/src/runtime/signal_openbsd_386.go b/src/runtime/signal_openbsd_386.go
index c582a44..dbbb2c8 100644
--- a/src/runtime/signal_openbsd_386.go
+++ b/src/runtime/signal_openbsd_386.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_openbsd_amd64.go b/src/runtime/signal_openbsd_amd64.go
index 4f0d19d..5631ab4 100644
--- a/src/runtime/signal_openbsd_amd64.go
+++ b/src/runtime/signal_openbsd_amd64.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_openbsd_arm.go b/src/runtime/signal_openbsd_arm.go
index 8ee255c..3158a4e 100644
--- a/src/runtime/signal_openbsd_arm.go
+++ b/src/runtime/signal_openbsd_arm.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/signal_plan9.go b/src/runtime/signal_plan9.go
index 302f156..d3894c8 100644
--- a/src/runtime/signal_plan9.go
+++ b/src/runtime/signal_plan9.go

@@ -22,8 +22,8 @@
 	{_SigThrow, "sys: trap: invalid opcode"},
 
 	// We can recover from some memory errors in runtime·sigpanic.
-	{_SigPanic, "sys: trap: fault read addr"},  // SIGRFAULT
-	{_SigPanic, "sys: trap: fault write addr"}, // SIGWFAULT
+	{_SigPanic, "sys: trap: fault read"},  // SIGRFAULT
+	{_SigPanic, "sys: trap: fault write"}, // SIGWFAULT
 
 	// We can also recover from math errors.
 	{_SigPanic, "sys: trap: divide error"}, // SIGINTDIV
@@ -51,4 +51,7 @@
 
 	// Alarms can be handled if desired, otherwise they're ignored.
 	{_SigNotify, "alarm"},
+
+	// Aborts can be handled if desired, otherwise they cause a stack trace.
+	{_SigNotify + _SigThrow, "abort"},
 }

diff --git a/src/runtime/signal_ppc64x.go b/src/runtime/signal_ppc64x.go
index bad9fe6..01a4af7 100644
--- a/src/runtime/signal_ppc64x.go
+++ b/src/runtime/signal_ppc64x.go

@@ -7,7 +7,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 func dumpregs(c *sigctxt) {
 	print("r0   ", hex(c.r0()), "\t")
@@ -53,7 +56,8 @@
 var crashing int32
 
 // May run during STW, so write barriers are not allowed.
-//go:nowritebarrier
+//
+//go:nowritebarrierrec
 func sighandler(sig uint32, info *siginfo, ctxt unsafe.Pointer, gp *g) {
 	_g_ := getg()
 	c := &sigctxt{info, ctxt}
@@ -82,11 +86,11 @@
 		// functions are correctly handled. This smashes
 		// the stack frame but we're not going back there
 		// anyway.
-		sp := c.sp() - ptrSize
+		sp := c.sp() - sys.MinFrameSize
 		c.set_sp(sp)
 		*(*uint64)(unsafe.Pointer(uintptr(sp))) = c.link()
 
-		pc := uintptr(gp.sigpc)
+		pc := gp.sigpc
 
 		// If we don't recognize the PC as code
 		// but we do recognize the link register as code,
@@ -106,6 +110,7 @@
 		// In case we are panicking from external C code
 		c.set_r0(0)
 		c.set_r30(uint64(uintptr(unsafe.Pointer(gp))))
+		c.set_r12(uint64(funcPC(sigpanic)))
 		c.set_pc(uint64(funcPC(sigpanic)))
 		return
 	}
@@ -116,8 +121,12 @@
 		}
 	}
 
+	if c.sigcode() == _SI_USER && signal_ignored(sig) {
+		return
+	}
+
 	if flags&_SigKill != 0 {
-		exit(2)
+		dieFromSignal(int32(sig))
 	}
 
 	if flags&_SigThrow == 0 {
@@ -144,8 +153,8 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		goroutineheader(gp)
 		tracebacktrap(uintptr(c.pc()), uintptr(c.sp()), uintptr(c.link()), gp)
 		if crashing > 0 && gp != _g_.m.curg && _g_.m.curg != nil && readgstatus(_g_.m.curg)&^_Gscan == _Grunning {

diff --git a/src/runtime/signal_sigtramp.go b/src/runtime/signal_sigtramp.go
new file mode 100644
index 0000000..3e0b104
--- /dev/null
+++ b/src/runtime/signal_sigtramp.go

@@ -0,0 +1,58 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build dragonfly linux netbsd
+
+package runtime
+
+import "unsafe"
+
+// Continuation of the (assembly) sigtramp() logic.
+// This may be called with the world stopped.
+//go:nosplit
+//go:nowritebarrierrec
+func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
+	if sigfwdgo(sig, info, ctx) {
+		return
+	}
+	g := getg()
+	if g == nil {
+		if sig == _SIGPROF {
+			// Ignore profiling signals that arrive on
+			// non-Go threads. On some systems they will
+			// be handled directly by the signal handler,
+			// by calling sigprofNonGo, in which case we won't
+			// get here anyhow.
+			return
+		}
+		badsignal(uintptr(sig), &sigctxt{info, ctx})
+		return
+	}
+
+	// If some non-Go code called sigaltstack, adjust.
+	sp := uintptr(unsafe.Pointer(&sig))
+	if sp < g.m.gsignal.stack.lo || sp >= g.m.gsignal.stack.hi {
+		var st sigaltstackt
+		sigaltstack(nil, &st)
+		if st.ss_flags&_SS_DISABLE != 0 {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(noSignalStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		stsp := uintptr(unsafe.Pointer(st.ss_sp))
+		if sp < stsp || sp >= stsp+st.ss_size {
+			setg(nil)
+			cgocallback(unsafe.Pointer(funcPC(sigNotOnStack)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
+		}
+		g.m.gsignal.stack.lo = stsp
+		g.m.gsignal.stack.hi = stsp + st.ss_size
+		g.m.gsignal.stackguard0 = stsp + _StackGuard
+		g.m.gsignal.stackguard1 = stsp + _StackGuard
+		g.m.gsignal.stackAlloc = st.ss_size
+		g.m.gsignal.stktopsp = getcallersp(unsafe.Pointer(&sig))
+	}
+
+	setg(g.m.gsignal)
+	sighandler(sig, info, ctx, g)
+	setg(g)
+}

diff --git a/src/runtime/signal_solaris.go b/src/runtime/signal_solaris.go
index d8ac676..a86f7bf 100644
--- a/src/runtime/signal_solaris.go
+++ b/src/runtime/signal_solaris.go

@@ -32,10 +32,10 @@
 	/* 19 */ {_SigNotify, "SIGPWR: power-fail restart"},
 	/* 20 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 21 */ {_SigNotify, "SIGURG: urgent socket condition"},
-	/* 22 */ {_SigNotify, "SIGPOLL: pollable event occured"},
+	/* 22 */ {_SigNotify, "SIGPOLL: pollable event occurred"},
 	/* 23 */ {_SigNotify + _SigDefault, "SIGSTOP: stop (cannot be caught or ignored)"},
-	/* 24 */ {0, "SIGTSTP: user stop requested from tty"},
-	/* 25 */ {0, "SIGCONT: stopped process has been continued"},
+	/* 24 */ {_SigNotify + _SigDefault, "SIGTSTP: user stop requested from tty"},
+	/* 25 */ {_SigNotify + _SigDefault, "SIGCONT: stopped process has been continued"},
 	/* 26 */ {_SigNotify + _SigDefault, "SIGTTIN: background tty read attempted"},
 	/* 27 */ {_SigNotify + _SigDefault, "SIGTTOU: background tty write attempted"},
 	/* 28 */ {_SigNotify, "SIGVTALRM: virtual timer expired"},
@@ -46,7 +46,7 @@
 	/* 33 */ {_SigNotify, "SIGLWP: reserved signal no longer used by"},
 	/* 34 */ {_SigNotify, "SIGFREEZE: special signal used by CPR"},
 	/* 35 */ {_SigNotify, "SIGTHAW: special signal used by CPR"},
-	/* 36 */ {0, "SIGCANCEL: reserved signal for thread cancellation"},
+	/* 36 */ {_SigSetStack + _SigUnblock, "SIGCANCEL: reserved signal for thread cancellation"}, // Oracle's spelling of cancelation.
 	/* 37 */ {_SigNotify, "SIGLOST: resource lost (eg, record-lock lost)"},
 	/* 38 */ {_SigNotify, "SIGXRES: resource control exceeded"},
 	/* 39 */ {_SigNotify, "SIGJVM1: reserved signal for Java Virtual Machine"},

diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index ad3ab31..f59c9b9 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,43 +6,16 @@
 
 package runtime
 
-import "unsafe"
+import _ "unsafe" // for go:linkname
 
 //go:linkname os_sigpipe os.sigpipe
 func os_sigpipe() {
 	systemstack(sigpipe)
 }
 
-// Determines if the signal should be handled by Go and if not, forwards the
-// signal to the handler that was installed before Go's.  Returns whether the
-// signal was forwarded.
-//go:nosplit
-func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
-	g := getg()
-	c := &sigctxt{info, ctx}
+func signame(sig uint32) string {
 	if sig >= uint32(len(sigtable)) {
-		return false
+		return ""
 	}
-	fwdFn := fwdSig[sig]
-	flags := sigtable[sig].flags
-
-	// If there is no handler to forward to, no need to forward.
-	if fwdFn == _SIG_DFL {
-		return false
-	}
-	// Only forward synchronous signals.
-	if c.sigcode() == _SI_USER || flags&_SigPanic == 0 {
-		return false
-	}
-	// Determine if the signal occurred inside Go code.  We test that:
-	//   (1) we were in a goroutine (i.e., m.curg != nil), and
-	//   (2) we weren't in CGO (i.e., m.curg.syscallsp == 0).
-	if g != nil && g.m != nil && g.m.curg != nil && g.m.curg.syscallsp == 0 {
-		return false
-	}
-	// Signal not handled by Go, forward it.
-	if fwdFn != _SIG_IGN {
-		sigfwd(fwdFn, sig, info, ctx)
-	}
-	return true
+	return sigtable[sig].name
 }

diff --git a/src/runtime/signal_windows.go b/src/runtime/signal_windows.go
index d80cc97..298dcc9 100644
--- a/src/runtime/signal_windows.go
+++ b/src/runtime/signal_windows.go

@@ -82,13 +82,13 @@
 
 	// Only push runtime·sigpanic if r.ip() != 0.
 	// If r.ip() == 0, probably panicked because of a
-	// call to a nil func.  Not pushing that onto sp will
+	// call to a nil func. Not pushing that onto sp will
 	// make the trace look like a call to runtime·sigpanic instead.
 	// (Otherwise the trace will end at runtime·sigpanic and we
 	// won't get to see who faulted.)
 	if r.ip() != 0 {
 		sp := unsafe.Pointer(r.sp())
-		sp = add(sp, ^uintptr(unsafe.Sizeof(uintptr(0))-1)) // sp--
+		sp = add(sp, ^(unsafe.Sizeof(uintptr(0)) - 1)) // sp--
 		*((*uintptr)(sp)) = r.ip()
 		r.setsp(uintptr(sp))
 	}
@@ -134,8 +134,8 @@
 	}
 	print("\n")
 
-	var docrash bool
-	if gotraceback(&docrash) > 0 {
+	level, _, docrash := gotraceback()
+	if level > 0 {
 		tracebacktrap(r.ip(), r.sp(), 0, gp)
 		tracebackothers(gp)
 		dumpregs(r)
@@ -155,7 +155,7 @@
 		throw("unexpected signal during runtime execution")
 	}
 
-	switch uint32(g.sig) {
+	switch g.sig {
 	case _EXCEPTION_ACCESS_VIOLATION:
 		if g.sigcode1 < 0x1000 || g.paniconfault {
 			panicmem()
@@ -191,7 +191,7 @@
 
 // Following are not implemented.
 
-func initsig() {
+func initsig(preinit bool) {
 }
 
 func sigenable(sig uint32) {
@@ -209,6 +209,10 @@
 	badsignal2()
 }
 
+func signame(sig uint32) string {
+	return ""
+}
+
 func crash() {
 	// TODO: This routine should do whatever is needed
 	// to make the Windows program abort/crash as it

diff --git a/src/runtime/sigpanic_unix.go b/src/runtime/sigpanic_unix.go
index 1ce6223..4cd8615 100644
--- a/src/runtime/sigpanic_unix.go
+++ b/src/runtime/sigpanic_unix.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 3f50a59..0162ffa 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go

@@ -12,7 +12,7 @@
 // sigsend is called by the signal handler to queue a new signal.
 // signal_recv is called by the Go program to receive a newly queued signal.
 // Synchronization between sigsend and signal_recv is based on the sig.state
-// variable.  It can be in 3 states: sigIdle, sigReceiving and sigSending.
+// variable. It can be in 3 states: sigIdle, sigReceiving and sigSending.
 // sigReceiving means that signal_recv is blocked on sig.Note and there are no
 // new pending signals.
 // sigSending means that sig.mask *may* contain new pending signals,
@@ -28,15 +28,19 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	_ "unsafe" // for go:linkname
+)
 
 var sig struct {
-	note   note
-	mask   [(_NSIG + 31) / 32]uint32
-	wanted [(_NSIG + 31) / 32]uint32
-	recv   [(_NSIG + 31) / 32]uint32
-	state  uint32
-	inuse  bool
+	note    note
+	mask    [(_NSIG + 31) / 32]uint32
+	wanted  [(_NSIG + 31) / 32]uint32
+	ignored [(_NSIG + 31) / 32]uint32
+	recv    [(_NSIG + 31) / 32]uint32
+	state   uint32
+	inuse   bool
 }
 
 const (
@@ -49,7 +53,7 @@
 // Reports whether the signal was sent. If not, the caller typically crashes the program.
 func sigsend(s uint32) bool {
 	bit := uint32(1) << uint(s&31)
-	if !sig.inuse || s < 0 || int(s) >= 32*len(sig.wanted) || sig.wanted[s/32]&bit == 0 {
+	if !sig.inuse || s >= uint32(32*len(sig.wanted)) || sig.wanted[s/32]&bit == 0 {
 		return false
 	}
 
@@ -59,7 +63,7 @@
 		if mask&bit != 0 {
 			return true // signal already in queue
 		}
-		if cas(&sig.mask[s/32], mask, mask|bit) {
+		if atomic.Cas(&sig.mask[s/32], mask, mask|bit) {
 			break
 		}
 	}
@@ -67,18 +71,18 @@
 	// Notify receiver that queue has new bit.
 Send:
 	for {
-		switch atomicload(&sig.state) {
+		switch atomic.Load(&sig.state) {
 		default:
 			throw("sigsend: inconsistent state")
 		case sigIdle:
-			if cas(&sig.state, sigIdle, sigSending) {
+			if atomic.Cas(&sig.state, sigIdle, sigSending) {
 				break Send
 			}
 		case sigSending:
 			// notification already pending
 			break Send
 		case sigReceiving:
-			if cas(&sig.state, sigReceiving, sigIdle) {
+			if atomic.Cas(&sig.state, sigReceiving, sigIdle) {
 				notewakeup(&sig.note)
 				break Send
 			}
@@ -90,6 +94,7 @@
 
 // Called to receive the next queued signal.
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_recv os/signal.signal_recv
 func signal_recv() uint32 {
 	for {
 		// Serve any signals from local copy.
@@ -103,17 +108,17 @@
 		// Wait for updates to be available from signal sender.
 	Receive:
 		for {
-			switch atomicload(&sig.state) {
+			switch atomic.Load(&sig.state) {
 			default:
 				throw("signal_recv: inconsistent state")
 			case sigIdle:
-				if cas(&sig.state, sigIdle, sigReceiving) {
+				if atomic.Cas(&sig.state, sigIdle, sigReceiving) {
 					notetsleepg(&sig.note, -1)
 					noteclear(&sig.note)
 					break Receive
 				}
 			case sigSending:
-				if cas(&sig.state, sigSending, sigIdle) {
+				if atomic.Cas(&sig.state, sigSending, sigIdle) {
 					break Receive
 				}
 			}
@@ -121,32 +126,35 @@
 
 		// Incorporate updates from sender into local copy.
 		for i := range sig.mask {
-			sig.recv[i] = xchg(&sig.mask[i], 0)
+			sig.recv[i] = atomic.Xchg(&sig.mask[i], 0)
 		}
 	}
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {
 	if !sig.inuse {
 		// The first call to signal_enable is for us
-		// to use for initialization.  It does not pass
+		// to use for initialization. It does not pass
 		// signal information in m.
 		sig.inuse = true // enable reception of signals; cannot disable
 		noteclear(&sig.note)
 		return
 	}
 
-	if int(s) >= len(sig.wanted)*32 {
+	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
 	sig.wanted[s/32] |= 1 << (s & 31)
+	sig.ignored[s/32] &^= 1 << (s & 31)
 	sigenable(s)
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_disable os/signal.signal_disable
 func signal_disable(s uint32) {
-	if int(s) >= len(sig.wanted)*32 {
+	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
 	sig.wanted[s/32] &^= 1 << (s & 31)
@@ -154,25 +162,17 @@
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_ignore os/signal.signal_ignore
 func signal_ignore(s uint32) {
-	if int(s) >= len(sig.wanted)*32 {
+	if s >= uint32(len(sig.wanted)*32) {
 		return
 	}
 	sig.wanted[s/32] &^= 1 << (s & 31)
+	sig.ignored[s/32] |= 1 << (s & 31)
 	sigignore(s)
 }
 
-// This runs on a foreign stack, without an m or a g.  No stack split.
-//go:nosplit
-//go:norace
-func badsignal(sig uintptr) {
-	cgocallback(unsafe.Pointer(funcPC(badsignalgo)), noescape(unsafe.Pointer(&sig)), unsafe.Sizeof(sig))
-}
-
-func badsignalgo(sig uintptr) {
-	if !sigsend(uint32(sig)) {
-		// A foreign thread received the signal sig, and the
-		// Go code does not want to handle it.
-		raisebadsignal(int32(sig))
-	}
+// Checked by signal handlers.
+func signal_ignored(s uint32) bool {
+	return sig.ignored[s/32]&(1<<(s&31)) != 0
 }

diff --git a/src/runtime/sigqueue_plan9.go b/src/runtime/sigqueue_plan9.go
index f000fab..575d26a 100644
--- a/src/runtime/sigqueue_plan9.go
+++ b/src/runtime/sigqueue_plan9.go

@@ -6,6 +6,8 @@
 
 package runtime
 
+import _ "unsafe"
+
 const qsize = 64
 
 var sig struct {
@@ -92,6 +94,7 @@
 
 // Called to receive the next queued signal.
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_recv os/signal.signal_recv
 func signal_recv() string {
 	for {
 		note := sig.q.pop()
@@ -108,10 +111,11 @@
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_enable os/signal.signal_enable
 func signal_enable(s uint32) {
 	if !sig.inuse {
 		// The first call to signal_enable is for us
-		// to use for initialization.  It does not pass
+		// to use for initialization. It does not pass
 		// signal information in m.
 		sig.inuse = true // enable reception of signals; cannot disable
 		noteclear(&sig.note)
@@ -120,9 +124,11 @@
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_disable os/signal.signal_disable
 func signal_disable(s uint32) {
 }
 
 // Must only be called from a single goroutine at a time.
+//go:linkname signal_ignore os/signal.signal_ignore
 func signal_ignore(s uint32) {
 }

diff --git a/src/runtime/signal_linux.go b/src/runtime/sigtab_linux_generic.go
similarity index 84%
rename from src/runtime/signal_linux.go
rename to src/runtime/sigtab_linux_generic.go
index 2f25b59..ea36bf3 100644
--- a/src/runtime/signal_linux.go
+++ b/src/runtime/sigtab_linux_generic.go

@@ -2,9 +2,11 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-package runtime
+// +build !mips64
+// +build !mips64le
+// +build linux
 
-import "unsafe"
+package runtime
 
 type sigTabT struct {
 	flags int32
@@ -30,7 +32,7 @@
 	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
 	/* 16 */ {_SigThrow + _SigUnblock, "SIGSTKFLT: stack fault"},
 	/* 17 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
-	/* 18 */ {0, "SIGCONT: continue"},
+	/* 18 */ {_SigNotify + _SigDefault, "SIGCONT: continue"},
 	/* 19 */ {0, "SIGSTOP: stop, unblockable"},
 	/* 20 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
 	/* 21 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
@@ -43,9 +45,9 @@
 	/* 28 */ {_SigNotify, "SIGWINCH: window size change"},
 	/* 29 */ {_SigNotify, "SIGIO: i/o now possible"},
 	/* 30 */ {_SigNotify, "SIGPWR: power failure restart"},
-	/* 31 */ {_SigNotify, "SIGSYS: bad system call"},
-	/* 32 */ {_SigSetStack, "signal 32"}, /* SIGCANCEL; see issue 6997 */
-	/* 33 */ {_SigSetStack, "signal 33"}, /* SIGSETXID; see issue 3871, 9400 */
+	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
+	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
+	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
 	/* 34 */ {_SigNotify, "signal 34"},
 	/* 35 */ {_SigNotify, "signal 35"},
 	/* 36 */ {_SigNotify, "signal 36"},
@@ -78,19 +80,3 @@
 	/* 63 */ {_SigNotify, "signal 63"},
 	/* 64 */ {_SigNotify, "signal 64"},
 }
-
-// Continuation of the (assembly) sigtramp() logic.
-//go:nosplit
-func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
-	if sigfwdgo(sig, info, ctx) {
-		return
-	}
-	g := getg()
-	if g == nil {
-		badsignal(uintptr(sig))
-		return
-	}
-	setg(g.m.gsignal)
-	sighandler(sig, info, ctx, g)
-	setg(g)
-}

diff --git a/src/runtime/sigtab_linux_mips64x.go b/src/runtime/sigtab_linux_mips64x.go
new file mode 100644
index 0000000..201fe3d
--- /dev/null
+++ b/src/runtime/sigtab_linux_mips64x.go

@@ -0,0 +1,81 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+// +build linux
+
+package runtime
+
+type sigTabT struct {
+	flags int32
+	name  string
+}
+
+var sigtable = [...]sigTabT{
+	/* 0 */ {0, "SIGNONE: no trap"},
+	/* 1 */ {_SigNotify + _SigKill, "SIGHUP: terminal line hangup"},
+	/* 2 */ {_SigNotify + _SigKill, "SIGINT: interrupt"},
+	/* 3 */ {_SigNotify + _SigThrow, "SIGQUIT: quit"},
+	/* 4 */ {_SigThrow + _SigUnblock, "SIGILL: illegal instruction"},
+	/* 5 */ {_SigThrow + _SigUnblock, "SIGTRAP: trace trap"},
+	/* 6 */ {_SigNotify + _SigThrow, "SIGABRT: abort"},
+	/* 7 */ {_SigThrow, "SIGEMT"},
+	/* 8 */ {_SigPanic + _SigUnblock, "SIGFPE: floating-point exception"},
+	/* 9 */ {0, "SIGKILL: kill"},
+	/* 10 */ {_SigPanic + _SigUnblock, "SIGBUS: bus error"},
+	/* 11 */ {_SigPanic + _SigUnblock, "SIGSEGV: segmentation violation"},
+	/* 12 */ {_SigThrow, "SIGSYS: bad system call"},
+	/* 13 */ {_SigNotify, "SIGPIPE: write to broken pipe"},
+	/* 14 */ {_SigNotify, "SIGALRM: alarm clock"},
+	/* 15 */ {_SigNotify + _SigKill, "SIGTERM: termination"},
+	/* 16 */ {_SigNotify, "SIGUSR1: user-defined signal 1"},
+	/* 17 */ {_SigNotify, "SIGUSR2: user-defined signal 2"},
+	/* 18 */ {_SigNotify + _SigUnblock, "SIGCHLD: child status has changed"},
+	/* 19 */ {_SigNotify, "SIGPWR: power failure restart"},
+	/* 20 */ {_SigNotify, "SIGWINCH: window size change"},
+	/* 21 */ {_SigNotify, "SIGURG: urgent condition on socket"},
+	/* 22 */ {_SigNotify, "SIGIO: i/o now possible"},
+	/* 23 */ {0, "SIGSTOP: stop, unblockable"},
+	/* 24 */ {_SigNotify + _SigDefault, "SIGTSTP: keyboard stop"},
+	/* 25 */ {_SigNotify + _SigDefault, "SIGCONT: continue"},
+	/* 26 */ {_SigNotify + _SigDefault, "SIGTTIN: background read from tty"},
+	/* 27 */ {_SigNotify + _SigDefault, "SIGTTOU: background write to tty"},
+	/* 28 */ {_SigNotify, "SIGVTALRM: virtual alarm clock"},
+	/* 29 */ {_SigNotify + _SigUnblock, "SIGPROF: profiling alarm clock"},
+	/* 30 */ {_SigNotify, "SIGXCPU: cpu limit exceeded"},
+	/* 31 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
+	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
+	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
+	/* 34 */ {_SigNotify, "signal 34"},
+	/* 35 */ {_SigNotify, "signal 35"},
+	/* 36 */ {_SigNotify, "signal 36"},
+	/* 37 */ {_SigNotify, "signal 37"},
+	/* 38 */ {_SigNotify, "signal 38"},
+	/* 39 */ {_SigNotify, "signal 39"},
+	/* 40 */ {_SigNotify, "signal 40"},
+	/* 41 */ {_SigNotify, "signal 41"},
+	/* 42 */ {_SigNotify, "signal 42"},
+	/* 43 */ {_SigNotify, "signal 43"},
+	/* 44 */ {_SigNotify, "signal 44"},
+	/* 45 */ {_SigNotify, "signal 45"},
+	/* 46 */ {_SigNotify, "signal 46"},
+	/* 47 */ {_SigNotify, "signal 47"},
+	/* 48 */ {_SigNotify, "signal 48"},
+	/* 49 */ {_SigNotify, "signal 49"},
+	/* 50 */ {_SigNotify, "signal 50"},
+	/* 51 */ {_SigNotify, "signal 51"},
+	/* 52 */ {_SigNotify, "signal 52"},
+	/* 53 */ {_SigNotify, "signal 53"},
+	/* 54 */ {_SigNotify, "signal 54"},
+	/* 55 */ {_SigNotify, "signal 55"},
+	/* 56 */ {_SigNotify, "signal 56"},
+	/* 57 */ {_SigNotify, "signal 57"},
+	/* 58 */ {_SigNotify, "signal 58"},
+	/* 59 */ {_SigNotify, "signal 59"},
+	/* 60 */ {_SigNotify, "signal 60"},
+	/* 61 */ {_SigNotify, "signal 61"},
+	/* 62 */ {_SigNotify, "signal 62"},
+	/* 63 */ {_SigNotify, "signal 63"},
+	/* 64 */ {_SigNotify, "signal 64"},
+}

diff --git a/src/runtime/slice.go b/src/runtime/slice.go
index 5cda11d..e15e6c4 100644
--- a/src/runtime/slice.go
+++ b/src/runtime/slice.go

@@ -14,88 +14,122 @@
 	cap   int
 }
 
+// maxElems is a lookup table containing the maximum capacity for a slice.
+// The index is the size of the slice element.
+var maxElems = [...]uintptr{
+	^uintptr(0),
+	_MaxMem / 1, _MaxMem / 2, _MaxMem / 3, _MaxMem / 4,
+	_MaxMem / 5, _MaxMem / 6, _MaxMem / 7, _MaxMem / 8,
+	_MaxMem / 9, _MaxMem / 10, _MaxMem / 11, _MaxMem / 12,
+	_MaxMem / 13, _MaxMem / 14, _MaxMem / 15, _MaxMem / 16,
+	_MaxMem / 17, _MaxMem / 18, _MaxMem / 19, _MaxMem / 20,
+	_MaxMem / 21, _MaxMem / 22, _MaxMem / 23, _MaxMem / 24,
+	_MaxMem / 25, _MaxMem / 26, _MaxMem / 27, _MaxMem / 28,
+	_MaxMem / 29, _MaxMem / 30, _MaxMem / 31, _MaxMem / 32,
+}
+
+// maxSliceCap returns the maximum capacity for a slice.
+func maxSliceCap(elemsize uintptr) uintptr {
+	if elemsize < uintptr(len(maxElems)) {
+		return maxElems[elemsize]
+	}
+	return _MaxMem / elemsize
+}
+
 // TODO: take uintptrs instead of int64s?
-func makeslice(t *slicetype, len64, cap64 int64) slice {
-	// NOTE: The len > MaxMem/elemsize check here is not strictly necessary,
+func makeslice(et *_type, len64, cap64 int64) slice {
+	// NOTE: The len > maxElements check here is not strictly necessary,
 	// but it produces a 'len out of range' error instead of a 'cap out of range' error
 	// when someone does make([]T, bignumber). 'cap out of range' is true too,
 	// but since the cap is only being supplied implicitly, saying len is clearer.
 	// See issue 4085.
+	maxElements := maxSliceCap(et.size)
 	len := int(len64)
-	if len64 < 0 || int64(len) != len64 || t.elem.size > 0 && uintptr(len) > _MaxMem/uintptr(t.elem.size) {
+	if len64 < 0 || int64(len) != len64 || uintptr(len) > maxElements {
 		panic(errorString("makeslice: len out of range"))
 	}
+
 	cap := int(cap64)
-	if cap < len || int64(cap) != cap64 || t.elem.size > 0 && uintptr(cap) > _MaxMem/uintptr(t.elem.size) {
+	if cap < len || int64(cap) != cap64 || uintptr(cap) > maxElements {
 		panic(errorString("makeslice: cap out of range"))
 	}
-	p := newarray(t.elem, uintptr(cap))
+
+	p := mallocgc(et.size*uintptr(cap), et, true)
 	return slice{p, len, cap}
 }
 
-// growslice_n is a variant of growslice that takes the number of new elements
-// instead of the new minimum capacity.
-// TODO(rsc): This is used by append(slice, slice...).
-// The compiler should change that code to use growslice directly (issue #11419).
-func growslice_n(t *slicetype, old slice, n int) slice {
-	if n < 1 {
-		panic(errorString("growslice: invalid n"))
-	}
-	return growslice(t, old, old.cap+n)
-}
-
 // growslice handles slice growth during append.
-// It is passed the slice type, the old slice, and the desired new minimum capacity,
+// It is passed the slice element type, the old slice, and the desired new minimum capacity,
 // and it returns a new slice with at least that capacity, with the old data
 // copied into it.
-func growslice(t *slicetype, old slice, cap int) slice {
-	if cap < old.cap || t.elem.size > 0 && uintptr(cap) > _MaxMem/uintptr(t.elem.size) {
-		panic(errorString("growslice: cap out of range"))
-	}
-
+// The new slice's length is set to the old slice's length,
+// NOT to the new requested capacity.
+// This is for codegen convenience. The old slice's length is used immediately
+// to calculate where to write new values during an append.
+// TODO: When the old backend is gone, reconsider this decision.
+// The SSA backend might prefer the new length or to return only ptr/cap and save stack space.
+func growslice(et *_type, old slice, cap int) slice {
 	if raceenabled {
-		callerpc := getcallerpc(unsafe.Pointer(&t))
-		racereadrangepc(old.array, uintptr(old.len*int(t.elem.size)), callerpc, funcPC(growslice))
+		callerpc := getcallerpc(unsafe.Pointer(&et))
+		racereadrangepc(old.array, uintptr(old.len*int(et.size)), callerpc, funcPC(growslice))
+	}
+	if msanenabled {
+		msanread(old.array, uintptr(old.len*int(et.size)))
 	}
 
-	et := t.elem
 	if et.size == 0 {
+		if cap < old.cap {
+			panic(errorString("growslice: cap out of range"))
+		}
 		// append should not create a slice with nil pointer but non-zero len.
 		// We assume that append doesn't need to preserve old.array in this case.
 		return slice{unsafe.Pointer(&zerobase), old.len, cap}
 	}
 
 	newcap := old.cap
-	if newcap+newcap < cap {
+	doublecap := newcap + newcap
+	if cap > doublecap {
 		newcap = cap
 	} else {
-		for {
-			if old.len < 1024 {
-				newcap += newcap
-			} else {
+		if old.len < 1024 {
+			newcap = doublecap
+		} else {
+			for newcap < cap {
 				newcap += newcap / 4
 			}
-			if newcap >= cap {
-				break
-			}
 		}
 	}
 
-	if uintptr(newcap) >= _MaxMem/uintptr(et.size) {
+	var lenmem, capmem uintptr
+	const ptrSize = unsafe.Sizeof((*byte)(nil))
+	switch et.size {
+	case 1:
+		lenmem = uintptr(old.len)
+		capmem = roundupsize(uintptr(newcap))
+		newcap = int(capmem)
+	case ptrSize:
+		lenmem = uintptr(old.len) * ptrSize
+		capmem = roundupsize(uintptr(newcap) * ptrSize)
+		newcap = int(capmem / ptrSize)
+	default:
+		lenmem = uintptr(old.len) * et.size
+		capmem = roundupsize(uintptr(newcap) * et.size)
+		newcap = int(capmem / et.size)
+	}
+
+	if cap < old.cap || uintptr(newcap) > maxSliceCap(et.size) {
 		panic(errorString("growslice: cap out of range"))
 	}
-	lenmem := uintptr(old.len) * uintptr(et.size)
-	capmem := roundupsize(uintptr(newcap) * uintptr(et.size))
-	newcap = int(capmem / uintptr(et.size))
+
 	var p unsafe.Pointer
 	if et.kind&kindNoPointers != 0 {
-		p = rawmem(capmem)
+		p = mallocgc(capmem, nil, false)
 		memmove(p, old.array, lenmem)
 		memclr(add(p, lenmem), capmem-lenmem)
 	} else {
 		// Note: can't use rawmem (which avoids zeroing of memory), because then GC can scan uninitialized memory.
-		p = newarray(et, uintptr(newcap))
-		if !writeBarrierEnabled {
+		p = mallocgc(capmem, et, true)
+		if !writeBarrier.enabled {
 			memmove(p, old.array, lenmem)
 		} else {
 			for i := uintptr(0); i < lenmem; i += et.size {
@@ -127,6 +161,10 @@
 		racewriterangepc(to.array, uintptr(n*int(width)), callerpc, pc)
 		racereadrangepc(fm.array, uintptr(n*int(width)), callerpc, pc)
 	}
+	if msanenabled {
+		msanwrite(to.array, uintptr(n*int(width)))
+		msanread(fm.array, uintptr(n*int(width)))
+	}
 
 	size := uintptr(n) * width
 	if size == 1 { // common case worth about 2x to do here
@@ -135,7 +173,7 @@
 	} else {
 		memmove(to.array, fm.array, size)
 	}
-	return int(n)
+	return n
 }
 
 func slicestringcopy(to []byte, fm string) int {
@@ -153,7 +191,10 @@
 		pc := funcPC(slicestringcopy)
 		racewriterangepc(unsafe.Pointer(&to[0]), uintptr(n), callerpc, pc)
 	}
+	if msanenabled {
+		msanwrite(unsafe.Pointer(&to[0]), uintptr(n))
+	}
 
-	memmove(unsafe.Pointer(&to[0]), unsafe.Pointer((*stringStruct)(unsafe.Pointer(&fm)).str), uintptr(n))
+	memmove(unsafe.Pointer(&to[0]), stringStructOf(&fm).str, uintptr(n))
 	return n
 }

diff --git a/src/runtime/softfloat64.go b/src/runtime/softfloat64.go
index 790dbda..1678e8f 100644
--- a/src/runtime/softfloat64.go
+++ b/src/runtime/softfloat64.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/softfloat64_test.go b/src/runtime/softfloat64_test.go
index e108872..7347aff 100644
--- a/src/runtime/softfloat64_test.go
+++ b/src/runtime/softfloat64_test.go

@@ -1,4 +1,4 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/softfloat_arm.go b/src/runtime/softfloat_arm.go
index c6eba58..5f609c8 100644
--- a/src/runtime/softfloat_arm.go
+++ b/src/runtime/softfloat_arm.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// Software floating point interpretaton of ARM 7500 FP instructions.
+// Software floating point interpretation of ARM 7500 FP instructions.
 // The interpretation is not bit compatible with the 7500.
 // It uses true little-endian doubles, while the 7500 used mixed-endian.
 
@@ -157,14 +157,26 @@
 		}
 		return 1
 	}
-	if i == 0xe08bb00d {
-		// add sp to r11.
-		// might be part of a large stack offset address
+	if i == 0xe08fb00b {
+		// add pc to r11
+		// might be part of a PIC floating point move
 		// (or might not, but again no harm done).
-		regs[11] += regs[13]
+		regs[11] += uint32(uintptr(unsafe.Pointer(pc))) + 8
 
 		if fptrace > 0 {
-			print("*** cpu R[11] += R[13] ", hex(regs[11]), "\n")
+			print("*** cpu R[11] += pc ", hex(regs[11]), "\n")
+		}
+		return 1
+	}
+	if i&0xfffffff0 == 0xe08bb000 {
+		r := i & 0xf
+		// add r to r11.
+		// might be part of a large offset address calculation
+		// (or might not, but again no harm done).
+		regs[11] += regs[r]
+
+		if fptrace > 0 {
+			print("*** cpu R[11] += R[", r, "] ", hex(regs[11]), "\n")
 		}
 		return 1
 	}
@@ -518,7 +530,7 @@
 	case 0xeeb80ac0: // D[regd] = S[regm] (MOVWF)
 		cmp := int32(m.freglo[regm])
 		if cmp < 0 {
-			fputf(regd, f64to32(fintto64(int64(-cmp))))
+			fputf(regd, f64to32(fintto64(-int64(cmp))))
 			m.freglo[regd] ^= 0x80000000
 		} else {
 			fputf(regd, f64to32(fintto64(int64(cmp))))
@@ -540,7 +552,7 @@
 	case 0xeeb80bc0: // D[regd] = S[regm] (MOVWD)
 		cmp := int32(m.freglo[regm])
 		if cmp < 0 {
-			fputd(regd, fintto64(int64(-cmp)))
+			fputd(regd, fintto64(-int64(cmp)))
 			m.freghi[regd] ^= 0x80000000
 		} else {
 			fputd(regd, fintto64(int64(cmp)))
@@ -598,7 +610,7 @@
 			pc = uint32(funcPC(_sfloatpanic))
 			break
 		}
-		pc += 4 * uint32(skip)
+		pc += 4 * skip
 	}
 	if first {
 		print("sfloat2 ", pc, " ", hex(*(*uint32)(unsafe.Pointer(uintptr(pc)))), "\n")

diff --git a/src/runtime/sqrt.go b/src/runtime/sqrt.go
index 7452a61..d71a498 100644
--- a/src/runtime/sqrt.go
+++ b/src/runtime/sqrt.go

@@ -11,7 +11,7 @@
 
 // The original C code and the long comment below are
 // from FreeBSD's /usr/src/lib/msun/src/e_sqrt.c and
-// came with this notice.  The go code is a simplified
+// came with this notice. The go code is a simplified
 // version of the original C.
 //
 // ====================================================
@@ -117,7 +117,7 @@
 	// normalize x
 	exp := int((ix >> float64Shift) & float64Mask)
 	if exp == 0 { // subnormal x
-		for ix&1<<float64Shift == 0 {
+		for ix&(1<<float64Shift) == 0 {
 			ix <<= 1
 			exp--
 		}

diff --git a/src/runtime/sqrt_test.go b/src/runtime/sqrt_test.go
index d5ccc7f..54539e1 100644
--- a/src/runtime/sqrt_test.go
+++ b/src/runtime/sqrt_test.go

@@ -74,6 +74,7 @@
 	0,
 	math.Inf(1),
 	math.NaN(),
+	math.Float64frombits(2),
 }
 var sqrtSC = []float64{
 	math.NaN(),
@@ -82,4 +83,5 @@
 	0,
 	math.Inf(1),
 	math.NaN(),
+	3.1434555694052576e-162,
 }

diff --git a/src/runtime/stack1.go b/src/runtime/stack.go
similarity index 63%
rename from src/runtime/stack1.go
rename to src/runtime/stack.go
index efcb5f2..8398a10 100644
--- a/src/runtime/stack1.go
+++ b/src/runtime/stack.go

@@ -4,7 +4,112 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+/*
+Stack layout parameters.
+Included both by runtime (compiled via 6c) and linkers (compiled via gcc).
+
+The per-goroutine g->stackguard is set to point StackGuard bytes
+above the bottom of the stack.  Each function compares its stack
+pointer against g->stackguard to check for overflow.  To cut one
+instruction from the check sequence for functions with tiny frames,
+the stack is allowed to protrude StackSmall bytes below the stack
+guard.  Functions with large frames don't bother with the check and
+always call morestack.  The sequences are (for amd64, others are
+similar):
+
+	guard = g->stackguard
+	frame = function's stack frame size
+	argsize = size of function arguments (call + return)
+
+	stack frame size <= StackSmall:
+		CMPQ guard, SP
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size > StackSmall but < StackBig
+		LEAQ (frame-StackSmall)(SP), R0
+		CMPQ guard, R0
+		JHI 3(PC)
+		MOVQ m->morearg, $(argsize << 32)
+		CALL morestack(SB)
+
+	stack frame size >= StackBig:
+		MOVQ m->morearg, $((argsize << 32) | frame)
+		CALL morestack(SB)
+
+The bottom StackGuard - StackSmall bytes are important: there has
+to be enough room to execute functions that refuse to check for
+stack overflow, either because they need to be adjacent to the
+actual caller's frame (deferproc) or because they handle the imminent
+stack overflow (morestack).
+
+For example, deferproc might call malloc, which does one of the
+above checks (without allocating a full frame), which might trigger
+a call to morestack.  This sequence needs to fit in the bottom
+section of the stack.  On amd64, morestack's frame is 40 bytes, and
+deferproc's frame is 56 bytes.  That fits well within the
+StackGuard - StackSmall bytes at the bottom.
+The linkers explore all possible call traces involving non-splitting
+functions to make sure that this limit cannot be violated.
+*/
+
+const (
+	// StackSystem is a number of additional bytes to add
+	// to each stack below the usual guard area for OS-specific
+	// purposes like signal handling. Used on Windows, Plan 9,
+	// and Darwin/ARM because they do not use a separate stack.
+	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + sys.GoosDarwin*sys.GoarchArm*1024
+
+	// The minimum size of stack used by Go code
+	_StackMin = 2048
+
+	// The minimum stack size to allocate.
+	// The hackery here rounds FixedStack0 up to a power of 2.
+	_FixedStack0 = _StackMin + _StackSystem
+	_FixedStack1 = _FixedStack0 - 1
+	_FixedStack2 = _FixedStack1 | (_FixedStack1 >> 1)
+	_FixedStack3 = _FixedStack2 | (_FixedStack2 >> 2)
+	_FixedStack4 = _FixedStack3 | (_FixedStack3 >> 4)
+	_FixedStack5 = _FixedStack4 | (_FixedStack4 >> 8)
+	_FixedStack6 = _FixedStack5 | (_FixedStack5 >> 16)
+	_FixedStack  = _FixedStack6 + 1
+
+	// Functions that need frames bigger than this use an extra
+	// instruction to do the stack split check, to avoid overflow
+	// in case SP - framesize wraps below zero.
+	// This value can be no bigger than the size of the unmapped
+	// space at zero.
+	_StackBig = 4096
+
+	// The stack guard is a pointer this many bytes above the
+	// bottom of the stack.
+	_StackGuard = 720*sys.StackGuardMultiplier + _StackSystem
+
+	// After a stack split check the SP is allowed to be this
+	// many bytes below the stack guard. This saves an instruction
+	// in the checking sequence for tiny frames.
+	_StackSmall = 128
+
+	// The maximum number of bytes that a chain of NOSPLIT
+	// functions can use.
+	_StackLimit = _StackGuard - _StackSystem - _StackSmall
+)
+
+// Goroutine preemption request.
+// Stored into g->stackguard0 to cause split stack check failure.
+// Must be greater than any real sp.
+// 0xfffffade in hex.
+const (
+	_StackPreempt = uintptrMask & -1314
+	_StackFork    = uintptrMask & -1234
+)
 
 const (
 	// stackDebug == 0: no logging
@@ -21,8 +126,7 @@
 )
 
 const (
-	uintptrMask = 1<<(8*ptrSize) - 1
-	poisonStack = uintptrMask & 0x6868686868686868
+	uintptrMask = 1<<(8*sys.PtrSize) - 1
 
 	// Goroutine preemption request.
 	// Stored into g->stackguard0 to cause split stack check failure.
@@ -41,77 +145,88 @@
 //     order = log_2(size/FixedStack)
 // There is a free list for each order.
 // TODO: one lock per order?
-var stackpool [_NumStackOrders]mspan
+var stackpool [_NumStackOrders]mSpanList
 var stackpoolmu mutex
 
-// List of stack spans to be freed at the end of GC. Protected by
-// stackpoolmu.
-var stackFreeQueue mspan
-
-// Cached value of haveexperiment("framepointer")
-var framepointer_enabled bool
+// Global pool of large stack spans.
+var stackLarge struct {
+	lock mutex
+	free [_MHeapMap_Bits]mSpanList // free lists by log_2(s.npages)
+}
 
 func stackinit() {
 	if _StackCacheSize&_PageMask != 0 {
 		throw("cache size must be a multiple of page size")
 	}
 	for i := range stackpool {
-		mSpanList_Init(&stackpool[i])
+		stackpool[i].init()
 	}
-	mSpanList_Init(&stackFreeQueue)
+	for i := range stackLarge.free {
+		stackLarge.free[i].init()
+	}
 }
 
-// Allocates a stack from the free pool.  Must be called with
+// stacklog2 returns ⌊log_2(n)⌋.
+func stacklog2(n uintptr) int {
+	log2 := 0
+	for n > 1 {
+		n >>= 1
+		log2++
+	}
+	return log2
+}
+
+// Allocates a stack from the free pool. Must be called with
 // stackpoolmu held.
 func stackpoolalloc(order uint8) gclinkptr {
 	list := &stackpool[order]
-	s := list.next
-	if s == list {
-		// no free stacks.  Allocate another span worth.
-		s = mHeap_AllocStack(&mheap_, _StackCacheSize>>_PageShift)
+	s := list.first
+	if s == nil {
+		// no free stacks. Allocate another span worth.
+		s = mheap_.allocStack(_StackCacheSize >> _PageShift)
 		if s == nil {
 			throw("out of memory")
 		}
-		if s.ref != 0 {
-			throw("bad ref")
+		if s.allocCount != 0 {
+			throw("bad allocCount")
 		}
-		if s.freelist.ptr() != nil {
-			throw("bad freelist")
+		if s.stackfreelist.ptr() != nil {
+			throw("bad stackfreelist")
 		}
 		for i := uintptr(0); i < _StackCacheSize; i += _FixedStack << order {
-			x := gclinkptr(uintptr(s.start)<<_PageShift + i)
-			x.ptr().next = s.freelist
-			s.freelist = x
+			x := gclinkptr(s.base() + i)
+			x.ptr().next = s.stackfreelist
+			s.stackfreelist = x
 		}
-		mSpanList_Insert(list, s)
+		list.insert(s)
 	}
-	x := s.freelist
+	x := s.stackfreelist
 	if x.ptr() == nil {
 		throw("span has no free stacks")
 	}
-	s.freelist = x.ptr().next
-	s.ref++
-	if s.freelist.ptr() == nil {
+	s.stackfreelist = x.ptr().next
+	s.allocCount++
+	if s.stackfreelist.ptr() == nil {
 		// all stacks in s are allocated.
-		mSpanList_Remove(s)
+		list.remove(s)
 	}
 	return x
 }
 
-// Adds stack x to the free pool.  Must be called with stackpoolmu held.
+// Adds stack x to the free pool. Must be called with stackpoolmu held.
 func stackpoolfree(x gclinkptr, order uint8) {
-	s := mHeap_Lookup(&mheap_, (unsafe.Pointer)(x))
+	s := mheap_.lookup(unsafe.Pointer(x))
 	if s.state != _MSpanStack {
 		throw("freeing stack not in a stack span")
 	}
-	if s.freelist.ptr() == nil {
+	if s.stackfreelist.ptr() == nil {
 		// s will now have a free stack
-		mSpanList_Insert(&stackpool[order], s)
+		stackpool[order].insert(s)
 	}
-	x.ptr().next = s.freelist
-	s.freelist = x
-	s.ref--
-	if gcphase == _GCoff && s.ref == 0 {
+	x.ptr().next = s.stackfreelist
+	s.stackfreelist = x
+	s.allocCount--
+	if gcphase == _GCoff && s.allocCount == 0 {
 		// Span is completely free. Return it to the heap
 		// immediately if we're sweeping.
 		//
@@ -127,14 +242,16 @@
 		//    pointer into a free span.
 		//
 		// By not freeing, we prevent step #4 until GC is done.
-		mSpanList_Remove(s)
-		s.freelist = 0
-		mHeap_FreeStack(&mheap_, s)
+		stackpool[order].remove(s)
+		s.stackfreelist = 0
+		mheap_.freeStack(s)
 	}
 }
 
 // stackcacherefill/stackcacherelease implement a global pool of stack segments.
 // The pool is required to prevent unlimited growth of per-thread caches.
+//
+//go:systemstack
 func stackcacherefill(c *mcache, order uint8) {
 	if stackDebug >= 1 {
 		print("stackcacherefill order=", order, "\n")
@@ -156,6 +273,7 @@
 	c.stackcache[order].size = size
 }
 
+//go:systemstack
 func stackcacherelease(c *mcache, order uint8) {
 	if stackDebug >= 1 {
 		print("stackcacherelease order=", order, "\n")
@@ -174,6 +292,7 @@
 	c.stackcache[order].size = size
 }
 
+//go:systemstack
 func stackcache_clear(c *mcache) {
 	if stackDebug >= 1 {
 		print("stackcache clear\n")
@@ -192,6 +311,12 @@
 	unlock(&stackpoolmu)
 }
 
+// stackalloc allocates an n byte stack.
+//
+// stackalloc must run on the system stack because it uses per-P
+// resources and must not split the stack.
+//
+//go:systemstack
 func stackalloc(n uint32) (stack, []stkbar) {
 	// Stackalloc must be called on scheduler stack, so that we
 	// never try to grow the stack during the code that stackalloc runs.
@@ -251,18 +376,36 @@
 			c.stackcache[order].list = x.ptr().next
 			c.stackcache[order].size -= uintptr(n)
 		}
-		v = (unsafe.Pointer)(x)
+		v = unsafe.Pointer(x)
 	} else {
-		s := mHeap_AllocStack(&mheap_, round(uintptr(n), _PageSize)>>_PageShift)
-		if s == nil {
-			throw("out of memory")
+		var s *mspan
+		npage := uintptr(n) >> _PageShift
+		log2npage := stacklog2(npage)
+
+		// Try to get a stack from the large stack cache.
+		lock(&stackLarge.lock)
+		if !stackLarge.free[log2npage].isEmpty() {
+			s = stackLarge.free[log2npage].first
+			stackLarge.free[log2npage].remove(s)
 		}
-		v = (unsafe.Pointer)(s.start << _PageShift)
+		unlock(&stackLarge.lock)
+
+		if s == nil {
+			// Allocate a new stack from the heap.
+			s = mheap_.allocStack(npage)
+			if s == nil {
+				throw("out of memory")
+			}
+		}
+		v = unsafe.Pointer(s.base())
 	}
 
 	if raceenabled {
 		racemalloc(v, uintptr(n))
 	}
+	if msanenabled {
+		msanmalloc(v, uintptr(n))
+	}
 	if stackDebug >= 1 {
 		print("  allocated ", v, "\n")
 	}
@@ -271,9 +414,15 @@
 	return stack{uintptr(v), uintptr(v) + top}, *(*[]stkbar)(unsafe.Pointer(&stkbarSlice))
 }
 
+// stackfree frees an n byte stack allocation at stk.
+//
+// stackfree must run on the system stack because it uses per-P
+// resources and must not split the stack.
+//
+//go:systemstack
 func stackfree(stk stack, n uintptr) {
 	gp := getg()
-	v := (unsafe.Pointer)(stk.lo)
+	v := unsafe.Pointer(stk.lo)
 	if n&(n-1) != 0 {
 		throw("stack not a power of 2")
 	}
@@ -292,6 +441,9 @@
 		}
 		return
 	}
+	if msanenabled {
+		msanfree(v, n)
+	}
 	if stackCache != 0 && n < _FixedStack<<_NumStackOrders && n < _StackCacheSize {
 		order := uint8(0)
 		n2 := n
@@ -314,25 +466,25 @@
 			c.stackcache[order].size += n
 		}
 	} else {
-		s := mHeap_Lookup(&mheap_, v)
+		s := mheap_.lookup(v)
 		if s.state != _MSpanStack {
-			println(hex(s.start<<_PageShift), v)
+			println(hex(s.base()), v)
 			throw("bad span state")
 		}
 		if gcphase == _GCoff {
 			// Free the stack immediately if we're
 			// sweeping.
-			mHeap_FreeStack(&mheap_, s)
+			mheap_.freeStack(s)
 		} else {
-			// Otherwise, add it to a list of stack spans
-			// to be freed at the end of GC.
-			//
-			// TODO(austin): Make it possible to re-use
-			// these spans as stacks, like we do for small
-			// stack spans. (See issue #11466.)
-			lock(&stackpoolmu)
-			mSpanList_Insert(&stackFreeQueue, s)
-			unlock(&stackpoolmu)
+			// If the GC is running, we can't return a
+			// stack span to the heap because it could be
+			// reused as a heap span, and this state
+			// change would race with GC. Add it to the
+			// large stack cache instead.
+			log2npage := stacklog2(s.npages)
+			lock(&stackLarge.lock)
+			stackLarge.free[log2npage].insert(s)
+			unlock(&stackLarge.lock)
 		}
 	}
 }
@@ -375,20 +527,24 @@
 type adjustinfo struct {
 	old   stack
 	delta uintptr // ptr distance from old to new stack (newbase - oldbase)
+	cache pcvalueCache
+
+	// sghi is the highest sudog.elem on the stack.
+	sghi uintptr
 }
 
 // Adjustpointer checks whether *vpp is in the old stack described by adjinfo.
 // If so, it rewrites *vpp to point into the new stack.
 func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
-	pp := (*unsafe.Pointer)(vpp)
+	pp := (*uintptr)(vpp)
 	p := *pp
 	if stackDebug >= 4 {
-		print("        ", pp, ":", p, "\n")
+		print("        ", pp, ":", hex(p), "\n")
 	}
-	if adjinfo.old.lo <= uintptr(p) && uintptr(p) < adjinfo.old.hi {
-		*pp = add(p, adjinfo.delta)
+	if adjinfo.old.lo <= p && p < adjinfo.old.hi {
+		*pp = p + adjinfo.delta
 		if stackDebug >= 3 {
-			print("        adjust ptr ", pp, ":", p, " -> ", *pp, "\n")
+			print("        adjust ptr ", pp, ":", hex(p), " -> ", hex(*pp), "\n")
 		}
 	}
 }
@@ -422,15 +578,22 @@
 	minp := adjinfo.old.lo
 	maxp := adjinfo.old.hi
 	delta := adjinfo.delta
-	num := uintptr(bv.n)
+	num := bv.n
+	// If this frame might contain channel receive slots, use CAS
+	// to adjust pointers. If the slot hasn't been received into
+	// yet, it may contain stack pointers and a concurrent send
+	// could race with adjusting those pointers. (The sent value
+	// itself can never contain stack pointers.)
+	useCAS := uintptr(scanp) < adjinfo.sghi
 	for i := uintptr(0); i < num; i++ {
 		if stackDebug >= 4 {
-			print("        ", add(scanp, i*ptrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*ptrSize))), " # ", i, " ", bv.bytedata[i/8], "\n")
+			print("        ", add(scanp, i*sys.PtrSize), ":", ptrnames[ptrbit(&bv, i)], ":", hex(*(*uintptr)(add(scanp, i*sys.PtrSize))), " # ", i, " ", bv.bytedata[i/8], "\n")
 		}
 		if ptrbit(&bv, i) == 1 {
-			pp := (*uintptr)(add(scanp, i*ptrSize))
+			pp := (*uintptr)(add(scanp, i*sys.PtrSize))
+		retry:
 			p := *pp
-			if f != nil && 0 < p && p < _PageSize && debug.invalidptr != 0 || p == poisonStack {
+			if f != nil && 0 < p && p < _PageSize && debug.invalidptr != 0 {
 				// Looks like a junk value in a pointer slot.
 				// Live analysis wrong?
 				getg().m.traceback = 2
@@ -441,7 +604,14 @@
 				if stackDebug >= 3 {
 					print("adjust ptr ", p, " ", funcname(f), "\n")
 				}
-				*pp = p + delta
+				if useCAS {
+					ppu := (*unsafe.Pointer)(unsafe.Pointer(pp))
+					if !atomic.Casp1(ppu, unsafe.Pointer(p), unsafe.Pointer(p+delta)) {
+						goto retry
+					}
+				} else {
+					*pp = p + delta
+				}
 			}
 		}
 	}
@@ -468,7 +638,7 @@
 	if targetpc != f.entry {
 		targetpc--
 	}
-	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc)
+	pcdata := pcdatavalue(f, _PCDATA_StackMapIndex, targetpc, &adjinfo.cache)
 	if pcdata == -1 {
 		pcdata = 0 // in prologue
 	}
@@ -476,13 +646,11 @@
 	// Adjust local variables if stack frame has been allocated.
 	size := frame.varp - frame.sp
 	var minsize uintptr
-	switch thechar {
-	case '6', '8':
-		minsize = 0
-	case '7':
-		minsize = spAlign
+	switch sys.ArchFamily {
+	case sys.ARM64:
+		minsize = sys.SpAlign
 	default:
-		minsize = ptrSize
+		minsize = sys.MinFrameSize
 	}
 	if size > minsize {
 		var bv bitvector
@@ -498,15 +666,15 @@
 			throw("bad symbol table")
 		}
 		bv = stackmapdata(stackmap, pcdata)
-		size = uintptr(bv.n) * ptrSize
+		size = uintptr(bv.n) * sys.PtrSize
 		if stackDebug >= 3 {
-			print("      locals ", pcdata, "/", stackmap.n, " ", size/ptrSize, " words ", bv.bytedata, "\n")
+			print("      locals ", pcdata, "/", stackmap.n, " ", size/sys.PtrSize, " words ", bv.bytedata, "\n")
 		}
 		adjustpointers(unsafe.Pointer(frame.varp-size), &bv, adjinfo, f)
 	}
 
 	// Adjust saved base pointer if there is one.
-	if thechar == '6' && frame.argp-frame.varp == 2*regSize {
+	if sys.ArchFamily == sys.AMD64 && frame.argp-frame.varp == 2*sys.RegSize {
 		if !framepointer_enabled {
 			print("runtime: found space for saved base pointer, but no framepointer experiment\n")
 			print("argp=", hex(frame.argp), " varp=", hex(frame.varp), "\n")
@@ -526,7 +694,7 @@
 		} else {
 			stackmap := (*stackmap)(funcdata(f, _FUNCDATA_ArgsPointerMaps))
 			if stackmap == nil || stackmap.n <= 0 {
-				print("runtime: frame ", funcname(f), " untyped args ", frame.argp, "+", uintptr(frame.arglen), "\n")
+				print("runtime: frame ", funcname(f), " untyped args ", frame.argp, "+", frame.arglen, "\n")
 				throw("missing stackmap")
 			}
 			if pcdata < 0 || pcdata >= stackmap.n {
@@ -545,7 +713,7 @@
 }
 
 func adjustctxt(gp *g, adjinfo *adjustinfo) {
-	adjustpointer(adjinfo, (unsafe.Pointer)(&gp.sched.ctxt))
+	adjustpointer(adjinfo, unsafe.Pointer(&gp.sched.ctxt))
 }
 
 func adjustdefers(gp *g, adjinfo *adjustinfo) {
@@ -555,30 +723,30 @@
 	// Adjust pointers in the Defer structs.
 	// Defer structs themselves are never on the stack.
 	for d := gp._defer; d != nil; d = d.link {
-		adjustpointer(adjinfo, (unsafe.Pointer)(&d.fn))
-		adjustpointer(adjinfo, (unsafe.Pointer)(&d.sp))
-		adjustpointer(adjinfo, (unsafe.Pointer)(&d._panic))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.fn))
+		adjustpointer(adjinfo, unsafe.Pointer(&d.sp))
+		adjustpointer(adjinfo, unsafe.Pointer(&d._panic))
 	}
 }
 
 func adjustpanics(gp *g, adjinfo *adjustinfo) {
 	// Panics are on stack and already adjusted.
 	// Update pointer to head of list in G.
-	adjustpointer(adjinfo, (unsafe.Pointer)(&gp._panic))
+	adjustpointer(adjinfo, unsafe.Pointer(&gp._panic))
 }
 
 func adjustsudogs(gp *g, adjinfo *adjustinfo) {
 	// the data elements pointed to by a SudoG structure
 	// might be in the stack.
 	for s := gp.waiting; s != nil; s = s.waitlink {
-		adjustpointer(adjinfo, (unsafe.Pointer)(&s.elem))
-		adjustpointer(adjinfo, (unsafe.Pointer)(&s.selectdone))
+		adjustpointer(adjinfo, unsafe.Pointer(&s.elem))
+		adjustpointer(adjinfo, unsafe.Pointer(&s.selectdone))
 	}
 }
 
 func adjuststkbar(gp *g, adjinfo *adjustinfo) {
 	for i := int(gp.stkbarPos); i < len(gp.stkbar); i++ {
-		adjustpointer(adjinfo, (unsafe.Pointer)(&gp.stkbar[i].savedLRPtr))
+		adjustpointer(adjinfo, unsafe.Pointer(&gp.stkbar[i].savedLRPtr))
 	}
 }
 
@@ -588,9 +756,76 @@
 	}
 }
 
+func findsghi(gp *g, stk stack) uintptr {
+	var sghi uintptr
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		p := uintptr(sg.elem) + uintptr(sg.c.elemsize)
+		if stk.lo <= p && p < stk.hi && p > sghi {
+			sghi = p
+		}
+		p = uintptr(unsafe.Pointer(sg.selectdone)) + unsafe.Sizeof(sg.selectdone)
+		if stk.lo <= p && p < stk.hi && p > sghi {
+			sghi = p
+		}
+	}
+	return sghi
+}
+
+// syncadjustsudogs adjusts gp's sudogs and copies the part of gp's
+// stack they refer to while synchronizing with concurrent channel
+// operations. It returns the number of bytes of stack copied.
+func syncadjustsudogs(gp *g, used uintptr, adjinfo *adjustinfo) uintptr {
+	if gp.waiting == nil {
+		return 0
+	}
+
+	// Lock channels to prevent concurrent send/receive.
+	// It's important that we *only* do this for async
+	// copystack; otherwise, gp may be in the middle of
+	// putting itself on wait queues and this would
+	// self-deadlock.
+	var lastc *hchan
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc {
+			lock(&sg.c.lock)
+		}
+		lastc = sg.c
+	}
+
+	// Adjust sudogs.
+	adjustsudogs(gp, adjinfo)
+
+	// Copy the part of the stack the sudogs point in to
+	// while holding the lock to prevent races on
+	// send/receive slots.
+	var sgsize uintptr
+	if adjinfo.sghi != 0 {
+		oldBot := adjinfo.old.hi - used
+		newBot := oldBot + adjinfo.delta
+		sgsize = adjinfo.sghi - oldBot
+		memmove(unsafe.Pointer(newBot), unsafe.Pointer(oldBot), sgsize)
+	}
+
+	// Unlock channels.
+	lastc = nil
+	for sg := gp.waiting; sg != nil; sg = sg.waitlink {
+		if sg.c != lastc {
+			unlock(&sg.c.lock)
+		}
+		lastc = sg.c
+	}
+
+	return sgsize
+}
+
 // Copies gp's stack to a new stack of a different size.
 // Caller must have changed gp status to Gcopystack.
-func copystack(gp *g, newsize uintptr) {
+//
+// If sync is true, this is a self-triggered stack growth and, in
+// particular, no other G may be writing to gp's stack (e.g., via a
+// channel operation). If sync is false, copystack protects against
+// concurrent channel operations.
+func copystack(gp *g, newsize uintptr, sync bool) {
 	if gp.syscallsp != 0 {
 		throw("stack growth not allowed in system call")
 	}
@@ -609,24 +844,46 @@
 		print("copystack gp=", gp, " [", hex(old.lo), " ", hex(old.hi-used), " ", hex(old.hi), "]/", gp.stackAlloc, " -> [", hex(new.lo), " ", hex(new.hi-used), " ", hex(new.hi), "]/", newsize, "\n")
 	}
 
-	// adjust pointers in the to-be-copied frames
+	// Compute adjustment.
 	var adjinfo adjustinfo
 	adjinfo.old = old
 	adjinfo.delta = new.hi - old.hi
-	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
 
-	// adjust other miscellaneous things that have pointers into stacks.
+	// Adjust sudogs, synchronizing with channel ops if necessary.
+	ncopy := used
+	if sync {
+		adjustsudogs(gp, &adjinfo)
+	} else {
+		// sudogs can point in to the stack. During concurrent
+		// shrinking, these areas may be written to. Find the
+		// highest such pointer so we can handle everything
+		// there and below carefully. (This shouldn't be far
+		// from the bottom of the stack, so there's little
+		// cost in handling everything below it carefully.)
+		adjinfo.sghi = findsghi(gp, old)
+
+		// Synchronize with channel ops and copy the part of
+		// the stack they may interact with.
+		ncopy -= syncadjustsudogs(gp, used, &adjinfo)
+	}
+
+	// Copy the stack (or the rest of it) to the new location
+	memmove(unsafe.Pointer(new.hi-ncopy), unsafe.Pointer(old.hi-ncopy), ncopy)
+
+	// Disallow sigprof scans of this stack and block if there's
+	// one in progress.
+	gcLockStackBarriers(gp)
+
+	// Adjust remaining structures that have pointers into stacks.
+	// We have to do most of these before we traceback the new
+	// stack because gentraceback uses them.
 	adjustctxt(gp, &adjinfo)
 	adjustdefers(gp, &adjinfo)
 	adjustpanics(gp, &adjinfo)
-	adjustsudogs(gp, &adjinfo)
 	adjuststkbar(gp, &adjinfo)
-
-	// copy the stack to the new location
-	if stackPoisonCopy != 0 {
-		fillstack(new, 0xfb)
+	if adjinfo.sghi != 0 {
+		adjinfo.sghi += adjinfo.delta
 	}
-	memmove(unsafe.Pointer(new.hi-used), unsafe.Pointer(old.hi-used), used)
 
 	// copy old stack barriers to new stack barrier array
 	newstkbar = newstkbar[:len(gp.stkbar)]
@@ -639,6 +896,12 @@
 	oldsize := gp.stackAlloc
 	gp.stackAlloc = newsize
 	gp.stkbar = newstkbar
+	gp.stktopsp += adjinfo.delta
+
+	// Adjust pointers in the new stack.
+	gentraceback(^uintptr(0), ^uintptr(0), 0, gp, 0, nil, 0x7fffffff, adjustframe, noescape(unsafe.Pointer(&adjinfo)), 0)
+
+	gcUnlockStackBarriers(gp)
 
 	// free old stack
 	if stackPoisonCopy != 0 {
@@ -669,7 +932,7 @@
 		throw("stack growth after fork")
 	}
 	if thisg.m.morebuf.g.ptr() != thisg.m.curg {
-		print("runtime: newstack called from g=", thisg.m.morebuf.g, "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
+		print("runtime: newstack called from g=", hex(thisg.m.morebuf.g), "\n"+"\tm=", thisg.m, " m->curg=", thisg.m.curg, " m->g0=", thisg.m.g0, " m->gsignal=", thisg.m.gsignal, "\n")
 		morebuf := thisg.m.morebuf
 		traceback(morebuf.pc, morebuf.sp, morebuf.lr, morebuf.g.ptr())
 		throw("runtime: wrong goroutine in newstack")
@@ -699,7 +962,7 @@
 	// NOTE: stackguard0 may change underfoot, if another thread
 	// is about to try to preempt gp. Read it just once and use that same
 	// value now and below.
-	preempt := atomicloaduintptr(&gp.stackguard0) == stackPreempt
+	preempt := atomic.Loaduintptr(&gp.stackguard0) == stackPreempt
 
 	// Be conservative about where we preempt.
 	// We are interested in preempting user Go code, not runtime code.
@@ -722,18 +985,13 @@
 		}
 	}
 
-	// The goroutine must be executing in order to call newstack,
-	// so it must be Grunning (or Gscanrunning).
-	casgstatus(gp, _Grunning, _Gwaiting)
-	gp.waitreason = "stack growth"
-
 	if gp.stack.lo == 0 {
 		throw("missing stack in newstack")
 	}
 	sp := gp.sched.sp
-	if thechar == '6' || thechar == '8' {
+	if sys.ArchFamily == sys.AMD64 || sys.ArchFamily == sys.I386 {
 		// The call to morestack cost a word.
-		sp -= ptrSize
+		sp -= sys.PtrSize
 	}
 	if stackDebug >= 1 || sp < gp.stack.lo {
 		print("runtime: newstack sp=", hex(sp), " stack=[", hex(gp.stack.lo), ", ", hex(gp.stack.hi), "]\n",
@@ -761,6 +1019,8 @@
 		if thisg.m.p == 0 && thisg.m.locks == 0 {
 			throw("runtime: g is running but p is not")
 		}
+		// Synchronize with scang.
+		casgstatus(gp, _Grunning, _Gwaiting)
 		if gp.preemptscan {
 			for !castogscanstatus(gp, _Gwaiting, _Gscanwaiting) {
 				// Likely to be racing with the GC as
@@ -770,12 +1030,19 @@
 				// return.
 			}
 			if !gp.gcscandone {
-				scanstack(gp)
+				// gcw is safe because we're on the
+				// system stack.
+				gcw := &gp.m.p.ptr().gcw
+				scanstack(gp, gcw)
+				if gcBlackenPromptly {
+					gcw.dispose()
+				}
 				gp.gcscandone = true
 			}
 			gp.preemptscan = false
 			gp.preempt = false
 			casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
+			// This clears gcscanvalid.
 			casgstatus(gp, _Gwaiting, _Grunning)
 			gp.stackguard0 = gp.stack.lo + _StackGuard
 			gogo(&gp.sched) // never return
@@ -794,11 +1061,13 @@
 		throw("stack overflow")
 	}
 
-	casgstatus(gp, _Gwaiting, _Gcopystack)
+	// The goroutine must be executing in order to call newstack,
+	// so it must be Grunning (or Gscanrunning).
+	casgstatus(gp, _Grunning, _Gcopystack)
 
 	// The concurrent GC will not scan the stack while we are doing the copy since
 	// the gp is in a Gcopystack status.
-	copystack(gp, uintptr(newsize))
+	copystack(gp, uintptr(newsize), true)
 	if stackDebug >= 1 {
 		print("stack grow done\n")
 	}
@@ -816,17 +1085,19 @@
 func gostartcallfn(gobuf *gobuf, fv *funcval) {
 	var fn unsafe.Pointer
 	if fv != nil {
-		fn = (unsafe.Pointer)(fv.fn)
+		fn = unsafe.Pointer(fv.fn)
 	} else {
 		fn = unsafe.Pointer(funcPC(nilfunc))
 	}
-	gostartcall(gobuf, fn, (unsafe.Pointer)(fv))
+	gostartcall(gobuf, fn, unsafe.Pointer(fv))
 }
 
 // Maybe shrink the stack being used by gp.
 // Called at garbage collection time.
+// gp must be stopped, but the world need not be.
 func shrinkstack(gp *g) {
-	if readgstatus(gp) == _Gdead {
+	gstatus := readgstatus(gp)
+	if gstatus&^_Gscan == _Gdead {
 		if gp.stack.lo != 0 {
 			// Free whole stack - it will get reallocated
 			// if G is used again.
@@ -841,6 +1112,9 @@
 	if gp.stack.lo == 0 {
 		throw("missing stack in shrinkstack")
 	}
+	if gstatus&_Gscan == 0 {
+		throw("bad status in shrinkstack")
+	}
 
 	if debug.gcshrinkstackoff > 0 {
 		return
@@ -868,7 +1142,7 @@
 	if gp.syscallsp != 0 {
 		return
 	}
-	if goos_windows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
+	if sys.GoosWindows != 0 && gp.m != nil && gp.m.libcallsp != 0 {
 		return
 	}
 
@@ -876,9 +1150,7 @@
 		print("shrinking stack ", oldsize, "->", newsize, "\n")
 	}
 
-	oldstatus := casgcopystack(gp)
-	copystack(gp, newsize)
-	casgstatus(gp, _Gcopystack, oldstatus)
+	copystack(gp, newsize, false)
 }
 
 // freeStackSpans frees unused stack spans at the end of GC.
@@ -888,25 +1160,30 @@
 	// Scan stack pools for empty stack spans.
 	for order := range stackpool {
 		list := &stackpool[order]
-		for s := list.next; s != list; {
+		for s := list.first; s != nil; {
 			next := s.next
-			if s.ref == 0 {
-				mSpanList_Remove(s)
-				s.freelist = 0
-				mHeap_FreeStack(&mheap_, s)
+			if s.allocCount == 0 {
+				list.remove(s)
+				s.stackfreelist = 0
+				mheap_.freeStack(s)
 			}
 			s = next
 		}
 	}
 
-	// Free queued stack spans.
-	for stackFreeQueue.next != &stackFreeQueue {
-		s := stackFreeQueue.next
-		mSpanList_Remove(s)
-		mHeap_FreeStack(&mheap_, s)
-	}
-
 	unlock(&stackpoolmu)
+
+	// Free large stack spans.
+	lock(&stackLarge.lock)
+	for i := range stackLarge.free {
+		for s := stackLarge.free[i].first; s != nil; {
+			next := s.next
+			stackLarge.free[i].remove(s)
+			mheap_.freeStack(s)
+			s = next
+		}
+	}
+	unlock(&stackLarge.lock)
 }
 
 //go:nosplit

diff --git a/src/runtime/stack2.go b/src/runtime/stack2.go
deleted file mode 100644
index 5ec8d8d..0000000
--- a/src/runtime/stack2.go
+++ /dev/null

@@ -1,106 +0,0 @@
-// Copyright 2011 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-/*
-Stack layout parameters.
-Included both by runtime (compiled via 6c) and linkers (compiled via gcc).
-
-The per-goroutine g->stackguard is set to point StackGuard bytes
-above the bottom of the stack.  Each function compares its stack
-pointer against g->stackguard to check for overflow.  To cut one
-instruction from the check sequence for functions with tiny frames,
-the stack is allowed to protrude StackSmall bytes below the stack
-guard.  Functions with large frames don't bother with the check and
-always call morestack.  The sequences are (for amd64, others are
-similar):
-
-	guard = g->stackguard
-	frame = function's stack frame size
-	argsize = size of function arguments (call + return)
-
-	stack frame size <= StackSmall:
-		CMPQ guard, SP
-		JHI 3(PC)
-		MOVQ m->morearg, $(argsize << 32)
-		CALL morestack(SB)
-
-	stack frame size > StackSmall but < StackBig
-		LEAQ (frame-StackSmall)(SP), R0
-		CMPQ guard, R0
-		JHI 3(PC)
-		MOVQ m->morearg, $(argsize << 32)
-		CALL morestack(SB)
-
-	stack frame size >= StackBig:
-		MOVQ m->morearg, $((argsize << 32) | frame)
-		CALL morestack(SB)
-
-The bottom StackGuard - StackSmall bytes are important: there has
-to be enough room to execute functions that refuse to check for
-stack overflow, either because they need to be adjacent to the
-actual caller's frame (deferproc) or because they handle the imminent
-stack overflow (morestack).
-
-For example, deferproc might call malloc, which does one of the
-above checks (without allocating a full frame), which might trigger
-a call to morestack.  This sequence needs to fit in the bottom
-section of the stack.  On amd64, morestack's frame is 40 bytes, and
-deferproc's frame is 56 bytes.  That fits well within the
-StackGuard - StackSmall bytes at the bottom.
-The linkers explore all possible call traces involving non-splitting
-functions to make sure that this limit cannot be violated.
-*/
-
-const (
-	// StackSystem is a number of additional bytes to add
-	// to each stack below the usual guard area for OS-specific
-	// purposes like signal handling. Used on Windows, Plan 9,
-	// and Darwin/ARM because they do not use a separate stack.
-	_StackSystem = goos_windows*512*ptrSize + goos_plan9*512 + goos_darwin*goarch_arm*1024
-
-	// The minimum size of stack used by Go code
-	_StackMin = 2048
-
-	// The minimum stack size to allocate.
-	// The hackery here rounds FixedStack0 up to a power of 2.
-	_FixedStack0 = _StackMin + _StackSystem
-	_FixedStack1 = _FixedStack0 - 1
-	_FixedStack2 = _FixedStack1 | (_FixedStack1 >> 1)
-	_FixedStack3 = _FixedStack2 | (_FixedStack2 >> 2)
-	_FixedStack4 = _FixedStack3 | (_FixedStack3 >> 4)
-	_FixedStack5 = _FixedStack4 | (_FixedStack4 >> 8)
-	_FixedStack6 = _FixedStack5 | (_FixedStack5 >> 16)
-	_FixedStack  = _FixedStack6 + 1
-
-	// Functions that need frames bigger than this use an extra
-	// instruction to do the stack split check, to avoid overflow
-	// in case SP - framesize wraps below zero.
-	// This value can be no bigger than the size of the unmapped
-	// space at zero.
-	_StackBig = 4096
-
-	// The stack guard is a pointer this many bytes above the
-	// bottom of the stack.
-	_StackGuard = 640*stackGuardMultiplier + _StackSystem
-
-	// After a stack split check the SP is allowed to be this
-	// many bytes below the stack guard.  This saves an instruction
-	// in the checking sequence for tiny frames.
-	_StackSmall = 128
-
-	// The maximum number of bytes that a chain of NOSPLIT
-	// functions can use.
-	_StackLimit = _StackGuard - _StackSystem - _StackSmall
-)
-
-// Goroutine preemption request.
-// Stored into g->stackguard0 to cause split stack check failure.
-// Must be greater than any real sp.
-// 0xfffffade in hex.
-const (
-	_StackPreempt = uintptrMask & -1314
-	_StackFork    = uintptrMask & -1234
-)

diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index fa073f1..a32b68b 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go

@@ -1,4 +1,4 @@
-// Copyright 2012 The Go Authors.  All rights reserved.
+// Copyright 2012 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -111,7 +111,8 @@
 		select {
 		case <-done:
 		case <-time.After(20 * time.Second):
-			t.Fatal("finalizer did not run")
+			t.Error("finalizer did not run")
+			return
 		}
 	}()
 	wg.Wait()
@@ -191,7 +192,6 @@
 			<-done
 		})
 	}()
-
 	wg.Wait()
 }
 
@@ -416,9 +416,9 @@
 }
 
 func TestStackPanic(t *testing.T) {
-	// Test that stack copying copies panics correctly.  This is difficult
+	// Test that stack copying copies panics correctly. This is difficult
 	// to test because it is very unlikely that the stack will be copied
-	// in the middle of gopanic.  But it can happen.
+	// in the middle of gopanic. But it can happen.
 	// To make this test effective, edit panic.go:gopanic and uncomment
 	// the GC() call just before freedefer(d).
 	defer func() {

diff --git a/src/runtime/string.go b/src/runtime/string.go
index a5851b7..ef28ba9 100644
--- a/src/runtime/string.go
+++ b/src/runtime/string.go

@@ -5,6 +5,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -83,9 +84,12 @@
 	if raceenabled && l > 0 {
 		racereadrangepc(unsafe.Pointer(&b[0]),
 			uintptr(l),
-			getcallerpc(unsafe.Pointer(&b)),
+			getcallerpc(unsafe.Pointer(&buf)),
 			funcPC(slicebytetostring))
 	}
+	if msanenabled && l > 0 {
+		msanread(unsafe.Pointer(&b[0]), uintptr(l))
+	}
 	s, c := rawstringtmp(buf, l)
 	copy(c, b)
 	return s
@@ -94,7 +98,7 @@
 // stringDataOnStack reports whether the string's data is
 // stored on the current goroutine's stack.
 func stringDataOnStack(s string) bool {
-	ptr := uintptr((*stringStruct)(unsafe.Pointer(&s)).str)
+	ptr := uintptr(stringStructOf(&s).str)
 	stk := getg().stack
 	return stk.lo <= ptr && ptr < stk.hi
 }
@@ -126,12 +130,16 @@
 			getcallerpc(unsafe.Pointer(&b)),
 			funcPC(slicebytetostringtmp))
 	}
+	if msanenabled && len(b) > 0 {
+		msanread(unsafe.Pointer(&b[0]), uintptr(len(b)))
+	}
 	return *(*string)(unsafe.Pointer(&b))
 }
 
 func stringtoslicebyte(buf *tmpBuf, s string) []byte {
 	var b []byte
 	if buf != nil && len(s) <= len(buf) {
+		*buf = tmpBuf{}
 		b = buf[:len(s)]
 	} else {
 		b = rawbyteslice(len(s))
@@ -147,8 +155,8 @@
 	// The only such case today is:
 	// for i, c := range []byte(str)
 
-	str := (*stringStruct)(unsafe.Pointer(&s))
-	ret := slice{array: unsafe.Pointer(str.str), len: str.len, cap: str.len}
+	str := stringStructOf(&s)
+	ret := slice{array: str.str, len: str.len, cap: str.len}
 	return *(*[]byte)(unsafe.Pointer(&ret))
 }
 
@@ -164,6 +172,7 @@
 	}
 	var a []rune
 	if buf != nil && n <= len(buf) {
+		*buf = [tmpStringBufSize]rune{}
 		a = buf[:n]
 	} else {
 		a = rawruneslice(n)
@@ -182,9 +191,12 @@
 	if raceenabled && len(a) > 0 {
 		racereadrangepc(unsafe.Pointer(&a[0]),
 			uintptr(len(a))*unsafe.Sizeof(a[0]),
-			getcallerpc(unsafe.Pointer(&a)),
+			getcallerpc(unsafe.Pointer(&buf)),
 			funcPC(slicerunetostring))
 	}
+	if msanenabled && len(a) > 0 {
+		msanread(unsafe.Pointer(&a[0]), uintptr(len(a))*unsafe.Sizeof(a[0]))
+	}
 	var dum [4]byte
 	size1 := 0
 	for _, r := range a {
@@ -207,6 +219,16 @@
 	len int
 }
 
+// Variant with *byte pointer type for DWARF debugging.
+type stringStructDWARF struct {
+	str *byte
+	len int
+}
+
+func stringStructOf(sp *string) *stringStruct {
+	return (*stringStruct)(unsafe.Pointer(sp))
+}
+
 func intstring(buf *[4]byte, v int64) string {
 	var s string
 	var b []byte
@@ -216,6 +238,9 @@
 	} else {
 		s, b = rawstring(4)
 	}
+	if int64(rune(v)) != v {
+		v = runeerror
+	}
 	n := runetochar(b, rune(v))
 	return s[:n]
 }
@@ -261,16 +286,16 @@
 // The storage is not zeroed. Callers should use
 // b to set the string contents and then drop b.
 func rawstring(size int) (s string, b []byte) {
-	p := mallocgc(uintptr(size), nil, flagNoScan|flagNoZero)
+	p := mallocgc(uintptr(size), nil, false)
 
-	(*stringStruct)(unsafe.Pointer(&s)).str = p
-	(*stringStruct)(unsafe.Pointer(&s)).len = size
+	stringStructOf(&s).str = p
+	stringStructOf(&s).len = size
 
 	*(*slice)(unsafe.Pointer(&b)) = slice{p, size, size}
 
 	for {
 		ms := maxstring
-		if uintptr(size) <= uintptr(ms) || casuintptr((*uintptr)(unsafe.Pointer(&maxstring)), uintptr(ms), uintptr(size)) {
+		if uintptr(size) <= ms || atomic.Casuintptr((*uintptr)(unsafe.Pointer(&maxstring)), ms, uintptr(size)) {
 			return
 		}
 	}
@@ -279,7 +304,7 @@
 // rawbyteslice allocates a new byte slice. The byte slice is not zeroed.
 func rawbyteslice(size int) (b []byte) {
 	cap := roundupsize(uintptr(size))
-	p := mallocgc(cap, nil, flagNoScan|flagNoZero)
+	p := mallocgc(cap, nil, false)
 	if cap != uintptr(size) {
 		memclr(add(p, uintptr(size)), cap-uintptr(size))
 	}
@@ -294,7 +319,7 @@
 		throw("out of memory")
 	}
 	mem := roundupsize(uintptr(size) * 4)
-	p := mallocgc(mem, nil, flagNoScan|flagNoZero)
+	p := mallocgc(mem, nil, false)
 	if mem != uintptr(size)*4 {
 		memclr(add(p, uintptr(size)*4), mem-uintptr(size)*4)
 	}
@@ -360,3 +385,63 @@
 	}
 	return n
 }
+
+//go:nosplit
+func findnull(s *byte) int {
+	if s == nil {
+		return 0
+	}
+	p := (*[_MaxMem/2 - 1]byte)(unsafe.Pointer(s))
+	l := 0
+	for p[l] != 0 {
+		l++
+	}
+	return l
+}
+
+func findnullw(s *uint16) int {
+	if s == nil {
+		return 0
+	}
+	p := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(s))
+	l := 0
+	for p[l] != 0 {
+		l++
+	}
+	return l
+}
+
+var maxstring uintptr = 256 // a hint for print
+
+//go:nosplit
+func gostringnocopy(str *byte) string {
+	ss := stringStruct{str: unsafe.Pointer(str), len: findnull(str)}
+	s := *(*string)(unsafe.Pointer(&ss))
+	for {
+		ms := maxstring
+		if uintptr(len(s)) <= ms || atomic.Casuintptr(&maxstring, ms, uintptr(len(s))) {
+			break
+		}
+	}
+	return s
+}
+
+func gostringw(strw *uint16) string {
+	var buf [8]byte
+	str := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(strw))
+	n1 := 0
+	for i := 0; str[i] != 0; i++ {
+		n1 += runetochar(buf[:], rune(str[i]))
+	}
+	s, b := rawstring(n1 + 4)
+	n2 := 0
+	for i := 0; str[i] != 0; i++ {
+		// check for race
+		if n2 >= n1 {
+			break
+		}
+		n2 += runetochar(b[n2:], rune(str[i]))
+	}
+	b[n2] = 0 // for luck
+	return s[:n2]
+}

diff --git a/src/runtime/string1.go b/src/runtime/string1.go
deleted file mode 100644
index 4bfa3d9..0000000
--- a/src/runtime/string1.go
+++ /dev/null

@@ -1,67 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-import "unsafe"
-
-//go:nosplit
-func findnull(s *byte) int {
-	if s == nil {
-		return 0
-	}
-	p := (*[_MaxMem/2 - 1]byte)(unsafe.Pointer(s))
-	l := 0
-	for p[l] != 0 {
-		l++
-	}
-	return l
-}
-
-func findnullw(s *uint16) int {
-	if s == nil {
-		return 0
-	}
-	p := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(s))
-	l := 0
-	for p[l] != 0 {
-		l++
-	}
-	return l
-}
-
-var maxstring uintptr = 256 // a hint for print
-
-//go:nosplit
-func gostringnocopy(str *byte) string {
-	ss := stringStruct{str: unsafe.Pointer(str), len: findnull(str)}
-	s := *(*string)(unsafe.Pointer(&ss))
-	for {
-		ms := maxstring
-		if uintptr(len(s)) <= ms || casuintptr(&maxstring, ms, uintptr(len(s))) {
-			break
-		}
-	}
-	return s
-}
-
-func gostringw(strw *uint16) string {
-	var buf [8]byte
-	str := (*[_MaxMem/2/2 - 1]uint16)(unsafe.Pointer(strw))
-	n1 := 0
-	for i := 0; str[i] != 0; i++ {
-		n1 += runetochar(buf[:], rune(str[i]))
-	}
-	s, b := rawstring(n1 + 4)
-	n2 := 0
-	for i := 0; str[i] != 0; i++ {
-		// check for race
-		if n2 >= n1 {
-			break
-		}
-		n2 += runetochar(b[n2:], rune(str[i]))
-	}
-	b[n2] = 0 // for luck
-	return s[:n2]
-}

diff --git a/src/runtime/string_test.go b/src/runtime/string_test.go
index dfda950..0f1d82a 100644
--- a/src/runtime/string_test.go
+++ b/src/runtime/string_test.go

@@ -10,6 +10,10 @@
 	"testing"
 )
 
+// Strings and slices that don't escape and fit into tmpBuf are stack allocated,
+// which defeats using AllocsPerRun to test other optimizations.
+const sizeNoStack = 100
+
 func BenchmarkCompareStringEqual(b *testing.B) {
 	bytes := []byte("Hello Gophers!")
 	s1, s2 := string(bytes), string(bytes)
@@ -102,6 +106,17 @@
 	}
 }
 
+func BenchmarkArrayEqual(b *testing.B) {
+	a1 := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	a2 := [16]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		if a1 != a2 {
+			b.Fatal("not equal")
+		}
+	}
+}
+
 func TestStringW(t *testing.T) {
 	strings := []string{
 		"hello",
@@ -125,7 +140,7 @@
 }
 
 func TestLargeStringConcat(t *testing.T) {
-	output := executeTest(t, largeStringConcatSource, nil)
+	output := runTestProg(t, "testprog", "stringconcat")
 	want := "panic: " + strings.Repeat("0", 1<<10) + strings.Repeat("1", 1<<10) +
 		strings.Repeat("2", 1<<10) + strings.Repeat("3", 1<<10)
 	if !strings.HasPrefix(output, want) {
@@ -133,19 +148,6 @@
 	}
 }
 
-var largeStringConcatSource = `
-package main
-import "strings"
-func main() {
-	s0 := strings.Repeat("0", 1<<10)
-	s1 := strings.Repeat("1", 1<<10)
-	s2 := strings.Repeat("2", 1<<10)
-	s3 := strings.Repeat("3", 1<<10)
-	s := s0 + s1 + s2 + s3
-	panic(s)
-}
-`
-
 func TestGostringnocopy(t *testing.T) {
 	max := *runtime.Maxstring
 	b := make([]byte, max+10)
@@ -160,7 +162,7 @@
 }
 
 func TestCompareTempString(t *testing.T) {
-	s := "foo"
+	s := strings.Repeat("x", sizeNoStack)
 	b := []byte(s)
 	n := testing.AllocsPerRun(1000, func() {
 		if string(b) != s {
@@ -223,7 +225,7 @@
 }
 
 func TestRangeStringCast(t *testing.T) {
-	s := "abc"
+	s := strings.Repeat("x", sizeNoStack)
 	n := testing.AllocsPerRun(1000, func() {
 		for i, c := range []byte(s) {
 			if c != s[i] {
@@ -235,3 +237,36 @@
 		t.Fatalf("want 0 allocs, got %v", n)
 	}
 }
+
+func isZeroed(b []byte) bool {
+	for _, x := range b {
+		if x != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func isZeroedR(r []rune) bool {
+	for _, x := range r {
+		if x != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+func TestString2Slice(t *testing.T) {
+	// Make sure we don't return slices that expose
+	// an unzeroed section of stack-allocated temp buf
+	// between len and cap. See issue 14232.
+	s := "foož"
+	b := ([]byte)(s)
+	if !isZeroed(b[len(b):cap(b)]) {
+		t.Errorf("extra bytes not zeroed")
+	}
+	r := ([]rune)(s)
+	if !isZeroedR(r[len(r):cap(r)]) {
+		t.Errorf("extra runes not zeroed")
+	}
+}

diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index d725bb1..6c28fd2 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go

@@ -6,12 +6,6 @@
 
 import "unsafe"
 
-// Declarations for runtime services implemented in C or assembly.
-
-const ptrSize = 4 << (^uintptr(0) >> 63)             // unsafe.Sizeof(uintptr(0)) but an ideal const
-const regSize = 4 << (^uintreg(0) >> 63)             // unsafe.Sizeof(uintreg(0)) but an ideal const
-const spAlign = 1*(1-goarch_arm64) + 16*goarch_arm64 // SP alignment: 1 normally, 16 for ARM64
-
 // Should be a built-in for unsafe.Pointer?
 //go:nosplit
 func add(p unsafe.Pointer, x uintptr) unsafe.Pointer {
@@ -91,7 +85,7 @@
 
 // in asm_*.s
 //go:noescape
-func memeq(a, b unsafe.Pointer, size uintptr) bool
+func memequal(a, b unsafe.Pointer, size uintptr) bool
 
 // noescape hides a pointer from escape analysis.  noescape is
 // the identity function but escape analysis doesn't think the
@@ -151,42 +145,6 @@
 // See the assembly implementations for more details.
 func cgocallback_gofunc(fv uintptr, frame uintptr, framesize uintptr)
 
-//go:noescape
-func cas(ptr *uint32, old, new uint32) bool
-
-// NO go:noescape annotation; see atomic_pointer.go.
-func casp1(ptr *unsafe.Pointer, old, new unsafe.Pointer) bool
-
-func nop() // call to prevent inlining of function body
-
-//go:noescape
-func casuintptr(ptr *uintptr, old, new uintptr) bool
-
-//go:noescape
-func atomicstoreuintptr(ptr *uintptr, new uintptr)
-
-//go:noescape
-func atomicloaduintptr(ptr *uintptr) uintptr
-
-//go:noescape
-func atomicloaduint(ptr *uint) uint
-
-// TODO: Write native implementations of int64 atomic ops (or improve
-// inliner). These portable ones can't be inlined right now, so we're
-// taking an extra function call hit.
-
-func atomicstoreint64(ptr *int64, new int64) {
-	atomicstore64((*uint64)(unsafe.Pointer(ptr)), uint64(new))
-}
-
-func atomicloadint64(ptr *int64) int64 {
-	return int64(atomicload64((*uint64)(unsafe.Pointer(ptr))))
-}
-
-func xaddint64(ptr *int64, delta int64) int64 {
-	return int64(xadd64((*uint64)(unsafe.Pointer(ptr)), delta))
-}
-
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
 // between initializing an object and making that object accessible to
@@ -312,3 +270,6 @@
 func round(n, a uintptr) uintptr {
 	return (n + a - 1) &^ (a - 1)
 }
+
+// checkASM returns whether assembly runtime checks have passed.
+func checkASM() bool

diff --git a/src/runtime/stubs2.go b/src/runtime/stubs2.go
index 1cb6f91..95db924 100644
--- a/src/runtime/stubs2.go
+++ b/src/runtime/stubs2.go

@@ -18,7 +18,6 @@
 func nanotime() int64
 func usleep(usec uint32)
 
-func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
 func munmap(addr unsafe.Pointer, n uintptr)
 
 //go:noescape

diff --git a/src/runtime/stubs_android.go b/src/runtime/stubs_android.go
index e372377..0380dca 100644
--- a/src/runtime/stubs_android.go
+++ b/src/runtime/stubs_android.go

@@ -2,9 +2,15 @@
 
 import "unsafe"
 
+// Return values of access/connect/socket are the return values of the syscall
+// (may encode error numbers).
+
+// int access(const char *, int)
 //go:noescape
 func access(name *byte, mode int32) int32
 
-func connect(fd uintptr, addr unsafe.Pointer, len int32) int32
+// int connect(int, const struct sockaddr*, socklen_t)
+func connect(fd int32, addr unsafe.Pointer, len int32) int32
 
+// int socket(int, int, int)
 func socket(domain int32, typ int32, prot int32) int32

diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 400ab6d..4f6fae2 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go

@@ -4,7 +4,154 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
+
+// Frames may be used to get function/file/line information for a
+// slice of PC values returned by Callers.
+type Frames struct {
+	callers []uintptr
+
+	// If previous caller in iteration was a panic, then
+	// ci.callers[0] is the address of the faulting instruction
+	// instead of the return address of the call.
+	wasPanic bool
+
+	// Frames to return for subsequent calls to the Next method.
+	// Used for non-Go frames.
+	frames *[]Frame
+}
+
+// Frame is the information returned by Frames for each call frame.
+type Frame struct {
+	// Program counter for this frame; multiple frames may have
+	// the same PC value.
+	PC uintptr
+
+	// Func for this frame; may be nil for non-Go code or fully
+	// inlined functions.
+	Func *Func
+
+	// Function name, file name, and line number for this call frame.
+	// May be the empty string or zero if not known.
+	// If Func is not nil then Function == Func.Name().
+	Function string
+	File     string
+	Line     int
+
+	// Entry point for the function; may be zero if not known.
+	// If Func is not nil then Entry == Func.Entry().
+	Entry uintptr
+}
+
+// CallersFrames takes a slice of PC values returned by Callers and
+// prepares to return function/file/line information.
+// Do not change the slice until you are done with the Frames.
+func CallersFrames(callers []uintptr) *Frames {
+	return &Frames{callers: callers}
+}
+
+// Next returns frame information for the next caller.
+// If more is false, there are no more callers (the Frame value is valid).
+func (ci *Frames) Next() (frame Frame, more bool) {
+	if ci.frames != nil {
+		// We have saved up frames to return.
+		f := (*ci.frames)[0]
+		if len(*ci.frames) == 1 {
+			ci.frames = nil
+		} else {
+			*ci.frames = (*ci.frames)[1:]
+		}
+		return f, ci.frames != nil || len(ci.callers) > 0
+	}
+
+	if len(ci.callers) == 0 {
+		ci.wasPanic = false
+		return Frame{}, false
+	}
+	pc := ci.callers[0]
+	ci.callers = ci.callers[1:]
+	more = len(ci.callers) > 0
+	f := FuncForPC(pc)
+	if f == nil {
+		ci.wasPanic = false
+		if cgoSymbolizer != nil {
+			return ci.cgoNext(pc, more)
+		}
+		return Frame{}, more
+	}
+
+	entry := f.Entry()
+	xpc := pc
+	if xpc > entry && !ci.wasPanic {
+		xpc--
+	}
+	file, line := f.FileLine(xpc)
+
+	function := f.Name()
+	ci.wasPanic = entry == sigpanicPC
+
+	frame = Frame{
+		PC:       xpc,
+		Func:     f,
+		Function: function,
+		File:     file,
+		Line:     line,
+		Entry:    entry,
+	}
+
+	return frame, more
+}
+
+// cgoNext returns frame information for pc, known to be a non-Go function,
+// using the cgoSymbolizer hook.
+func (ci *Frames) cgoNext(pc uintptr, more bool) (Frame, bool) {
+	arg := cgoSymbolizerArg{pc: pc}
+	callCgoSymbolizer(&arg)
+
+	if arg.file == nil && arg.funcName == nil {
+		// No useful information from symbolizer.
+		return Frame{}, more
+	}
+
+	var frames []Frame
+	for {
+		frames = append(frames, Frame{
+			PC:       pc,
+			Func:     nil,
+			Function: gostring(arg.funcName),
+			File:     gostring(arg.file),
+			Line:     int(arg.lineno),
+			Entry:    arg.entry,
+		})
+		if arg.more == 0 {
+			break
+		}
+		callCgoSymbolizer(&arg)
+	}
+
+	// No more frames for this PC. Tell the symbolizer we are done.
+	// We don't try to maintain a single cgoSymbolizerArg for the
+	// whole use of Frames, because there would be no good way to tell
+	// the symbolizer when we are done.
+	arg.pc = 0
+	callCgoSymbolizer(&arg)
+
+	if len(frames) == 1 {
+		// Return a single frame.
+		return frames[0], more
+	}
+
+	// Return the first frame we saw and store the rest to be
+	// returned by later calls to Next.
+	rf := frames[0]
+	frames = frames[1:]
+	ci.frames = new([]Frame)
+	*ci.frames = frames
+	return rf, true
+}
 
 // NOTE: Func does not expose the actual unexported fields, because we return *Func
 // values to users, and we want to keep them from being able to overwrite the data
@@ -25,7 +172,6 @@
 	_PCDATA_StackMapIndex       = 0
 	_FUNCDATA_ArgsPointerMaps   = 0
 	_FUNCDATA_LocalsPointerMaps = 1
-	_FUNCDATA_DeadValueMaps     = 2
 	_ArgsSizeUnknown            = -0x80000000
 )
 
@@ -47,14 +193,18 @@
 	bss, ebss             uintptr
 	noptrbss, enoptrbss   uintptr
 	end, gcdata, gcbss    uintptr
+	types, etypes         uintptr
 
-	typelinks []*_type
+	typelinks []int32 // offsets from types
+	itablinks []*itab
 
 	modulename   string
 	modulehashes []modulehash
 
 	gcdatamask, gcbssmask bitvector
 
+	typemap map[typeOff]*_type // offset to *_rtype in previous module
+
 	next *moduledata
 }
 
@@ -83,8 +233,8 @@
 // Each bucket represents 4096 bytes of the text segment.
 // Each subbucket represents 256 bytes of the text segment.
 // To find a function given a pc, locate the bucket and subbucket for
-// that pc.  Add together the idx and subbucket value to obtain a
-// function index.  Then scan the functab array starting at that
+// that pc. Add together the idx and subbucket value to obtain a
+// function index. Then scan the functab array starting at that
 // index to find the target function.
 // This table uses 20 bytes for every 4096 bytes of code, or ~0.5% overhead.
 type findfuncbucket struct {
@@ -106,13 +256,14 @@
 	// and a byte giving the pointer width in bytes.
 	pcln := *(**[8]byte)(unsafe.Pointer(&datap.pclntable))
 	pcln32 := *(**[2]uint32)(unsafe.Pointer(&datap.pclntable))
-	if pcln32[0] != 0xfffffffb || pcln[4] != 0 || pcln[5] != 0 || pcln[6] != _PCQuantum || pcln[7] != ptrSize {
+	if pcln32[0] != 0xfffffffb || pcln[4] != 0 || pcln[5] != 0 || pcln[6] != sys.PCQuantum || pcln[7] != sys.PtrSize {
 		println("runtime: function symbol table header:", hex(pcln32[0]), hex(pcln[4]), hex(pcln[5]), hex(pcln[6]), hex(pcln[7]))
 		throw("invalid function symbol table\n")
 	}
 
 	// ftab is lookup table for function by program counter.
 	nftab := len(datap.ftab) - 1
+	var pcCache pcvalueCache
 	for i := 0; i < nftab; i++ {
 		// NOTE: ftab[nftab].entry is legal; it is the address beyond the final function.
 		if datap.ftab[i].entry > datap.ftab[i+1].entry {
@@ -148,9 +299,9 @@
 					}
 				}
 			}
-			pcvalue(f, f.pcfile, end, true)
-			pcvalue(f, f.pcln, end, true)
-			pcvalue(f, f.pcsp, end, true)
+			pcvalue(f, f.pcfile, end, &pcCache, true)
+			pcvalue(f, f.pcln, end, &pcCache, true)
+			pcvalue(f, f.pcsp, end, &pcCache, true)
 		}
 	}
 
@@ -196,7 +347,7 @@
 
 func findmoduledatap(pc uintptr) *moduledata {
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
-		if datap.minpc <= pc && pc <= datap.maxpc {
+		if datap.minpc <= pc && pc < datap.maxpc {
 			return datap
 		}
 	}
@@ -227,10 +378,42 @@
 	return (*_func)(unsafe.Pointer(&datap.pclntable[datap.ftab[idx].funcoff]))
 }
 
-func pcvalue(f *_func, off int32, targetpc uintptr, strict bool) int32 {
+type pcvalueCache struct {
+	entries [16]pcvalueCacheEnt
+}
+
+type pcvalueCacheEnt struct {
+	// targetpc and off together are the key of this cache entry.
+	targetpc uintptr
+	off      int32
+	// val is the value of this cached pcvalue entry.
+	val int32
+}
+
+func pcvalue(f *_func, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
 	if off == 0 {
 		return -1
 	}
+
+	// Check the cache. This speeds up walks of deep stacks, which
+	// tend to have the same recursive functions over and over.
+	//
+	// This cache is small enough that full associativity is
+	// cheaper than doing the hashing for a less associative
+	// cache.
+	if cache != nil {
+		for _, ent := range cache.entries {
+			// We check off first because we're more
+			// likely to have multiple entries with
+			// different offsets for the same targetpc
+			// than the other way around, so we'll usually
+			// fail in the first clause.
+			if ent.off == off && ent.targetpc == targetpc {
+				return ent.val
+			}
+		}
+	}
+
 	datap := findmoduledatap(f.entry) // inefficient
 	if datap == nil {
 		if strict && panicking == 0 {
@@ -249,6 +432,19 @@
 			break
 		}
 		if targetpc < pc {
+			// Replace a random entry in the cache. Random
+			// replacement prevents a performance cliff if
+			// a recursive stack's cycle is slightly
+			// larger than the cache.
+			if cache != nil {
+				ci := fastrand1() % uint32(len(cache.entries))
+				cache.entries[ci] = pcvalueCacheEnt{
+					targetpc: targetpc,
+					off:      off,
+					val:      val,
+				}
+			}
+
 			return val
 		}
 	}
@@ -285,7 +481,7 @@
 	if datap == nil {
 		return nil
 	}
-	return (*byte)(unsafe.Pointer(&datap.pclntable[f.nameoff]))
+	return &datap.pclntable[f.nameoff]
 }
 
 func funcname(f *_func) string {
@@ -297,8 +493,8 @@
 	if datap == nil {
 		return "?", 0
 	}
-	fileno := int(pcvalue(f, f.pcfile, targetpc, strict))
-	line = pcvalue(f, f.pcln, targetpc, strict)
+	fileno := int(pcvalue(f, f.pcfile, targetpc, nil, strict))
+	line = pcvalue(f, f.pcln, targetpc, nil, strict)
 	if fileno == -1 || line == -1 || fileno >= len(datap.filetab) {
 		// print("looking for ", hex(targetpc), " in ", funcname(f), " got file=", fileno, " line=", lineno, "\n")
 		return "?", 0
@@ -311,20 +507,20 @@
 	return funcline1(f, targetpc, true)
 }
 
-func funcspdelta(f *_func, targetpc uintptr) int32 {
-	x := pcvalue(f, f.pcsp, targetpc, true)
-	if x&(ptrSize-1) != 0 {
+func funcspdelta(f *_func, targetpc uintptr, cache *pcvalueCache) int32 {
+	x := pcvalue(f, f.pcsp, targetpc, cache, true)
+	if x&(sys.PtrSize-1) != 0 {
 		print("invalid spdelta ", funcname(f), " ", hex(f.entry), " ", hex(targetpc), " ", hex(f.pcsp), " ", x, "\n")
 	}
 	return x
 }
 
-func pcdatavalue(f *_func, table int32, targetpc uintptr) int32 {
+func pcdatavalue(f *_func, table int32, targetpc uintptr, cache *pcvalueCache) int32 {
 	if table < 0 || table >= f.npcdata {
 		return -1
 	}
 	off := *(*int32)(add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(table)*4))
-	return pcvalue(f, off, targetpc, true)
+	return pcvalue(f, off, targetpc, cache, true)
 }
 
 func funcdata(f *_func, i int32) unsafe.Pointer {
@@ -332,13 +528,13 @@
 		return nil
 	}
 	p := add(unsafe.Pointer(&f.nfuncdata), unsafe.Sizeof(f.nfuncdata)+uintptr(f.npcdata)*4)
-	if ptrSize == 8 && uintptr(p)&4 != 0 {
+	if sys.PtrSize == 8 && uintptr(p)&4 != 0 {
 		if uintptr(unsafe.Pointer(f))&4 != 0 {
 			println("runtime: misaligned func", f)
 		}
 		p = add(p, 4)
 	}
-	return *(*unsafe.Pointer)(add(p, uintptr(i)*ptrSize))
+	return *(*unsafe.Pointer)(add(p, uintptr(i)*sys.PtrSize))
 }
 
 // step advances to the next pc, value pair in the encoded table.
@@ -354,7 +550,7 @@
 	}
 	vdelta := int32(uvdelta)
 	p, pcdelta := readvarint(p)
-	*pc += uintptr(pcdelta * _PCQuantum)
+	*pc += uintptr(pcdelta * sys.PCQuantum)
 	*val += vdelta
 	return p, true
 }

diff --git a/src/runtime/sys_darwin_386.s b/src/runtime/sys_darwin_386.s
index abc5d32..83f4709 100644
--- a/src/runtime/sys_darwin_386.s
+++ b/src/runtime/sys_darwin_386.s

@@ -201,6 +201,11 @@
 	MOVL	$0, 8(SP)	// time zone pointer
 	MOVL	$116, AX
 	INT	$0x80
+	CMPL	AX, $0
+	JNE	inreg
+	MOVL	12(SP), AX
+	MOVL	16(SP), DX
+inreg:
 	// sec is in AX, usec in DX
 	// convert to DX:AX nsec
 	MOVL	DX, BX
@@ -242,53 +247,55 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
+	MOVL	fn+0(FP), AX
+	MOVL	sig+4(FP), BX
+	MOVL	info+8(FP), CX
+	MOVL	ctx+12(FP), DX
+	MOVL	SP, SI
+	SUBL	$32, SP		// align stack; handler might be C code
+	ANDL	$~15, SP
+	MOVL	BX, 0(SP)
+	MOVL	CX, 4(SP)
+	MOVL	DX, 8(SP)
+	MOVL	SI, 12(SP)
+	CALL	AX
+	MOVL	12(SP), AX
+	MOVL	AX, SP
+	RET
+
+TEXT runtime·sigreturn(SB),NOSPLIT,$12-8
+	MOVL	ctx+0(FP), CX
+	MOVL	infostyle+4(FP), BX
+	MOVL	$0, 0(SP)	// "caller PC" - ignored
+	MOVL	CX, 4(SP)
+	MOVL	BX, 8(SP)
+	MOVL	$184, AX	// sigreturn(ucontext, infostyle)
+	INT	$0x80
+	MOVL	$0xf1, 0xf1  // crash
+	RET
+
 // Sigtramp's job is to call the actual signal handler.
 // It is called with the following arguments on the stack:
-//	0(FP)	"return address" - ignored
-//	4(FP)	actual handler
-//	8(FP)	signal number
-//	12(FP)	siginfo style
-//	16(FP)	siginfo
-//	20(FP)	context
-TEXT runtime·sigtramp(SB),NOSPLIT,$40
-	get_tls(CX)
-	
-	// check that g exists
-	MOVL	g(CX), DI
-	CMPL	DI, $0
-	JNE	6(PC)
-	MOVL	sig+8(FP), BX
+//	0(SP)	"return address" - ignored
+//	4(SP)	actual handler
+//	8(SP)	signal number
+//	12(SP)	siginfo style
+//	16(SP)	siginfo
+//	20(SP)	context
+TEXT runtime·sigtramp(SB),NOSPLIT,$20
+	MOVL	fn+0(FP), BX
 	MOVL	BX, 0(SP)
-	MOVL	$runtime·badsignal(SB), AX
-	CALL	AX
-	JMP 	ret
-
-	// save g
-	MOVL	DI, 20(SP)
-
-	// g = m->gsignal
-	MOVL	g_m(DI), BP
-	MOVL	m_gsignal(BP), BP
-	MOVL	BP, g(CX)
-
-	// copy arguments to sighandler
-	MOVL	sig+8(FP), BX
-	MOVL	BX, 0(SP)
-	MOVL	info+12(FP), BX
+	MOVL	style+4(FP), BX
 	MOVL	BX, 4(SP)
-	MOVL	context+16(FP), BX
+	MOVL	sig+8(FP), BX
 	MOVL	BX, 8(SP)
-	MOVL	DI, 12(SP)
+	MOVL	info+12(FP), BX
+	MOVL	BX, 12(SP)
+	MOVL	context+16(FP), BX
+	MOVL	BX, 16(SP)
+	CALL	runtime·sigtrampgo(SB)
 
-	MOVL	handler+0(FP), BX
-	CALL	BX
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), DI
-	MOVL	DI, g(CX)
-
-ret:
 	// call sigreturn
 	MOVL	context+16(FP), CX
 	MOVL	style+4(FP), BX
@@ -361,10 +368,8 @@
 //	SP = stack - C_32_STK_ALIGN
 TEXT runtime·bsdthread_start(SB),NOSPLIT,$0
 	// set up ldt 7+id to point at m->tls.
-	// m->tls is at m+40.  newosproc left
-	// the m->id in tls[0].
 	LEAL	m_tls(DX), BP
-	MOVL	0(BP), DI
+	MOVL	m_id(DX), DI
 	ADDL	$7, DI	// m0 is LDT#7. count up.
 	// setldt(tls#, &tls, sizeof tls)
 	PUSHAL	// save registers
@@ -377,7 +382,7 @@
 	POPL	AX
 	POPAL
 
-	// Now segment is established.  Initialize m, g.
+	// Now segment is established. Initialize m, g.
 	get_tls(BP)
 	MOVL    m_g0(DX), AX
 	MOVL	AX, g(BP)

diff --git a/src/runtime/sys_darwin_amd64.s b/src/runtime/sys_darwin_amd64.s
index 692dbca..e09b906 100644
--- a/src/runtime/sys_darwin_amd64.s
+++ b/src/runtime/sys_darwin_amd64.s

@@ -155,10 +155,15 @@
 
 systime:
 	// Fall back to system call (usually first call in this thread).
-	MOVQ	SP, DI	// must be non-nil, unused
+	MOVQ	SP, DI
 	MOVQ	$0, SI
 	MOVL	$(0x2000000+116), AX
 	SYSCALL
+	CMPQ	AX, $0
+	JNE	inreg
+	MOVQ	0(SP), AX
+	MOVL	8(SP), DX
+inreg:
 	// sec is in AX, usec in DX
 	// return nsec in AX
 	IMULQ	$1000000000, AX
@@ -214,10 +219,14 @@
 
 TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
 	MOVQ fn+0(FP),    AX
-	MOVQ sig+8(FP),   DI
+	MOVL sig+8(FP),   DI
 	MOVQ info+16(FP), SI
 	MOVQ ctx+24(FP),  DX
+	MOVQ SP, BP
+	SUBQ $64, SP
+	ANDQ $~15, SP     // alignment for x86_64 ABI
 	CALL AX
+	MOVQ BP, SP
 	RET
 
 TEXT runtime·sigreturn(SB),NOSPLIT,$0-12

diff --git a/src/runtime/sys_darwin_arm.s b/src/runtime/sys_darwin_arm.s
index 087dec5..6b6437d 100644
--- a/src/runtime/sys_darwin_arm.s
+++ b/src/runtime/sys_darwin_arm.s

@@ -24,9 +24,9 @@
 #define	SYS_kill           37
 #define	SYS_getpid         20
 #define	SYS___pthread_kill 328
+#define	SYS_pthread_sigmask 329
 #define	SYS_setitimer      83
 #define	SYS___sysctl       202
-#define	SYS_sigprocmask    48
 #define	SYS_sigaction      46
 #define	SYS_sigreturn      184
 #define	SYS_select         93
@@ -194,6 +194,18 @@
 	MOVW	R1, ret_hi+4(FP)
 	RET
 
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
+	MOVW	sig+4(FP), R0
+	MOVW	info+8(FP), R1
+	MOVW	ctx+12(FP), R2
+	MOVW	fn+0(FP), R11
+	MOVW	R13, R4
+	SUB	$24, R13
+	BIC	$0x7, R13 // alignment for ELF ABI
+	BL	(R11)
+	MOVW	R4, R13
+	RET
+
 // Sigtramp's job is to call the actual signal handler.
 // It is called with the following arguments on the stack:
 //	 LR  	"return address" - ignored
@@ -249,7 +261,7 @@
 	MOVW    R1, 24(R6)
 
 	// switch stack and g
-	MOVW	R6, R13 // sigtramp can not re-entrant, so no need to back up R13.
+	MOVW	R6, R13 // sigtramp is not re-entrant, so no need to back up R13.
 	MOVW	R5, g
 
 	BL	(R0)
@@ -268,7 +280,7 @@
 	MOVW	sig+0(FP), R0
 	MOVW	new+4(FP), R1
 	MOVW	old+8(FP), R2
-	MOVW	$SYS_sigprocmask, R12
+	MOVW	$SYS_pthread_sigmask, R12
 	SWI	$0x80
 	BL.CS	notok<>(SB)
 	RET
@@ -297,12 +309,6 @@
 	SWI	$0x80
 	RET
 
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B	runtime·armcas(SB)
-
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
 

diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 0f9da85..a3b851d 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s

@@ -24,9 +24,9 @@
 #define	SYS_kill           37
 #define	SYS_getpid         20
 #define	SYS___pthread_kill 328
+#define	SYS_pthread_sigmask 329
 #define	SYS_setitimer      83
 #define	SYS___sysctl       202
-#define	SYS_sigprocmask    48
 #define	SYS_sigaction      46
 #define	SYS_sigreturn      184
 #define	SYS_select         93
@@ -180,6 +180,14 @@
 	MOVD	R0, ret+0(FP)
 	RET
 
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R0
+	MOVD	info+16(FP), R1
+	MOVD	ctx+24(FP), R2
+	MOVD	fn+0(FP), R11
+	BL	(R11)
+	RET
+
 // Sigtramp's job is to call the actual signal handler.
 // It is called with the following arguments on the stack:
 //	LR	"return address" - ignored
@@ -237,7 +245,7 @@
 	MOVD	R1, 48(R6)
 
 	// switch stack and g
-	MOVD	R6, RSP	// sigtramp can not re-entrant, so no need to back up RSP.
+	MOVD	R6, RSP	// sigtramp is not re-entrant, so no need to back up RSP.
 	MOVD	R5, g
 
 	BL	(R0)
@@ -256,7 +264,7 @@
 	MOVW	sig+0(FP), R0
 	MOVD	new+8(FP), R1
 	MOVD	old+16(FP), R2
-	MOVW	$SYS_sigprocmask, R16
+	MOVW	$SYS_pthread_sigmask, R16
 	SVC	$0x80
 	BCC	2(PC)
 	BL	notok<>(SB)

diff --git a/src/runtime/sys_dragonfly_amd64.s b/src/runtime/sys_dragonfly_amd64.s
index 26c9784..be964cb 100644
--- a/src/runtime/sys_dragonfly_amd64.s
+++ b/src/runtime/sys_dragonfly_amd64.s

@@ -51,6 +51,18 @@
 	MOVQ	R13, g_m(DI)
 	MOVQ	DI, g(CX)
 
+	// On DragonFly, a new thread inherits the signal stack of the
+	// creating thread. That confuses minit, so we remove that
+	// signal stack here before calling the regular mstart. It's
+	// a bit baroque to remove a signal stack here only to add one
+	// in minit, but it's a simple change that keeps DragonFly
+	// working like other OS's. At this point all signals are
+	// blocked, so there is no race.
+	SUBQ	$8, SP
+	MOVQ	$0, 0(SP)
+	CALL	runtime·signalstack(SB)
+	ADDQ	$8, SP
+
 	CALL	runtime·stackcheck(SB)
 	CALL	runtime·mstart(SB)
 
@@ -187,37 +199,19 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$64
-	get_tls(BX)
-
-	// check that g exists
-	MOVQ	g(BX), R10
-	CMPQ	R10, $0
-	JNE	5(PC)
-	MOVQ	DI, 0(SP)
-	MOVQ	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVL	sig+8(FP), DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP), DX
+	MOVQ	fn+0(FP), AX
 	CALL	AX
 	RET
 
-	// save g
-	MOVQ	R10, 40(SP)
-	
-	// g = m->signal
-	MOVQ	g_m(R10), AX
-	MOVQ	m_gsignal(AX), AX
-	MOVQ	AX, g(BX)
-	
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
 	MOVQ	DX, 16(SP)
-	MOVQ	R10, 24(SP)
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(BX)
-	MOVQ	40(SP), R10
-	MOVQ	R10, g(BX)
+	CALL	runtime·sigtrampgo(SB)
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0

diff --git a/src/runtime/sys_freebsd_386.s b/src/runtime/sys_freebsd_386.s
index b2dd780..b37abce 100644
--- a/src/runtime/sys_freebsd_386.s
+++ b/src/runtime/sys_freebsd_386.s

@@ -25,7 +25,7 @@
 	MOVL	mm+0(FP), AX
 	MOVL	m_g0(AX), BX
 	LEAL	m_tls(AX), BP
-	MOVL	0(BP), DI
+	MOVL	m_id(AX), DI
 	ADDL	$7, DI
 	PUSHAL
 	PUSHL	$32
@@ -207,44 +207,26 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$44
-	get_tls(CX)
-
-	// check that g exists
-	MOVL	g(CX), DI
-	CMPL	DI, $0
-	JNE	6(PC)
-	MOVL	signo+0(FP), BX
-	MOVL	BX, 0(SP)
-	MOVL	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$12-16
+	MOVL	sig+4(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	info+8(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	ctx+12(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	fn+0(FP), AX
 	CALL	AX
-	JMP 	ret
+	RET
 
-	// save g
-	MOVL	DI, 20(SP)
-	
-	// g = m->gsignal
-	MOVL	g_m(DI), BX
-	MOVL	m_gsignal(BX), BX
-	MOVL	BX, g(CX)
-
-	// copy arguments for call to sighandler
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
 	MOVL	info+4(FP), BX
 	MOVL	BX, 4(SP)
 	MOVL	context+8(FP), BX
 	MOVL	BX, 8(SP)
-	MOVL	DI, 12(SP)
+	CALL	runtime·sigtrampgo(SB)
 
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), BX
-	MOVL	BX, g(CX)
-
-ret:
 	// call sigreturn
 	MOVL	context+8(FP), AX
 	MOVL	$0, 0(SP)	// syscall gap

diff --git a/src/runtime/sys_freebsd_amd64.s b/src/runtime/sys_freebsd_amd64.s
index b1c67c7..9700117 100644
--- a/src/runtime/sys_freebsd_amd64.s
+++ b/src/runtime/sys_freebsd_amd64.s

@@ -183,37 +183,19 @@
 	MOVL	$0xf1, 0xf1  // crash
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$64
-	get_tls(BX)
-
-	// check that g exists
-	MOVQ	g(BX), R10
-	CMPQ	R10, $0
-	JNE	5(PC)
-	MOVQ	DI, 0(SP)
-	MOVQ	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVL	sig+8(FP), DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP), DX
+	MOVQ	fn+0(FP), AX
 	CALL	AX
 	RET
 
-	// save g
-	MOVQ	R10, 40(SP)
-	
-	// g = m->signal
-	MOVQ	g_m(R10), AX
-	MOVQ	m_gsignal(AX), AX
-	MOVQ	AX, g(BX)
-	
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
 	MOVQ	DX, 16(SP)
-	MOVQ	R10, 24(SP)
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(BX)
-	MOVQ	40(SP), R10
-	MOVQ	R10, g(BX)
+	CALL	runtime·sigtrampgo(SB)
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0

diff --git a/src/runtime/sys_freebsd_arm.s b/src/runtime/sys_freebsd_arm.s
index bd6ff96..e7dfb28 100644
--- a/src/runtime/sys_freebsd_arm.s
+++ b/src/runtime/sys_freebsd_arm.s

@@ -216,7 +216,7 @@
 	MOVW.CS R8, (R8)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$24
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	// this might be called in external code context,
 	// where g is not set.
 	// first save R0, because runtime·load_g will clobber it
@@ -225,30 +225,9 @@
 	CMP 	$0, R0
 	BL.NE	runtime·load_g(SB)
 
-	CMP $0, g
-	BNE 4(PC)
-	// signal number is already prepared in 4(R13)
-	MOVW $runtime·badsignal(SB), R11
-	BL (R11)
-	RET
-
-	// save g
-	MOVW g, R4
-	MOVW g, 20(R13)
-
-	// g = m->signal
-	MOVW g_m(g), R8
-	MOVW m_gsignal(R8), g
-
-	// R0 is already saved
-	MOVW R1, 8(R13) // info
-	MOVW R2, 12(R13) // context
-	MOVW R4, 16(R13) // oldg
-
-	BL runtime·sighandler(SB)
-
-	// restore g
-	MOVW 20(R13), g
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	BL	runtime·sigtrampgo(SB)
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$16
@@ -300,6 +279,18 @@
 	MOVW.CS R8, (R8)
 	RET
 
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
+	MOVW	sig+4(FP), R0
+	MOVW	info+8(FP), R1
+	MOVW	ctx+12(FP), R2
+	MOVW	fn+0(FP), R11
+	MOVW	R13, R4
+	SUB	$24, R13
+	BIC	$0x7, R13 // alignment for ELF ABI
+	BL	(R11)
+	MOVW	R4, R13
+	RET
+
 TEXT runtime·usleep(SB),NOSPLIT,$16
 	MOVW usec+0(FP), R0
 	CALL runtime·usplitR0(SB)
@@ -377,20 +368,6 @@
 	SWI $0
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// TODO(minux): this is only valid for ARMv6+
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)

diff --git a/src/runtime/sys_linux_386.s b/src/runtime/sys_linux_386.s
index 98a1a0e..4fe07e0 100644
--- a/src/runtime/sys_linux_386.s
+++ b/src/runtime/sys_linux_386.s

@@ -10,17 +10,31 @@
 #include "go_tls.h"
 #include "textflag.h"
 
+// Most linux systems use glibc's dynamic linker, which puts the
+// __kernel_vsyscall vdso helper at 0x10(GS) for easy access from position
+// independent code and setldt in runtime does the same in the statically
+// linked case. However, systems that use alternative libc such as Android's
+// bionic and musl, do not save the helper anywhere, and so the only way to
+// invoke a syscall from position independent code is boring old int $0x80
+// (which is also what syscall wrappers in bionic/musl use).
+//
+// The benchmarks also showed that using int $0x80 is as fast as calling
+// *%gs:0x10 except on AMD Opteron. See https://golang.org/cl/19833
+// for the benchmark program and raw data.
+//#define INVOKE_SYSCALL	CALL	0x10(GS) // non-portable
+#define INVOKE_SYSCALL	INT	$0x80
+
 TEXT runtime·exit(SB),NOSPLIT,$0
 	MOVL	$252, AX	// syscall number
 	MOVL	code+0(FP), BX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	INT $3	// not reached
 	RET
 
 TEXT runtime·exit1(SB),NOSPLIT,$0
 	MOVL	$1, AX	// exit - exit the current os thread
 	MOVL	code+0(FP), BX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	INT $3	// not reached
 	RET
 
@@ -29,7 +43,7 @@
 	MOVL	name+0(FP), BX
 	MOVL	mode+4(FP), CX
 	MOVL	perm+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	MOVL	$-1, AX
@@ -39,7 +53,7 @@
 TEXT runtime·closefd(SB),NOSPLIT,$0
 	MOVL	$6, AX		// syscall - close
 	MOVL	fd+0(FP), BX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	MOVL	$-1, AX
@@ -51,7 +65,7 @@
 	MOVL	fd+0(FP), BX
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	MOVL	$-1, AX
@@ -63,7 +77,7 @@
 	MOVL	fd+0(FP), BX
 	MOVL	p+4(FP), CX
 	MOVL	n+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	MOVL	$-1, AX
@@ -74,7 +88,7 @@
 	MOVL	$191, AX		// syscall - ugetrlimit
 	MOVL	kind+0(FP), BX
 	MOVL	limit+4(FP), CX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+8(FP)
 	RET
 
@@ -93,31 +107,31 @@
 	MOVL	$0, DX
 	MOVL	$0, SI
 	LEAL	0(SP), DI
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·gettid(SB),NOSPLIT,$0-4
 	MOVL	$224, AX	// syscall - gettid
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+0(FP)
 	RET
 
 TEXT runtime·raise(SB),NOSPLIT,$12
 	MOVL	$224, AX	// syscall - gettid
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, BX	// arg 1 tid
 	MOVL	sig+0(FP), CX	// arg 2 signal
 	MOVL	$238, AX	// syscall - tkill
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·raiseproc(SB),NOSPLIT,$12
 	MOVL	$20, AX	// syscall - getpid
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, BX	// arg 1 pid
 	MOVL	sig+0(FP), CX	// arg 2 signal
 	MOVL	$37, AX	// syscall - kill
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·setitimer(SB),NOSPLIT,$0-12
@@ -125,7 +139,7 @@
 	MOVL	mode+0(FP), BX
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·mincore(SB),NOSPLIT,$0-16
@@ -133,7 +147,7 @@
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	MOVL	dst+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+12(FP)
 	RET
 
@@ -143,7 +157,7 @@
 	MOVL	$0, BX		// CLOCK_REALTIME
 	LEAL	8(SP), CX
 	MOVL	$0, DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
 
@@ -160,7 +174,7 @@
 	MOVL	$1, BX		// CLOCK_MONOTONIC
 	LEAL	8(SP), CX
 	MOVL	$0, DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	8(SP), AX	// sec
 	MOVL	12(SP), BX	// nsec
 
@@ -181,7 +195,7 @@
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
 	MOVL	size+12(FP), SI
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	INT $3
@@ -193,7 +207,7 @@
 	MOVL	new+4(FP), CX
 	MOVL	old+8(FP), DX
 	MOVL	size+12(FP), SI
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
 
@@ -218,12 +232,15 @@
 	CALL	runtime·sigtrampgo(SB)
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
+
 TEXT runtime·sigreturn(SB),NOSPLIT,$0
 	MOVL	$173, AX	// rt_sigreturn
 	// Sigreturn expects same SP as signal handler,
-	// so cannot CALL *runtime._vsdo(SB) here.
+	// so cannot CALL 0x10(GS) here.
 	INT	$0x80
-	INT $3	// not reached
+	INT	$3	// not reached
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
@@ -235,7 +252,7 @@
 	MOVL	fd+16(FP), DI
 	MOVL	off+20(FP), BP
 	SHRL	$12, BP
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	3(PC)
 	NOTL	AX
@@ -247,7 +264,7 @@
 	MOVL	$91, AX	// munmap
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	INT $3
@@ -258,7 +275,7 @@
 	MOVL	addr+0(FP), BX
 	MOVL	n+4(FP), CX
 	MOVL	flags+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	// ignore failure - maybe pages are locked
 	RET
 
@@ -272,7 +289,7 @@
 	MOVL	ts+12(FP), SI
 	MOVL	addr2+16(FP), DI
 	MOVL	val3+20(FP), BP
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+24(FP)
 	RET
 
@@ -294,10 +311,9 @@
 	MOVL	SI, 8(CX)
 	MOVL	$1234, 12(CX)
 
-	// cannot use CALL *runtime·_vdso(SB) here, because
-	// the stack changes during the system call (after
-	// CALL *runtime·_vdso(SB), the child is still using
-	// the parent's stack when executing its RET instruction).
+	// cannot use CALL 0x10(GS) here, because the stack changes during the
+	// system call (after CALL 0x10(GS), the child is still using the
+	// parent's stack when executing its RET instruction).
 	INT	$0x80
 
 	// In parent, return.
@@ -314,7 +330,7 @@
 
 	// Initialize AX to Linux tid
 	MOVL	$224, AX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 
 	MOVL	0(SP), BX	    // m
 	MOVL	4(SP), DX	    // g
@@ -328,9 +344,8 @@
 	MOVL	AX, m_procid(BX)	// save tid as m->procid
 
 	// set up ldt 7+id to point at m->tls.
-	// newosproc left the id in tls[0].
 	LEAL	m_tls(BX), BP
-	MOVL	0(BP), DI
+	MOVL	m_id(BX), DI
 	ADDL	$7, DI	// m0 is LDT#7. count up.
 	// setldt(tls#, &tls, sizeof tls)
 	PUSHAL	// save registers
@@ -343,7 +358,7 @@
 	POPL	AX
 	POPAL
 
-	// Now segment is established.  Initialize m, g.
+	// Now segment is established. Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
 	MOVL	BX, g_m(DX)
@@ -366,7 +381,7 @@
 	MOVL	$186, AX	// sigaltstack
 	MOVL	new+4(SP), BX
 	MOVL	old+8(SP), CX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	CMPL	AX, $0xfffff001
 	JLS	2(PC)
 	INT	$3
@@ -394,11 +409,29 @@
 #define SEG_NOT_PRESENT 0x20
 #define USEABLE 0x40
 
-// setldt(int entry, int address, int limit)
-TEXT runtime·setldt(SB),NOSPLIT,$32
-	MOVL	entry+0(FP), BX	// entry
-	MOVL	address+4(FP), CX	// base address
+// `-1` means the kernel will pick a TLS entry on the first setldt call,
+// which happens during runtime init, and that we'll store back the saved
+// entry and reuse that on subsequent calls when creating new threads.
+DATA  runtime·tls_entry_number+0(SB)/4, $-1
+GLOBL runtime·tls_entry_number(SB), NOPTR, $4
 
+// setldt(int entry, int address, int limit)
+// We use set_thread_area, which mucks with the GDT, instead of modify_ldt,
+// which would modify the LDT, but is disabled on some kernels.
+// The name, setldt, is a misnomer, although we leave this name as it is for
+// the compatibility with other platforms.
+TEXT runtime·setldt(SB),NOSPLIT,$32
+	MOVL	address+4(FP), DX	// base address
+
+#ifdef GOOS_android
+	/*
+	 * Same as in sys_darwin_386.s:/ugliness, different constant.
+	 * address currently holds m->tls, which must be %gs:0xf8.
+	 * See cgo/gcc_android_386.c for the derivation of the constant.
+	 */
+	SUBL	$0xf8, DX
+	MOVL	DX, 0(DX)
+#else
 	/*
 	 * When linking against the system libraries,
 	 * we use its pthread_create and let it set up %gs
@@ -409,42 +442,53 @@
 	 * To accommodate that rewrite, we translate
 	 * the address here and bump the limit to 0xffffffff (no limit)
 	 * so that -4(GS) maps to 0(address).
-	 * Also, the final 0(GS) (current 4(CX)) has to point
+	 * Also, the final 0(GS) (current 4(DX)) has to point
 	 * to itself, to mimic ELF.
 	 */
-	ADDL	$0x4, CX	// address
-	MOVL	CX, 0(CX)
+	ADDL	$0x4, DX	// address
+	MOVL	DX, 0(DX)
+#endif
+
+	// get entry number
+	MOVL	runtime·tls_entry_number(SB), CX
 
 	// set up user_desc
 	LEAL	16(SP), AX	// struct user_desc
-	MOVL	BX, 0(AX)
-	MOVL	CX, 4(AX)
-	MOVL	$0xfffff, 8(AX)
+	MOVL	CX, 0(AX)	// unsigned int entry_number
+	MOVL	DX, 4(AX)	// unsigned long base_addr
+	MOVL	$0xfffff, 8(AX)	// unsigned int limit
 	MOVL	$(SEG_32BIT|LIMIT_IN_PAGES|USEABLE|CONTENTS_DATA), 12(AX)	// flag bits
 
-	// call modify_ldt
-	MOVL	$1, BX	// func = 1 (write)
-	MOVL	AX, CX	// user_desc
-	MOVL	$16, DX	// sizeof(user_desc)
-	MOVL	$123, AX	// syscall - modify_ldt
-	CALL	*runtime·_vdso(SB)
+	// call set_thread_area
+	MOVL	AX, BX	// user_desc
+	MOVL	$243, AX	// syscall - set_thread_area
+	// We can't call this via 0x10(GS) because this is called from setldt0 to set that up.
+	INT     $0x80
 
 	// breakpoint on error
 	CMPL AX, $0xfffff001
 	JLS 2(PC)
 	INT $3
 
-	// compute segment selector - (entry*8+7)
-	MOVL	entry+0(FP), AX
+	// read allocated entry number back out of user_desc
+	LEAL	16(SP), AX	// get our user_desc back
+	MOVL	0(AX), AX
+
+	// store entry number if the kernel allocated it
+	CMPL	CX, $-1
+	JNE	2(PC)
+	MOVL	AX, runtime·tls_entry_number(SB)
+
+	// compute segment selector - (entry*8+3)
 	SHLL	$3, AX
-	ADDL	$7, AX
+	ADDL	$3, AX
 	MOVW	AX, GS
 
 	RET
 
 TEXT runtime·osyield(SB),NOSPLIT,$0
 	MOVL	$158, AX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	RET
 
 TEXT runtime·sched_getaffinity(SB),NOSPLIT,$0
@@ -452,7 +496,7 @@
 	MOVL	pid+0(FP), BX
 	MOVL	len+4(FP), CX
 	MOVL	buf+8(FP), DX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+12(FP)
 	RET
 
@@ -460,7 +504,7 @@
 TEXT runtime·epollcreate(SB),NOSPLIT,$0
 	MOVL    $254, AX
 	MOVL	size+0(FP), BX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+4(FP)
 	RET
 
@@ -468,7 +512,7 @@
 TEXT runtime·epollcreate1(SB),NOSPLIT,$0
 	MOVL    $329, AX
 	MOVL	flags+0(FP), BX
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+4(FP)
 	RET
 
@@ -479,7 +523,7 @@
 	MOVL	op+4(FP), CX
 	MOVL	fd+8(FP), DX
 	MOVL	ev+12(FP), SI
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
 
@@ -490,7 +534,7 @@
 	MOVL	ev+4(FP), CX
 	MOVL	nev+8(FP), DX
 	MOVL	timeout+12(FP), SI
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
 	MOVL	AX, ret+16(FP)
 	RET
 
@@ -500,5 +544,36 @@
 	MOVL	fd+0(FP), BX  // fd
 	MOVL	$2, CX  // F_SETFD
 	MOVL	$1, DX  // FD_CLOEXEC
-	CALL	*runtime·_vdso(SB)
+	INVOKE_SYSCALL
+	RET
+
+// int access(const char *name, int mode)
+TEXT runtime·access(SB),NOSPLIT,$0
+	MOVL	$33, AX  // syscall - access
+	MOVL	name+0(FP), BX
+	MOVL	mode+4(FP), CX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+8(FP)
+	RET
+
+// int connect(int fd, const struct sockaddr *addr, socklen_t addrlen)
+TEXT runtime·connect(SB),NOSPLIT,$0-16
+	// connect is implemented as socketcall(NR_socket, 3, *(rest of args))
+	// stack already should have fd, addr, addrlen.
+	MOVL	$102, AX  // syscall - socketcall
+	MOVL	$3, BX  // connect
+	LEAL	fd+0(FP), CX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+12(FP)
+	RET
+
+// int socket(int domain, int type, int protocol)
+TEXT runtime·socket(SB),NOSPLIT,$0-16
+	// socket is implemented as socketcall(NR_socket, 1, *(rest of args))
+	// stack already should have domain, type, protocol.
+	MOVL	$102, AX  // syscall - socketcall
+	MOVL	$1, BX  // socket
+	LEAL	domain+0(FP), CX
+	INVOKE_SYSCALL
+	MOVL	AX, ret+12(FP)
 	RET

diff --git a/src/runtime/sys_linux_amd64.s b/src/runtime/sys_linux_amd64.s
index 59c21c5..8a8f3cc 100644
--- a/src/runtime/sys_linux_amd64.s
+++ b/src/runtime/sys_linux_amd64.s

@@ -234,12 +234,93 @@
 	CALL AX
 	RET
 
+// Used instead of sigtramp in programs that use cgo.
+// Arguments from kernel are in DI, SI, DX.
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	// If no traceback function, do usual sigtramp.
+	MOVQ	runtime·cgoTraceback(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// If no traceback support function, which means that
+	// runtime/cgo was not linked in, do usual sigtramp.
+	MOVQ	_cgo_callers(SB), AX
+	TESTQ	AX, AX
+	JZ	sigtramp
+
+	// Figure out if we are currently in a cgo call.
+	// If not, just do usual sigtramp.
+	get_tls(CX)
+	MOVQ	g(CX),AX
+	TESTQ	AX, AX
+	JZ	sigtrampnog     // g == nil
+	MOVQ	g_m(AX), AX
+	TESTQ	AX, AX
+	JZ	sigtramp        // g.m == nil
+	MOVL	m_ncgo(AX), CX
+	TESTL	CX, CX
+	JZ	sigtramp        // g.m.ncgo == 0
+	MOVQ	m_curg(AX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg == nil
+	MOVQ	g_syscallsp(CX), CX
+	TESTQ	CX, CX
+	JZ	sigtramp        // g.m.curg.syscallsp == 0
+	MOVQ	m_cgoCallers(AX), R8
+	TESTQ	R8, R8
+	JZ	sigtramp        // g.m.cgoCallers == nil
+	MOVL	m_cgoCallersUse(AX), CX
+	TESTL	CX, CX
+	JNZ	sigtramp	// g.m.cgoCallersUse != 0
+
+	// Jump to a function in runtime/cgo.
+	// That function, written in C, will call the user's traceback
+	// function with proper unwind info, and will then call back here.
+	// The first three arguments, and the fifth, are already in registers.
+	// Set the two remaining arguments now.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigtramp(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+sigtramp:
+	JMP	runtime·sigtramp(SB)
+
+sigtrampnog:
+	// Signal arrived on a non-Go thread. If this is SIGPROF, get a
+	// stack trace.
+	CMPL	DI, $27 // 27 == SIGPROF
+	JNZ	sigtramp
+
+	// Lock sigprofCallersUse.
+	MOVL	$0, AX
+	MOVL	$1, CX
+	MOVQ	$runtime·sigprofCallersUse(SB), BX
+	LOCK
+	CMPXCHGL	CX, 0(BX)
+	JNZ	sigtramp  // Skip stack trace if already locked.
+
+	// Jump to the traceback function in runtime/cgo.
+	// It will call back to sigprofNonGo, which will ignore the
+	// arguments passed in registers.
+	// First three arguments to traceback function are in registers already.
+	MOVQ	runtime·cgoTraceback(SB), CX
+	MOVQ	$runtime·sigprofCallers(SB), R8
+	MOVQ	$runtime·sigprofNonGo(SB), R9
+	MOVQ	_cgo_callers(SB), AX
+	JMP	AX
+
+// For cgo unwinding to work, this function must look precisely like
+// the one in glibc.  The glibc source code is:
+// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/x86_64/sigaction.c
+// The code that cares about the precise instructions used is:
+// https://gcc.gnu.org/viewcvs/gcc/trunk/libgcc/config/i386/linux-unwind.h?revision=219188&view=markup
 TEXT runtime·sigreturn(SB),NOSPLIT,$0
-	MOVL	$15, AX	// rt_sigreturn
+	MOVQ	$15, AX	// rt_sigreturn
 	SYSCALL
 	INT $3	// not reached
 
-TEXT runtime·mmap(SB),NOSPLIT,$0
+TEXT runtime·sysMmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
 	MOVL	prot+16(FP), DX
@@ -256,6 +337,24 @@
 	MOVQ	AX, ret+32(FP)
 	RET
 
+// Call the function stored in _cgo_mmap using the GCC calling convention.
+// This must be called on the system stack.
+TEXT runtime·callCgoMmap(SB),NOSPLIT,$16
+	MOVQ	addr+0(FP), DI
+	MOVQ	n+8(FP), SI
+	MOVL	prot+16(FP), DX
+	MOVL	flags+20(FP), CX
+	MOVL	fd+24(FP), R8
+	MOVL	off+28(FP), R9
+	MOVQ	_cgo_mmap(SB), AX
+	MOVQ	SP, BX
+	ANDQ	$~15, SP	// alignment as per amd64 psABI
+	MOVQ	BX, 0(SP)
+	CALL	AX
+	MOVQ	0(SP), SP
+	MOVQ	AX, ret+32(FP)
+	RET
+
 TEXT runtime·munmap(SB),NOSPLIT,$0
 	MOVQ	addr+0(FP), DI
 	MOVQ	n+8(FP), SI
@@ -339,7 +438,7 @@
 	// Call fn
 	CALL	R12
 
-	// It shouldn't return.  If it does, exit that thread.
+	// It shouldn't return. If it does, exit that thread.
 	MOVL	$111, DI
 	MOVL	$60, AX
 	SYSCALL
@@ -357,8 +456,14 @@
 
 // set tls base to DI
 TEXT runtime·settls(SB),NOSPLIT,$32
+#ifdef GOOS_android
+	// Same as in sys_darwin_386.s:/ugliness, different constant.
+	// DI currently holds m->tls, which must be fs:0x1d0.
+	// See cgo/gcc_android_amd64.c for the derivation of the constant.
+	SUBQ	$0x1d0, DI  // In android, the tls base 
+#else
 	ADDQ	$8, DI	// ELF wants to use -8(FS)
-
+#endif
 	MOVQ	DI, SI
 	MOVQ	$0x1002, DI	// ARCH_SET_FS
 	MOVQ	$158, AX	// arch_prctl
@@ -428,3 +533,33 @@
 	MOVL	$72, AX  // fcntl
 	SYSCALL
 	RET
+
+
+// int access(const char *name, int mode)
+TEXT runtime·access(SB),NOSPLIT,$0
+	MOVQ	name+0(FP), DI
+	MOVL	mode+8(FP), SI
+	MOVL	$21, AX  // syscall entry
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET
+
+// int connect(int fd, const struct sockaddr *addr, socklen_t addrlen)
+TEXT runtime·connect(SB),NOSPLIT,$0-28
+	MOVL	fd+0(FP), DI
+	MOVQ	addr+8(FP), SI
+	MOVL	addrlen+16(FP), DX
+	MOVL	$42, AX  // syscall entry
+	SYSCALL
+	MOVL	AX, ret+24(FP)
+	RET
+
+// int socket(int domain, int type, int protocol)
+TEXT runtime·socket(SB),NOSPLIT,$0-20
+	MOVL	domain+0(FP), DI
+	MOVL	type+4(FP), SI
+	MOVL	protocol+8(FP), DX
+	MOVL	$41, AX  // syscall entry
+	SYSCALL
+	MOVL	AX, ret+16(FP)
+	RET

diff --git a/src/runtime/sys_linux_arm.s b/src/runtime/sys_linux_arm.s
index 29eb8eb..5e5fcf0 100644
--- a/src/runtime/sys_linux_arm.s
+++ b/src/runtime/sys_linux_arm.s

@@ -313,7 +313,7 @@
 	MOVW	$16(R13), R13
 	BL	(R0)
 
-	// It shouldn't return.  If it does, exit that thread.
+	// It shouldn't return. If it does, exit that thread.
 	SUB	$16, R13 // restore the stack pointer to avoid memory corruption
 	MOVW	$0, R0
 	MOVW	R0, 4(R13)
@@ -339,7 +339,11 @@
 	MOVW	info+8(FP), R1
 	MOVW	ctx+12(FP), R2
 	MOVW	fn+0(FP), R11
+	MOVW	R13, R4
+	SUB	$24, R13
+	BIC	$0x7, R13 // alignment for ELF ABI
 	BL	(R11)
+	MOVW	R4, R13
 	RET
 
 TEXT runtime·sigtramp(SB),NOSPLIT,$12
@@ -357,6 +361,10 @@
 	BL	(R11)
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	MOVW  	$runtime·sigtramp(SB), R11
+	B	(R11)
+
 TEXT runtime·rtsigprocmask(SB),NOSPLIT,$0
 	MOVW	sig+0(FP), R0
 	MOVW	new+4(FP), R1
@@ -390,35 +398,6 @@
 	SWI	$0
 	RET
 
-// Use kernel version instead of native armcas in asm_arm.s.
-// See ../sync/atomic/asm_linux_arm.s for details.
-TEXT cas<>(SB),NOSPLIT,$0
-	MOVW	$0xffff0fc0, R15 // R15 is hardware PC.
-
-TEXT runtime·cas(SB),NOSPLIT,$0
-	MOVW	ptr+0(FP), R2
-	MOVW	old+4(FP), R0
-loop:
-	MOVW	new+8(FP), R1
-	BL	cas<>(SB)
-	BCC	check
-	MOVW	$1, R0
-	MOVB	R0, ret+12(FP)
-	RET
-check:
-	// Kernel lies; double-check.
-	MOVW	ptr+0(FP), R2
-	MOVW	old+4(FP), R0
-	MOVW	0(R2), R3
-	CMP	R0, R3
-	BEQ	loop
-	MOVW	$0, R0
-	MOVB	R0, ret+12(FP)
-	RET
-
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
 // As for cas, memory barriers are complicated on ARM, but the kernel
 // provides a user helper. ARMv5 does not support SMP and has no
 // memory barrier instruction at all. ARMv6 added SMP support and has

diff --git a/src/runtime/sys_linux_arm64.s b/src/runtime/sys_linux_arm64.s
index 7b58d67..1bee847 100644
--- a/src/runtime/sys_linux_arm64.s
+++ b/src/runtime/sys_linux_arm64.s

@@ -43,6 +43,9 @@
 #define SYS_epoll_ctl		21
 #define SYS_epoll_pwait		22
 #define SYS_clock_gettime	113
+#define SYS_faccessat		48
+#define SYS_socket		198
+#define SYS_connect		203
 
 TEXT runtime·exit(SB),NOSPLIT,$-8-4
 	MOVW	code+0(FP), R0
@@ -256,6 +259,10 @@
 	BL	(R0)
 	RET
 
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	MOVD	$runtime·sigtramp(SB), R3
+	B	(R3)
+
 TEXT runtime·mmap(SB),NOSPLIT,$-8
 	MOVD	addr+0(FP), R0
 	MOVD	n+8(FP), R1
@@ -266,6 +273,9 @@
 
 	MOVD	$SYS_mmap, R8
 	SVC
+	CMN	$4095, R0
+	BCC	2(PC)
+	NEG	R0,R0
 	MOVD	R0, ret+32(FP)
 	RET
 
@@ -445,3 +455,33 @@
 	MOVD	$SYS_fcntl, R8
 	SVC
 	RET
+
+// int access(const char *name, int mode)
+TEXT runtime·access(SB),NOSPLIT,$0-20
+	MOVD	$AT_FDCWD, R0
+	MOVD	name+0(FP), R1
+	MOVW	mode+8(FP), R2
+	MOVD	$SYS_faccessat, R8
+	SVC
+	MOVW	R0, ret+16(FP)
+	RET
+
+// int connect(int fd, const struct sockaddr *addr, socklen_t len)
+TEXT runtime·connect(SB),NOSPLIT,$0-28
+	MOVW	fd+0(FP), R0
+	MOVD	addr+8(FP), R1
+	MOVW	len+16(FP), R2
+	MOVD	$SYS_connect, R8
+	SVC
+	MOVW	R0, ret+24(FP)
+	RET
+
+// int socket(int domain, int typ, int prot)
+TEXT runtime·socket(SB),NOSPLIT,$0-20
+	MOVW	domain+0(FP), R0
+	MOVW	typ+4(FP), R1
+	MOVW	prot+8(FP), R2
+	MOVD	$SYS_socket, R8
+	SVC
+	MOVW	R0, ret+16(FP)
+	RET

diff --git a/src/runtime/sys_linux_mips64x.s b/src/runtime/sys_linux_mips64x.s
new file mode 100644
index 0000000..d4a81ca
--- /dev/null
+++ b/src/runtime/sys_linux_mips64x.s

@@ -0,0 +1,436 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+// +build mips64 mips64le
+
+//
+// System calls and other sys.stuff for mips64, Linux
+//
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define SYS_exit		5058
+#define SYS_read		5000
+#define SYS_write		5001
+#define SYS_open		5002
+#define SYS_close		5003
+#define SYS_getpid		5038
+#define SYS_kill		5060
+#define SYS_fcntl		5080
+#define SYS_gettimeofday	5094
+#define SYS_mmap		5009
+#define SYS_munmap		5011
+#define SYS_setitimer		5036
+#define SYS_clone		5055
+#define SYS_newselect		5022
+#define SYS_sched_yield		5023
+#define SYS_rt_sigreturn	5211
+#define SYS_rt_sigaction	5013
+#define SYS_rt_sigprocmask	5014
+#define SYS_sigaltstack		5129
+#define SYS_getrlimit		5095
+#define SYS_madvise		5027
+#define SYS_mincore		5026
+#define SYS_gettid		5178
+#define SYS_tkill		5192
+#define SYS_futex		5194
+#define SYS_sched_getaffinity	5196
+#define SYS_exit_group		5205
+#define SYS_epoll_create	5207
+#define SYS_epoll_ctl		5208
+#define SYS_epoll_wait		5209
+#define SYS_clock_gettime	5222
+#define SYS_epoll_create1	5285
+
+TEXT runtime·exit(SB),NOSPLIT,$-8-4
+	MOVW	code+0(FP), R4
+	MOVV	$SYS_exit_group, R2
+	SYSCALL
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT,$-8-4
+	MOVW	code+0(FP), R4
+	MOVV	$SYS_exit, R2
+	SYSCALL
+	RET
+
+TEXT runtime·open(SB),NOSPLIT,$-8-20
+	MOVV	name+0(FP), R4
+	MOVW	mode+8(FP), R5
+	MOVW	perm+12(FP), R6
+	MOVV	$SYS_open, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·closefd(SB),NOSPLIT,$-8-12
+	MOVW	fd+0(FP), R4
+	MOVV	$SYS_close, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+8(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT,$-8-28
+	MOVV	fd+0(FP), R4
+	MOVV	p+8(FP), R5
+	MOVW	n+16(FP), R6
+	MOVV	$SYS_write, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT,$-8-28
+	MOVW	fd+0(FP), R4
+	MOVV	p+8(FP), R5
+	MOVW	n+16(FP), R6
+	MOVV	$SYS_read, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT,$-8-20
+	MOVW	kind+0(FP), R4	// _RLIMIT_AS = 6 on linux/mips
+	MOVV	limit+8(FP), R5
+	MOVV	$SYS_getrlimit, R2
+	SYSCALL
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16-4
+	MOVWU	usec+0(FP), R3
+	MOVV	R3, R5
+	MOVW	$1000000, R4
+	DIVVU	R4, R3
+	MOVV	LO, R3
+	MOVV	R3, 8(R29)
+	MULVU	R3, R4
+	MOVV	LO, R4
+	SUBVU	R4, R5
+	MOVV	R5, 16(R29)
+
+	// select(0, 0, 0, 0, &tv)
+	MOVW	$0, R4
+	MOVW	$0, R5
+	MOVW	$0, R6
+	MOVW	$0, R7
+	ADDV	$8, R29, R8
+	MOVV	$SYS_newselect, R2
+	SYSCALL
+	RET
+
+TEXT runtime·gettid(SB),NOSPLIT,$0-4
+	MOVV	$SYS_gettid, R2
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT,$-8
+	MOVV	$SYS_gettid, R2
+	SYSCALL
+	MOVW	R2, R4	// arg 1 tid
+	MOVW	sig+0(FP), R5	// arg 2
+	MOVV	$SYS_tkill, R2
+	SYSCALL
+	RET
+
+TEXT runtime·raiseproc(SB),NOSPLIT,$-8
+	MOVV	$SYS_getpid, R2
+	SYSCALL
+	MOVW	R2, R4	// arg 1 pid
+	MOVW	sig+0(FP), R5	// arg 2
+	MOVV	$SYS_kill, R2
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT,$-8-24
+	MOVW	mode+0(FP), R4
+	MOVV	new+8(FP), R5
+	MOVV	old+16(FP), R6
+	MOVV	$SYS_setitimer, R2
+	SYSCALL
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT,$-8-28
+	MOVV	addr+0(FP), R4
+	MOVV	n+8(FP), R5
+	MOVV	dst+16(FP), R6
+	MOVV	$SYS_mincore, R2
+	SYSCALL
+	SUBVU	R2, R0, R2	// caller expects negative errno
+	MOVW	R2, ret+24(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$16
+	MOVV	$0(R29), R4
+	MOVV	$0, R5
+	MOVV	$SYS_gettimeofday, R2
+	SYSCALL
+	MOVV	0(R29), R3	// sec
+	MOVV	8(R29), R5	// usec
+	MOVV	$1000, R4
+	MULVU	R4, R5
+	MOVV	LO, R5
+	MOVV	R3, sec+0(FP)
+	MOVW	R5, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	MOVW	$1, R4 // CLOCK_MONOTONIC
+	MOVV	$0(R29), R5
+	MOVV	$SYS_clock_gettime, R2
+	SYSCALL
+	MOVV	0(R29), R3	// sec
+	MOVV	8(R29), R5	// nsec
+	// sec is in R3, nsec in R5
+	// return nsec in R3
+	MOVV	$1000000000, R4
+	MULVU	R4, R3
+	MOVV	LO, R3
+	ADDVU	R5, R3
+	MOVV	R3, ret+0(FP)
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT,$-8-28
+	MOVW	sig+0(FP), R4
+	MOVV	new+8(FP), R5
+	MOVV	old+16(FP), R6
+	MOVW	size+24(FP), R7
+	MOVV	$SYS_rt_sigprocmask, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVV	R0, 0xf1(R0)	// crash
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT,$-8-36
+	MOVV	sig+0(FP), R4
+	MOVV	new+8(FP), R5
+	MOVV	old+16(FP), R6
+	MOVV	size+24(FP), R7
+	MOVV	$SYS_rt_sigaction, R2
+	SYSCALL
+	MOVW	R2, ret+32(FP)
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R4
+	MOVV	info+16(FP), R5
+	MOVV	ctx+24(FP), R6
+	MOVV	fn+0(FP), R25
+	JAL	(R25)
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	// initialize REGSB = PC&0xffffffff00000000
+	BGEZAL	R0, 1(PC)
+	SRLV	$32, R31, RSB
+	SLLV	$32, RSB
+
+	// initialize essential registers (just in case)
+	JAL	runtime·reginit(SB)
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVB	runtime·iscgo(SB), R1
+	BEQ	R1, 2(PC)
+	JAL	runtime·load_g(SB)
+
+	MOVW	R4, 8(R29)
+	MOVV	R5, 16(R29)
+	MOVV	R6, 24(R29)
+	MOVV	$runtime·sigtrampgo(SB), R1
+	JAL	(R1)
+	RET
+
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	JMP	runtime·sigtramp(SB)
+
+TEXT runtime·mmap(SB),NOSPLIT,$-8
+	MOVV	addr+0(FP), R4
+	MOVV	n+8(FP), R5
+	MOVW	prot+16(FP), R6
+	MOVW	flags+20(FP), R7
+	MOVW	fd+24(FP), R8
+	MOVW	off+28(FP), R9
+
+	MOVV	$SYS_mmap, R2
+	SYSCALL
+	MOVV	R2, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT,$-8
+	MOVV	addr+0(FP), R4
+	MOVV	n+8(FP), R5
+	MOVV	$SYS_munmap, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVV	R0, 0xf3(R0)	// crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT,$-8
+	MOVV	addr+0(FP), R4
+	MOVV	n+8(FP), R5
+	MOVW	flags+16(FP), R6
+	MOVV	$SYS_madvise, R2
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+// int64 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT,$-8
+	MOVV	addr+0(FP), R4
+	MOVW	op+8(FP), R5
+	MOVW	val+12(FP), R6
+	MOVV	ts+16(FP), R7
+	MOVV	addr2+24(FP), R8
+	MOVW	val3+32(FP), R9
+	MOVV	$SYS_futex, R2
+	SYSCALL
+	MOVW	R2, ret+40(FP)
+	RET
+
+// int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT,$-8
+	MOVW	flags+0(FP), R4
+	MOVV	stk+8(FP), R5
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	// Careful: Linux system call clobbers ???.
+	MOVV	mm+16(FP), R16
+	MOVV	gg+24(FP), R17
+	MOVV	fn+32(FP), R18
+
+	MOVV	R16, -8(R5)
+	MOVV	R17, -16(R5)
+	MOVV	R18, -24(R5)
+	MOVV	$1234, R16
+	MOVV	R16, -32(R5)
+
+	MOVV	$SYS_clone, R2
+	SYSCALL
+
+	// In parent, return.
+	BEQ	R2, 3(PC)
+	MOVW	R2, ret+40(FP)
+	RET
+
+	// In child, on new stack.
+	// initialize essential registers
+	JAL	runtime·reginit(SB)
+	MOVV	-32(R29), R16
+	MOVV	$1234, R1
+	BEQ	R16, R1, 2(PC)
+	MOVV	R0, 0(R0)
+
+	// Initialize m->procid to Linux tid
+	MOVV	$SYS_gettid, R2
+	SYSCALL
+
+	MOVV	-24(R29), R18		// fn
+	MOVV	-16(R29), R17		// g
+	MOVV	-8(R29), R16		// m
+
+	BEQ	R16, nog
+	BEQ	R17, nog
+
+	MOVV	R2, m_procid(R16)
+
+	// TODO: setup TLS.
+
+	// In child, set up new stack
+	MOVV	R16, g_m(R17)
+	MOVV	R17, g
+	//CALL	runtime·stackcheck(SB)
+
+nog:
+	// Call fn
+	JAL	(R18)
+
+	// It shouldn't return.	 If it does, exit that thread.
+	MOVW	$111, R4
+	MOVV	$SYS_exit, R2
+	SYSCALL
+	JMP	-3(PC)	// keep exiting
+
+TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+	MOVV	new+0(FP), R4
+	MOVV	old+8(FP), R5
+	MOVV	$SYS_sigaltstack, R2
+	SYSCALL
+	BEQ	R7, 2(PC)
+	MOVV	R0, 0xf1(R0)	// crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT,$-8
+	MOVV	$SYS_sched_yield, R2
+	SYSCALL
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT,$-8
+	MOVV	pid+0(FP), R4
+	MOVV	len+8(FP), R5
+	MOVV	buf+16(FP), R6
+	MOVV	$SYS_sched_getaffinity, R2
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT,$-8
+	MOVW    size+0(FP), R4
+	MOVV	$SYS_epoll_create, R2
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT,$-8
+	MOVW	flags+0(FP), R4
+	MOVV	$SYS_epoll_create1, R2
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT,$-8
+	MOVW	epfd+0(FP), R4
+	MOVW	op+4(FP), R5
+	MOVW	fd+8(FP), R6
+	MOVV	ev+16(FP), R7
+	MOVV	$SYS_epoll_ctl, R2
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT,$-8
+	MOVW	epfd+0(FP), R4
+	MOVV	ev+8(FP), R5
+	MOVW	nev+16(FP), R6
+	MOVW	timeout+20(FP), R7
+	MOVV	$SYS_epoll_wait, R2
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT,$-8
+	MOVW    fd+0(FP), R4  // fd
+	MOVV    $2, R5  // F_SETFD
+	MOVV    $1, R6  // FD_CLOEXEC
+	MOVV	$SYS_fcntl, R2
+	SYSCALL
+	RET

diff --git a/src/runtime/sys_linux_ppc64x.s b/src/runtime/sys_linux_ppc64x.s
index 01575f8..56b842a 100644
--- a/src/runtime/sys_linux_ppc64x.s
+++ b/src/runtime/sys_linux_ppc64x.s

@@ -12,6 +12,7 @@
 #include "go_asm.h"
 #include "go_tls.h"
 #include "textflag.h"
+#include "asm_ppc64x.h"
 
 #define SYS_exit		  1
 #define SYS_read		  3
@@ -47,17 +48,17 @@
 #define SYS_clock_gettime	246
 #define SYS_epoll_create1	315
 
-TEXT runtime·exit(SB),NOSPLIT,$-8-4
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	code+0(FP), R3
 	SYSCALL	$SYS_exit_group
 	RET
 
-TEXT runtime·exit1(SB),NOSPLIT,$-8-4
+TEXT runtime·exit1(SB),NOSPLIT|NOFRAME,$0-4
 	MOVW	code+0(FP), R3
 	SYSCALL	$SYS_exit
 	RET
 
-TEXT runtime·open(SB),NOSPLIT,$-8-20
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
 	MOVD	name+0(FP), R3
 	MOVW	mode+8(FP), R4
 	MOVW	perm+12(FP), R5
@@ -67,7 +68,7 @@
 	MOVW	R3, ret+16(FP)
 	RET
 
-TEXT runtime·closefd(SB),NOSPLIT,$-8-12
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
 	MOVW	fd+0(FP), R3
 	SYSCALL	$SYS_close
 	BVC	2(PC)
@@ -75,7 +76,7 @@
 	MOVW	R3, ret+8(FP)
 	RET
 
-TEXT runtime·write(SB),NOSPLIT,$-8-28
+TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
 	MOVD	fd+0(FP), R3
 	MOVD	p+8(FP), R4
 	MOVW	n+16(FP), R5
@@ -85,7 +86,7 @@
 	MOVW	R3, ret+24(FP)
 	RET
 
-TEXT runtime·read(SB),NOSPLIT,$-8-28
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	fd+0(FP), R3
 	MOVD	p+8(FP), R4
 	MOVW	n+16(FP), R5
@@ -95,7 +96,7 @@
 	MOVW	R3, ret+24(FP)
 	RET
 
-TEXT runtime·getrlimit(SB),NOSPLIT,$-8-20
+TEXT runtime·getrlimit(SB),NOSPLIT|NOFRAME,$0-20
 	MOVW	kind+0(FP), R3
 	MOVD	limit+8(FP), R4
 	SYSCALL	$SYS_ugetrlimit
@@ -126,32 +127,33 @@
 	MOVW	R3, ret+0(FP)
 	RET
 
-TEXT runtime·raise(SB),NOSPLIT,$-8
+TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
 	SYSCALL	$SYS_gettid
 	MOVW	R3, R3	// arg 1 tid
 	MOVW	sig+0(FP), R4	// arg 2
 	SYSCALL	$SYS_tkill
 	RET
 
-TEXT runtime·raiseproc(SB),NOSPLIT,$-8
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
 	SYSCALL	$SYS_getpid
 	MOVW	R3, R3	// arg 1 pid
 	MOVW	sig+0(FP), R4	// arg 2
 	SYSCALL	$SYS_kill
 	RET
 
-TEXT runtime·setitimer(SB),NOSPLIT,$-8-24
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
 	MOVW	mode+0(FP), R3
 	MOVD	new+8(FP), R4
 	MOVD	old+16(FP), R5
 	SYSCALL	$SYS_setitimer
 	RET
 
-TEXT runtime·mincore(SB),NOSPLIT,$-8-28
+TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
 	MOVD	addr+0(FP), R3
 	MOVD	n+8(FP), R4
 	MOVD	dst+16(FP), R5
 	SYSCALL	$SYS_mincore
+	NEG	R3		// caller expects negative errno
 	MOVW	R3, ret+24(FP)
 	RET
 
@@ -182,7 +184,7 @@
 	MOVD	R3, ret+0(FP)
 	RET
 
-TEXT runtime·rtsigprocmask(SB),NOSPLIT,$-8-28
+TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
 	MOVW	sig+0(FP), R3
 	MOVD	new+8(FP), R4
 	MOVD	old+16(FP), R5
@@ -192,7 +194,7 @@
 	MOVD	R0, 0xf1(R0)	// crash
 	RET
 
-TEXT runtime·rt_sigaction(SB),NOSPLIT,$-8-36
+TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
 	MOVD	sig+0(FP), R3
 	MOVD	new+8(FP), R4
 	MOVD	old+16(FP), R5
@@ -205,9 +207,10 @@
 	MOVW	sig+8(FP), R3
 	MOVD	info+16(FP), R4
 	MOVD	ctx+24(FP), R5
-	MOVD	fn+0(FP), R31
-	MOVD	R31, CTR
+	MOVD	fn+0(FP), R12
+	MOVD	R12, CTR
 	BL	(CTR)
+	MOVD	24(R1), R2
 	RET
 
 #ifdef GOARCH_ppc64le
@@ -215,7 +218,7 @@
 TEXT runtime·sigtramp(SB),NOSPLIT,$64
 #else
 // function descriptor for the real sigtramp
-TEXT runtime·sigtramp(SB),NOSPLIT,$-8
+TEXT runtime·sigtramp(SB),NOSPLIT|NOFRAME,$0
 	DWORD	$runtime·_sigtramp(SB)
 	DWORD	$0
 	DWORD	$0
@@ -231,15 +234,31 @@
 	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
-	MOVW	R3, 8(R1)
-	MOVD	R4, 16(R1)
-	MOVD	R5, 24(R1)
-	MOVD	$runtime·sigtrampgo(SB), R31
-	MOVD	R31, CTR
+	MOVW	R3, FIXED_FRAME+0(R1)
+	MOVD	R4, FIXED_FRAME+8(R1)
+	MOVD	R5, FIXED_FRAME+16(R1)
+	MOVD	$runtime·sigtrampgo(SB), R12
+	MOVD	R12, CTR
 	BL	(CTR)
+	MOVD	24(R1), R2
 	RET
 
-TEXT runtime·mmap(SB),NOSPLIT,$-8
+#ifdef GOARCH_ppc64le
+// ppc64le doesn't need function descriptors
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+#else
+// function descriptor for the real sigtramp
+TEXT runtime·cgoSigtramp(SB),NOSPLIT|NOFRAME,$0
+	DWORD	$runtime·_cgoSigtramp(SB)
+	DWORD	$0
+	DWORD	$0
+TEXT runtime·_cgoSigtramp(SB),NOSPLIT,$0
+#endif
+	MOVD	$runtime·sigtramp(SB), R12
+	MOVD	R12, CTR
+	JMP	(CTR)
+
+TEXT runtime·mmap(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr+0(FP), R3
 	MOVD	n+8(FP), R4
 	MOVW	prot+16(FP), R5
@@ -251,7 +270,7 @@
 	MOVD	R3, ret+32(FP)
 	RET
 
-TEXT runtime·munmap(SB),NOSPLIT,$-8
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr+0(FP), R3
 	MOVD	n+8(FP), R4
 	SYSCALL	$SYS_munmap
@@ -259,7 +278,7 @@
 	MOVD	R0, 0xf3(R0)
 	RET
 
-TEXT runtime·madvise(SB),NOSPLIT,$-8
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr+0(FP), R3
 	MOVD	n+8(FP), R4
 	MOVW	flags+16(FP), R5
@@ -269,7 +288,7 @@
 
 // int64 futex(int32 *uaddr, int32 op, int32 val,
 //	struct timespec *timeout, int32 *uaddr2, int32 val2);
-TEXT runtime·futex(SB),NOSPLIT,$-8
+TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
 	MOVD	addr+0(FP), R3
 	MOVW	op+8(FP), R4
 	MOVW	val+12(FP), R5
@@ -281,7 +300,7 @@
 	RET
 
 // int64 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
-TEXT runtime·clone(SB),NOSPLIT,$-8
+TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
 	MOVW	flags+0(FP), R3
 	MOVD	stk+8(FP), R4
 
@@ -344,7 +363,7 @@
 	SYSCALL	$SYS_exit
 	BR	-2(PC)	// keep exiting
 
-TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
 	MOVD	new+0(FP), R3
 	MOVD	old+8(FP), R4
 	SYSCALL	$SYS_sigaltstack
@@ -352,11 +371,11 @@
 	MOVD	R0, 0xf1(R0)  // crash
 	RET
 
-TEXT runtime·osyield(SB),NOSPLIT,$-8
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
 	SYSCALL	$SYS_sched_yield
 	RET
 
-TEXT runtime·sched_getaffinity(SB),NOSPLIT,$-8
+TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
 	MOVD	pid+0(FP), R3
 	MOVD	len+8(FP), R4
 	MOVD	buf+16(FP), R5
@@ -365,21 +384,21 @@
 	RET
 
 // int32 runtime·epollcreate(int32 size);
-TEXT runtime·epollcreate(SB),NOSPLIT,$-8
+TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
 	MOVW    size+0(FP), R3
 	SYSCALL	$SYS_epoll_create
 	MOVW	R3, ret+8(FP)
 	RET
 
 // int32 runtime·epollcreate1(int32 flags);
-TEXT runtime·epollcreate1(SB),NOSPLIT,$-8
+TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
 	MOVW	flags+0(FP), R3
 	SYSCALL	$SYS_epoll_create1
 	MOVW	R3, ret+8(FP)
 	RET
 
 // func epollctl(epfd, op, fd int32, ev *epollEvent) int
-TEXT runtime·epollctl(SB),NOSPLIT,$-8
+TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
 	MOVW	epfd+0(FP), R3
 	MOVW	op+4(FP), R4
 	MOVW	fd+8(FP), R5
@@ -389,7 +408,7 @@
 	RET
 
 // int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
-TEXT runtime·epollwait(SB),NOSPLIT,$-8
+TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
 	MOVW	epfd+0(FP), R3
 	MOVD	ev+8(FP), R4
 	MOVW	nev+16(FP), R5
@@ -399,7 +418,7 @@
 	RET
 
 // void runtime·closeonexec(int32 fd);
-TEXT runtime·closeonexec(SB),NOSPLIT,$-8
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
 	MOVW    fd+0(FP), R3  // fd
 	MOVD    $2, R4  // F_SETFD
 	MOVD    $1, R5  // FD_CLOEXEC

diff --git a/src/runtime/sys_linux_s390x.s b/src/runtime/sys_linux_s390x.s
new file mode 100644
index 0000000..f43792b
--- /dev/null
+++ b/src/runtime/sys_linux_s390x.s

@@ -0,0 +1,440 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// System calls and other system stuff for Linux s390x; see
+// /usr/include/asm/unistd.h for the syscall number definitions.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+#define SYS_exit                  1
+#define SYS_read                  3
+#define SYS_write                 4
+#define SYS_open                  5
+#define SYS_close                 6
+#define SYS_getpid               20
+#define SYS_kill                 37
+#define SYS_fcntl                55
+#define SYS_gettimeofday         78
+#define SYS_mmap                 90
+#define SYS_munmap               91
+#define SYS_setitimer           104
+#define SYS_clone               120
+#define SYS_select              142
+#define SYS_sched_yield         158
+#define SYS_rt_sigreturn        173
+#define SYS_rt_sigaction        174
+#define SYS_rt_sigprocmask      175
+#define SYS_sigaltstack         186
+#define SYS_ugetrlimit          191
+#define SYS_madvise             219
+#define SYS_mincore             218
+#define SYS_gettid              236
+#define SYS_tkill               237
+#define SYS_futex               238
+#define SYS_sched_getaffinity   240
+#define SYS_exit_group          248
+#define SYS_epoll_create        249
+#define SYS_epoll_ctl           250
+#define SYS_epoll_wait          251
+#define SYS_clock_gettime       260
+#define SYS_epoll_create1       327
+
+TEXT runtime·exit(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), R2
+	MOVW	$SYS_exit_group, R1
+	SYSCALL
+	RET
+
+TEXT runtime·exit1(SB),NOSPLIT|NOFRAME,$0-4
+	MOVW	code+0(FP), R2
+	MOVW	$SYS_exit, R1
+	SYSCALL
+	RET
+
+TEXT runtime·open(SB),NOSPLIT|NOFRAME,$0-20
+	MOVD	name+0(FP), R2
+	MOVW	mode+8(FP), R3
+	MOVW	perm+12(FP), R4
+	MOVW	$SYS_open, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·closefd(SB),NOSPLIT|NOFRAME,$0-12
+	MOVW	fd+0(FP), R2
+	MOVW	$SYS_close, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+8(FP)
+	RET
+
+TEXT runtime·write(SB),NOSPLIT|NOFRAME,$0-28
+	MOVD	fd+0(FP), R2
+	MOVD	p+8(FP), R3
+	MOVW	n+16(FP), R4
+	MOVW	$SYS_write, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·read(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	fd+0(FP), R2
+	MOVD	p+8(FP), R3
+	MOVW	n+16(FP), R4
+	MOVW	$SYS_read, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVW	$-1, R2
+	MOVW	R2, ret+24(FP)
+	RET
+
+TEXT runtime·getrlimit(SB),NOSPLIT|NOFRAME,$0-20
+	MOVW	kind+0(FP), R2
+	MOVD	limit+8(FP), R3
+	MOVW	$SYS_ugetrlimit, R1
+	SYSCALL
+	MOVW	R2, ret+16(FP)
+	RET
+
+TEXT runtime·usleep(SB),NOSPLIT,$16-4
+	MOVW	usec+0(FP), R2
+	MOVD	R2, R4
+	MOVW	$1000000, R3
+	DIVD	R3, R2
+	MOVD	R2, 8(R15)
+	MULLD	R2, R3
+	SUB	R3, R4
+	MOVD	R4, 16(R15)
+
+	// select(0, 0, 0, 0, &tv)
+	MOVW	$0, R2
+	MOVW	$0, R3
+	MOVW	$0, R4
+	MOVW	$0, R5
+	ADD	$8, R15, R6
+	MOVW	$SYS_select, R1
+	SYSCALL
+	RET
+
+TEXT runtime·gettid(SB),NOSPLIT,$0-4
+	MOVW	$SYS_gettid, R1
+	SYSCALL
+	MOVW	R2, ret+0(FP)
+	RET
+
+TEXT runtime·raise(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_gettid, R1
+	SYSCALL
+	MOVW	R2, R2	// arg 1 tid
+	MOVW	sig+0(FP), R3	// arg 2
+	MOVW	$SYS_tkill, R1
+	SYSCALL
+	RET
+
+TEXT runtime·raiseproc(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_getpid, R1
+	SYSCALL
+	MOVW	R2, R2	// arg 1 pid
+	MOVW	sig+0(FP), R3	// arg 2
+	MOVW	$SYS_kill, R1
+	SYSCALL
+	RET
+
+TEXT runtime·setitimer(SB),NOSPLIT|NOFRAME,$0-24
+	MOVW	mode+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVW	$SYS_setitimer, R1
+	SYSCALL
+	RET
+
+TEXT runtime·mincore(SB),NOSPLIT|NOFRAME,$0-28
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVD	dst+16(FP), R4
+	MOVW	$SYS_mincore, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// func now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$16
+	MOVD	$0(R15), R2
+	MOVD	$0, R3
+	MOVW	$SYS_gettimeofday, R1
+	SYSCALL
+	MOVD	0(R15), R2	// sec
+	MOVD	8(R15), R4	// usec
+	MOVD	$1000, R3
+	MULLD	R3, R4
+	MOVD	R2, sec+0(FP)
+	MOVW	R4, nsec+8(FP)
+	RET
+
+TEXT runtime·nanotime(SB),NOSPLIT,$16
+	MOVW	$1, R2 // CLOCK_MONOTONIC
+	MOVD	$0(R15), R3
+	MOVW	$SYS_clock_gettime, R1
+	SYSCALL
+	MOVD	0(R15), R2	// sec
+	MOVD	8(R15), R4	// nsec
+	// sec is in R2, nsec in R4
+	// return nsec in R2
+	MOVD	$1000000000, R3
+	MULLD	R3, R2
+	ADD	R4, R2
+	MOVD	R2, ret+0(FP)
+	RET
+
+TEXT runtime·rtsigprocmask(SB),NOSPLIT|NOFRAME,$0-28
+	MOVW	sig+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVW	size+24(FP), R5
+	MOVW	$SYS_rt_sigprocmask, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·rt_sigaction(SB),NOSPLIT|NOFRAME,$0-36
+	MOVD	sig+0(FP), R2
+	MOVD	new+8(FP), R3
+	MOVD	old+16(FP), R4
+	MOVD	size+24(FP), R5
+	MOVW	$SYS_rt_sigaction, R1
+	SYSCALL
+	MOVW	R2, ret+32(FP)
+	RET
+
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVW	sig+8(FP), R2
+	MOVD	info+16(FP), R3
+	MOVD	ctx+24(FP), R4
+	MOVD	fn+0(FP), R5
+	BL	R5
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$64
+	// initialize essential registers (just in case)
+	XOR	R0, R0
+
+	// this might be called in external code context,
+	// where g is not set.
+	MOVB	runtime·iscgo(SB), R6
+	CMPBEQ	R6, $0, 2(PC)
+	BL	runtime·load_g(SB)
+
+	MOVW	R2, 8(R15)
+	MOVD	R3, 16(R15)
+	MOVD	R4, 24(R15)
+	MOVD	$runtime·sigtrampgo(SB), R5
+	BL	R5
+	RET
+
+TEXT runtime·cgoSigtramp(SB),NOSPLIT,$0
+	BR	runtime·sigtramp(SB)
+
+// func mmap(addr unsafe.Pointer, n uintptr, prot, flags, fd int32, off uint32) unsafe.Pointer
+TEXT runtime·mmap(SB),NOSPLIT,$48-40
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	prot+16(FP), R4
+	MOVW	flags+20(FP), R5
+	MOVW	fd+24(FP), R6
+	MOVWZ	off+28(FP), R7
+
+	// s390x uses old_mmap, so the arguments need to be placed into
+	// a struct and a pointer to the struct passed to mmap.
+	MOVD	R2, addr-48(SP)
+	MOVD	R3, n-40(SP)
+	MOVD	R4, prot-32(SP)
+	MOVD	R5, flags-24(SP)
+	MOVD	R6, fd-16(SP)
+	MOVD	R7, off-8(SP)
+
+	MOVD	$addr-48(SP), R2
+	MOVW	$SYS_mmap, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	NEG	R2
+	MOVD	R2, ret+32(FP)
+	RET
+
+TEXT runtime·munmap(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	$SYS_munmap, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVD	n+8(FP), R3
+	MOVW	flags+16(FP), R4
+	MOVW	$SYS_madvise, R1
+	SYSCALL
+	// ignore failure - maybe pages are locked
+	RET
+
+// int64 futex(int32 *uaddr, int32 op, int32 val,
+//	struct timespec *timeout, int32 *uaddr2, int32 val2);
+TEXT runtime·futex(SB),NOSPLIT|NOFRAME,$0
+	MOVD	addr+0(FP), R2
+	MOVW	op+8(FP), R3
+	MOVW	val+12(FP), R4
+	MOVD	ts+16(FP), R5
+	MOVD	addr2+24(FP), R6
+	MOVW	val3+32(FP),  R7
+	MOVW	$SYS_futex, R1
+	SYSCALL
+	MOVW	R2, ret+40(FP)
+	RET
+
+// int32 clone(int32 flags, void *stk, M *mp, G *gp, void (*fn)(void));
+TEXT runtime·clone(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), R3
+	MOVD	stk+8(FP), R2
+
+	// Copy mp, gp, fn off parent stack for use by child.
+	// Careful: Linux system call clobbers ???.
+	MOVD	mm+16(FP), R7
+	MOVD	gg+24(FP), R8
+	MOVD	fn+32(FP), R9
+
+	MOVD	R7, -8(R2)
+	MOVD	R8, -16(R2)
+	MOVD	R9, -24(R2)
+	MOVD	$1234, R7
+	MOVD	R7, -32(R2)
+
+	SYSCALL $SYS_clone
+
+	// In parent, return.
+	CMPBEQ	R2, $0, 3(PC)
+	MOVW	R2, ret+40(FP)
+	RET
+
+	// In child, on new stack.
+	// initialize essential registers
+	XOR	R0, R0
+	MOVD	-32(R15), R7
+	CMP	R7, $1234
+	BEQ	2(PC)
+	MOVD	R0, 0(R0)
+
+	// Initialize m->procid to Linux tid
+	SYSCALL $SYS_gettid
+
+	MOVD	-24(R15), R9        // fn
+	MOVD	-16(R15), R8        // g
+	MOVD	-8(R15), R7         // m
+
+	CMPBEQ	R7, $0, nog
+	CMP	R8, $0
+	BEQ	nog
+
+	MOVD	R2, m_procid(R7)
+
+	// In child, set up new stack
+	MOVD	R7, g_m(R8)
+	MOVD	R8, g
+	//CALL	runtime·stackcheck(SB)
+
+nog:
+	// Call fn
+	BL	R9
+
+	// It shouldn't return.	 If it does, exit that thread.
+	MOVW	$111, R2
+	MOVW	$SYS_exit, R1
+	SYSCALL
+	BR	-2(PC)	// keep exiting
+
+TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
+	MOVD	new+0(FP), R2
+	MOVD	old+8(FP), R3
+	MOVW	$SYS_sigaltstack, R1
+	SYSCALL
+	MOVD	$-4095, R3
+	CMPUBLT	R2, R3, 2(PC)
+	MOVD	R0, 0(R0) // crash
+	RET
+
+TEXT runtime·osyield(SB),NOSPLIT|NOFRAME,$0
+	MOVW	$SYS_sched_yield, R1
+	SYSCALL
+	RET
+
+TEXT runtime·sched_getaffinity(SB),NOSPLIT|NOFRAME,$0
+	MOVD	pid+0(FP), R2
+	MOVD	len+8(FP), R3
+	MOVD	buf+16(FP), R4
+	MOVW	$SYS_sched_getaffinity, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollcreate(int32 size);
+TEXT runtime·epollcreate(SB),NOSPLIT|NOFRAME,$0
+	MOVW    size+0(FP), R2
+	MOVW	$SYS_epoll_create, R1
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// int32 runtime·epollcreate1(int32 flags);
+TEXT runtime·epollcreate1(SB),NOSPLIT|NOFRAME,$0
+	MOVW	flags+0(FP), R2
+	MOVW	$SYS_epoll_create1, R1
+	SYSCALL
+	MOVW	R2, ret+8(FP)
+	RET
+
+// func epollctl(epfd, op, fd int32, ev *epollEvent) int
+TEXT runtime·epollctl(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), R2
+	MOVW	op+4(FP), R3
+	MOVW	fd+8(FP), R4
+	MOVD	ev+16(FP), R5
+	MOVW	$SYS_epoll_ctl, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
+TEXT runtime·epollwait(SB),NOSPLIT|NOFRAME,$0
+	MOVW	epfd+0(FP), R2
+	MOVD	ev+8(FP), R3
+	MOVW	nev+16(FP), R4
+	MOVW	timeout+20(FP), R5
+	MOVW	$SYS_epoll_wait, R1
+	SYSCALL
+	MOVW	R2, ret+24(FP)
+	RET
+
+// void runtime·closeonexec(int32 fd);
+TEXT runtime·closeonexec(SB),NOSPLIT|NOFRAME,$0
+	MOVW    fd+0(FP), R2  // fd
+	MOVD    $2, R3  // F_SETFD
+	MOVD    $1, R4  // FD_CLOEXEC
+	MOVW	$SYS_fcntl, R1
+	SYSCALL
+	RET

diff --git a/src/runtime/sys_mips64x.go b/src/runtime/sys_mips64x.go
new file mode 100644
index 0000000..9e7d805
--- /dev/null
+++ b/src/runtime/sys_mips64x.go

@@ -0,0 +1,43 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+package runtime
+
+import "unsafe"
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then did an immediate Gosave.
+func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
+	if buf.lr != 0 {
+		throw("invalid use of gostartcall")
+	}
+	buf.lr = buf.pc
+	buf.pc = uintptr(fn)
+	buf.ctxt = ctxt
+}
+
+// Called to rewind context saved during morestack back to beginning of function.
+// To help us, the linker emits a jmp back to the beginning right after the
+// call to morestack. We just have to decode and apply that jump.
+func rewindmorestack(buf *gobuf) {
+	var inst uint32
+	if buf.pc&3 == 0 && buf.pc != 0 {
+		inst = *(*uint32)(unsafe.Pointer(buf.pc))
+		if inst>>26 == 2 { // JMP addr
+			//print("runtime: rewind pc=", hex(buf.pc), " to pc=", hex(buf.pc &^ uintptr(1<<28-1) | uintptr((inst&^0xfc000000)<<2)), "\n");
+			buf.pc &^= 1<<28 - 1
+			buf.pc |= uintptr((inst &^ 0xfc000000) << 2)
+			return
+		}
+		if inst>>16 == 0x1000 { // BEQ	R0, R0, offset
+			//print("runtime: rewind pc=", hex(buf.pc), " to pc=", hex(buf.pc + uintptr(int32(int16(inst&0xffff))<<2 + 4)), "\n");
+			buf.pc += uintptr(int32(int16(inst&0xffff))<<2 + 4)
+			return
+		}
+	}
+	print("runtime: pc=", hex(buf.pc), " ", hex(inst), "\n")
+	throw("runtime: misuse of rewindmorestack")
+}

diff --git a/src/runtime/sys_nacl_386.s b/src/runtime/sys_nacl_386.s
index bf2d36e..e69a0b7 100644
--- a/src/runtime/sys_nacl_386.s
+++ b/src/runtime/sys_nacl_386.s

@@ -227,6 +227,9 @@
 	LEAL	24(SP), AX
 	MOVL	AX, 20(SP)
 	NACL_SYSCALL(SYS_mmap)
+	CMPL	AX, $-4095
+	JNA	2(PC)
+	NEGL	AX
 	MOVL	AX, ret+24(FP)
 	RET
 

diff --git a/src/runtime/sys_nacl_arm.s b/src/runtime/sys_nacl_arm.s
index cf4804f..474d9fe 100644
--- a/src/runtime/sys_nacl_arm.s
+++ b/src/runtime/sys_nacl_arm.s

@@ -308,21 +308,6 @@
 	NACL_SYSCALL(SYS_get_random_bytes)
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// This is only valid for ARMv6+, however, NaCl/ARM is only defined
-// for ARMv7A anyway.
-// bool armcas(int32 *val, int32 old, int32 new)
-// AtomiBLy:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // Likewise, this is only valid for ARMv7+, but that's okay.
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)

diff --git a/src/runtime/sys_netbsd_386.s b/src/runtime/sys_netbsd_386.s
index 13b8428..0322c36 100644
--- a/src/runtime/sys_netbsd_386.s
+++ b/src/runtime/sys_netbsd_386.s

@@ -215,42 +215,25 @@
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$44
-	get_tls(CX)
-
-	// check that g exists
-	MOVL	g(CX), DI
-	CMPL	DI, $0
-	JNE	6(PC)
-	MOVL	signo+0(FP), BX
-	MOVL	BX, 0(SP)
-	MOVL	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$12-16
+	MOVL	sig+4(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	info+8(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	ctx+12(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	fn+0(FP), AX
 	CALL	AX
 	RET
 
-	// save g
-	MOVL	DI, 20(SP)
-
-	// g = m->gsignal
-	MOVL	g_m(DI), BX
-	MOVL	m_gsignal(BX), BX
-	MOVL	BX, g(CX)
-
-	// copy arguments for call to sighandler
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
 	MOVL	info+4(FP), BX
 	MOVL	BX, 4(SP)
 	MOVL	context+8(FP), BX
 	MOVL	BX, 8(SP)
-	MOVL	DI, 12(SP)
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), BX
-	MOVL	BX, g(CX)
+	CALL	runtime·sigtrampgo(SB)
 	RET
 
 // int32 lwp_create(void *context, uintptr flags, void *lwpid);
@@ -279,7 +262,7 @@
 	POPL	AX
 	POPAL
 
-	// Now segment is established.  Initialize m, g.
+	// Now segment is established. Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
 	MOVL	BX, g_m(DX)

diff --git a/src/runtime/sys_netbsd_amd64.s b/src/runtime/sys_netbsd_amd64.s
index d0640db..d6b5d35 100644
--- a/src/runtime/sys_netbsd_amd64.s
+++ b/src/runtime/sys_netbsd_amd64.s

@@ -37,7 +37,7 @@
 	// Call fn
 	CALL	R12
 
-	// It shouldn't return.  If it does, exit.
+	// It shouldn't return. If it does, exit.
 	MOVL	$310, AX		// sys__lwp_exit
 	SYSCALL
 	JMP	-3(PC)			// keep exiting
@@ -237,37 +237,21 @@
 	MOVL	$0xf1, 0xf1		// crash
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$64
-	get_tls(BX)
-
-	// check that g exists
-	MOVQ	g(BX), R10
-	CMPQ	R10, $0
-	JNE	5(PC)
-	MOVQ	DI, 0(SP)
-	MOVQ	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVL	sig+8(FP), DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP), DX
+	MOVQ	fn+0(FP), AX
 	CALL	AX
 	RET
 
-	// save g
-	MOVQ	R10, 40(SP)
-
-	// g = m->signal
-	MOVQ	g_m(R10), AX
-	MOVQ	m_gsignal(AX), AX
-	MOVQ	AX, g(BX)
-
-	MOVQ	DI, 0(SP)
-	MOVQ	SI, 8(SP)
-	MOVQ	DX, 16(SP)
-	MOVQ	R10, 24(SP)
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(BX)
-	MOVQ	40(SP), R10
-	MOVQ	R10, g(BX)
+TEXT runtime·sigtramp(SB),NOSPLIT,$32
+	MOVQ	DI, 0(SP)   // signum
+	MOVQ	SI, 8(SP)   // info
+	MOVQ	DX, 16(SP)  // ctx
+	MOVQ	R15, 24(SP) // for sigreturn
+	CALL	runtime·sigtrampgo(SB)
+	MOVQ	24(SP), R15
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0

diff --git a/src/runtime/sys_netbsd_arm.s b/src/runtime/sys_netbsd_arm.s
index ae669ce..3d3b65f 100644
--- a/src/runtime/sys_netbsd_arm.s
+++ b/src/runtime/sys_netbsd_arm.s

@@ -213,7 +213,19 @@
 	MOVW.CS R8, (R8)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$24
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
+	MOVW	sig+4(FP), R0
+	MOVW	info+8(FP), R1
+	MOVW	ctx+12(FP), R2
+	MOVW	fn+0(FP), R11
+	MOVW	R13, R4
+	SUB	$24, R13
+	BIC	$0x7, R13 // alignment for ELF ABI
+	BL	(R11)
+	MOVW	R4, R13
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	// this might be called in external code context,
 	// where g is not set.
 	// first save R0, because runtime·load_g will clobber it
@@ -222,30 +234,9 @@
 	CMP 	$0, R0
 	BL.NE	runtime·load_g(SB)
 
-	CMP $0, g
-	BNE 4(PC)
-	// signal number is already prepared in 4(R13)
-	MOVW $runtime·badsignal(SB), R11
-	BL (R11)
-	RET
-
-	// save g
-	MOVW g, R4
-	MOVW g, 20(R13)
-
-	// g = m->signal
-	MOVW g_m(g), R8
-	MOVW m_gsignal(R8), g
-
-	// R0 is already saved
-	MOVW R1, 8(R13) // info
-	MOVW R2, 12(R13) // context
-	MOVW R4, 16(R13) // gp
-
-	BL runtime·sighandler(SB)
-
-	// restore g
-	MOVW 20(R13), g
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	BL	runtime·sigtrampgo(SB)
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$12
@@ -338,20 +329,6 @@
 	SWI $0xa0005c	// sys_fcntl
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	B	runtime·cas(SB)
-
-// TODO(minux): this is only valid for ARMv6+
-// bool armcas(int32 *val, int32 old, int32 new)
-// Atomically:
-//	if(*val == old){
-//		*val = new;
-//		return 1;
-//	}else
-//		return 0;
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B runtime·armcas(SB)
-
 // TODO: this is only valid for ARMv7+
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)

diff --git a/src/runtime/sys_nonppc64x.go b/src/runtime/sys_nonppc64x.go
new file mode 100644
index 0000000..4409374
--- /dev/null
+++ b/src/runtime/sys_nonppc64x.go

@@ -0,0 +1,10 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !ppc64,!ppc64le
+
+package runtime
+
+func prepGoExitFrame(sp uintptr) {
+}

diff --git a/src/runtime/sys_openbsd_386.s b/src/runtime/sys_openbsd_386.s
index bdf18d8..2bb818f 100644
--- a/src/runtime/sys_openbsd_386.s
+++ b/src/runtime/sys_openbsd_386.s

@@ -195,51 +195,25 @@
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$44
-	get_tls(CX)
-
-	// check that g exists
-	MOVL	g(CX), DI
-	CMPL	DI, $0
-	JNE	6(PC)
-	MOVL	signo+0(FP), BX
-	MOVL	BX, 0(SP)
-	MOVL	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$12-16
+	MOVL	sig+4(FP), AX
+	MOVL	AX, 0(SP)
+	MOVL	info+8(FP), AX
+	MOVL	AX, 4(SP)
+	MOVL	ctx+12(FP), AX
+	MOVL	AX, 8(SP)
+	MOVL	fn+0(FP), AX
 	CALL	AX
-	JMP 	ret
+	RET
 
-	// save g
-	MOVL	DI, 20(SP)
-	
-	// g = m->gsignal
-	MOVL	g_m(DI), BX
-	MOVL	m_gsignal(BX), BX
-	MOVL	BX, g(CX)
-
-	// copy arguments for call to sighandler
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	MOVL	signo+0(FP), BX
 	MOVL	BX, 0(SP)
 	MOVL	info+4(FP), BX
 	MOVL	BX, 4(SP)
 	MOVL	context+8(FP), BX
 	MOVL	BX, 8(SP)
-	MOVL	DI, 12(SP)
-
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(CX)
-	MOVL	20(SP), BX
-	MOVL	BX, g(CX)
-
-ret:
-	// call sigreturn
-	MOVL	context+8(FP), AX
-	MOVL	$0, 0(SP)		// syscall gap
-	MOVL	AX, 4(SP)		// arg 1 - sigcontext
-	MOVL	$103, AX		// sys_sigreturn
-	INT	$0x80
-	MOVL	$0xf1, 0xf1		// crash
+	CALL	runtime·sigtrampgo(SB)
 	RET
 
 // int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
@@ -297,7 +271,7 @@
 	POPL	AX
 	POPAL
 	
-	// Now segment is established.  Initialize m, g.
+	// Now segment is established. Initialize m, g.
 	get_tls(AX)
 	MOVL	DX, g(AX)
 	MOVL	BX, g_m(DX)

diff --git a/src/runtime/sys_openbsd_amd64.s b/src/runtime/sys_openbsd_amd64.s
index 213ffc1..c9fb832 100644
--- a/src/runtime/sys_openbsd_amd64.s
+++ b/src/runtime/sys_openbsd_amd64.s

@@ -50,7 +50,7 @@
 	// Call fn
 	CALL	R12
 
-	// It shouldn't return.  If it does, exit
+	// It shouldn't return. If it does, exit
 	MOVQ	$0, DI			// arg 1 - notdead
 	MOVL	$302, AX		// sys___threxit
 	SYSCALL
@@ -228,37 +228,19 @@
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$64
-	get_tls(BX)
-	
-	// check that g exists
-	MOVQ	g(BX), R10
-	CMPQ	R10, $0
-	JNE	5(PC)
-	MOVQ	DI, 0(SP)
-	MOVQ	$runtime·badsignal(SB), AX
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-32
+	MOVL	sig+8(FP), DI
+	MOVQ	info+16(FP), SI
+	MOVQ	ctx+24(FP), DX
+	MOVQ	fn+0(FP), AX
 	CALL	AX
 	RET
 
-	// save g
-	MOVQ	R10, 40(SP)
-	
-	// g = m->signal
-	MOVQ	g_m(R10), AX
-	MOVQ	m_gsignal(AX), AX
-	MOVQ	AX, g(BX)
-	
+TEXT runtime·sigtramp(SB),NOSPLIT,$24
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
 	MOVQ	DX, 16(SP)
-	MOVQ	R10, 24(SP)
-	
-	CALL	runtime·sighandler(SB)
-
-	// restore g
-	get_tls(BX)
-	MOVQ	40(SP), R10
-	MOVQ	R10, g(BX)
+	CALL	runtime·sigtrampgo(SB)
 	RET
 
 TEXT runtime·mmap(SB),NOSPLIT,$0
@@ -306,7 +288,7 @@
 
 // set tls base to DI
 TEXT runtime·settls(SB),NOSPLIT,$0
-	// adjust for ELF: wants to use -16(FS) and -8(FS) for g and m
+	// adjust for ELF: wants to use -8(FS) for g
 	ADDQ	$8, DI
 	MOVQ	$329, AX		// sys___settcb
 	SYSCALL

diff --git a/src/runtime/sys_openbsd_arm.s b/src/runtime/sys_openbsd_arm.s
index 60deb8f..29e8971 100644
--- a/src/runtime/sys_openbsd_arm.s
+++ b/src/runtime/sys_openbsd_arm.s

@@ -208,7 +208,19 @@
 	MOVW	R0, ret+8(FP)
 	RET
 
-TEXT runtime·sigtramp(SB),NOSPLIT,$24
+TEXT runtime·sigfwd(SB),NOSPLIT,$0-16
+	MOVW	sig+4(FP), R0
+	MOVW	info+8(FP), R1
+	MOVW	ctx+12(FP), R2
+	MOVW	fn+0(FP), R11
+	MOVW	R13, R4
+	SUB	$24, R13
+	BIC	$0x7, R13 // alignment for ELF ABI
+	BL	(R11)
+	MOVW	R4, R13
+	RET
+
+TEXT runtime·sigtramp(SB),NOSPLIT,$12
 	// If called from an external code context, g will not be set.
 	// Save R0, since runtime·load_g will clobber it.
 	MOVW	R0, 4(R13)		// signum
@@ -216,30 +228,9 @@
 	CMP	$0, R0
 	BL.NE	runtime·load_g(SB)
 
-	CMP	$0, g
-	BNE	4(PC)
-	// Signal number saved in 4(R13).
-	MOVW	runtime·badsignal(SB), R11
-	BL	(R11)
-	RET
-
-	// Save g.
-	MOVW	g, R3
-	MOVW	g, 20(R13)
-
-	// g = m->signal
-	MOVW	g_m(g), R8
-	MOVW	m_gsignal(R8), g
-
-	// R0 already saved.
-	MOVW	R1, 8(R13)		// info
-	MOVW	R2, 12(R13)		// context
-	MOVW	R3, 16(R13)		// gp (original g)
-
-	BL	runtime·sighandler(SB)
-
-	// Restore g.
-	MOVW	20(R13), g
+	MOVW	R1, 8(R13)
+	MOVW	R2, 12(R13)
+	BL	runtime·sigtrampgo(SB)
 	RET
 
 // int32 tfork(void *param, uintptr psize, M *mp, G *gp, void (*fn)(void));
@@ -373,13 +364,6 @@
 	MOVW	R0, ret+4(FP)
 	RET
 
-TEXT runtime·casp1(SB),NOSPLIT,$0
-	//B	runtime·armcas(SB)
-	B	runtime·cas(SB)
-
-TEXT runtime·cas(SB),NOSPLIT,$0
-	B	runtime·armcas(SB)
-
 TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
 	B	runtime·armPublicationBarrier(SB)
 

diff --git a/src/runtime/sys_plan9_386.s b/src/runtime/sys_plan9_386.s
index cae326a..1af3cb1 100644
--- a/src/runtime/sys_plan9_386.s
+++ b/src/runtime/sys_plan9_386.s

@@ -238,7 +238,7 @@
 	get_tls(AX)
 	MOVL	g(AX), BX
 	MOVL	g_m(BX), BX
-	MOVL	m_errstr(BX), CX
+	MOVL	(m_mOS+mOS_errstr)(BX), CX
 	MOVL	CX, 0(SP)
 	MOVL	$ERRMAX, 4(SP)
 	CALL	errstr<>(SB)

diff --git a/src/runtime/sys_plan9_amd64.s b/src/runtime/sys_plan9_amd64.s
index 6aefe5f..1492ef2 100644
--- a/src/runtime/sys_plan9_amd64.s
+++ b/src/runtime/sys_plan9_amd64.s

@@ -243,7 +243,7 @@
 	get_tls(AX)
 	MOVQ	g(AX), BX
 	MOVQ	g_m(BX), BX
-	MOVQ	m_errstr(BX), CX
+	MOVQ	(m_mOS+mOS_errstr)(BX), CX
 	MOVQ	CX, 0(SP)
 	MOVQ	$ERRMAX, 8(SP)
 	CALL	errstr<>(SB)

diff --git a/src/runtime/sys_plan9_arm.s b/src/runtime/sys_plan9_arm.s
new file mode 100644
index 0000000..6dee611
--- /dev/null
+++ b/src/runtime/sys_plan9_arm.s

@@ -0,0 +1,318 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "textflag.h"
+
+// from ../syscall/zsysnum_plan9.go
+
+#define SYS_SYSR1       0
+#define SYS_BIND        2
+#define SYS_CHDIR       3
+#define SYS_CLOSE       4
+#define SYS_DUP         5
+#define SYS_ALARM       6
+#define SYS_EXEC        7
+#define SYS_EXITS       8
+#define SYS_FAUTH       10
+#define SYS_SEGBRK      12
+#define SYS_OPEN        14
+#define SYS_OSEEK       16
+#define SYS_SLEEP       17
+#define SYS_RFORK       19
+#define SYS_PIPE        21
+#define SYS_CREATE      22
+#define SYS_FD2PATH     23
+#define SYS_BRK_        24
+#define SYS_REMOVE      25
+#define SYS_NOTIFY      28
+#define SYS_NOTED       29
+#define SYS_SEGATTACH   30
+#define SYS_SEGDETACH   31
+#define SYS_SEGFREE     32
+#define SYS_SEGFLUSH    33
+#define SYS_RENDEZVOUS  34
+#define SYS_UNMOUNT     35
+#define SYS_SEMACQUIRE  37
+#define SYS_SEMRELEASE  38
+#define SYS_SEEK        39
+#define SYS_FVERSION    40
+#define SYS_ERRSTR      41
+#define SYS_STAT        42
+#define SYS_FSTAT       43
+#define SYS_WSTAT       44
+#define SYS_FWSTAT      45
+#define SYS_MOUNT       46
+#define SYS_AWAIT       47
+#define SYS_PREAD       50
+#define SYS_PWRITE      51
+#define SYS_TSEMACQUIRE 52
+#define SYS_NSEC        53
+
+//func open(name *byte, mode, perm int32) int32
+TEXT runtime·open(SB),NOSPLIT,$0-16
+	MOVW    $SYS_OPEN, R0
+	SWI	0
+	MOVW	R0, ret+12(FP)
+	RET
+
+//func pread(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
+TEXT runtime·pread(SB),NOSPLIT,$0-24
+	MOVW    $SYS_PREAD, R0
+	SWI	0
+	MOVW	R0, ret+20(FP)
+	RET
+
+//func pwrite(fd int32, buf unsafe.Pointer, nbytes int32, offset int64) int32
+TEXT runtime·pwrite(SB),NOSPLIT,$0-24
+	MOVW    $SYS_PWRITE, R0
+	SWI	0
+	MOVW	R0, ret+20(FP)
+	RET
+
+//func seek(fd int32, offset int64, whence int32) int64
+TEXT runtime·seek(SB),NOSPLIT,$0-24
+	MOVW	$ret_lo+16(FP), R0
+	MOVW	0(R13), R1
+	MOVW	R0, 0(R13)
+	MOVW.W	R1, -4(R13)
+	MOVW	$SYS_SEEK, R0
+	SWI	0
+	MOVW.W	R1, 4(R13)
+	CMP	$-1, R0
+	MOVW.EQ	R0, ret_lo+16(FP)
+	MOVW.EQ	R0, ret_hi+20(FP)
+	RET
+
+//func closefd(fd int32) int32
+TEXT runtime·closefd(SB),NOSPLIT,$0-8
+	MOVW	$SYS_CLOSE, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func exits(msg *byte)
+TEXT runtime·exits(SB),NOSPLIT,$0-4
+	MOVW    $SYS_EXITS, R0
+	SWI	0
+	RET
+
+//func brk_(addr unsafe.Pointer) int32
+TEXT runtime·brk_(SB),NOSPLIT,$0-8
+	MOVW    $SYS_BRK_, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func sleep(ms int32) int32
+TEXT runtime·sleep(SB),NOSPLIT,$0-8
+	MOVW    $SYS_SLEEP, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func plan9_semacquire(addr *uint32, block int32) int32
+TEXT runtime·plan9_semacquire(SB),NOSPLIT,$0-12
+	MOVW	$SYS_SEMACQUIRE, R0
+	SWI	0
+	MOVW	R0, ret+8(FP)
+	RET
+
+//func plan9_tsemacquire(addr *uint32, ms int32) int32
+TEXT runtime·plan9_tsemacquire(SB),NOSPLIT,$0-12
+	MOVW	$SYS_TSEMACQUIRE, R0
+	SWI	0
+	MOVW	R0, ret+8(FP)
+	RET
+
+//func nsec(*int64) int64
+TEXT runtime·nsec(SB),NOSPLIT,$-4-12
+	MOVW	$SYS_NSEC, R0
+	SWI	0
+	MOVW	unnamed+0(FP), R1
+	MOVW	0(R1), R0
+	MOVW	R0, ret_lo+4(FP)
+	MOVW	4(R1), R0
+	MOVW	R0, ret_hi+8(FP)
+	RET
+
+// time.now() (sec int64, nsec int32)
+TEXT time·now(SB),NOSPLIT,$12-12
+	// use nsec system call to get current time in nanoseconds
+	MOVW	$sysnsec_lo-8(SP), R0	// destination addr
+	MOVW	R0,res-12(SP)
+	MOVW	$SYS_NSEC, R0
+	SWI	0
+	MOVW	sysnsec_lo-8(SP), R1	// R1:R2 = nsec
+	MOVW	sysnsec_hi-4(SP), R2
+
+	// multiply nanoseconds by reciprocal of 10**9 (scaled by 2**61)
+	// to get seconds (96 bit scaled result)
+	MOVW	$0x89705f41, R3		// 2**61 * 10**-9
+	MULLU	R1,R3,(R6,R5)		// R5:R6:R7 = R1:R2 * R3
+	MOVW	$0,R7
+	MULALU	R2,R3,(R7,R6)
+
+	// unscale by discarding low 32 bits, shifting the rest by 29
+	MOVW	R6>>29,R6		// R6:R7 = (R5:R6:R7 >> 61)
+	ORR	R7<<3,R6
+	MOVW	R7>>29,R7
+
+	// subtract (10**9 * sec) from nsec to get nanosecond remainder
+	MOVW	$1000000000, R5		// 10**9
+	MULLU	R6,R5,(R9,R8)		// R8:R9 = R6:R7 * R5
+	MULA	R7,R5,R9,R9
+	SUB.S	R8,R1			// R1:R2 -= R8:R9
+	SBC	R9,R2
+
+	// because reciprocal was a truncated repeating fraction, quotient
+	// may be slightly too small -- adjust to make remainder < 10**9
+	CMP	R5,R1			// if remainder > 10**9
+	SUB.HS	R5,R1			//    remainder -= 10**9
+	ADD.HS	$1,R6			//    sec += 1
+
+	MOVW	R6,sec_lo+0(FP)
+	MOVW	R7,sec_hi+4(FP)
+	MOVW	R1,nsec+8(FP)
+	RET
+
+//func notify(fn unsafe.Pointer) int32
+TEXT runtime·notify(SB),NOSPLIT,$0-8
+	MOVW	$SYS_NOTIFY, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func noted(mode int32) int32
+TEXT runtime·noted(SB),NOSPLIT,$0-8
+	MOVW	$SYS_NOTED, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func plan9_semrelease(addr *uint32, count int32) int32
+TEXT runtime·plan9_semrelease(SB),NOSPLIT,$0-12
+	MOVW	$SYS_SEMRELEASE, R0
+	SWI	0
+	MOVW	R0, ret+8(FP)
+	RET
+
+//func rfork(flags int32) int32
+TEXT runtime·rfork(SB),NOSPLIT,$0-8
+	MOVW	$SYS_RFORK, R0
+	SWI	0
+	MOVW	R0, ret+4(FP)
+	RET
+
+//func tstart_plan9(newm *m)
+TEXT runtime·tstart_plan9(SB),NOSPLIT,$0-4
+	MOVW	newm+0(FP), R1
+	MOVW	m_g0(R1), g
+
+	// Layout new m scheduler stack on os stack.
+	MOVW	R13, R0
+	MOVW	R0, g_stack+stack_hi(g)
+	SUB	$(64*1024), R0
+	MOVW	R0, (g_stack+stack_lo)(g)
+	MOVW	R0, g_stackguard0(g)
+	MOVW	R0, g_stackguard1(g)
+
+	// Initialize procid from TOS struct.
+	MOVW	_tos(SB), R0
+	MOVW	48(R0), R0
+	MOVW	R0, m_procid(R1)	// save pid as m->procid
+
+	BL	runtime·mstart(SB)
+
+	MOVW	$0x1234, R0
+	MOVW	R0, 0(R0)		// not reached
+	RET
+
+//func sigtramp(ureg, msg unsafe.Pointer)
+TEXT runtime·sigtramp(SB),NOSPLIT,$0-8
+	// check that g and m exist
+	CMP	$0, g
+	BEQ	4(PC)
+	MOVW	g_m(g), R0
+	CMP 	$0, R0
+	BNE	2(PC)
+	BL	runtime·badsignal2(SB)	// will exit
+
+	// save args
+	MOVW	ureg+0(FP), R1
+	MOVW	msg+4(FP), R2
+
+	// change stack
+	MOVW	m_gsignal(R0), R3
+	MOVW	(g_stack+stack_hi)(R3), R13
+
+	// make room for args, retval and g
+	SUB	$24, R13
+
+	// save g
+	MOVW	g, R3
+	MOVW	R3, 20(R13)
+
+	// g = m->gsignal
+	MOVW	m_gsignal(R0), g
+
+	// load args and call sighandler
+	ADD	$4,R13,R5
+	MOVM.IA	[R1-R3], (R5)
+	BL	runtime·sighandler(SB)
+	MOVW	16(R13), R0			// retval
+
+	// restore g
+	MOVW	20(R13), g
+
+	// call noted(R0)
+	MOVW	R0, 4(R13)
+	BL	runtime·noted(SB)
+	RET
+
+//func sigpanictramp()
+TEXT  runtime·sigpanictramp(SB),NOSPLIT,$0-0
+	MOVW.W	R0, -4(R13)
+	B	runtime·sigpanic(SB)
+
+//func setfpmasks()
+// Only used by the 64-bit runtime.
+TEXT runtime·setfpmasks(SB),NOSPLIT,$0
+	RET
+
+#define ERRMAX 128	/* from os_plan9.h */
+
+// func errstr() string
+// Only used by package syscall.
+// Grab error string due to a syscall made
+// in entersyscall mode, without going
+// through the allocator (issue 4994).
+// See ../syscall/asm_plan9_arm.s:/·Syscall/
+TEXT runtime·errstr(SB),NOSPLIT,$0-8
+	MOVW	g_m(g), R0
+	MOVW	(m_mOS+mOS_errstr)(R0), R1
+	MOVW	R1, ret_base+0(FP)
+	MOVW	$ERRMAX, R2
+	MOVW	R2, ret_len+4(FP)
+	MOVW    $SYS_ERRSTR, R0
+	SWI	0
+	MOVW	R1, R2
+	MOVBU	0(R2), R0
+	CMP	$0, R0
+	BEQ	3(PC)
+	ADD	$1, R2
+	B	-4(PC)
+	SUB	R1, R2
+	MOVW	R2, ret_len+4(FP)
+	RET
+
+TEXT ·publicationBarrier(SB),NOSPLIT,$-4-0
+	B	runtime·armPublicationBarrier(SB)
+
+// never called (cgo not supported)
+TEXT runtime·read_tls_fallback(SB),NOSPLIT,$-4
+	MOVW	$0, R0
+	MOVW	R0, (R0)
+	RET

diff --git a/src/runtime/sys_ppc64x.go b/src/runtime/sys_ppc64x.go
index bd182e3..2ea1f81 100644
--- a/src/runtime/sys_ppc64x.go
+++ b/src/runtime/sys_ppc64x.go

@@ -35,3 +35,5 @@
 	print("runtime: pc=", hex(buf.pc), " ", hex(inst), "\n")
 	throw("runtime: misuse of rewindmorestack")
 }
+
+func prepGoExitFrame(sp uintptr)

diff --git a/src/runtime/sys_s390x.go b/src/runtime/sys_s390x.go
new file mode 100644
index 0000000..2aa81e7
--- /dev/null
+++ b/src/runtime/sys_s390x.go

@@ -0,0 +1,45 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import "unsafe"
+
+// adjust Gobuf as if it executed a call to fn with context ctxt
+// and then did an immediate Gosave.
+func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
+	if buf.lr != 0 {
+		throw("invalid use of gostartcall")
+	}
+	buf.lr = buf.pc
+	buf.pc = uintptr(fn)
+	buf.ctxt = ctxt
+}
+
+// Called to rewind context saved during morestack back to beginning of function.
+// To help us, the linker emits a jmp back to the beginning right after the
+// call to morestack. We just have to decode and apply that jump.
+func rewindmorestack(buf *gobuf) {
+	var inst uint64
+	if buf.pc&1 == 0 && buf.pc != 0 {
+		inst = *(*uint64)(unsafe.Pointer(buf.pc))
+		switch inst >> 48 {
+		case 0xa7f4: // BRC (branch relative on condition) instruction.
+			inst >>= 32
+			inst &= 0xFFFF
+			offset := int64(int16(inst))
+			offset <<= 1
+			buf.pc += uintptr(offset)
+			return
+		case 0xc0f4: // BRCL (branch relative on condition long) instruction.
+			inst >>= 16
+			inst = inst & 0xFFFFFFFF
+			inst = (inst << 1) & 0xFFFFFFFF
+			buf.pc += uintptr(int32(inst))
+			return
+		}
+	}
+	print("runtime: pc=", hex(buf.pc), " ", hex(inst), "\n")
+	throw("runtime: misuse of rewindmorestack")
+}

diff --git a/src/runtime/sys_solaris_amd64.s b/src/runtime/sys_solaris_amd64.s
index e431564..07a7ace 100644
--- a/src/runtime/sys_solaris_amd64.s
+++ b/src/runtime/sys_solaris_amd64.s

@@ -26,7 +26,7 @@
 	get_tls(CX)
 	MOVQ	g(CX), BX
 	MOVQ	g_m(BX), BX
-	MOVQ	AX,	m_perrno(BX)
+	MOVQ	AX,	(m_mOS+mOS_perrno)(BX)
 	RET
 
 // int64 runtime·nanotime1(void);
@@ -52,7 +52,7 @@
 // pipe(3c) wrapper that returns fds in AX, DX.
 // NOT USING GO CALLING CONVENTION.
 TEXT runtime·pipe1(SB),NOSPLIT,$0
-	SUBQ	$16, SP // 8 bytes will do, but stack has to be 16-byte alligned
+	SUBQ	$16, SP // 8 bytes will do, but stack has to be 16-byte aligned
 	MOVQ	SP, DI
 	LEAQ	libc_pipe(SB), AX
 	CALL	AX
@@ -80,8 +80,10 @@
 
 	get_tls(CX)
 	MOVQ	g(CX), BX
+	CMPQ	BX, $0
+	JEQ	skiperrno1
 	MOVQ	g_m(BX), BX
-	MOVQ	m_perrno(BX), DX
+	MOVQ	(m_mOS+mOS_perrno)(BX), DX
 	CMPQ	DX, $0
 	JEQ	skiperrno1
 	MOVL	$0, 0(DX)
@@ -108,8 +110,10 @@
 
 	get_tls(CX)
 	MOVQ	g(CX), BX
+	CMPQ	BX, $0
+	JEQ	skiperrno2
 	MOVQ	g_m(BX), BX
-	MOVQ	m_perrno(BX), AX
+	MOVQ	(m_mOS+mOS_perrno)(BX), AX
 	CMPQ	AX, $0
 	JEQ	skiperrno2
 	MOVL	0(AX), AX
@@ -169,7 +173,11 @@
 	MOVQ	g(BX), R10
 	CMPQ	R10, $0
 	JNE	allgood
+	MOVQ	SI, 80(SP)
+	MOVQ	DX, 88(SP)
+	LEAQ	80(SP), AX
 	MOVQ	DI, 0(SP)
+	MOVQ	AX, 8(SP)
 	MOVQ	$runtime·badsignal(SB), AX
 	CALL	AX
 	JMP	exit
@@ -196,7 +204,7 @@
 	MOVQ    R10, 176(SP)
 
 	// save m->scratch
-	LEAQ	m_scratch(BP), R11
+	LEAQ	(m_mOS+mOS_scratch)(BP), R11
 	MOVQ	0(R11), R10
 	MOVQ	R10, 112(SP)
 	MOVQ	8(R11), R10
@@ -211,7 +219,7 @@
 	MOVQ	R10, 152(SP)
 
 	// save errno, it might be EINTR; stuff we do here might reset it.
-	MOVQ	m_perrno(BP), R10
+	MOVQ	(m_mOS+mOS_perrno)(BP), R10
 	MOVL	0(R10), R10
 	MOVQ	R10, 160(SP)
 
@@ -220,6 +228,8 @@
 	MOVQ	m_gsignal(BP), BP
 	MOVQ	BP, g(BX)
 
+	// TODO: If current SP is not in gsignal.stack, then adjust.
+
 	// prepare call
 	MOVQ	DI, 0(SP)
 	MOVQ	SI, 8(SP)
@@ -244,7 +254,7 @@
 	MOVQ    R10, libcall_r2(R11)
 
 	// restore scratch
-	LEAQ	m_scratch(BP), R11
+	LEAQ	(m_mOS+mOS_scratch)(BP), R11
 	MOVQ	112(SP), R10
 	MOVQ	R10, 0(R11)
 	MOVQ	120(SP), R10
@@ -259,7 +269,7 @@
 	MOVQ	R10, 40(R11)
 
 	// restore errno
-	MOVQ	m_perrno(BP), R11
+	MOVQ	(m_mOS+mOS_perrno)(BP), R11
 	MOVQ	160(SP), R10
 	MOVL	R10, 0(R11)
 

diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index e5fe88a..95130b7 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s

@@ -206,7 +206,8 @@
 	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
 	LEAL	g__size(SP), BX
 	MOVL	BX, g_m(SP)
-	LEAL	-8192(SP), CX
+
+	LEAL	-32768(SP), CX		// must be less than SizeOfStackReserve set by linker
 	MOVL	CX, (g_stack+stack_lo)(SP)
 	ADDL	$const__StackGuard, CX
 	MOVL	CX, g_stackguard0(SP)
@@ -357,10 +358,11 @@
 	MOVL	CX, 0x14(FS)
 	RET
 
-// Sleep duration is in 100ns units.
-TEXT runtime·usleep1(SB),NOSPLIT,$0
-	MOVL	usec+0(FP), BX
-	MOVL	$runtime·usleep2(SB), AX // to hide from 8l
+// onosstack calls fn on OS stack.
+// func onosstack(fn unsafe.Pointer, arg uint32)
+TEXT runtime·onosstack(SB),NOSPLIT,$0
+	MOVL	fn+0(FP), AX		// to hide from 8l
+	MOVL	arg+4(FP), BX
 
 	// Execute call on m->g0 stack, in case we are not actually
 	// calling a system call wrapper, like when running under WINE.
@@ -422,6 +424,14 @@
 	MOVL	BP, SP
 	RET
 
+// Runs on OS stack.
+TEXT runtime·switchtothread(SB),NOSPLIT,$0
+	MOVL	SP, BP
+	MOVL	runtime·_SwitchToThread(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+	RET
+
 // func now() (sec int64, nsec int32)
 TEXT time·now(SB),NOSPLIT,$8-12
 	CALL	runtime·unixnano(SB)

diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index ea4f3e0..9c19737 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s

@@ -11,7 +11,7 @@
 #define maxargs 16
 
 // void runtime·asmstdcall(void *c);
-TEXT runtime·asmstdcall(SB),NOSPLIT,$0
+TEXT runtime·asmstdcall(SB),NOSPLIT|NOFRAME,$0
 	// asmcgocall will put first argument into CX.
 	PUSHQ	CX			// save for later
 	MOVQ	libcall_fn(CX), AX
@@ -62,7 +62,7 @@
 
 	RET
 
-TEXT runtime·badsignal2(SB),NOSPLIT,$48
+TEXT runtime·badsignal2(SB),NOSPLIT|NOFRAME,$48
 	// stderr
 	MOVQ	$-12, CX // stderr
 	MOVQ	CX, 0(SP)
@@ -102,7 +102,7 @@
 // exception record and context pointers.
 // Handler function is stored in AX.
 // Return 0 for 'not handled', -1 for handled.
-TEXT runtime·sigtramp(SB),NOSPLIT,$0-0
+TEXT runtime·sigtramp(SB),NOSPLIT|NOFRAME,$0-0
 	// CX: PEXCEPTION_POINTERS ExceptionInfo
 
 	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
@@ -190,32 +190,32 @@
 
 	RET
 
-TEXT runtime·exceptiontramp(SB),NOSPLIT,$0
+TEXT runtime·exceptiontramp(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	$runtime·exceptionhandler(SB), AX
 	JMP	runtime·sigtramp(SB)
 
-TEXT runtime·firstcontinuetramp(SB),NOSPLIT,$0-0
+TEXT runtime·firstcontinuetramp(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ	$runtime·firstcontinuehandler(SB), AX
 	JMP	runtime·sigtramp(SB)
 
-TEXT runtime·lastcontinuetramp(SB),NOSPLIT,$0-0
+TEXT runtime·lastcontinuetramp(SB),NOSPLIT|NOFRAME,$0-0
 	MOVQ	$runtime·lastcontinuehandler(SB), AX
 	JMP	runtime·sigtramp(SB)
 
-TEXT runtime·ctrlhandler(SB),NOSPLIT,$8
+TEXT runtime·ctrlhandler(SB),NOSPLIT|NOFRAME,$8
 	MOVQ	CX, 16(SP)		// spill
 	MOVQ	$runtime·ctrlhandler1(SB), CX
 	MOVQ	CX, 0(SP)
 	CALL	runtime·externalthreadhandler(SB)
 	RET
 
-TEXT runtime·profileloop(SB),NOSPLIT,$8
+TEXT runtime·profileloop(SB),NOSPLIT|NOFRAME,$8
 	MOVQ	$runtime·profileloop1(SB), CX
 	MOVQ	CX, 0(SP)
 	CALL	runtime·externalthreadhandler(SB)
 	RET
 
-TEXT runtime·externalthreadhandler(SB),NOSPLIT,$0
+TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME,$0
 	PUSHQ	BP
 	MOVQ	SP, BP
 	PUSHQ	BX
@@ -228,7 +228,7 @@
 	SUBQ	$m__size, SP		// space for M
 	MOVQ	SP, 0(SP)
 	MOVQ	$m__size, 8(SP)
-	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX, maybe BP
 
 	LEAQ	m_tls(SP), CX
 	MOVQ	CX, 0x28(GS)
@@ -239,11 +239,11 @@
 
 	MOVQ	SP, 0(SP)
 	MOVQ	$g__size, 8(SP)
-	CALL	runtime·memclr(SB)	// smashes AX,BX,CX
+	CALL	runtime·memclr(SB)	// smashes AX,BX,CX, maybe BP
 	LEAQ	g__size(SP), BX
 	MOVQ	BX, g_m(SP)
 
-	LEAQ	-8192(SP), CX
+	LEAQ	-32768(SP), CX		// must be less than SizeOfStackReserve set by linker
 	MOVQ	CX, (g_stack+stack_lo)(SP)
 	ADDQ	$const__StackGuard, CX
 	MOVQ	CX, g_stackguard0(SP)
@@ -381,10 +381,10 @@
 	MOVQ	DI, 0x28(GS)
 	RET
 
-// Sleep duration is in 100ns units.
-TEXT runtime·usleep1(SB),NOSPLIT,$0
-	MOVL	usec+0(FP), BX
-	MOVQ	$runtime·usleep2(SB), AX // to hide from 6l
+// func onosstack(fn unsafe.Pointer, arg uint32)
+TEXT runtime·onosstack(SB),NOSPLIT,$0
+	MOVQ	fn+0(FP), AX		// to hide from 6l
+	MOVL	arg+8(FP), BX
 
 	// Execute call on m->g0 stack, in case we are not actually
 	// calling a system call wrapper, like when running under WINE.
@@ -428,19 +428,33 @@
 	RET
 
 // Runs on OS stack. duration (in 100ns units) is in BX.
-TEXT runtime·usleep2(SB),NOSPLIT,$16
+// The function leaves room for 4 syscall parameters
+// (as per windows amd64 calling convention).
+TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$48
 	MOVQ	SP, AX
 	ANDQ	$~15, SP	// alignment as per Windows requirement
-	MOVQ	AX, 8(SP)
+	MOVQ	AX, 40(SP)
 	// Want negative 100ns units.
 	NEGQ	BX
-	MOVQ	SP, R8 // ptime
+	LEAQ	32(SP), R8  // ptime
 	MOVQ	BX, (R8)
 	MOVQ	$-1, CX // handle
 	MOVQ	$0, DX // alertable
 	MOVQ	runtime·_NtWaitForSingleObject(SB), AX
 	CALL	AX
-	MOVQ	8(SP), SP
+	MOVQ	40(SP), SP
+	RET
+
+// Runs on OS stack.
+TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	SUBQ	$(48), SP	// room for SP and 4 args as per Windows requirement
+				// plus one extra word to keep stack 16 bytes aligned
+	MOVQ	AX, 32(SP)
+	MOVQ	runtime·_SwitchToThread(SB), AX
+	CALL	AX
+	MOVQ	32(SP), SP
 	RET
 
 // func now() (sec int64, nsec int32)

diff --git a/src/runtime/sys_x86.go b/src/runtime/sys_x86.go
index 3f0771f..f6e45cc 100644
--- a/src/runtime/sys_x86.go
+++ b/src/runtime/sys_x86.go

@@ -1,4 +1,4 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -6,17 +6,20 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // adjust Gobuf as it if executed a call to fn with context ctxt
 // and then did an immediate gosave.
 func gostartcall(buf *gobuf, fn, ctxt unsafe.Pointer) {
 	sp := buf.sp
-	if regSize > ptrSize {
-		sp -= ptrSize
+	if sys.RegSize > sys.PtrSize {
+		sp -= sys.PtrSize
 		*(*uintptr)(unsafe.Pointer(sp)) = 0
 	}
-	sp -= ptrSize
+	sp -= sys.PtrSize
 	*(*uintptr)(unsafe.Pointer(sp)) = buf.pc
 	buf.sp = sp
 	buf.pc = uintptr(fn)
@@ -37,13 +40,13 @@
 		return
 	}
 	if pc[0] == 0xcc {
-		// This is a breakpoint inserted by gdb.  We could use
-		// runtime·findfunc to find the function.  But if we
+		// This is a breakpoint inserted by gdb. We could use
+		// runtime·findfunc to find the function. But if we
 		// do that, then we will continue execution at the
 		// function entry point, and we will not hit the gdb
-		// breakpoint.  So for this case we don't change
+		// breakpoint. So for this case we don't change
 		// buf.pc, so that when we return we will execute
-		// the jump instruction and carry on.  This means that
+		// the jump instruction and carry on. This means that
 		// stack unwinding may not work entirely correctly
 		// (https://golang.org/issue/5723) but the user is
 		// running under gdb anyhow.

diff --git a/src/runtime/syscall2_solaris.go b/src/runtime/syscall2_solaris.go
index df72996..e19035a 100644
--- a/src/runtime/syscall2_solaris.go
+++ b/src/runtime/syscall2_solaris.go

@@ -9,9 +9,6 @@
 //go:cgo_import_dynamic libc_chdir chdir "libc.so"
 //go:cgo_import_dynamic libc_chroot chroot "libc.so"
 //go:cgo_import_dynamic libc_close close "libc.so"
-//go:cgo_import_dynamic libc_dlclose dlclose "libc.so"
-//go:cgo_import_dynamic libc_dlopen dlopen "libc.so"
-//go:cgo_import_dynamic libc_dlsym dlsym "libc.so"
 //go:cgo_import_dynamic libc_execve execve "libc.so"
 //go:cgo_import_dynamic libc_fcntl fcntl "libc.so"
 //go:cgo_import_dynamic libc_gethostname gethostname "libc.so"
@@ -30,9 +27,6 @@
 //go:linkname libc_chdir libc_chdir
 //go:linkname libc_chroot libc_chroot
 //go:linkname libc_close libc_close
-//go:linkname libc_dlclose libc_dlclose
-//go:linkname libc_dlopen libc_dlopen
-//go:linkname libc_dlsym libc_dlsym
 //go:linkname libc_execve libc_execve
 //go:linkname libc_fcntl libc_fcntl
 //go:linkname libc_gethostname libc_gethostname

diff --git a/src/runtime/syscall_solaris.go b/src/runtime/syscall_solaris.go
index ae1f334..6c9dbe2 100644
--- a/src/runtime/syscall_solaris.go
+++ b/src/runtime/syscall_solaris.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 8e069cd..cd23b8d 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -45,16 +45,16 @@
 		panic("compileCallback: not a function")
 	}
 	ft := (*functype)(unsafe.Pointer(fn._type))
-	if ft.out.len != 1 {
+	if len(ft.out()) != 1 {
 		panic("compileCallback: function must have one output parameter")
 	}
 	uintptrSize := unsafe.Sizeof(uintptr(0))
-	if t := (**_type)(unsafe.Pointer(ft.out.array)); (*t).size != uintptrSize {
+	if ft.out()[0].size != uintptrSize {
 		panic("compileCallback: output parameter size is wrong")
 	}
 	argsize := uintptr(0)
-	for _, t := range (*[1024](*_type))(unsafe.Pointer(ft.in.array))[:ft.in.len] {
-		if (*t).size > uintptrSize {
+	for _, t := range ft.in() {
+		if t.size > uintptrSize {
 			panic("compileCallback: input parameter size is wrong")
 		}
 		argsize += uintptrSize
@@ -88,6 +88,41 @@
 	return callbackasmAddr(n)
 }
 
+const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+
+//go:linkname syscall_loadsystemlibrary syscall.loadsystemlibrary
+//go:nosplit
+func syscall_loadsystemlibrary(filename *uint16) (handle, err uintptr) {
+	c := &getg().m.syscall
+
+	if useLoadLibraryEx {
+		c.fn = getLoadLibraryEx()
+		c.n = 3
+		args := struct {
+			lpFileName *uint16
+			hFile      uintptr // always 0
+			flags      uint32
+		}{filename, 0, _LOAD_LIBRARY_SEARCH_SYSTEM32}
+		c.args = uintptr(noescape(unsafe.Pointer(&args)))
+	} else {
+		// User is on Windows XP or something ancient.
+		// The caller wanted to only load the filename DLL
+		// from the System32 directory but that facility
+		// doesn't exist, so just load it the normal way. This
+		// is a potential security risk, but so is Windows XP.
+		c.fn = getLoadLibrary()
+		c.n = 1
+		c.args = uintptr(noescape(unsafe.Pointer(&filename)))
+	}
+
+	cgocall(asmstdcallAddr, unsafe.Pointer(c))
+	handle = c.r1
+	if handle == 0 {
+		err = c.err
+	}
+	return
+}
+
 //go:linkname syscall_loadlibrary syscall.loadlibrary
 //go:nosplit
 func syscall_loadlibrary(filename *uint16) (handle, err uintptr) {

diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 677eb5f..4a10749 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go

@@ -1,11 +1,14 @@
-// Copyright 2010 The Go Authors.  All rights reserved.
+// Copyright 2010 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime_test
 
 import (
+	"bytes"
 	"fmt"
+	"internal/syscall/windows/sysdll"
+	"internal/testenv"
 	"io/ioutil"
 	"os"
 	"os/exec"
@@ -499,7 +502,7 @@
 }
 
 func TestRaiseException(t *testing.T) {
-	o := executeTest(t, raiseExceptionSource, nil)
+	o := runTestProg(t, "testprog", "RaiseException")
 	if strings.Contains(o, "RaiseException should not return") {
 		t.Fatalf("RaiseException did not crash program: %v", o)
 	}
@@ -508,35 +511,13 @@
 	}
 }
 
-const raiseExceptionSource = `
-package main
-import "syscall"
-func main() {
-	const EXCEPTION_NONCONTINUABLE = 1
-	mod := syscall.MustLoadDLL("kernel32.dll")
-	proc := mod.MustFindProc("RaiseException")
-	proc.Call(0xbad, EXCEPTION_NONCONTINUABLE, 0, 0)
-	println("RaiseException should not return")
-}
-`
-
 func TestZeroDivisionException(t *testing.T) {
-	o := executeTest(t, zeroDivisionExceptionSource, nil)
+	o := runTestProg(t, "testprog", "ZeroDivisionException")
 	if !strings.Contains(o, "panic: runtime error: integer divide by zero") {
 		t.Fatalf("No stack trace: %v", o)
 	}
 }
 
-const zeroDivisionExceptionSource = `
-package main
-func main() {
-	x := 1
-	y := 0
-	z := x / y
-	println(z)
-}
-`
-
 func TestWERDialogue(t *testing.T) {
 	if os.Getenv("TESTING_WER_DIALOGUE") == "1" {
 		defer os.Exit(0)
@@ -640,3 +621,354 @@
 		t.Errorf("got %d want %d", got, want)
 	}
 }
+
+func TestTimeBeginPeriod(t *testing.T) {
+	const TIMERR_NOERROR = 0
+	if *runtime.TimeBeginPeriodRetValue != TIMERR_NOERROR {
+		t.Fatalf("timeBeginPeriod failed: it returned %d", *runtime.TimeBeginPeriodRetValue)
+	}
+}
+
+// removeOneCPU removes one (any) cpu from affinity mask.
+// It returns new affinity mask.
+func removeOneCPU(mask uintptr) (uintptr, error) {
+	if mask == 0 {
+		return 0, fmt.Errorf("cpu affinity mask is empty")
+	}
+	maskbits := int(unsafe.Sizeof(mask) * 8)
+	for i := 0; i < maskbits; i++ {
+		newmask := mask & ^(1 << uint(i))
+		if newmask != mask {
+			return newmask, nil
+		}
+
+	}
+	panic("not reached")
+}
+
+func resumeChildThread(kernel32 *syscall.DLL, childpid int) error {
+	_OpenThread := kernel32.MustFindProc("OpenThread")
+	_ResumeThread := kernel32.MustFindProc("ResumeThread")
+	_Thread32First := kernel32.MustFindProc("Thread32First")
+	_Thread32Next := kernel32.MustFindProc("Thread32Next")
+
+	snapshot, err := syscall.CreateToolhelp32Snapshot(syscall.TH32CS_SNAPTHREAD, 0)
+	if err != nil {
+		return err
+	}
+	defer syscall.CloseHandle(snapshot)
+
+	const _THREAD_SUSPEND_RESUME = 0x0002
+
+	type ThreadEntry32 struct {
+		Size           uint32
+		tUsage         uint32
+		ThreadID       uint32
+		OwnerProcessID uint32
+		BasePri        int32
+		DeltaPri       int32
+		Flags          uint32
+	}
+
+	var te ThreadEntry32
+	te.Size = uint32(unsafe.Sizeof(te))
+	ret, _, err := _Thread32First.Call(uintptr(snapshot), uintptr(unsafe.Pointer(&te)))
+	if ret == 0 {
+		return err
+	}
+	for te.OwnerProcessID != uint32(childpid) {
+		ret, _, err = _Thread32Next.Call(uintptr(snapshot), uintptr(unsafe.Pointer(&te)))
+		if ret == 0 {
+			return err
+		}
+	}
+	h, _, err := _OpenThread.Call(_THREAD_SUSPEND_RESUME, 1, uintptr(te.ThreadID))
+	if h == 0 {
+		return err
+	}
+	defer syscall.Close(syscall.Handle(h))
+
+	ret, _, err = _ResumeThread.Call(h)
+	if ret == 0xffffffff {
+		return err
+	}
+	return nil
+}
+
+func TestNumCPU(t *testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") == "1" {
+		// in child process
+		fmt.Fprintf(os.Stderr, "%d", runtime.NumCPU())
+		os.Exit(0)
+	}
+
+	switch n := runtime.NumberOfProcessors(); {
+	case n < 1:
+		t.Fatalf("system cannot have %d cpu(s)", n)
+	case n == 1:
+		if runtime.NumCPU() != 1 {
+			t.Fatalf("runtime.NumCPU() returns %d on single cpu system", runtime.NumCPU())
+		}
+		return
+	}
+
+	const (
+		_CREATE_SUSPENDED   = 0x00000004
+		_PROCESS_ALL_ACCESS = syscall.STANDARD_RIGHTS_REQUIRED | syscall.SYNCHRONIZE | 0xfff
+	)
+
+	kernel32 := syscall.MustLoadDLL("kernel32.dll")
+	_GetProcessAffinityMask := kernel32.MustFindProc("GetProcessAffinityMask")
+	_SetProcessAffinityMask := kernel32.MustFindProc("SetProcessAffinityMask")
+
+	cmd := exec.Command(os.Args[0], "-test.run=TestNumCPU")
+	cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
+	var buf bytes.Buffer
+	cmd.Stdout = &buf
+	cmd.Stderr = &buf
+	cmd.SysProcAttr = &syscall.SysProcAttr{CreationFlags: _CREATE_SUSPENDED}
+	err := cmd.Start()
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		err = cmd.Wait()
+		childOutput := string(buf.Bytes())
+		if err != nil {
+			t.Fatalf("child failed: %v: %v", err, childOutput)
+		}
+		// removeOneCPU should have decreased child cpu count by 1
+		want := fmt.Sprintf("%d", runtime.NumCPU()-1)
+		if childOutput != want {
+			t.Fatalf("child output: want %q, got %q", want, childOutput)
+		}
+	}()
+
+	defer func() {
+		err = resumeChildThread(kernel32, cmd.Process.Pid)
+		if err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	ph, err := syscall.OpenProcess(_PROCESS_ALL_ACCESS, false, uint32(cmd.Process.Pid))
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer syscall.CloseHandle(ph)
+
+	var mask, sysmask uintptr
+	ret, _, err := _GetProcessAffinityMask.Call(uintptr(ph), uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
+	if ret == 0 {
+		t.Fatal(err)
+	}
+
+	newmask, err := removeOneCPU(mask)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	ret, _, err = _SetProcessAffinityMask.Call(uintptr(ph), newmask)
+	if ret == 0 {
+		t.Fatal(err)
+	}
+	ret, _, err = _GetProcessAffinityMask.Call(uintptr(ph), uintptr(unsafe.Pointer(&mask)), uintptr(unsafe.Pointer(&sysmask)))
+	if ret == 0 {
+		t.Fatal(err)
+	}
+	if newmask != mask {
+		t.Fatalf("SetProcessAffinityMask didn't set newmask of 0x%x. Current mask is 0x%x.", newmask, mask)
+	}
+}
+
+// See Issue 14959
+func TestDLLPreloadMitigation(t *testing.T) {
+	if _, err := exec.LookPath("gcc"); err != nil {
+		t.Skip("skipping test: gcc is missing")
+	}
+
+	tmpdir, err := ioutil.TempDir("", "TestDLLPreloadMitigation")
+	if err != nil {
+		t.Fatal("TempDir failed: ", err)
+	}
+	defer func() {
+		err := os.RemoveAll(tmpdir)
+		if err != nil {
+			t.Error(err)
+		}
+	}()
+
+	dir0, err := os.Getwd()
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer os.Chdir(dir0)
+
+	const src = `
+#include <stdint.h>
+#include <windows.h>
+
+uintptr_t cfunc() {
+   SetLastError(123);
+}
+`
+	srcname := "nojack.c"
+	err = ioutil.WriteFile(filepath.Join(tmpdir, srcname), []byte(src), 0)
+	if err != nil {
+		t.Fatal(err)
+	}
+	name := "nojack.dll"
+	cmd := exec.Command("gcc", "-shared", "-s", "-Werror", "-o", name, srcname)
+	cmd.Dir = tmpdir
+	out, err := cmd.CombinedOutput()
+	if err != nil {
+		t.Fatalf("failed to build dll: %v - %v", err, string(out))
+	}
+	dllpath := filepath.Join(tmpdir, name)
+
+	dll := syscall.MustLoadDLL(dllpath)
+	dll.MustFindProc("cfunc")
+	dll.Release()
+
+	// Get into the directory with the DLL we'll load by base name
+	// ("nojack.dll") Think of this as the user double-clicking an
+	// installer from their Downloads directory where a browser
+	// silently downloaded some malicious DLLs.
+	os.Chdir(tmpdir)
+
+	// First before we can load a DLL from the current directory,
+	// loading it only as "nojack.dll", without an absolute path.
+	delete(sysdll.IsSystemDLL, name) // in case test was run repeatedly
+	dll, err = syscall.LoadDLL(name)
+	if err != nil {
+		t.Fatalf("failed to load %s by base name before sysdll registration: %v", name, err)
+	}
+	dll.Release()
+
+	// And now verify that if we register it as a system32-only
+	// DLL, the implicit loading from the current directory no
+	// longer works.
+	sysdll.IsSystemDLL[name] = true
+	dll, err = syscall.LoadDLL(name)
+	if err == nil {
+		dll.Release()
+		if wantLoadLibraryEx() {
+			t.Fatalf("Bad: insecure load of DLL by base name %q before sysdll registration: %v", name, err)
+		}
+		t.Skip("insecure load of DLL, but expected")
+	}
+}
+
+// wantLoadLibraryEx reports whether we expect LoadLibraryEx to work for tests.
+func wantLoadLibraryEx() bool {
+	return testenv.Builder() == "windows-amd64-gce" || testenv.Builder() == "windows-386-gce"
+}
+
+func TestLoadLibraryEx(t *testing.T) {
+	use, have, flags := runtime.LoadLibraryExStatus()
+	if use {
+		return // success.
+	}
+	if wantLoadLibraryEx() {
+		t.Fatalf("Expected LoadLibraryEx+flags to be available. (LoadLibraryEx=%v; flags=%v)",
+			have, flags)
+	}
+	t.Skipf("LoadLibraryEx not usable, but not expected. (LoadLibraryEx=%v; flags=%v)",
+		have, flags)
+}
+
+var (
+	modwinmm    = syscall.NewLazyDLL("winmm.dll")
+	modkernel32 = syscall.NewLazyDLL("kernel32.dll")
+
+	procCreateEvent = modkernel32.NewProc("CreateEventW")
+	procSetEvent    = modkernel32.NewProc("SetEvent")
+)
+
+func createEvent() (syscall.Handle, error) {
+	r0, _, e0 := syscall.Syscall6(procCreateEvent.Addr(), 4, 0, 0, 0, 0, 0, 0)
+	if r0 == 0 {
+		return 0, syscall.Errno(e0)
+	}
+	return syscall.Handle(r0), nil
+}
+
+func setEvent(h syscall.Handle) error {
+	r0, _, e0 := syscall.Syscall(procSetEvent.Addr(), 1, uintptr(h), 0, 0)
+	if r0 == 0 {
+		return syscall.Errno(e0)
+	}
+	return nil
+}
+
+func BenchmarkChanToSyscallPing(b *testing.B) {
+	n := b.N
+	ch := make(chan int)
+	event, err := createEvent()
+	if err != nil {
+		b.Fatal(err)
+	}
+	go func() {
+		for i := 0; i < n; i++ {
+			syscall.WaitForSingleObject(event, syscall.INFINITE)
+			ch <- 1
+		}
+	}()
+	for i := 0; i < n; i++ {
+		err := setEvent(event)
+		if err != nil {
+			b.Fatal(err)
+		}
+		<-ch
+	}
+}
+
+func BenchmarkSyscallToSyscallPing(b *testing.B) {
+	n := b.N
+	event1, err := createEvent()
+	if err != nil {
+		b.Fatal(err)
+	}
+	event2, err := createEvent()
+	if err != nil {
+		b.Fatal(err)
+	}
+	go func() {
+		for i := 0; i < n; i++ {
+			syscall.WaitForSingleObject(event1, syscall.INFINITE)
+			err := setEvent(event2)
+			if err != nil {
+				b.Fatal(err)
+			}
+		}
+	}()
+	for i := 0; i < n; i++ {
+		err := setEvent(event1)
+		if err != nil {
+			b.Fatal(err)
+		}
+		syscall.WaitForSingleObject(event2, syscall.INFINITE)
+	}
+}
+
+func BenchmarkChanToChanPing(b *testing.B) {
+	n := b.N
+	ch1 := make(chan int)
+	ch2 := make(chan int)
+	go func() {
+		for i := 0; i < n; i++ {
+			<-ch1
+			ch2 <- 1
+		}
+	}()
+	for i := 0; i < n; i++ {
+		ch1 <- 1
+		<-ch2
+	}
+}
+
+func BenchmarkOsYield(b *testing.B) {
+	for i := 0; i < b.N; i++ {
+		runtime.OsYield()
+	}
+}

diff --git a/src/runtime/testdata/testprog/crash.go b/src/runtime/testdata/testprog/crash.go
new file mode 100644
index 0000000..4d83132
--- /dev/null
+++ b/src/runtime/testdata/testprog/crash.go

@@ -0,0 +1,45 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func init() {
+	register("Crash", Crash)
+}
+
+func test(name string) {
+	defer func() {
+		if x := recover(); x != nil {
+			fmt.Printf(" recovered")
+		}
+		fmt.Printf(" done\n")
+	}()
+	fmt.Printf("%s:", name)
+	var s *string
+	_ = *s
+	fmt.Print("SHOULD NOT BE HERE")
+}
+
+func testInNewThread(name string) {
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		test(name)
+		c <- true
+	}()
+	<-c
+}
+
+func Crash() {
+	runtime.LockOSThread()
+	test("main")
+	testInNewThread("new-thread")
+	testInNewThread("second-new-thread")
+	test("main-again")
+}

diff --git a/src/runtime/testdata/testprog/deadlock.go b/src/runtime/testdata/testprog/deadlock.go
new file mode 100644
index 0000000..c938fcf
--- /dev/null
+++ b/src/runtime/testdata/testprog/deadlock.go

@@ -0,0 +1,216 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"runtime"
+	"runtime/debug"
+	"time"
+)
+
+func init() {
+	registerInit("InitDeadlock", InitDeadlock)
+	registerInit("NoHelperGoroutines", NoHelperGoroutines)
+
+	register("SimpleDeadlock", SimpleDeadlock)
+	register("LockedDeadlock", LockedDeadlock)
+	register("LockedDeadlock2", LockedDeadlock2)
+	register("GoexitDeadlock", GoexitDeadlock)
+	register("StackOverflow", StackOverflow)
+	register("ThreadExhaustion", ThreadExhaustion)
+	register("RecursivePanic", RecursivePanic)
+	register("GoexitExit", GoexitExit)
+	register("GoNil", GoNil)
+	register("MainGoroutineID", MainGoroutineID)
+	register("Breakpoint", Breakpoint)
+	register("GoexitInPanic", GoexitInPanic)
+	register("PanicAfterGoexit", PanicAfterGoexit)
+	register("RecoveredPanicAfterGoexit", RecoveredPanicAfterGoexit)
+	register("PanicTraceback", PanicTraceback)
+	register("GoschedInPanic", GoschedInPanic)
+	register("SyscallInPanic", SyscallInPanic)
+}
+
+func SimpleDeadlock() {
+	select {}
+	panic("not reached")
+}
+
+func InitDeadlock() {
+	select {}
+	panic("not reached")
+}
+
+func LockedDeadlock() {
+	runtime.LockOSThread()
+	select {}
+}
+
+func LockedDeadlock2() {
+	go func() {
+		runtime.LockOSThread()
+		select {}
+	}()
+	time.Sleep(time.Millisecond)
+	select {}
+}
+
+func GoexitDeadlock() {
+	F := func() {
+		for i := 0; i < 10; i++ {
+		}
+	}
+
+	go F()
+	go F()
+	runtime.Goexit()
+}
+
+func StackOverflow() {
+	var f func() byte
+	f = func() byte {
+		var buf [64 << 10]byte
+		return buf[0] + f()
+	}
+	debug.SetMaxStack(1474560)
+	f()
+}
+
+func ThreadExhaustion() {
+	debug.SetMaxThreads(10)
+	c := make(chan int)
+	for i := 0; i < 100; i++ {
+		go func() {
+			runtime.LockOSThread()
+			c <- 0
+			select {}
+		}()
+		<-c
+	}
+}
+
+func RecursivePanic() {
+	func() {
+		defer func() {
+			fmt.Println(recover())
+		}()
+		var x [8192]byte
+		func(x [8192]byte) {
+			defer func() {
+				if err := recover(); err != nil {
+					panic("wrap: " + err.(string))
+				}
+			}()
+			panic("bad")
+		}(x)
+	}()
+	panic("again")
+}
+
+func GoexitExit() {
+	go func() {
+		time.Sleep(time.Millisecond)
+	}()
+	i := 0
+	runtime.SetFinalizer(&i, func(p *int) {})
+	runtime.GC()
+	runtime.Goexit()
+}
+
+func GoNil() {
+	defer func() {
+		recover()
+	}()
+	var f func()
+	go f()
+	select {}
+}
+
+func MainGoroutineID() {
+	panic("test")
+}
+
+func NoHelperGoroutines() {
+	i := 0
+	runtime.SetFinalizer(&i, func(p *int) {})
+	time.AfterFunc(time.Hour, func() {})
+	panic("oops")
+}
+
+func Breakpoint() {
+	runtime.Breakpoint()
+}
+
+func GoexitInPanic() {
+	go func() {
+		defer func() {
+			runtime.Goexit()
+		}()
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
+type errorThatGosched struct{}
+
+func (errorThatGosched) Error() string {
+	runtime.Gosched()
+	return "errorThatGosched"
+}
+
+func GoschedInPanic() {
+	panic(errorThatGosched{})
+}
+
+type errorThatPrint struct{}
+
+func (errorThatPrint) Error() string {
+	fmt.Println("1")
+	fmt.Println("2")
+	return "3"
+}
+
+func SyscallInPanic() {
+	panic(errorThatPrint{})
+}
+
+func PanicAfterGoexit() {
+	defer func() {
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
+func RecoveredPanicAfterGoexit() {
+	defer func() {
+		defer func() {
+			r := recover()
+			if r == nil {
+				panic("bad recover")
+			}
+		}()
+		panic("hello")
+	}()
+	runtime.Goexit()
+}
+
+func PanicTraceback() {
+	pt1()
+}
+
+func pt1() {
+	defer func() {
+		panic("panic pt1")
+	}()
+	pt2()
+}
+
+func pt2() {
+	defer func() {
+		panic("panic pt2")
+	}()
+	panic("hello")
+}

diff --git a/src/runtime/testdata/testprog/gc.go b/src/runtime/testdata/testprog/gc.go
new file mode 100644
index 0000000..a0c1f82
--- /dev/null
+++ b/src/runtime/testdata/testprog/gc.go

@@ -0,0 +1,109 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"runtime/debug"
+	"sync/atomic"
+	"time"
+)
+
+func init() {
+	register("GCFairness", GCFairness)
+	register("GCFairness2", GCFairness2)
+	register("GCSys", GCSys)
+}
+
+func GCSys() {
+	runtime.GOMAXPROCS(1)
+	memstats := new(runtime.MemStats)
+	runtime.GC()
+	runtime.ReadMemStats(memstats)
+	sys := memstats.Sys
+
+	runtime.MemProfileRate = 0 // disable profiler
+
+	itercount := 100000
+	for i := 0; i < itercount; i++ {
+		workthegc()
+	}
+
+	// Should only be using a few MB.
+	// We allocated 100 MB or (if not short) 1 GB.
+	runtime.ReadMemStats(memstats)
+	if sys > memstats.Sys {
+		sys = 0
+	} else {
+		sys = memstats.Sys - sys
+	}
+	if sys > 16<<20 {
+		fmt.Printf("using too much memory: %d bytes\n", sys)
+		return
+	}
+	fmt.Printf("OK\n")
+}
+
+func workthegc() []byte {
+	return make([]byte, 1029)
+}
+
+func GCFairness() {
+	runtime.GOMAXPROCS(1)
+	f, err := os.Open("/dev/null")
+	if os.IsNotExist(err) {
+		// This test tests what it is intended to test only if writes are fast.
+		// If there is no /dev/null, we just don't execute the test.
+		fmt.Println("OK")
+		return
+	}
+	if err != nil {
+		fmt.Println(err)
+		os.Exit(1)
+	}
+	for i := 0; i < 2; i++ {
+		go func() {
+			for {
+				f.Write([]byte("."))
+			}
+		}()
+	}
+	time.Sleep(10 * time.Millisecond)
+	fmt.Println("OK")
+}
+
+func GCFairness2() {
+	// Make sure user code can't exploit the GC's high priority
+	// scheduling to make scheduling of user code unfair. See
+	// issue #15706.
+	runtime.GOMAXPROCS(1)
+	debug.SetGCPercent(1)
+	var count [3]int64
+	var sink [3]interface{}
+	for i := range count {
+		go func(i int) {
+			for {
+				sink[i] = make([]byte, 1024)
+				atomic.AddInt64(&count[i], 1)
+			}
+		}(i)
+	}
+	// Note: If the unfairness is really bad, it may not even get
+	// past the sleep.
+	//
+	// If the scheduling rules change, this may not be enough time
+	// to let all goroutines run, but for now we cycle through
+	// them rapidly.
+	time.Sleep(30 * time.Millisecond)
+	for i := range count {
+		if atomic.LoadInt64(&count[i]) == 0 {
+			fmt.Printf("goroutine %d did not run\n", i)
+			return
+		}
+	}
+	fmt.Println("OK")
+}

diff --git a/src/runtime/testdata/testprog/main.go b/src/runtime/testdata/testprog/main.go
new file mode 100644
index 0000000..ae491a2
--- /dev/null
+++ b/src/runtime/testdata/testprog/main.go

@@ -0,0 +1,35 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "os"
+
+var cmds = map[string]func(){}
+
+func register(name string, f func()) {
+	if cmds[name] != nil {
+		panic("duplicate registration: " + name)
+	}
+	cmds[name] = f
+}
+
+func registerInit(name string, f func()) {
+	if len(os.Args) >= 2 && os.Args[1] == name {
+		f()
+	}
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		println("usage: " + os.Args[0] + " name-of-test")
+		return
+	}
+	f := cmds[os.Args[1]]
+	if f == nil {
+		println("unknown function: " + os.Args[1])
+		return
+	}
+	f()
+}

diff --git a/src/runtime/testdata/testprog/memprof.go b/src/runtime/testdata/testprog/memprof.go
new file mode 100644
index 0000000..a22fee6
--- /dev/null
+++ b/src/runtime/testdata/testprog/memprof.go

@@ -0,0 +1,49 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime"
+	"runtime/pprof"
+)
+
+func init() {
+	register("MemProf", MemProf)
+}
+
+var memProfBuf bytes.Buffer
+var memProfStr string
+
+func MemProf() {
+	for i := 0; i < 1000; i++ {
+		fmt.Fprintf(&memProfBuf, "%*d\n", i, i)
+	}
+	memProfStr = memProfBuf.String()
+
+	runtime.GC()
+
+	f, err := ioutil.TempFile("", "memprof")
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := pprof.WriteHeapProfile(f); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	name := f.Name()
+	if err := f.Close(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	fmt.Println(name)
+}

diff --git a/src/runtime/testdata/testprog/misc.go b/src/runtime/testdata/testprog/misc.go
new file mode 100644
index 0000000..7ccd389
--- /dev/null
+++ b/src/runtime/testdata/testprog/misc.go

@@ -0,0 +1,15 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "runtime"
+
+func init() {
+	register("NumGoroutine", NumGoroutine)
+}
+
+func NumGoroutine() {
+	println(runtime.NumGoroutine())
+}

diff --git a/src/runtime/testdata/testprog/signal.go b/src/runtime/testdata/testprog/signal.go
new file mode 100644
index 0000000..2ccbada
--- /dev/null
+++ b/src/runtime/testdata/testprog/signal.go

@@ -0,0 +1,29 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !windows,!plan9,!nacl
+
+package main
+
+import (
+	"syscall"
+	"time"
+)
+
+func init() {
+	register("SignalExitStatus", SignalExitStatus)
+}
+
+func SignalExitStatus() {
+	syscall.Kill(syscall.Getpid(), syscall.SIGTERM)
+
+	// Should die immediately, but we've seen flakiness on various
+	// systems (see issue 14063). It's possible that the signal is
+	// being delivered to a different thread and we are returning
+	// and exiting before that thread runs again. Give the program
+	// a little while to die to make sure we pick up the signal
+	// before we return and exit the program. The time here
+	// shouldn't matter--we'll never really sleep this long.
+	time.Sleep(time.Second)
+}

diff --git a/src/runtime/testdata/testprog/stringconcat.go b/src/runtime/testdata/testprog/stringconcat.go
new file mode 100644
index 0000000..f233e66
--- /dev/null
+++ b/src/runtime/testdata/testprog/stringconcat.go

@@ -0,0 +1,20 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "strings"
+
+func init() {
+	register("stringconcat", stringconcat)
+}
+
+func stringconcat() {
+	s0 := strings.Repeat("0", 1<<10)
+	s1 := strings.Repeat("1", 1<<10)
+	s2 := strings.Repeat("2", 1<<10)
+	s3 := strings.Repeat("3", 1<<10)
+	s := s0 + s1 + s2 + s3
+	panic(s)
+}

diff --git a/src/runtime/testdata/testprog/syscall_windows.go b/src/runtime/testdata/testprog/syscall_windows.go
new file mode 100644
index 0000000..6e6782e
--- /dev/null
+++ b/src/runtime/testdata/testprog/syscall_windows.go

@@ -0,0 +1,27 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "syscall"
+
+func init() {
+	register("RaiseException", RaiseException)
+	register("ZeroDivisionException", ZeroDivisionException)
+}
+
+func RaiseException() {
+	const EXCEPTION_NONCONTINUABLE = 1
+	mod := syscall.MustLoadDLL("kernel32.dll")
+	proc := mod.MustFindProc("RaiseException")
+	proc.Call(0xbad, EXCEPTION_NONCONTINUABLE, 0, 0)
+	println("RaiseException should not return")
+}
+
+func ZeroDivisionException() {
+	x := 1
+	y := 0
+	z := x / y
+	println(z)
+}

diff --git a/src/runtime/testdata/testprogcgo/aprof.go b/src/runtime/testdata/testprogcgo/aprof.go
new file mode 100644
index 0000000..aabca9e
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/aprof.go

@@ -0,0 +1,53 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// Test that SIGPROF received in C code does not crash the process
+// looking for the C code's func pointer.
+
+// The test fails when the function is the first C function.
+// The exported functions are the first C functions, so we use that.
+
+// extern void GoNop();
+import "C"
+
+import (
+	"bytes"
+	"fmt"
+	"runtime/pprof"
+	"time"
+)
+
+func init() {
+	register("CgoCCodeSIGPROF", CgoCCodeSIGPROF)
+}
+
+//export GoNop
+func GoNop() {}
+
+func CgoCCodeSIGPROF() {
+	c := make(chan bool)
+	go func() {
+		<-c
+		start := time.Now()
+		for i := 0; i < 1e7; i++ {
+			if i%1000 == 0 {
+				if time.Since(start) > time.Second {
+					break
+				}
+			}
+			C.GoNop()
+		}
+		c <- true
+	}()
+
+	var buf bytes.Buffer
+	pprof.StartCPUProfile(&buf)
+	c <- true
+	<-c
+	pprof.StopCPUProfile()
+
+	fmt.Println("OK")
+}

diff --git a/src/runtime/testdata/testprogcgo/callback.go b/src/runtime/testdata/testprogcgo/callback.go
new file mode 100644
index 0000000..7d9d68d
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/callback.go

@@ -0,0 +1,89 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <pthread.h>
+
+void go_callback();
+
+static void *thr(void *arg) {
+    go_callback();
+    return 0;
+}
+
+static void foo() {
+    pthread_t th;
+    pthread_attr_t attr;
+    pthread_attr_init(&attr);
+    pthread_attr_setstacksize(&attr, 256 << 10);
+    pthread_create(&th, &attr, thr, 0);
+    pthread_join(th, 0);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func init() {
+	register("CgoCallbackGC", CgoCallbackGC)
+}
+
+//export go_callback
+func go_callback() {
+	runtime.GC()
+	grow()
+	runtime.GC()
+}
+
+var cnt int
+
+func grow() {
+	x := 10000
+	sum := 0
+	if grow1(&x, &sum) == 0 {
+		panic("bad")
+	}
+}
+
+func grow1(x, sum *int) int {
+	if *x == 0 {
+		return *sum + 1
+	}
+	*x--
+	sum1 := *sum + *x
+	return grow1(x, &sum1)
+}
+
+func CgoCallbackGC() {
+	const P = 100
+	done := make(chan bool)
+	// allocate a bunch of stack frames and spray them with pointers
+	for i := 0; i < P; i++ {
+		go func() {
+			grow()
+			done <- true
+		}()
+	}
+	for i := 0; i < P; i++ {
+		<-done
+	}
+	// now give these stack frames to cgo callbacks
+	for i := 0; i < P; i++ {
+		go func() {
+			C.foo()
+			done <- true
+		}()
+	}
+	for i := 0; i < P; i++ {
+		<-done
+	}
+	fmt.Printf("OK\n")
+}

diff --git a/src/runtime/testdata/testprogcgo/cgo.go b/src/runtime/testdata/testprogcgo/cgo.go
new file mode 100644
index 0000000..870d4ef
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/cgo.go

@@ -0,0 +1,100 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+void foo1(void) {}
+void foo2(void* p) {}
+*/
+import "C"
+import (
+	"fmt"
+	"os"
+	"runtime"
+	"strconv"
+	"time"
+	"unsafe"
+)
+
+func init() {
+	register("CgoSignalDeadlock", CgoSignalDeadlock)
+	register("CgoTraceback", CgoTraceback)
+	register("CgoCheckBytes", CgoCheckBytes)
+}
+
+func CgoSignalDeadlock() {
+	runtime.GOMAXPROCS(100)
+	ping := make(chan bool)
+	go func() {
+		for i := 0; ; i++ {
+			runtime.Gosched()
+			select {
+			case done := <-ping:
+				if done {
+					ping <- true
+					return
+				}
+				ping <- true
+			default:
+			}
+			func() {
+				defer func() {
+					recover()
+				}()
+				var s *string
+				*s = ""
+			}()
+		}
+	}()
+	time.Sleep(time.Millisecond)
+	for i := 0; i < 64; i++ {
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		go func() {
+			runtime.LockOSThread()
+			select {}
+		}()
+		time.Sleep(time.Millisecond)
+		ping <- false
+		select {
+		case <-ping:
+		case <-time.After(time.Second):
+			fmt.Printf("HANG\n")
+			return
+		}
+	}
+	ping <- true
+	select {
+	case <-ping:
+	case <-time.After(time.Second):
+		fmt.Printf("HANG\n")
+		return
+	}
+	fmt.Printf("OK\n")
+}
+
+func CgoTraceback() {
+	C.foo1()
+	buf := make([]byte, 1)
+	runtime.Stack(buf, true)
+	fmt.Printf("OK\n")
+}
+
+func CgoCheckBytes() {
+	try, _ := strconv.Atoi(os.Getenv("GO_CGOCHECKBYTES_TRY"))
+	if try <= 0 {
+		try = 1
+	}
+	b := make([]byte, 1e6*try)
+	start := time.Now()
+	for i := 0; i < 1e3*try; i++ {
+		C.foo2(unsafe.Pointer(&b[0]))
+		if time.Since(start) > time.Second {
+			break
+		}
+	}
+}

diff --git a/src/runtime/testdata/testprogcgo/crash.go b/src/runtime/testdata/testprogcgo/crash.go
new file mode 100644
index 0000000..4d83132
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/crash.go

@@ -0,0 +1,45 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"runtime"
+)
+
+func init() {
+	register("Crash", Crash)
+}
+
+func test(name string) {
+	defer func() {
+		if x := recover(); x != nil {
+			fmt.Printf(" recovered")
+		}
+		fmt.Printf(" done\n")
+	}()
+	fmt.Printf("%s:", name)
+	var s *string
+	_ = *s
+	fmt.Print("SHOULD NOT BE HERE")
+}
+
+func testInNewThread(name string) {
+	c := make(chan bool)
+	go func() {
+		runtime.LockOSThread()
+		test(name)
+		c <- true
+	}()
+	<-c
+}
+
+func Crash() {
+	runtime.LockOSThread()
+	test("main")
+	testInNewThread("new-thread")
+	testInNewThread("second-new-thread")
+	test("main-again")
+}

diff --git a/src/runtime/testdata/testprogcgo/deadlock.go b/src/runtime/testdata/testprogcgo/deadlock.go
new file mode 100644
index 0000000..2cc68a8
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/deadlock.go

@@ -0,0 +1,30 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+char *geterror() {
+	return "cgo error";
+}
+*/
+import "C"
+import (
+	"fmt"
+)
+
+func init() {
+	register("CgoPanicDeadlock", CgoPanicDeadlock)
+}
+
+type cgoError struct{}
+
+func (cgoError) Error() string {
+	fmt.Print("") // necessary to trigger the deadlock
+	return C.GoString(C.geterror())
+}
+
+func CgoPanicDeadlock() {
+	panic(cgoError{})
+}

diff --git a/src/runtime/testdata/testprogcgo/dll_windows.go b/src/runtime/testdata/testprogcgo/dll_windows.go
new file mode 100644
index 0000000..aed2410
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/dll_windows.go

@@ -0,0 +1,25 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+/*
+#include <windows.h>
+
+DWORD getthread() {
+	return GetCurrentThreadId();
+}
+*/
+import "C"
+import "./windows"
+
+func init() {
+	register("CgoDLLImportsMain", CgoDLLImportsMain)
+}
+
+func CgoDLLImportsMain() {
+	C.getthread()
+	windows.GetThread()
+	println("OK")
+}

diff --git a/src/runtime/testdata/testprogcgo/dropm.go b/src/runtime/testdata/testprogcgo/dropm.go
new file mode 100644
index 0000000..9e782f5
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/dropm.go

@@ -0,0 +1,59 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+// Test that a sequence of callbacks from C to Go get the same m.
+// This failed to be true on arm and arm64, which was the root cause
+// of issue 13881.
+
+package main
+
+/*
+#include <stddef.h>
+#include <pthread.h>
+
+extern void GoCheckM();
+
+static void* thread(void* arg __attribute__ ((unused))) {
+	GoCheckM();
+	return NULL;
+}
+
+static void CheckM() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, thread, NULL);
+	pthread_join(tid, NULL);
+	pthread_create(&tid, NULL, thread, NULL);
+	pthread_join(tid, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+)
+
+func init() {
+	register("EnsureDropM", EnsureDropM)
+}
+
+var savedM uintptr
+
+//export GoCheckM
+func GoCheckM() {
+	m := runtime_getm_for_test()
+	if savedM == 0 {
+		savedM = m
+	} else if savedM != m {
+		fmt.Printf("m == %x want %x\n", m, savedM)
+		os.Exit(1)
+	}
+}
+
+func EnsureDropM() {
+	C.CheckM()
+	fmt.Println("OK")
+}

diff --git a/src/runtime/testdata/testprogcgo/dropm_stub.go b/src/runtime/testdata/testprogcgo/dropm_stub.go
new file mode 100644
index 0000000..f7f142c
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/dropm_stub.go

@@ -0,0 +1,11 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import _ "unsafe" // for go:linkname
+
+// Defined in the runtime package.
+//go:linkname runtime_getm_for_test runtime.getm
+func runtime_getm_for_test() uintptr

diff --git a/src/runtime/testdata/testprogcgo/exec.go b/src/runtime/testdata/testprogcgo/exec.go
new file mode 100644
index 0000000..2e94840
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/exec.go

@@ -0,0 +1,89 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <stddef.h>
+#include <signal.h>
+#include <pthread.h>
+
+// Save the signal mask at startup so that we see what it is before
+// the Go runtime starts setting up signals.
+
+static sigset_t mask;
+
+static void init(void) __attribute__ ((constructor));
+
+static void init() {
+	sigemptyset(&mask);
+	pthread_sigmask(SIG_SETMASK, NULL, &mask);
+}
+
+int SIGINTBlocked() {
+	return sigismember(&mask, SIGINT);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"os/signal"
+	"sync"
+	"syscall"
+)
+
+func init() {
+	register("CgoExecSignalMask", CgoExecSignalMask)
+}
+
+func CgoExecSignalMask() {
+	if len(os.Args) > 2 && os.Args[2] == "testsigint" {
+		if C.SIGINTBlocked() != 0 {
+			os.Exit(1)
+		}
+		os.Exit(0)
+	}
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, syscall.SIGTERM)
+	go func() {
+		for range c {
+		}
+	}()
+
+	const goCount = 10
+	const execCount = 10
+	var wg sync.WaitGroup
+	wg.Add(goCount*execCount + goCount)
+	for i := 0; i < goCount; i++ {
+		go func() {
+			defer wg.Done()
+			for j := 0; j < execCount; j++ {
+				c2 := make(chan os.Signal, 1)
+				signal.Notify(c2, syscall.SIGUSR1)
+				syscall.Kill(os.Getpid(), syscall.SIGTERM)
+				go func(j int) {
+					defer wg.Done()
+					cmd := exec.Command(os.Args[0], "CgoExecSignalMask", "testsigint")
+					cmd.Stdin = os.Stdin
+					cmd.Stdout = os.Stdout
+					cmd.Stderr = os.Stderr
+					if err := cmd.Run(); err != nil {
+						fmt.Printf("iteration %d: %v\n", j, err)
+						os.Exit(1)
+					}
+				}(j)
+				signal.Stop(c2)
+			}
+		}()
+	}
+	wg.Wait()
+
+	fmt.Println("OK")
+}

diff --git a/src/runtime/testdata/testprogcgo/main.go b/src/runtime/testdata/testprogcgo/main.go
new file mode 100644
index 0000000..ae491a2
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/main.go

@@ -0,0 +1,35 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "os"
+
+var cmds = map[string]func(){}
+
+func register(name string, f func()) {
+	if cmds[name] != nil {
+		panic("duplicate registration: " + name)
+	}
+	cmds[name] = f
+}
+
+func registerInit(name string, f func()) {
+	if len(os.Args) >= 2 && os.Args[1] == name {
+		f()
+	}
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		println("usage: " + os.Args[0] + " name-of-test")
+		return
+	}
+	f := cmds[os.Args[1]]
+	if f == nil {
+		println("unknown function: " + os.Args[1])
+		return
+	}
+	f()
+}

diff --git a/src/runtime/testdata/testprogcgo/pprof.go b/src/runtime/testdata/testprogcgo/pprof.go
new file mode 100644
index 0000000..cb30ec5
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/pprof.go

@@ -0,0 +1,97 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// Run a slow C function saving a CPU profile.
+
+/*
+#include <stdint.h>
+
+int salt1;
+int salt2;
+
+void cpuHog() {
+	int foo = salt1;
+	int i;
+
+	for (i = 0; i < 100000; i++) {
+		if (foo > 0) {
+			foo *= foo;
+		} else {
+			foo *= foo + 1;
+		}
+	}
+	salt2 = foo;
+}
+
+static int cpuHogCount;
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+// pprofCgoTraceback is passed to runtime.SetCgoTraceback.
+// For testing purposes it pretends that all CPU hits in C code are in cpuHog.
+void pprofCgoTraceback(void* parg) {
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	arg->buf[0] = (uintptr_t)(cpuHog) + 0x10;
+	arg->buf[1] = 0;
+	++cpuHogCount;
+}
+
+// getCpuHogCount fetches the number of times we've seen cpuHog in the
+// traceback.
+int getCpuHogCount() {
+	return cpuHogCount;
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime"
+	"runtime/pprof"
+	"time"
+	"unsafe"
+)
+
+func init() {
+	register("CgoPprof", CgoPprof)
+}
+
+func CgoPprof() {
+	runtime.SetCgoTraceback(0, unsafe.Pointer(C.pprofCgoTraceback), nil, nil)
+
+	f, err := ioutil.TempFile("", "prof")
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := pprof.StartCPUProfile(f); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	t0 := time.Now()
+	for C.getCpuHogCount() < 2 && time.Since(t0) < time.Second {
+		C.cpuHog()
+	}
+
+	pprof.StopCPUProfile()
+
+	name := f.Name()
+	if err := f.Close(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	fmt.Println(name)
+}

diff --git a/src/runtime/testdata/testprogcgo/threadpanic.go b/src/runtime/testdata/testprogcgo/threadpanic.go
new file mode 100644
index 0000000..f9b48a9
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/threadpanic.go

@@ -0,0 +1,24 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9
+
+package main
+
+// void start(void);
+import "C"
+
+func init() {
+	register("CgoExternalThreadPanic", CgoExternalThreadPanic)
+}
+
+func CgoExternalThreadPanic() {
+	C.start()
+	select {}
+}
+
+//export gopanic
+func gopanic() {
+	panic("BOOM")
+}

diff --git a/src/runtime/testdata/testprogcgo/threadpanic_unix.c b/src/runtime/testdata/testprogcgo/threadpanic_unix.c
new file mode 100644
index 0000000..c426452
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/threadpanic_unix.c

@@ -0,0 +1,26 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+
+void gopanic(void);
+
+static void*
+die(void* x)
+{
+	gopanic();
+	return 0;
+}
+
+void
+start(void)
+{
+	pthread_t t;
+	if(pthread_create(&t, 0, die, 0) != 0)
+		printf("pthread_create failed\n");
+}

diff --git a/src/runtime/testdata/testprogcgo/threadpanic_windows.c b/src/runtime/testdata/testprogcgo/threadpanic_windows.c
new file mode 100644
index 0000000..ba66d0f
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/threadpanic_windows.c

@@ -0,0 +1,23 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include <process.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+void gopanic(void);
+
+static unsigned int __attribute__((__stdcall__))
+die(void* x)
+{
+	gopanic();
+	return 0;
+}
+
+void
+start(void)
+{
+	if(_beginthreadex(0, 0, die, 0, 0, 0) != 0)
+		printf("_beginthreadex failed\n");
+}

diff --git a/src/runtime/testdata/testprogcgo/threadpprof.go b/src/runtime/testdata/testprogcgo/threadpprof.go
new file mode 100644
index 0000000..fdeee69
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/threadpprof.go

@@ -0,0 +1,112 @@
+// Copyright 2016 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// Run a slow C function saving a CPU profile.
+
+/*
+#include <stdint.h>
+#include <time.h>
+#include <pthread.h>
+
+int threadSalt1;
+int threadSalt2;
+
+void cpuHogThread() {
+	int foo = threadSalt1;
+	int i;
+
+	for (i = 0; i < 100000; i++) {
+		if (foo > 0) {
+			foo *= foo;
+		} else {
+			foo *= foo + 1;
+		}
+	}
+	threadSalt2 = foo;
+}
+
+static int cpuHogThreadCount;
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+static void *pprofThread(void* p) {
+	time_t start;
+
+	(void)p;
+	start = time(NULL);
+	while (__sync_add_and_fetch(&cpuHogThreadCount, 0) < 2 && time(NULL) - start < 2) {
+		cpuHogThread();
+	}
+}
+
+
+// pprofCgoThreadTraceback is passed to runtime.SetCgoTraceback.
+// For testing purposes it pretends that all CPU hits in C code are in cpuHog.
+void pprofCgoThreadTraceback(void* parg) {
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	arg->buf[0] = (uintptr_t)(cpuHogThread) + 0x10;
+	arg->buf[1] = 0;
+	__sync_add_and_fetch(&cpuHogThreadCount, 1);
+}
+
+// getCPUHogThreadCount fetches the number of times we've seen cpuHogThread
+// in the traceback.
+int getCPUHogThreadCount() {
+	return __sync_add_and_fetch(&cpuHogThreadCount, 0);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"io/ioutil"
+	"os"
+	"runtime"
+	"runtime/pprof"
+	"time"
+	"unsafe"
+)
+
+func init() {
+	register("CgoPprofThread", CgoPprofThread)
+}
+
+func CgoPprofThread() {
+	runtime.SetCgoTraceback(0, unsafe.Pointer(C.pprofCgoThreadTraceback), nil, nil)
+
+	f, err := ioutil.TempFile("", "prof")
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	if err := pprof.StartCPUProfile(f); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	t0 := time.Now()
+	for C.getCPUHogThreadCount() < 2 && time.Since(t0) < time.Second {
+		time.Sleep(100 * time.Millisecond)
+	}
+
+	pprof.StopCPUProfile()
+
+	name := f.Name()
+	if err := f.Close(); err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(2)
+	}
+
+	fmt.Println(name)
+}

diff --git a/src/runtime/testdata/testprogcgo/threadprof.go b/src/runtime/testdata/testprogcgo/threadprof.go
new file mode 100644
index 0000000..a77479d
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/threadprof.go

@@ -0,0 +1,93 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+/*
+#include <stdint.h>
+#include <signal.h>
+#include <pthread.h>
+
+volatile int32_t spinlock;
+
+static void *thread1(void *p) {
+	(void)p;
+	while (spinlock == 0)
+		;
+	pthread_kill(pthread_self(), SIGPROF);
+	spinlock = 0;
+	return NULL;
+}
+__attribute__((constructor)) void issue9456() {
+	pthread_t tid;
+	pthread_create(&tid, 0, thread1, NULL);
+}
+
+void **nullptr;
+
+void *crash(void *p) {
+	*nullptr = p;
+	return 0;
+}
+
+int start_crashing_thread(void) {
+	pthread_t tid;
+	return pthread_create(&tid, 0, crash, 0);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+	"runtime"
+	"sync/atomic"
+	"time"
+	"unsafe"
+)
+
+func init() {
+	register("CgoExternalThreadSIGPROF", CgoExternalThreadSIGPROF)
+	register("CgoExternalThreadSignal", CgoExternalThreadSignal)
+}
+
+func CgoExternalThreadSIGPROF() {
+	// This test intends to test that sending SIGPROF to foreign threads
+	// before we make any cgo call will not abort the whole process, so
+	// we cannot make any cgo call here. See https://golang.org/issue/9456.
+	atomic.StoreInt32((*int32)(unsafe.Pointer(&C.spinlock)), 1)
+	for atomic.LoadInt32((*int32)(unsafe.Pointer(&C.spinlock))) == 1 {
+		runtime.Gosched()
+	}
+	println("OK")
+}
+
+func CgoExternalThreadSignal() {
+	if len(os.Args) > 2 && os.Args[2] == "crash" {
+		i := C.start_crashing_thread()
+		if i != 0 {
+			fmt.Println("pthread_create failed:", i)
+			// Exit with 0 because parent expects us to crash.
+			return
+		}
+
+		// We should crash immediately, but give it plenty of
+		// time before failing (by exiting 0) in case we are
+		// running on a slow system.
+		time.Sleep(5 * time.Second)
+		return
+	}
+
+	out, err := exec.Command(os.Args[0], "CgoExternalThreadSignal", "crash").CombinedOutput()
+	if err == nil {
+		fmt.Println("C signal did not crash as expected\n")
+		fmt.Printf("%s\n", out)
+		os.Exit(1)
+	}
+
+	fmt.Println("OK")
+}

diff --git a/src/runtime/testdata/testprogcgo/traceback.go b/src/runtime/testdata/testprogcgo/traceback.go
new file mode 100644
index 0000000..e8b0a04
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/traceback.go

@@ -0,0 +1,81 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+// This program will crash.
+// We want the stack trace to include the C functions.
+// We use a fake traceback, and a symbolizer that dumps a string we recognize.
+
+/*
+#cgo CFLAGS: -g -O0
+
+#include <stdint.h>
+
+char *p;
+
+static int f3() {
+	*p = 0;
+	return 0;
+}
+
+static int f2() {
+	return f3();
+}
+
+static int f1() {
+	return f2();
+}
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+struct cgoSymbolizerArg {
+	uintptr_t   pc;
+	const char* file;
+	uintptr_t   lineno;
+	const char* func;
+	uintptr_t   entry;
+	uintptr_t   more;
+	uintptr_t   data;
+};
+
+void cgoTraceback(void* parg) {
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	arg->buf[0] = 1;
+	arg->buf[1] = 2;
+	arg->buf[2] = 3;
+	arg->buf[3] = 0;
+}
+
+void cgoSymbolizer(void* parg) {
+	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
+	if (arg->pc != arg->data + 1) {
+		arg->file = "unexpected data";
+	} else {
+		arg->file = "cgo symbolizer";
+	}
+	arg->lineno = arg->data + 1;
+	arg->data++;
+}
+*/
+import "C"
+
+import (
+	"runtime"
+	"unsafe"
+)
+
+func init() {
+	register("CrashTraceback", CrashTraceback)
+}
+
+func CrashTraceback() {
+	runtime.SetCgoTraceback(0, unsafe.Pointer(C.cgoTraceback), nil, unsafe.Pointer(C.cgoSymbolizer))
+	C.f1()
+}

diff --git a/src/runtime/testdata/testprogcgo/tracebackctxt.go b/src/runtime/testdata/testprogcgo/tracebackctxt.go
new file mode 100644
index 0000000..51fa4ad
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/tracebackctxt.go

@@ -0,0 +1,107 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The __attribute__((weak)) used below doesn't seem to work on Windows.
+
+package main
+
+// Test the context argument to SetCgoTraceback.
+// Use fake context, traceback, and symbolizer functions.
+
+/*
+// Defined in tracebackctxt_c.c.
+extern void C1(void);
+extern void C2(void);
+extern void tcContext(void*);
+extern void tcTraceback(void*);
+extern void tcSymbolizer(void*);
+extern int getContextCount(void);
+*/
+import "C"
+
+import (
+	"fmt"
+	"runtime"
+	"unsafe"
+)
+
+func init() {
+	register("TracebackContext", TracebackContext)
+}
+
+var tracebackOK bool
+
+func TracebackContext() {
+	runtime.SetCgoTraceback(0, unsafe.Pointer(C.tcTraceback), unsafe.Pointer(C.tcContext), unsafe.Pointer(C.tcSymbolizer))
+	C.C1()
+	if got := C.getContextCount(); got != 0 {
+		fmt.Printf("at end contextCount == %d, expected 0\n", got)
+		tracebackOK = false
+	}
+	if tracebackOK {
+		fmt.Println("OK")
+	}
+}
+
+//export G1
+func G1() {
+	C.C2()
+}
+
+//export G2
+func G2() {
+	pc := make([]uintptr, 32)
+	n := runtime.Callers(0, pc)
+	cf := runtime.CallersFrames(pc[:n])
+	var frames []runtime.Frame
+	for {
+		frame, more := cf.Next()
+		frames = append(frames, frame)
+		if !more {
+			break
+		}
+	}
+
+	want := []struct {
+		function string
+		line     int
+	}{
+		{"main.G2", 0},
+		{"cFunction", 0x10200},
+		{"cFunction", 0x200},
+		{"cFunction", 0x10201},
+		{"cFunction", 0x201},
+		{"main.G1", 0},
+		{"cFunction", 0x10100},
+		{"cFunction", 0x100},
+		{"main.TracebackContext", 0},
+	}
+
+	ok := true
+	i := 0
+wantLoop:
+	for _, w := range want {
+		for ; i < len(frames); i++ {
+			if w.function == frames[i].Function {
+				if w.line != 0 && w.line != frames[i].Line {
+					fmt.Printf("found function %s at wrong line %#x (expected %#x)\n", w.function, frames[i].Line, w.line)
+					ok = false
+				}
+				i++
+				continue wantLoop
+			}
+		}
+		fmt.Printf("did not find function %s in\n", w.function)
+		for _, f := range frames {
+			fmt.Println(f)
+		}
+		ok = false
+		break
+	}
+	tracebackOK = ok
+	if got := C.getContextCount(); got != 2 {
+		fmt.Printf("at bottom contextCount == %d, expected 2\n", got)
+		tracebackOK = false
+	}
+}

diff --git a/src/runtime/testdata/testprogcgo/tracebackctxt_c.c b/src/runtime/testdata/testprogcgo/tracebackctxt_c.c
new file mode 100644
index 0000000..900cada
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/tracebackctxt_c.c

@@ -0,0 +1,91 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// The C definitions for tracebackctxt.go. That file uses //export so
+// it can't put function definitions in the "C" import comment.
+
+#include <stdlib.h>
+#include <stdint.h>
+
+// Functions exported from Go.
+extern void G1(void);
+extern void G2(void);
+
+void C1() {
+	G1();
+}
+
+void C2() {
+	G2();
+}
+
+struct cgoContextArg {
+	uintptr_t context;
+};
+
+struct cgoTracebackArg {
+	uintptr_t  context;
+	uintptr_t  sigContext;
+	uintptr_t* buf;
+	uintptr_t  max;
+};
+
+struct cgoSymbolizerArg {
+	uintptr_t   pc;
+	const char* file;
+	uintptr_t   lineno;
+	const char* func;
+	uintptr_t   entry;
+	uintptr_t   more;
+	uintptr_t   data;
+};
+
+// Uses atomic adds and subtracts to catch the possibility of
+// erroneous calls from multiple threads; that should be impossible in
+// this test case, but we check just in case.
+static int contextCount;
+
+int getContextCount() {
+	return __sync_add_and_fetch(&contextCount, 0);
+}
+
+void tcContext(void* parg) {
+	struct cgoContextArg* arg = (struct cgoContextArg*)(parg);
+	if (arg->context == 0) {
+		arg->context = __sync_add_and_fetch(&contextCount, 1);
+	} else {
+		if (arg->context != __sync_add_and_fetch(&contextCount, 0)) {
+			abort();
+		}
+		__sync_sub_and_fetch(&contextCount, 1);
+	}
+}
+
+void tcTraceback(void* parg) {
+	int base, i;
+	struct cgoTracebackArg* arg = (struct cgoTracebackArg*)(parg);
+	if (arg->context == 0) {
+		// This shouldn't happen in this program.
+		abort();
+	}
+	// Return a variable number of PC values.
+	base = arg->context << 8;
+	for (i = 0; i < arg->context; i++) {
+		if (i < arg->max) {
+			arg->buf[i] = base + i;
+		}
+	}
+}
+
+void tcSymbolizer(void *parg) {
+	struct cgoSymbolizerArg* arg = (struct cgoSymbolizerArg*)(parg);
+	if (arg->pc == 0) {
+		return;
+	}
+	// Report two lines per PC returned by traceback, to test more handling.
+	arg->more = arg->file == NULL;
+	arg->file = "tracebackctxt.go";
+	arg->func = "cFunction";
+	arg->lineno = arg->pc + (arg->more << 16);
+}

diff --git a/src/runtime/testdata/testprogcgo/windows/win.go b/src/runtime/testdata/testprogcgo/windows/win.go
new file mode 100644
index 0000000..f2eabb9
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/windows/win.go

@@ -0,0 +1,16 @@
+package windows
+
+/*
+#cgo CFLAGS: -mnop-fun-dllimport
+
+#include <windows.h>
+
+DWORD agetthread() {
+	return GetCurrentThreadId();
+}
+*/
+import "C"
+
+func GetThread() uint32 {
+	return uint32(C.agetthread())
+}

diff --git a/src/runtime/testdata/testprognet/main.go b/src/runtime/testdata/testprognet/main.go
new file mode 100644
index 0000000..ae491a2
--- /dev/null
+++ b/src/runtime/testdata/testprognet/main.go

@@ -0,0 +1,35 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "os"
+
+var cmds = map[string]func(){}
+
+func register(name string, f func()) {
+	if cmds[name] != nil {
+		panic("duplicate registration: " + name)
+	}
+	cmds[name] = f
+}
+
+func registerInit(name string, f func()) {
+	if len(os.Args) >= 2 && os.Args[1] == name {
+		f()
+	}
+}
+
+func main() {
+	if len(os.Args) < 2 {
+		println("usage: " + os.Args[0] + " name-of-test")
+		return
+	}
+	f := cmds[os.Args[1]]
+	if f == nil {
+		println("unknown function: " + os.Args[1])
+		return
+	}
+	f()
+}

diff --git a/src/runtime/testdata/testprognet/net.go b/src/runtime/testdata/testprognet/net.go
new file mode 100644
index 0000000..714b101
--- /dev/null
+++ b/src/runtime/testdata/testprognet/net.go

@@ -0,0 +1,29 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import (
+	"fmt"
+	"net"
+)
+
+func init() {
+	registerInit("NetpollDeadlock", NetpollDeadlockInit)
+	register("NetpollDeadlock", NetpollDeadlock)
+}
+
+func NetpollDeadlockInit() {
+	fmt.Println("dialing")
+	c, err := net.Dial("tcp", "localhost:14356")
+	if err == nil {
+		c.Close()
+	} else {
+		fmt.Println("error: ", err)
+	}
+}
+
+func NetpollDeadlock() {
+	fmt.Println("done")
+}

diff --git a/src/runtime/testdata/testprognet/signal.go b/src/runtime/testdata/testprognet/signal.go
new file mode 100644
index 0000000..a1559fe
--- /dev/null
+++ b/src/runtime/testdata/testprognet/signal.go

@@ -0,0 +1,26 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !windows,!plan9,!nacl
+
+// This is in testprognet instead of testprog because testprog
+// must not import anything (like net, but also like os/signal)
+// that kicks off background goroutines during init.
+
+package main
+
+import (
+	"os/signal"
+	"syscall"
+)
+
+func init() {
+	register("SignalIgnoreSIGTRAP", SignalIgnoreSIGTRAP)
+}
+
+func SignalIgnoreSIGTRAP() {
+	signal.Ignore(syscall.SIGTRAP)
+	syscall.Kill(syscall.Getpid(), syscall.SIGTRAP)
+	println("OK")
+}

diff --git a/src/runtime/textflag.h b/src/runtime/textflag.h
index 2a76e76..929e9b3 100644
--- a/src/runtime/textflag.h
+++ b/src/runtime/textflag.h

@@ -1,14 +1,16 @@
-// Copyright 2013 The Go Authors.  All rights reserved.
+// Copyright 2013 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 // This file defines flags attached to various functions
-// and data objects.  The compilers, assemblers, and linker must
+// and data objects. The compilers, assemblers, and linker must
 // all agree on these values.
+//
+// Keep in sync with src/cmd/internal/obj/textflag.go.
 
-// Don't profile the marked routine.  This flag is deprecated.
+// Don't profile the marked routine. This flag is deprecated.
 #define NOPROF	1
-// It is ok for the linker to get multiple of these symbols.  It will
+// It is ok for the linker to get multiple of these symbols. It will
 // pick one of the duplicates to use.
 #define DUPOK	2
 // Don't insert stack check preamble.
@@ -21,3 +23,12 @@
 #define WRAPPER 32
 // This function uses its incoming context register.
 #define NEEDCTXT 64
+// Allocate a word of thread local storage and store the offset from the
+// thread local base to the thread local storage in this variable.
+#define TLSBSS	256
+// Do not insert instructions to allocate a stack frame for this function.
+// Only valid on functions that declare a frame size of 0.
+// TODO(mwhudson): only implemented for ppc64x at present.
+#define NOFRAME 512
+// Function can call reflect.Type.Method or reflect.Type.MethodByName.
+#define REFLECTMETHOD = 1024

diff --git a/src/runtime/time.go b/src/runtime/time.go
index ffe7590..8df185d 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go

@@ -16,7 +16,7 @@
 	i int // heap index
 
 	// Timer wakes up at when, and then at when+period, ... (period > 0 only)
-	// each time calling f(now, arg) in the timer goroutine, so f must be
+	// each time calling f(arg, now) in the timer goroutine, so f must be
 	// a well-behaved function and not block.
 	when   int64
 	period int64
@@ -202,7 +202,7 @@
 			goparkunlock(&timers.lock, "timer goroutine (idle)", traceEvGoBlock, 1)
 			continue
 		}
-		// At least one timer pending.  Sleep until then.
+		// At least one timer pending. Sleep until then.
 		timers.sleeping = true
 		noteclear(&timers.waitnote)
 		unlock(&timers.lock)

diff --git a/src/runtime/tls_arm.s b/src/runtime/tls_arm.s
index d37970e..32bfcf8 100644
--- a/src/runtime/tls_arm.s
+++ b/src/runtime/tls_arm.s

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -15,7 +15,7 @@
 // Note: both functions will clobber R0 and R11 and
 // can be called from 5c ABI code.
 
-// On android and darwin, runtime.tlsg is a normal variable.
+// On android and darwin, runtime.tls_g is a normal variable.
 // TLS offset is computed in x_cgo_inittls.
 #ifdef GOOS_android
 #define TLSG_IS_VARIABLE
@@ -41,14 +41,7 @@
 	// The replacement function saves LR in R11 over the call to read_tls_fallback.
 	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
 	BIC $3, R0 // Darwin/ARM might return unaligned pointer
-	// $runtime.tlsg(SB) is a special linker symbol.
-	// It is the offset from the TLS base pointer to our
-	// thread-local storage for g.
-#ifdef TLSG_IS_VARIABLE
-	MOVW	runtime·tlsg(SB), R11
-#else
-	MOVW	$runtime·tlsg(SB), R11
-#endif
+	MOVW	runtime·tls_g(SB), R11
 	ADD	R11, R0
 	MOVW	g, 0(R0)
 	MOVW	g, R0 // preserve R0 across call to setg<>
@@ -65,14 +58,7 @@
 	// See save_g
 	MRC	15, 0, R0, C13, C0, 3 // fetch TLS base pointer
 	BIC $3, R0 // Darwin/ARM might return unaligned pointer
-	// $runtime.tlsg(SB) is a special linker symbol.
-	// It is the offset from the TLS base pointer to our
-	// thread-local storage for g.
-#ifdef TLSG_IS_VARIABLE
-	MOVW	runtime·tlsg(SB), R11
-#else
-	MOVW	$runtime·tlsg(SB), R11
-#endif
+	MOVW	runtime·tls_g(SB), R11
 	ADD	R11, R0
 	MOVW	0(R0), g
 	RET
@@ -95,7 +81,11 @@
 	B.EQ	nocgo
 	MRC     15, 0, R0, C13, C0, 3 	// load TLS base pointer
 	MOVW 	R0, R3 			// arg 3: TLS base pointer
-	MOVW 	$runtime·tlsg(SB), R2 	// arg 2: tlsg
+#ifdef TLSG_IS_VARIABLE
+	MOVW 	$runtime·tls_g(SB), R2 	// arg 2: &tls_g
+#else
+        MOVW	$0, R2			// arg 2: not used when using platform tls
+#endif
 	MOVW	$setg_gcc<>(SB), R1 	// arg 1: setg
 	MOVW	g, R0 			// arg 0: G
 	BL	(R4) // will clobber R0-R3
@@ -109,5 +99,7 @@
 	B		runtime·save_g(SB)
 
 #ifdef TLSG_IS_VARIABLE
-GLOBL runtime·tlsg+0(SB), NOPTR, $4
+GLOBL runtime·tls_g+0(SB), NOPTR, $4
+#else
+GLOBL runtime·tls_g+0(SB), TLSBSS, $4
 #endif

diff --git a/src/runtime/tls_arm64.h b/src/runtime/tls_arm64.h
index d5676ab..c29fa7f 100644
--- a/src/runtime/tls_arm64.h
+++ b/src/runtime/tls_arm64.h

@@ -2,7 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#ifdef GOOS_android
+#define TLS_linux
+#define TLSG_IS_VARIABLE
+#endif
 #ifdef GOOS_linux
+#define TLS_linux
+#endif
+#ifdef TLS_linux
 #define TPIDR TPIDR_EL0
 #define MRS_TPIDR_R0 WORD $0xd53bd040 // MRS TPIDR_EL0, R0
 #endif

diff --git a/src/runtime/tls_arm64.s b/src/runtime/tls_arm64.s
index a5f86c4..62ae6fa 100644
--- a/src/runtime/tls_arm64.s
+++ b/src/runtime/tls_arm64.s

@@ -18,13 +18,8 @@
 	// Darwin sometimes returns unaligned pointers
 	AND	$0xfffffffffffffff8, R0
 #endif
-#ifdef TLSG_IS_VARIABLE
 	MOVD	runtime·tls_g(SB), R27
 	ADD	R27, R0
-#else
-	// TODO(minux): use real TLS relocation, instead of hard-code for Linux
-	ADD	$0x10, R0
-#endif
 	MOVD	0(R0), g
 
 nocgo:
@@ -40,21 +35,15 @@
 	// Darwin sometimes returns unaligned pointers
 	AND	$0xfffffffffffffff8, R0
 #endif
-#ifdef TLSG_IS_VARIABLE
 	MOVD	runtime·tls_g(SB), R27
 	ADD	R27, R0
-#else
-	// TODO(minux): use real TLS relocation, instead of hard-code for Linux
-	ADD	$0x10, R0
-#endif
 	MOVD	g, 0(R0)
 
 nocgo:
 	RET
 
 #ifdef TLSG_IS_VARIABLE
-// The runtime.tlsg name is being handled specially in the
-// linker. As we just need a regular variable here, don't
-// use that name.
 GLOBL runtime·tls_g+0(SB), NOPTR, $8
+#else
+GLOBL runtime·tls_g+0(SB), TLSBSS, $8
 #endif

diff --git a/src/runtime/tls_mips64x.s b/src/runtime/tls_mips64x.s
new file mode 100644
index 0000000..53bd6f2
--- /dev/null
+++ b/src/runtime/tls_mips64x.s

@@ -0,0 +1,30 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips64 mips64le
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// If !iscgo, this is a no-op.
+//
+// NOTE: mcall() assumes this clobbers only R23 (REGTMP).
+TEXT runtime·save_g(SB),NOSPLIT,$-8-0
+	MOVB	runtime·iscgo(SB), R23
+	BEQ	R23, nocgo
+
+	MOVV	R3, R23	// save R3
+	MOVV	g, runtime·tls_g(SB) // TLS relocation clobbers R3
+	MOVV	R23, R3	// restore R3
+
+nocgo:
+	RET
+
+TEXT runtime·load_g(SB),NOSPLIT,$-8-0
+	MOVV	runtime·tls_g(SB), g // TLS relocation clobbers R3
+	RET
+
+GLOBL runtime·tls_g(SB), TLSBSS, $8

diff --git a/src/runtime/tls_ppc64x.s b/src/runtime/tls_ppc64x.s
index fc1718f..69c0d9e 100644
--- a/src/runtime/tls_ppc64x.s
+++ b/src/runtime/tls_ppc64x.s

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -22,21 +22,12 @@
 // If !iscgo, this is a no-op.
 //
 // NOTE: setg_gcc<> assume this clobbers only R31.
-TEXT runtime·save_g(SB),NOSPLIT,$-8-0
+TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
 	MOVB	runtime·iscgo(SB), R31
 	CMP	R31, $0
 	BEQ	nocgo
-
-	// $runtime.tlsg(SB) is a special linker symbol.
-	// It is the offset from the start of TLS to our
-	// thread-local storage for g.
-	MOVD	$runtime·tlsg(SB), R31
-	ADD	R13, R31
-	// The actual TLS base is 0x7000 below R13
-	SUB	$0x7000, R31
-
-	// Store g in TLS
-	MOVD	g, 0(R31)
+	MOVD	runtime·tls_g(SB), R31
+	MOVD	g, 0(R13)(R31*1)
 
 nocgo:
 	RET
@@ -50,11 +41,9 @@
 // usual Go registers aren't set up.
 //
 // NOTE: _cgo_topofstack assumes this only clobbers g (R30), and R31.
-TEXT runtime·load_g(SB),NOSPLIT,$-8-0
-	MOVD	$runtime·tlsg(SB), R31
-	// R13 is the C ABI TLS base pointer + 0x7000
-	ADD	R13, R31
-	SUB	$0x7000, R31
-
-	MOVD	0(R31), g
+TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVD	runtime·tls_g(SB), R31
+	MOVD	0(R13)(R31*1), g
 	RET
+
+GLOBL runtime·tls_g+0(SB), TLSBSS, $8

diff --git a/src/runtime/tls_s390x.s b/src/runtime/tls_s390x.s
new file mode 100644
index 0000000..cb6a21c
--- /dev/null
+++ b/src/runtime/tls_s390x.s

@@ -0,0 +1,51 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "go_tls.h"
+#include "funcdata.h"
+#include "textflag.h"
+
+// We have to resort to TLS variable to save g (R13).
+// One reason is that external code might trigger
+// SIGSEGV, and our runtime.sigtramp don't even know we
+// are in external code, and will continue to use R13,
+// this might well result in another SIGSEGV.
+
+// save_g saves the g register into pthread-provided
+// thread-local memory, so that we can call externally compiled
+// s390x code that will overwrite this register.
+//
+// If !iscgo, this is a no-op.
+//
+// NOTE: setg_gcc<> assume this clobbers only R10 and R11.
+TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVB	runtime·iscgo(SB),  R10
+	CMPBEQ	R10, $0, nocgo
+	MOVW	AR0, R11
+	SLD	$32, R11
+	MOVW	AR1, R11
+	MOVD	runtime·tls_g(SB), R10
+	MOVD	g, 0(R10)(R11*1)
+nocgo:
+	RET
+
+// load_g loads the g register from pthread-provided
+// thread-local memory, for use after calling externally compiled
+// s390x code that overwrote those registers.
+//
+// This is never called directly from C code (it doesn't have to
+// follow the C ABI), but it may be called from a C context, where the
+// usual Go registers aren't set up.
+//
+// NOTE: _cgo_topofstack assumes this only clobbers g (R13), R10 and R11.
+TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0
+	MOVW	AR0, R11
+	SLD	$32, R11
+	MOVW	AR1, R11
+	MOVD	runtime·tls_g(SB), R10
+	MOVD	0(R10)(R11*1), g
+	RET
+
+GLOBL runtime·tls_g+0(SB),TLSBSS,$8

diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index c818462..092f941 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go

@@ -12,32 +12,35 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // Event types in the trace, args are given in square brackets.
 const (
 	traceEvNone           = 0  // unused
 	traceEvBatch          = 1  // start of per-P batch of events [pid, timestamp]
 	traceEvFrequency      = 2  // contains tracer timer frequency [frequency (ticks per second)]
-	traceEvStack          = 3  // stack [stack id, number of PCs, array of PCs]
+	traceEvStack          = 3  // stack [stack id, number of PCs, array of {PC, func string ID, file string ID, line}]
 	traceEvGomaxprocs     = 4  // current value of GOMAXPROCS [timestamp, GOMAXPROCS, stack id]
 	traceEvProcStart      = 5  // start of P [timestamp, thread id]
 	traceEvProcStop       = 6  // stop of P [timestamp]
-	traceEvGCStart        = 7  // GC start [timestamp, stack id]
+	traceEvGCStart        = 7  // GC start [timestamp, seq, stack id]
 	traceEvGCDone         = 8  // GC done [timestamp]
 	traceEvGCScanStart    = 9  // GC scan start [timestamp]
 	traceEvGCScanDone     = 10 // GC scan done [timestamp]
 	traceEvGCSweepStart   = 11 // GC sweep start [timestamp, stack id]
 	traceEvGCSweepDone    = 12 // GC sweep done [timestamp]
-	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, start PC, stack id]
-	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id]
+	traceEvGoCreate       = 13 // goroutine creation [timestamp, new goroutine id, new stack id, stack id]
+	traceEvGoStart        = 14 // goroutine starts running [timestamp, goroutine id, seq]
 	traceEvGoEnd          = 15 // goroutine ends [timestamp]
 	traceEvGoStop         = 16 // goroutine stops (like in select{}) [timestamp, stack]
 	traceEvGoSched        = 17 // goroutine calls Gosched [timestamp, stack]
 	traceEvGoPreempt      = 18 // goroutine is preempted [timestamp, stack]
 	traceEvGoSleep        = 19 // goroutine calls Sleep [timestamp, stack]
 	traceEvGoBlock        = 20 // goroutine blocks [timestamp, stack]
-	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, stack]
+	traceEvGoUnblock      = 21 // goroutine is unblocked [timestamp, goroutine id, seq, stack]
 	traceEvGoBlockSend    = 22 // goroutine blocks on chan send [timestamp, stack]
 	traceEvGoBlockRecv    = 23 // goroutine blocks on chan recv [timestamp, stack]
 	traceEvGoBlockSelect  = 24 // goroutine blocks on select [timestamp, stack]
@@ -45,15 +48,19 @@
 	traceEvGoBlockCond    = 26 // goroutine blocks on Cond [timestamp, stack]
 	traceEvGoBlockNet     = 27 // goroutine blocks on network [timestamp, stack]
 	traceEvGoSysCall      = 28 // syscall enter [timestamp, stack]
-	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, real timestamp]
+	traceEvGoSysExit      = 29 // syscall exit [timestamp, goroutine id, seq, real timestamp]
 	traceEvGoSysBlock     = 30 // syscall blocks [timestamp]
-	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [goroutine id]
-	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [goroutine id]
+	traceEvGoWaiting      = 31 // denotes that goroutine is blocked when tracing starts [timestamp, goroutine id]
+	traceEvGoInSyscall    = 32 // denotes that goroutine is in syscall when tracing starts [timestamp, goroutine id]
 	traceEvHeapAlloc      = 33 // memstats.heap_live change [timestamp, heap_alloc]
 	traceEvNextGC         = 34 // memstats.next_gc change [timestamp, next_gc]
 	traceEvTimerGoroutine = 35 // denotes timer goroutine [timer goroutine id]
 	traceEvFutileWakeup   = 36 // denotes that the previous wakeup of this goroutine was futile [timestamp]
-	traceEvCount          = 37
+	traceEvString         = 37 // string dictionary entry [ID, length, string]
+	traceEvGoStartLocal   = 38 // goroutine starts running on the same P as the last event [timestamp, goroutine id]
+	traceEvGoUnblockLocal = 39 // goroutine is unblocked on the same P as the last event [timestamp, goroutine id, stack]
+	traceEvGoSysExitLocal = 40 // syscall exit on the same P as the last event [timestamp, goroutine id, real timestamp]
+	traceEvCount          = 41
 )
 
 const (
@@ -66,7 +73,7 @@
 	// and ppc64le.
 	// Tracing won't work reliably for architectures where cputicks is emulated
 	// by nanotime, so the value doesn't matter for those architectures.
-	traceTickDiv = 16 + 48*(goarch_386|goarch_amd64|goarch_amd64p32)
+	traceTickDiv = 16 + 48*(sys.Goarch386|sys.GoarchAmd64|sys.GoarchAmd64p32)
 	// Maximum number of PCs in a single stack trace.
 	// Since events contain only stack id rather than whole stack trace,
 	// we can allow quite large values here.
@@ -88,56 +95,41 @@
 
 // trace is global tracing context.
 var trace struct {
-	lock          mutex     // protects the following members
-	lockOwner     *g        // to avoid deadlocks during recursive lock locks
-	enabled       bool      // when set runtime traces events
-	shutdown      bool      // set when we are waiting for trace reader to finish after setting enabled to false
-	headerWritten bool      // whether ReadTrace has emitted trace header
-	footerWritten bool      // whether ReadTrace has emitted trace footer
-	shutdownSema  uint32    // used to wait for ReadTrace completion
-	seqStart      uint64    // sequence number when tracing was started
-	ticksStart    int64     // cputicks when tracing was started
-	ticksEnd      int64     // cputicks when tracing was stopped
-	timeStart     int64     // nanotime when tracing was started
-	timeEnd       int64     // nanotime when tracing was stopped
-	reading       *traceBuf // buffer currently handed off to user
-	empty         *traceBuf // stack of empty buffers
-	fullHead      *traceBuf // queue of full buffers
-	fullTail      *traceBuf
+	lock          mutex       // protects the following members
+	lockOwner     *g          // to avoid deadlocks during recursive lock locks
+	enabled       bool        // when set runtime traces events
+	shutdown      bool        // set when we are waiting for trace reader to finish after setting enabled to false
+	headerWritten bool        // whether ReadTrace has emitted trace header
+	footerWritten bool        // whether ReadTrace has emitted trace footer
+	shutdownSema  uint32      // used to wait for ReadTrace completion
+	seqStart      uint64      // sequence number when tracing was started
+	ticksStart    int64       // cputicks when tracing was started
+	ticksEnd      int64       // cputicks when tracing was stopped
+	timeStart     int64       // nanotime when tracing was started
+	timeEnd       int64       // nanotime when tracing was stopped
+	seqGC         uint64      // GC start/done sequencer
+	reading       traceBufPtr // buffer currently handed off to user
+	empty         traceBufPtr // stack of empty buffers
+	fullHead      traceBufPtr // queue of full buffers
+	fullTail      traceBufPtr
 	reader        *g              // goroutine that called ReadTrace, or nil
 	stackTab      traceStackTable // maps stack traces to unique ids
 
-	bufLock mutex     // protects buf
-	buf     *traceBuf // global trace buffer, used when running without a p
-}
+	// Dictionary for traceEvString.
+	// Currently this is used only for func/file:line info after tracing session,
+	// so we assume single-threaded access.
+	strings   map[string]uint64
+	stringSeq uint64
 
-var traceseq uint64 // global trace sequence number
-
-// tracestamp returns a consistent sequence number, time stamp pair
-// for use in a trace. We need to make sure that time stamp ordering
-// (assuming synchronized CPUs) and sequence ordering match.
-// To do that, we increment traceseq, grab ticks, and increment traceseq again.
-// We treat odd traceseq as a sign that another thread is in the middle
-// of the sequence and spin until it is done.
-// Not splitting stack to avoid preemption, just in case the call sites
-// that used to call xadd64 and cputicks are sensitive to that.
-//go:nosplit
-func tracestamp() (seq uint64, ts int64) {
-	seq = atomicload64(&traceseq)
-	for seq&1 != 0 || !cas64(&traceseq, seq, seq+1) {
-		seq = atomicload64(&traceseq)
-	}
-	ts = cputicks()
-	atomicstore64(&traceseq, seq+2)
-	return seq >> 1, ts
+	bufLock mutex       // protects buf
+	buf     traceBufPtr // global trace buffer, used when running without a p
 }
 
 // traceBufHeader is per-P tracing buffer.
 type traceBufHeader struct {
-	link      *traceBuf               // in trace.empty/full
-	lastSeq   uint64                  // sequence number of last event
+	link      traceBufPtr             // in trace.empty/full
 	lastTicks uint64                  // when we wrote the last event
-	buf       []byte                  // trace data, always points to traceBuf.arr
+	pos       int                     // next write offset in arr
 	stk       [traceStackSize]uintptr // scratch buffer for traceback
 }
 
@@ -147,6 +139,19 @@
 	arr [64<<10 - unsafe.Sizeof(traceBufHeader{})]byte // underlying buffer for traceBufHeader.buf
 }
 
+// traceBufPtr is a *traceBuf that is not traced by the garbage
+// collector and doesn't have write barriers. traceBufs are not
+// allocated from the GC'd heap, so this is safe, and are often
+// manipulated in contexts where write barriers are not allowed, so
+// this is necessary.
+type traceBufPtr uintptr
+
+func (tp traceBufPtr) ptr() *traceBuf   { return (*traceBuf)(unsafe.Pointer(tp)) }
+func (tp *traceBufPtr) set(b *traceBuf) { *tp = traceBufPtr(unsafe.Pointer(b)) }
+func traceBufPtrOf(b *traceBuf) traceBufPtr {
+	return traceBufPtr(unsafe.Pointer(b))
+}
+
 // StartTrace enables tracing for the current process.
 // While tracing, the data will be buffered and available via ReadTrace.
 // StartTrace returns an error if tracing is already enabled.
@@ -170,11 +175,6 @@
 		return errorString("tracing is already enabled")
 	}
 
-	trace.seqStart, trace.ticksStart = tracestamp()
-	trace.timeStart = nanotime()
-	trace.headerWritten = false
-	trace.footerWritten = false
-
 	// Can't set trace.enabled yet. While the world is stopped, exitsyscall could
 	// already emit a delayed event (see exitTicks in exitsyscall) if we set trace.enabled here.
 	// That would lead to an inconsistent trace:
@@ -187,12 +187,15 @@
 	for _, gp := range allgs {
 		status := readgstatus(gp)
 		if status != _Gdead {
-			traceGoCreate(gp, gp.startpc)
+			traceGoCreate(gp, gp.startpc) // also resets gp.traceseq/tracelastp
 		}
 		if status == _Gwaiting {
+			// traceEvGoWaiting is implied to have seq=1.
+			gp.traceseq++
 			traceEvent(traceEvGoWaiting, -1, uint64(gp.goid))
 		}
 		if status == _Gsyscall {
+			gp.traceseq++
 			traceEvent(traceEvGoInSyscall, -1, uint64(gp.goid))
 		} else {
 			gp.sysblocktraced = false
@@ -200,6 +203,17 @@
 	}
 	traceProcStart()
 	traceGoStart()
+	// Note: ticksStart needs to be set after we emit traceEvGoInSyscall events.
+	// If we do it the other way around, it is possible that exitsyscall will
+	// query sysexitticks after ticksStart but before traceEvGoInSyscall timestamp.
+	// It will lead to a false conclusion that cputicks is broken.
+	trace.ticksStart = cputicks()
+	trace.timeStart = nanotime()
+	trace.headerWritten = false
+	trace.footerWritten = false
+	trace.strings = make(map[string]uint64)
+	trace.stringSeq = 0
+	trace.seqGC = 0
 	_g_.m.startingtrace = false
 	trace.enabled = true
 
@@ -232,14 +246,14 @@
 			break
 		}
 		buf := p.tracebuf
-		if buf != nil {
+		if buf != 0 {
 			traceFullQueue(buf)
-			p.tracebuf = nil
+			p.tracebuf = 0
 		}
 	}
-	if trace.buf != nil && len(trace.buf.buf) != 0 {
+	if trace.buf != 0 && trace.buf.ptr().pos != 0 {
 		buf := trace.buf
-		trace.buf = nil
+		trace.buf = 0
 		traceFullQueue(buf)
 	}
 
@@ -255,8 +269,6 @@
 
 	trace.enabled = false
 	trace.shutdown = true
-	trace.stackTab.dump()
-
 	unlock(&trace.bufLock)
 
 	startTheWorld()
@@ -274,24 +286,25 @@
 		if p == nil {
 			break
 		}
-		if p.tracebuf != nil {
+		if p.tracebuf != 0 {
 			throw("trace: non-empty trace buffer in proc")
 		}
 	}
-	if trace.buf != nil {
+	if trace.buf != 0 {
 		throw("trace: non-empty global trace buffer")
 	}
-	if trace.fullHead != nil || trace.fullTail != nil {
+	if trace.fullHead != 0 || trace.fullTail != 0 {
 		throw("trace: non-empty full trace buffer")
 	}
-	if trace.reading != nil || trace.reader != nil {
+	if trace.reading != 0 || trace.reader != nil {
 		throw("trace: reading after shutdown")
 	}
-	for trace.empty != nil {
+	for trace.empty != 0 {
 		buf := trace.empty
-		trace.empty = buf.link
-		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
+		trace.empty = buf.ptr().link
+		sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
 	}
+	trace.strings = nil
 	trace.shutdown = false
 	unlock(&trace.lock)
 }
@@ -321,31 +334,31 @@
 		return nil
 	}
 	// Recycle the old buffer.
-	if buf := trace.reading; buf != nil {
-		buf.link = trace.empty
+	if buf := trace.reading; buf != 0 {
+		buf.ptr().link = trace.empty
 		trace.empty = buf
-		trace.reading = nil
+		trace.reading = 0
 	}
 	// Write trace header.
 	if !trace.headerWritten {
 		trace.headerWritten = true
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return []byte("go 1.5 trace\x00\x00\x00\x00")
+		return []byte("go 1.7 trace\x00\x00\x00\x00")
 	}
 	// Wait for new data.
-	if trace.fullHead == nil && !trace.shutdown {
+	if trace.fullHead == 0 && !trace.shutdown {
 		trace.reader = getg()
 		goparkunlock(&trace.lock, "trace reader (blocked)", traceEvGoBlock, 2)
 		lock(&trace.lock)
 	}
 	// Write a buffer.
-	if trace.fullHead != nil {
+	if trace.fullHead != 0 {
 		buf := traceFullDequeue()
 		trace.reading = buf
 		trace.lockOwner = nil
 		unlock(&trace.lock)
-		return buf.buf
+		return buf.ptr().arr[:buf.ptr().pos]
 	}
 	// Write footer with timer frequency.
 	if !trace.footerWritten {
@@ -357,12 +370,13 @@
 		var data []byte
 		data = append(data, traceEvFrequency|0<<traceArgCountShift)
 		data = traceAppend(data, uint64(freq))
-		data = traceAppend(data, 0)
 		if timers.gp != nil {
 			data = append(data, traceEvTimerGoroutine|0<<traceArgCountShift)
 			data = traceAppend(data, uint64(timers.gp.goid))
-			data = traceAppend(data, 0)
 		}
+		// This will emit a bunch of full buffers, we will pick them up
+		// on the next iteration.
+		trace.stackTab.dump()
 		return data
 	}
 	// Done.
@@ -388,11 +402,11 @@
 
 // traceReader returns the trace reader that should be woken up, if any.
 func traceReader() *g {
-	if trace.reader == nil || (trace.fullHead == nil && !trace.shutdown) {
+	if trace.reader == nil || (trace.fullHead == 0 && !trace.shutdown) {
 		return nil
 	}
 	lock(&trace.lock)
-	if trace.reader == nil || (trace.fullHead == nil && !trace.shutdown) {
+	if trace.reader == nil || (trace.fullHead == 0 && !trace.shutdown) {
 		unlock(&trace.lock)
 		return nil
 	}
@@ -405,8 +419,8 @@
 // traceProcFree frees trace buffer associated with pp.
 func traceProcFree(pp *p) {
 	buf := pp.tracebuf
-	pp.tracebuf = nil
-	if buf == nil {
+	pp.tracebuf = 0
+	if buf == 0 {
 		return
 	}
 	lock(&trace.lock)
@@ -415,27 +429,27 @@
 }
 
 // traceFullQueue queues buf into queue of full buffers.
-func traceFullQueue(buf *traceBuf) {
-	buf.link = nil
-	if trace.fullHead == nil {
+func traceFullQueue(buf traceBufPtr) {
+	buf.ptr().link = 0
+	if trace.fullHead == 0 {
 		trace.fullHead = buf
 	} else {
-		trace.fullTail.link = buf
+		trace.fullTail.ptr().link = buf
 	}
 	trace.fullTail = buf
 }
 
 // traceFullDequeue dequeues from queue of full buffers.
-func traceFullDequeue() *traceBuf {
+func traceFullDequeue() traceBufPtr {
 	buf := trace.fullHead
-	if buf == nil {
-		return nil
+	if buf == 0 {
+		return 0
 	}
-	trace.fullHead = buf.link
-	if trace.fullHead == nil {
-		trace.fullTail = nil
+	trace.fullHead = buf.ptr().link
+	if trace.fullHead == 0 {
+		trace.fullTail = 0
 	}
-	buf.link = nil
+	buf.ptr().link = 0
 	return buf
 }
 
@@ -459,28 +473,21 @@
 		traceReleaseBuffer(pid)
 		return
 	}
-	buf := *bufp
+	buf := (*bufp).ptr()
 	const maxSize = 2 + 5*traceBytesPerNumber // event type, length, sequence, timestamp, stack id and two add params
-	if buf == nil || cap(buf.buf)-len(buf.buf) < maxSize {
-		buf = traceFlush(buf)
-		*bufp = buf
+	if buf == nil || len(buf.arr)-buf.pos < maxSize {
+		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+		(*bufp).set(buf)
 	}
 
-	seq, ticksraw := tracestamp()
-	seqDiff := seq - buf.lastSeq
-	ticks := uint64(ticksraw) / traceTickDiv
+	ticks := uint64(cputicks()) / traceTickDiv
 	tickDiff := ticks - buf.lastTicks
-	if len(buf.buf) == 0 {
-		data := buf.buf
-		data = append(data, traceEvBatch|1<<traceArgCountShift)
-		data = traceAppend(data, uint64(pid))
-		data = traceAppend(data, seq)
-		data = traceAppend(data, ticks)
-		buf.buf = data
-		seqDiff = 0
+	if buf.pos == 0 {
+		buf.byte(traceEvBatch | 1<<traceArgCountShift)
+		buf.varint(uint64(pid))
+		buf.varint(ticks)
 		tickDiff = 0
 	}
-	buf.lastSeq = seq
 	buf.lastTicks = ticks
 	narg := byte(len(args))
 	if skip >= 0 {
@@ -491,21 +498,20 @@
 	if narg > 3 {
 		narg = 3
 	}
-	data := buf.buf
-	data = append(data, ev|narg<<traceArgCountShift)
+	startPos := buf.pos
+	buf.byte(ev | narg<<traceArgCountShift)
 	var lenp *byte
 	if narg == 3 {
 		// Reserve the byte for length assuming that length < 128.
-		data = append(data, 0)
-		lenp = &data[len(data)-1]
+		buf.varint(0)
+		lenp = &buf.arr[buf.pos-1]
 	}
-	data = traceAppend(data, seqDiff)
-	data = traceAppend(data, tickDiff)
+	buf.varint(tickDiff)
 	for _, a := range args {
-		data = traceAppend(data, a)
+		buf.varint(a)
 	}
 	if skip == 0 {
-		data = append(data, 0)
+		buf.varint(0)
 	} else if skip > 0 {
 		_g_ := getg()
 		gp := mp.curg
@@ -514,7 +520,12 @@
 			nstk = callers(skip, buf.stk[:])
 		} else if gp != nil {
 			gp = mp.curg
-			nstk = gcallers(gp, skip, buf.stk[:])
+			// This may happen when tracing a system call,
+			// so we must lock the stack.
+			if gcTryLockStackBarriers(gp) {
+				nstk = gcallers(gp, skip, buf.stk[:])
+				gcUnlockStackBarriers(gp)
+			}
 		}
 		if nstk > 0 {
 			nstk-- // skip runtime.goexit
@@ -523,9 +534,9 @@
 			nstk-- // skip runtime.main
 		}
 		id := trace.stackTab.put(buf.stk[:nstk])
-		data = traceAppend(data, uint64(id))
+		buf.varint(uint64(id))
 	}
-	evSize := len(data) - len(buf.buf)
+	evSize := buf.pos - startPos
 	if evSize > maxSize {
 		throw("invalid length of trace event")
 	}
@@ -533,12 +544,11 @@
 		// Fill in actual length.
 		*lenp = byte(evSize - 2)
 	}
-	buf.buf = data
 	traceReleaseBuffer(pid)
 }
 
 // traceAcquireBuffer returns trace buffer to use and, if necessary, locks it.
-func traceAcquireBuffer() (mp *m, pid int32, bufp **traceBuf) {
+func traceAcquireBuffer() (mp *m, pid int32, bufp *traceBufPtr) {
 	mp = acquirem()
 	if p := mp.p.ptr(); p != nil {
 		return mp, p.id, &p.tracebuf
@@ -556,36 +566,57 @@
 }
 
 // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
-func traceFlush(buf *traceBuf) *traceBuf {
+func traceFlush(buf traceBufPtr) traceBufPtr {
 	owner := trace.lockOwner
 	dolock := owner == nil || owner != getg().m.curg
 	if dolock {
 		lock(&trace.lock)
 	}
-	if buf != nil {
-		if &buf.buf[0] != &buf.arr[0] {
-			throw("trace buffer overflow")
-		}
+	if buf != 0 {
 		traceFullQueue(buf)
 	}
-	if trace.empty != nil {
+	if trace.empty != 0 {
 		buf = trace.empty
-		trace.empty = buf.link
+		trace.empty = buf.ptr().link
 	} else {
-		buf = (*traceBuf)(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
-		if buf == nil {
+		buf = traceBufPtr(sysAlloc(unsafe.Sizeof(traceBuf{}), &memstats.other_sys))
+		if buf == 0 {
 			throw("trace: out of memory")
 		}
 	}
-	buf.link = nil
-	buf.buf = buf.arr[:0]
-	buf.lastTicks = 0
+	bufp := buf.ptr()
+	bufp.link.set(nil)
+	bufp.pos = 0
+	bufp.lastTicks = 0
 	if dolock {
 		unlock(&trace.lock)
 	}
 	return buf
 }
 
+func traceString(buf *traceBuf, s string) (uint64, *traceBuf) {
+	if s == "" {
+		return 0, buf
+	}
+	if id, ok := trace.strings[s]; ok {
+		return id, buf
+	}
+
+	trace.stringSeq++
+	id := trace.stringSeq
+	trace.strings[s] = id
+
+	size := 1 + 2*traceBytesPerNumber + len(s)
+	if len(buf.arr)-buf.pos < size {
+		buf = traceFlush(traceBufPtrOf(buf)).ptr()
+	}
+	buf.byte(traceEvString)
+	buf.varint(id)
+	buf.varint(uint64(len(s)))
+	buf.pos += copy(buf.arr[buf.pos:], s)
+	return id, buf
+}
+
 // traceAppend appends v to buf in little-endian-base-128 encoding.
 func traceAppend(buf []byte, v uint64) []byte {
 	for ; v >= 0x80; v >>= 7 {
@@ -595,24 +626,46 @@
 	return buf
 }
 
+// varint appends v to buf in little-endian-base-128 encoding.
+func (buf *traceBuf) varint(v uint64) {
+	pos := buf.pos
+	for ; v >= 0x80; v >>= 7 {
+		buf.arr[pos] = 0x80 | byte(v)
+		pos++
+	}
+	buf.arr[pos] = byte(v)
+	pos++
+	buf.pos = pos
+}
+
+// byte appends v to buf.
+func (buf *traceBuf) byte(v byte) {
+	buf.arr[buf.pos] = v
+	buf.pos++
+}
+
 // traceStackTable maps stack traces (arrays of PC's) to unique uint32 ids.
 // It is lock-free for reading.
 type traceStackTable struct {
 	lock mutex
 	seq  uint32
 	mem  traceAlloc
-	tab  [1 << 13]*traceStack
+	tab  [1 << 13]traceStackPtr
 }
 
 // traceStack is a single stack in traceStackTable.
 type traceStack struct {
-	link *traceStack
+	link traceStackPtr
 	hash uintptr
 	id   uint32
 	n    int
 	stk  [0]uintptr // real type [n]uintptr
 }
 
+type traceStackPtr uintptr
+
+func (tp traceStackPtr) ptr() *traceStack { return (*traceStack)(unsafe.Pointer(tp)) }
+
 // stack returns slice of PCs.
 func (ts *traceStack) stack() []uintptr {
 	return (*[traceStackSize]uintptr)(unsafe.Pointer(&ts.stk))[:ts.n]
@@ -624,7 +677,7 @@
 	if len(pcs) == 0 {
 		return 0
 	}
-	hash := memhash(unsafe.Pointer(&pcs[0]), uintptr(len(pcs))*unsafe.Sizeof(pcs[0]), 0)
+	hash := memhash(unsafe.Pointer(&pcs[0]), 0, uintptr(len(pcs))*unsafe.Sizeof(pcs[0]))
 	// First, search the hashtable w/o the mutex.
 	if id := tab.find(pcs, hash); id != 0 {
 		return id
@@ -656,7 +709,7 @@
 func (tab *traceStackTable) find(pcs []uintptr, hash uintptr) uint32 {
 	part := int(hash % uintptr(len(tab.tab)))
 Search:
-	for stk := tab.tab[part]; stk != nil; stk = stk.link {
+	for stk := tab.tab[part].ptr(); stk != nil; stk = stk.link.ptr() {
 		if stk.hash == hash && stk.n == len(pcs) {
 			for i, stkpc := range stk.stack() {
 				if stkpc != pcs[i] {
@@ -671,82 +724,128 @@
 
 // newStack allocates a new stack of size n.
 func (tab *traceStackTable) newStack(n int) *traceStack {
-	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*ptrSize))
+	return (*traceStack)(tab.mem.alloc(unsafe.Sizeof(traceStack{}) + uintptr(n)*sys.PtrSize))
 }
 
 // dump writes all previously cached stacks to trace buffers,
 // releases all memory and resets state.
 func (tab *traceStackTable) dump() {
-	var tmp [(2 + traceStackSize) * traceBytesPerNumber]byte
-	buf := traceFlush(nil)
+	frames := make(map[uintptr]traceFrame)
+	var tmp [(2 + 4*traceStackSize) * traceBytesPerNumber]byte
+	buf := traceFlush(0).ptr()
 	for _, stk := range tab.tab {
-		for ; stk != nil; stk = stk.link {
-			maxSize := 1 + (3+stk.n)*traceBytesPerNumber
-			if cap(buf.buf)-len(buf.buf) < maxSize {
-				buf = traceFlush(buf)
-			}
-			// Form the event in the temp buffer, we need to know the actual length.
+		stk := stk.ptr()
+		for ; stk != nil; stk = stk.link.ptr() {
 			tmpbuf := tmp[:0]
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.id))
 			tmpbuf = traceAppend(tmpbuf, uint64(stk.n))
 			for _, pc := range stk.stack() {
+				var frame traceFrame
+				frame, buf = traceFrameForPC(buf, frames, pc)
 				tmpbuf = traceAppend(tmpbuf, uint64(pc))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.funcID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.fileID))
+				tmpbuf = traceAppend(tmpbuf, uint64(frame.line))
 			}
 			// Now copy to the buffer.
-			data := buf.buf
-			data = append(data, traceEvStack|3<<traceArgCountShift)
-			data = traceAppend(data, uint64(len(tmpbuf)))
-			data = append(data, tmpbuf...)
-			buf.buf = data
+			size := 1 + traceBytesPerNumber + len(tmpbuf)
+			if len(buf.arr)-buf.pos < size {
+				buf = traceFlush(traceBufPtrOf(buf)).ptr()
+			}
+			buf.byte(traceEvStack | 3<<traceArgCountShift)
+			buf.varint(uint64(len(tmpbuf)))
+			buf.pos += copy(buf.arr[buf.pos:], tmpbuf)
 		}
 	}
 
 	lock(&trace.lock)
-	traceFullQueue(buf)
+	traceFullQueue(traceBufPtrOf(buf))
 	unlock(&trace.lock)
 
 	tab.mem.drop()
 	*tab = traceStackTable{}
 }
 
+type traceFrame struct {
+	funcID uint64
+	fileID uint64
+	line   uint64
+}
+
+func traceFrameForPC(buf *traceBuf, frames map[uintptr]traceFrame, pc uintptr) (traceFrame, *traceBuf) {
+	if frame, ok := frames[pc]; ok {
+		return frame, buf
+	}
+
+	var frame traceFrame
+	f := findfunc(pc)
+	if f == nil {
+		frames[pc] = frame
+		return frame, buf
+	}
+
+	fn := funcname(f)
+	const maxLen = 1 << 10
+	if len(fn) > maxLen {
+		fn = fn[len(fn)-maxLen:]
+	}
+	frame.funcID, buf = traceString(buf, fn)
+	file, line := funcline(f, pc-sys.PCQuantum)
+	frame.line = uint64(line)
+	if len(file) > maxLen {
+		file = file[len(file)-maxLen:]
+	}
+	frame.fileID, buf = traceString(buf, file)
+	return frame, buf
+}
+
 // traceAlloc is a non-thread-safe region allocator.
 // It holds a linked list of traceAllocBlock.
 type traceAlloc struct {
-	head *traceAllocBlock
+	head traceAllocBlockPtr
 	off  uintptr
 }
 
 // traceAllocBlock is a block in traceAlloc.
+//
+// traceAllocBlock is allocated from non-GC'd memory, so it must not
+// contain heap pointers. Writes to pointers to traceAllocBlocks do
+// not need write barriers.
 type traceAllocBlock struct {
-	next *traceAllocBlock
-	data [64<<10 - ptrSize]byte
+	next traceAllocBlockPtr
+	data [64<<10 - sys.PtrSize]byte
 }
 
+type traceAllocBlockPtr uintptr
+
+func (p traceAllocBlockPtr) ptr() *traceAllocBlock   { return (*traceAllocBlock)(unsafe.Pointer(p)) }
+func (p *traceAllocBlockPtr) set(x *traceAllocBlock) { *p = traceAllocBlockPtr(unsafe.Pointer(x)) }
+
 // alloc allocates n-byte block.
 func (a *traceAlloc) alloc(n uintptr) unsafe.Pointer {
-	n = round(n, ptrSize)
-	if a.head == nil || a.off+n > uintptr(len(a.head.data)) {
-		if n > uintptr(len(a.head.data)) {
+	n = round(n, sys.PtrSize)
+	if a.head == 0 || a.off+n > uintptr(len(a.head.ptr().data)) {
+		if n > uintptr(len(a.head.ptr().data)) {
 			throw("trace: alloc too large")
 		}
 		block := (*traceAllocBlock)(sysAlloc(unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys))
 		if block == nil {
 			throw("trace: out of memory")
 		}
-		block.next = a.head
-		a.head = block
+		block.next.set(a.head.ptr())
+		a.head.set(block)
 		a.off = 0
 	}
-	p := &a.head.data[a.off]
+	p := &a.head.ptr().data[a.off]
 	a.off += n
 	return unsafe.Pointer(p)
 }
 
 // drop frees all previously allocated memory and resets the allocator.
 func (a *traceAlloc) drop() {
-	for a.head != nil {
-		block := a.head
-		a.head = block.next
+	for a.head != 0 {
+		block := a.head.ptr()
+		a.head.set(block.next.ptr())
 		sysFree(unsafe.Pointer(block), unsafe.Sizeof(traceAllocBlock{}), &memstats.other_sys)
 	}
 }
@@ -773,7 +872,8 @@
 }
 
 func traceGCStart() {
-	traceEvent(traceEvGCStart, 4)
+	traceEvent(traceEvGCStart, 3, trace.seqGC)
+	trace.seqGC++
 }
 
 func traceGCDone() {
@@ -797,11 +897,23 @@
 }
 
 func traceGoCreate(newg *g, pc uintptr) {
-	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(pc))
+	newg.traceseq = 0
+	newg.tracelastp = getg().m.p
+	// +PCQuantum because traceFrameForPC expects return PCs and subtracts PCQuantum.
+	id := trace.stackTab.put([]uintptr{pc + sys.PCQuantum})
+	traceEvent(traceEvGoCreate, 2, uint64(newg.goid), uint64(id))
 }
 
 func traceGoStart() {
-	traceEvent(traceEvGoStart, -1, uint64(getg().m.curg.goid))
+	_g_ := getg().m.curg
+	_p_ := _g_.m.p
+	_g_.traceseq++
+	if _g_.tracelastp == _p_ {
+		traceEvent(traceEvGoStartLocal, -1, uint64(_g_.goid))
+	} else {
+		_g_.tracelastp = _p_
+		traceEvent(traceEvGoStart, -1, uint64(_g_.goid), _g_.traceseq)
+	}
 }
 
 func traceGoEnd() {
@@ -809,10 +921,14 @@
 }
 
 func traceGoSched() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoSched, 1)
 }
 
 func traceGoPreempt() {
+	_g_ := getg()
+	_g_.tracelastp = _g_.m.p
 	traceEvent(traceEvGoPreempt, 1)
 }
 
@@ -824,19 +940,37 @@
 }
 
 func traceGoUnpark(gp *g, skip int) {
-	traceEvent(traceEvGoUnblock, skip, uint64(gp.goid))
+	_p_ := getg().m.p
+	gp.traceseq++
+	if gp.tracelastp == _p_ {
+		traceEvent(traceEvGoUnblockLocal, skip, uint64(gp.goid))
+	} else {
+		gp.tracelastp = _p_
+		traceEvent(traceEvGoUnblock, skip, uint64(gp.goid), gp.traceseq)
+	}
 }
 
 func traceGoSysCall() {
-	traceEvent(traceEvGoSysCall, 4)
+	traceEvent(traceEvGoSysCall, 1)
 }
 
-func traceGoSysExit(seq uint64, ts int64) {
-	if int64(seq)-int64(trace.seqStart) < 0 {
-		// The timestamp was obtained during a previous tracing session, ignore.
-		return
+func traceGoSysExit(ts int64) {
+	if ts != 0 && ts < trace.ticksStart {
+		// There is a race between the code that initializes sysexitticks
+		// (in exitsyscall, which runs without a P, and therefore is not
+		// stopped with the rest of the world) and the code that initializes
+		// a new trace. The recorded sysexitticks must therefore be treated
+		// as "best effort". If they are valid for this trace, then great,
+		// use them for greater accuracy. But if they're not valid for this
+		// trace, assume that the trace was started after the actual syscall
+		// exit (but before we actually managed to start the goroutine,
+		// aka right now), and assign a fresh time stamp to keep the log consistent.
+		ts = 0
 	}
-	traceEvent(traceEvGoSysExit, -1, uint64(getg().m.curg.goid), seq, uint64(ts)/traceTickDiv)
+	_g_ := getg().m.curg
+	_g_.traceseq++
+	_g_.tracelastp = _g_.m.p
+	traceEvent(traceEvGoSysExit, -1, uint64(_g_.goid), _g_.traceseq, uint64(ts)/traceTickDiv)
 }
 
 func traceGoSysBlock(pp *p) {

diff --git a/src/runtime/trace/trace_stack_test.go b/src/runtime/trace/trace_stack_test.go
index 3fe1747..52a71bf 100644
--- a/src/runtime/trace/trace_stack_test.go
+++ b/src/runtime/trace/trace_stack_test.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -125,14 +125,7 @@
 	<-pipeReadDone
 
 	Stop()
-	events, _, err := parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
-	err = trace.Symbolize(events, os.Args[0])
-	if err != nil {
-		t.Fatalf("failed to symbolize trace: %v", err)
-	}
+	events, _ := parseTrace(t, buf)
 
 	// Now check that the stacks are correct.
 	type frame struct {
@@ -144,107 +137,110 @@
 		Stk  []frame
 	}
 	want := []eventDesc{
-		eventDesc{trace.EvGCStart, []frame{
-			frame{"runtime.GC", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 106},
-			frame{"testing.tRunner", 0},
+		{trace.EvGCStart, []frame{
+			{"runtime.GC", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 106},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoSched, []frame{
-			frame{"runtime/trace_test.TestTraceSymbolize", 107},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoStart, []frame{
+			{"runtime/trace_test.TestTraceSymbolize.func1", 37},
 		}},
-		eventDesc{trace.EvGoCreate, []frame{
-			frame{"runtime/trace_test.TestTraceSymbolize", 39},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoSched, []frame{
+			{"runtime/trace_test.TestTraceSymbolize", 107},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoStop, []frame{
-			frame{"runtime.block", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func1", 38},
+		{trace.EvGoCreate, []frame{
+			{"runtime/trace_test.TestTraceSymbolize", 39},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoStop, []frame{
-			frame{"runtime.chansend1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func2", 42},
+		{trace.EvGoStop, []frame{
+			{"runtime.block", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func1", 38},
 		}},
-		eventDesc{trace.EvGoStop, []frame{
-			frame{"runtime.chanrecv1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func3", 46},
+		{trace.EvGoStop, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func2", 42},
 		}},
-		eventDesc{trace.EvGoBlockRecv, []frame{
-			frame{"runtime.chanrecv1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func4", 50},
+		{trace.EvGoStop, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func3", 46},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"runtime.chansend1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 109},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockRecv, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func4", 50},
 		}},
-		eventDesc{trace.EvGoBlockSend, []frame{
-			frame{"runtime.chansend1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func5", 54},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 109},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"runtime.chanrecv1", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 110},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockSend, []frame{
+			{"runtime.chansend1", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func5", 54},
 		}},
-		eventDesc{trace.EvGoBlockSelect, []frame{
-			frame{"runtime.selectgo", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func6", 59},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.chanrecv1", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 110},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"runtime.selectgo", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 111},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockSelect, []frame{
+			{"runtime.selectgo", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func6", 59},
 		}},
-		eventDesc{trace.EvGoBlockSync, []frame{
-			frame{"sync.(*Mutex).Lock", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func7", 67},
+		{trace.EvGoUnblock, []frame{
+			{"runtime.selectgo", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 111},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"sync.(*Mutex).Unlock", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 115},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockSync, []frame{
+			{"sync.(*Mutex).Lock", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func7", 67},
 		}},
-		eventDesc{trace.EvGoBlockSync, []frame{
-			frame{"sync.(*WaitGroup).Wait", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func8", 73},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*Mutex).Unlock", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 115},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"sync.(*WaitGroup).Add", 0},
-			frame{"sync.(*WaitGroup).Done", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 116},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockSync, []frame{
+			{"sync.(*WaitGroup).Wait", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func8", 73},
 		}},
-		eventDesc{trace.EvGoBlockCond, []frame{
-			frame{"sync.(*Cond).Wait", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize.func9", 78},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*WaitGroup).Add", 0},
+			{"sync.(*WaitGroup).Done", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 116},
+			{"testing.tRunner", 0},
 		}},
-		eventDesc{trace.EvGoUnblock, []frame{
-			frame{"sync.(*Cond).Signal", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 117},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoBlockCond, []frame{
+			{"sync.(*Cond).Wait", 0},
+			{"runtime/trace_test.TestTraceSymbolize.func9", 78},
 		}},
-		eventDesc{trace.EvGoSleep, []frame{
-			frame{"time.Sleep", 0},
-			frame{"runtime/trace_test.TestTraceSymbolize", 108},
-			frame{"testing.tRunner", 0},
+		{trace.EvGoUnblock, []frame{
+			{"sync.(*Cond).Signal", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 117},
+			{"testing.tRunner", 0},
+		}},
+		{trace.EvGoSleep, []frame{
+			{"time.Sleep", 0},
+			{"runtime/trace_test.TestTraceSymbolize", 108},
+			{"testing.tRunner", 0},
 		}},
 	}
 	// Stacks for the following events are OS-dependent due to OS-specific code in net package.
 	if runtime.GOOS != "windows" && runtime.GOOS != "plan9" {
 		want = append(want, []eventDesc{
-			eventDesc{trace.EvGoBlockNet, []frame{
-				frame{"net.(*netFD).accept", 0},
-				frame{"net.(*TCPListener).AcceptTCP", 0},
-				frame{"net.(*TCPListener).Accept", 0},
-				frame{"runtime/trace_test.TestTraceSymbolize.func10", 86},
+			{trace.EvGoBlockNet, []frame{
+				{"net.(*netFD).accept", 0},
+				{"net.(*TCPListener).accept", 0},
+				{"net.(*TCPListener).Accept", 0},
+				{"runtime/trace_test.TestTraceSymbolize.func10", 86},
 			}},
-			eventDesc{trace.EvGoSysCall, []frame{
-				frame{"syscall.read", 0},
-				frame{"syscall.Read", 0},
-				frame{"os.(*File).read", 0},
-				frame{"os.(*File).Read", 0},
-				frame{"runtime/trace_test.TestTraceSymbolize.func11", 101},
+			{trace.EvGoSysCall, []frame{
+				{"syscall.read", 0},
+				{"syscall.Read", 0},
+				{"os.(*File).read", 0},
+				{"os.(*File).Read", 0},
+				{"runtime/trace_test.TestTraceSymbolize.func11", 101},
 			}},
 		}...)
 	}

diff --git a/src/runtime/trace/trace_test.go b/src/runtime/trace/trace_test.go
index 0a8957f..5fad3fb 100644
--- a/src/runtime/trace/trace_test.go
+++ b/src/runtime/trace/trace_test.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
@@ -52,7 +52,7 @@
 		t.Fatalf("failed to start tracing: %v", err)
 	}
 	Stop()
-	_, err := trace.Parse(buf)
+	_, err := trace.Parse(buf, "")
 	if err == trace.ErrTimeOrder {
 		t.Skipf("skipping trace: %v", err)
 	}
@@ -61,13 +61,13 @@
 	}
 }
 
-func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc, error) {
-	events, err := trace.Parse(r)
+func parseTrace(t *testing.T, r io.Reader) ([]*trace.Event, map[uint64]*trace.GDesc) {
+	events, err := trace.Parse(r, "")
 	if err == trace.ErrTimeOrder {
 		t.Skipf("skipping trace: %v", err)
 	}
 	if err != nil {
-		return nil, nil, err
+		t.Fatalf("failed to parse trace: %v", err)
 	}
 	gs := trace.GoroutineStats(events)
 	for goid := range gs {
@@ -75,7 +75,31 @@
 		// But still check that RelatedGoroutines does not crash, hang, etc.
 		_ = trace.RelatedGoroutines(events, goid)
 	}
-	return events, gs, nil
+	return events, gs
+}
+
+func testBrokenTimestamps(t *testing.T, data []byte) {
+	// On some processors cputicks (used to generate trace timestamps)
+	// produce non-monotonic timestamps. It is important that the parser
+	// distinguishes logically inconsistent traces (e.g. missing, excessive
+	// or misordered events) from broken timestamps. The former is a bug
+	// in tracer, the latter is a machine issue.
+	// So now that we have a consistent trace, test that (1) parser does
+	// not return a logical error in case of broken timestamps
+	// and (2) broken timestamps are eventually detected and reported.
+	trace.BreakTimestampsForTesting = true
+	defer func() {
+		trace.BreakTimestampsForTesting = false
+	}()
+	for i := 0; i < 1e4; i++ {
+		_, err := trace.Parse(bytes.NewReader(data), "")
+		if err == trace.ErrTimeOrder {
+			return
+		}
+		if err != nil {
+			t.Fatalf("failed to parse trace: %v", err)
+		}
+	}
 }
 
 func TestTraceStress(t *testing.T) {
@@ -129,7 +153,13 @@
 
 	runtime.GC()
 	// Trigger GC from malloc.
-	for i := 0; i < 1e3; i++ {
+	n := int(1e3)
+	if runtime.GOOS == "openbsd" && runtime.GOARCH == "arm" {
+		// Reduce allocation to avoid running out of
+		// memory on the builder - see issue/12032.
+		n = 512
+	}
+	for i := 0; i < n; i++ {
 		_ = make([]byte, 1<<20)
 	}
 
@@ -203,10 +233,9 @@
 	runtime.GOMAXPROCS(procs)
 
 	Stop()
-	_, _, err = parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
+	trace := buf.Bytes()
+	parseTrace(t, buf)
+	testBrokenTimestamps(t, trace)
 }
 
 // Do a bunch of various stuff (timers, GC, network, etc) in a separate goroutine.
@@ -260,7 +289,13 @@
 
 		runtime.GC()
 		// Trigger GC from malloc.
-		for i := 0; i < 1e3; i++ {
+		n := int(1e3)
+		if runtime.GOOS == "openbsd" && runtime.GOARCH == "arm" {
+			// Reduce allocation to avoid running out of
+			// memory on the builder - see issue/12032.
+			n = 512
+		}
+		for i := 0; i < n; i++ {
 			_ = make([]byte, 1<<20)
 		}
 
@@ -341,9 +376,9 @@
 		}
 		time.Sleep(time.Millisecond)
 		Stop()
-		if _, _, err := parseTrace(t, buf); err != nil {
-			t.Fatalf("failed to parse trace: %v", err)
-		}
+		trace := buf.Bytes()
+		parseTrace(t, buf)
+		testBrokenTimestamps(t, trace)
 	}
 	<-outerDone
 }
@@ -401,10 +436,7 @@
 	done.Wait()
 
 	Stop()
-	events, _, err := parseTrace(t, buf)
-	if err != nil {
-		t.Fatalf("failed to parse trace: %v", err)
-	}
+	events, _ := parseTrace(t, buf)
 	// Check that (1) trace does not contain EvFutileWakeup events and
 	// (2) there are no consecutive EvGoBlock/EvGCStart/EvGoBlock events
 	// (we call runtime.Gosched between all operations, so these would be futile wakeups).

diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 2def359..80a5440 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go

@@ -4,7 +4,11 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+	"unsafe"
+)
 
 // The code in this file implements stack trace walking for all architectures.
 // The most important fact about a given architecture is whether it uses a link register.
@@ -26,9 +30,10 @@
 // stores an 8-byte return PC onto the stack. To accommodate this, we use regSize
 // as the size of the architecture-pushed return PC.
 //
-// usesLR is defined below. ptrSize and regSize are defined in stubs.go.
+// usesLR is defined below in terms of minFrameSize, which is defined in
+// arch_$GOARCH.go. ptrSize and regSize are defined in stubs.go.
 
-const usesLR = GOARCH != "amd64" && GOARCH != "amd64p32" && GOARCH != "386"
+const usesLR = sys.MinFrameSize > 0
 
 var (
 	// initialized in tracebackinit
@@ -40,7 +45,6 @@
 	rt0_goPC             uintptr
 	sigpanicPC           uintptr
 	runfinqPC            uintptr
-	backgroundgcPC       uintptr
 	bgsweepPC            uintptr
 	forcegchelperPC      uintptr
 	timerprocPC          uintptr
@@ -68,7 +72,6 @@
 	rt0_goPC = funcPC(rt0_go)
 	sigpanicPC = funcPC(sigpanic)
 	runfinqPC = funcPC(runfinq)
-	backgroundgcPC = funcPC(backgroundgc)
 	bgsweepPC = funcPC(bgsweep)
 	forcegchelperPC = funcPC(forcegchelper)
 	timerprocPC = funcPC(timerproc)
@@ -96,7 +99,7 @@
 			frame.arglen = 0
 			frame.argmap = nil
 		} else {
-			frame.pc = uintptr(fn.fn)
+			frame.pc = fn.fn
 			f := findfunc(frame.pc)
 			if f == nil {
 				print("runtime: unknown pc in defer ", hex(frame.pc), "\n")
@@ -104,7 +107,7 @@
 			}
 			frame.fn = f
 			frame.argp = uintptr(deferArgs(d))
-			setArgInfo(&frame, f, true)
+			frame.arglen, frame.argmap = getArgInfo(&frame, f, true)
 		}
 		frame.continpc = frame.pc
 		if !callback((*stkframe)(noescape(unsafe.Pointer(&frame))), v) {
@@ -113,7 +116,7 @@
 	}
 }
 
-// Generic traceback.  Handles runtime stack prints (pcbuf == nil),
+// Generic traceback. Handles runtime stack prints (pcbuf == nil),
 // the runtime.Callers function (pcbuf != nil), as well as the garbage
 // collector (callback != nil).  A little clunky to merge these, but avoids
 // duplicating the code and all its subtlety.
@@ -138,11 +141,12 @@
 		// instead on the g0 stack.
 		throw("gentraceback cannot trace user goroutine on its own stack")
 	}
-	gotraceback := gotraceback(nil)
+	level, _, _ := gotraceback()
 
 	// Fix up returns to the stack barrier by fetching the
 	// original return PC from gp.stkbar.
-	stkbar := gp.stkbar[gp.stkbarPos:]
+	stkbarG := gp
+	stkbar := stkbarG.stkbar[stkbarG.stkbarPos:]
 
 	if pc0 == ^uintptr(0) && sp0 == ^uintptr(0) { // Signal to fetch saved values from gp.
 		if gp.syscallsp != 0 {
@@ -168,10 +172,11 @@
 		frame.lr = lr0
 	}
 	waspanic := false
+	cgoCtxt := gp.cgoCtxt
 	printing := pcbuf == nil && callback == nil
 	_defer := gp._defer
 
-	for _defer != nil && uintptr(_defer.sp) == _NoArgs {
+	for _defer != nil && _defer.sp == _NoArgs {
 		_defer = _defer.link
 	}
 
@@ -182,12 +187,40 @@
 			frame.pc = *(*uintptr)(unsafe.Pointer(frame.sp))
 			frame.lr = 0
 		} else {
-			frame.pc = uintptr(*(*uintreg)(unsafe.Pointer(frame.sp)))
-			frame.sp += regSize
+			frame.pc = uintptr(*(*sys.Uintreg)(unsafe.Pointer(frame.sp)))
+			frame.sp += sys.RegSize
 		}
 	}
 
 	f := findfunc(frame.pc)
+	if f != nil && f.entry == stackBarrierPC {
+		// We got caught in the middle of a stack barrier
+		// (presumably by a signal), so stkbar may be
+		// inconsistent with the barriers on the stack.
+		// Simulate the completion of the barrier.
+		//
+		// On x86, SP will be exactly one word above
+		// savedLRPtr. On LR machines, SP will be above
+		// savedLRPtr by some frame size.
+		var stkbarPos uintptr
+		if len(stkbar) > 0 && stkbar[0].savedLRPtr < sp0 {
+			// stackBarrier has not incremented stkbarPos.
+			stkbarPos = gp.stkbarPos
+		} else if gp.stkbarPos > 0 && gp.stkbar[gp.stkbarPos-1].savedLRPtr < sp0 {
+			// stackBarrier has incremented stkbarPos.
+			stkbarPos = gp.stkbarPos - 1
+		} else {
+			printlock()
+			print("runtime: failed to unwind through stackBarrier at SP ", hex(sp0), "; ")
+			gcPrintStkbars(gp, int(gp.stkbarPos))
+			print("\n")
+			throw("inconsistent state in stackBarrier")
+		}
+
+		frame.pc = gp.stkbar[stkbarPos].savedLRVal
+		stkbar = gp.stkbar[stkbarPos+1:]
+		f = findfunc(frame.pc)
+	}
 	if f == nil {
 		if callback != nil {
 			print("runtime: unknown pc ", hex(frame.pc), "\n")
@@ -197,6 +230,8 @@
 	}
 	frame.fn = f
 
+	var cache pcvalueCache
+
 	n := 0
 	for n < max {
 		// Typically:
@@ -206,6 +241,11 @@
 		//	stk is the stack containing sp.
 		//	The caller's program counter is lr, unless lr is zero, in which case it is *(uintptr*)sp.
 		f = frame.fn
+		if f.pcsp == 0 {
+			// No frame information, must be external function, like race support.
+			// See golang.org/issue/13568.
+			break
+		}
 
 		// Found an actual function.
 		// Derive frame pointer and link register.
@@ -216,12 +256,15 @@
 			sp := frame.sp
 			if flags&_TraceJumpStack != 0 && f.entry == systemstackPC && gp == g.m.g0 && gp.m.curg != nil {
 				sp = gp.m.curg.sched.sp
-				stkbar = gp.m.curg.stkbar[gp.m.curg.stkbarPos:]
+				frame.sp = sp
+				stkbarG = gp.m.curg
+				stkbar = stkbarG.stkbar[stkbarG.stkbarPos:]
+				cgoCtxt = gp.m.curg.cgoCtxt
 			}
-			frame.fp = sp + uintptr(funcspdelta(f, frame.pc))
+			frame.fp = sp + uintptr(funcspdelta(f, frame.pc, &cache))
 			if !usesLR {
 				// On x86, call instruction pushes return PC before entering new function.
-				frame.fp += regSize
+				frame.fp += sys.RegSize
 			}
 		}
 		var flr *_func
@@ -248,15 +291,15 @@
 				}
 			} else {
 				if frame.lr == 0 {
-					lrPtr = frame.fp - regSize
-					frame.lr = uintptr(*(*uintreg)(unsafe.Pointer(lrPtr)))
+					lrPtr = frame.fp - sys.RegSize
+					frame.lr = uintptr(*(*sys.Uintreg)(unsafe.Pointer(lrPtr)))
 				}
 			}
 			if frame.lr == stackBarrierPC {
 				// Recover original PC.
-				if stkbar[0].savedLRPtr != lrPtr {
+				if len(stkbar) == 0 || stkbar[0].savedLRPtr != lrPtr {
 					print("found next stack barrier at ", hex(lrPtr), "; expected ")
-					gcPrintStkbars(stkbar)
+					gcPrintStkbars(stkbarG, len(stkbarG.stkbar)-len(stkbar))
 					print("\n")
 					throw("missed stack barrier")
 				}
@@ -279,13 +322,13 @@
 		frame.varp = frame.fp
 		if !usesLR {
 			// On x86, call instruction pushes return PC before entering new function.
-			frame.varp -= regSize
+			frame.varp -= sys.RegSize
 		}
 
 		// If framepointer_enabled and there's a frame, then
 		// there's a saved bp here.
 		if framepointer_enabled && GOARCH == "amd64" && frame.varp > frame.sp {
-			frame.varp -= regSize
+			frame.varp -= sys.RegSize
 		}
 
 		// Derive size of arguments.
@@ -295,11 +338,8 @@
 		// in package runtime and reflect, and for those we use call-specific
 		// metadata recorded by f's caller.
 		if callback != nil || printing {
-			frame.argp = frame.fp
-			if usesLR {
-				frame.argp += ptrSize
-			}
-			setArgInfo(&frame, f, callback != nil)
+			frame.argp = frame.fp + sys.MinFrameSize
+			frame.arglen, frame.argmap = getArgInfo(&frame, f, callback != nil)
 		}
 
 		// Determine frame's 'continuation PC', where it can continue.
@@ -349,9 +389,13 @@
 				if (n > 0 || flags&_TraceTrap == 0) && frame.pc > f.entry && !waspanic {
 					tracepc--
 				}
-				print(funcname(f), "(")
+				name := funcname(f)
+				if name == "runtime.gopanic" {
+					name = "panic"
+				}
+				print(name, "(")
 				argp := (*[100]uintptr)(unsafe.Pointer(frame.argp))
-				for i := uintptr(0); i < frame.arglen/ptrSize; i++ {
+				for i := uintptr(0); i < frame.arglen/sys.PtrSize; i++ {
 					if i >= 10 {
 						print(", ...")
 						break
@@ -367,7 +411,7 @@
 				if frame.pc > f.entry {
 					print(" +", hex(frame.pc-f.entry))
 				}
-				if g.m.throwing > 0 && gp == g.m.curg || gotraceback >= 2 {
+				if g.m.throwing > 0 && gp == g.m.curg || level >= 2 {
 					print(" fp=", hex(frame.fp), " sp=", hex(frame.sp))
 				}
 				print("\n")
@@ -377,6 +421,18 @@
 		n++
 
 	skipped:
+		if f.entry == cgocallback_gofuncPC && len(cgoCtxt) > 0 {
+			ctxt := cgoCtxt[len(cgoCtxt)-1]
+			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
+
+			// skip only applies to Go frames.
+			// callback != nil only used when we only care
+			// about Go frames.
+			if skip == 0 && callback == nil {
+				n = tracebackCgoContext(pcbuf, printing, ctxt, n, max)
+			}
+		}
+
 		waspanic = f.entry == sigpanicPC
 
 		// Do not unwind past the bottom of the stack.
@@ -396,16 +452,16 @@
 		// before faking a call to sigpanic.
 		if usesLR && waspanic {
 			x := *(*uintptr)(unsafe.Pointer(frame.sp))
-			frame.sp += ptrSize
+			frame.sp += sys.MinFrameSize
 			if GOARCH == "arm64" {
 				// arm64 needs 16-byte aligned SP, always
-				frame.sp += ptrSize
+				frame.sp += sys.PtrSize
 			}
 			f = findfunc(frame.pc)
 			frame.fn = f
 			if f == nil {
 				frame.pc = x
-			} else if funcspdelta(f, frame.pc) == 0 {
+			} else if funcspdelta(f, frame.pc, &cache) == 0 {
 				frame.lr = x
 			}
 		}
@@ -476,34 +532,71 @@
 
 	if callback != nil && n < max && len(stkbar) > 0 {
 		print("runtime: g", gp.goid, ": leftover stack barriers ")
-		gcPrintStkbars(stkbar)
+		gcPrintStkbars(stkbarG, len(stkbarG.stkbar)-len(stkbar))
 		print("\n")
 		throw("traceback has leftover stack barriers")
 	}
 
+	if callback != nil && n < max && frame.sp != gp.stktopsp {
+		print("runtime: g", gp.goid, ": frame.sp=", hex(frame.sp), " top=", hex(gp.stktopsp), "\n")
+		print("\tstack=[", hex(gp.stack.lo), "-", hex(gp.stack.hi), "] n=", n, " max=", max, "\n")
+		throw("traceback did not unwind completely")
+	}
+
 	return n
 }
 
-func setArgInfo(frame *stkframe, f *_func, needArgMap bool) {
-	frame.arglen = uintptr(f.args)
+func getArgInfo(frame *stkframe, f *_func, needArgMap bool) (arglen uintptr, argmap *bitvector) {
+	arglen = uintptr(f.args)
 	if needArgMap && f.args == _ArgsSizeUnknown {
 		// Extract argument bitmaps for reflect stubs from the calls they made to reflect.
 		switch funcname(f) {
 		case "reflect.makeFuncStub", "reflect.methodValueCall":
-			arg0 := frame.sp
-			if usesLR {
-				arg0 += ptrSize
-			}
+			arg0 := frame.sp + sys.MinFrameSize
 			fn := *(**[2]uintptr)(unsafe.Pointer(arg0))
 			if fn[0] != f.entry {
 				print("runtime: confused by ", funcname(f), "\n")
 				throw("reflect mismatch")
 			}
 			bv := (*bitvector)(unsafe.Pointer(fn[1]))
-			frame.arglen = uintptr(bv.n * ptrSize)
-			frame.argmap = bv
+			arglen = uintptr(bv.n * sys.PtrSize)
+			argmap = bv
 		}
 	}
+	return
+}
+
+// tracebackCgoContext handles tracing back a cgo context value, from
+// the context argument to setCgoTraceback, for the gentraceback
+// function. It returns the new value of n.
+func tracebackCgoContext(pcbuf *uintptr, printing bool, ctxt uintptr, n, max int) int {
+	var cgoPCs [32]uintptr
+	cgoContextPCs(ctxt, cgoPCs[:])
+	var arg cgoSymbolizerArg
+	anySymbolized := false
+	for _, pc := range cgoPCs {
+		if pc == 0 || n >= max {
+			break
+		}
+		if pcbuf != nil {
+			(*[1 << 20]uintptr)(unsafe.Pointer(pcbuf))[n] = pc
+		}
+		if printing {
+			if cgoSymbolizer == nil {
+				print("non-Go function at pc=", hex(pc), "\n")
+			} else {
+				c := printOneCgoTraceback(pc, max-n, &arg)
+				n += c - 1 // +1 a few lines down
+				anySymbolized = true
+			}
+		}
+		n++
+	}
+	if anySymbolized {
+		arg.pc = 0
+		callCgoSymbolizer(&arg)
+	}
+	return n
 }
 
 func printcreatedby(gp *g) {
@@ -514,7 +607,7 @@
 		print("created by ", funcname(f), "\n")
 		tracepc := pc // back up to CALL instruction for funcline.
 		if pc > f.entry {
-			tracepc -= _PCQuantum
+			tracepc -= sys.PCQuantum
 		}
 		file, line := funcline(f, tracepc)
 		print("\t", file, ":", line)
@@ -540,6 +633,22 @@
 }
 
 func traceback1(pc, sp, lr uintptr, gp *g, flags uint) {
+	// If the goroutine is in cgo, and we have a cgo traceback, print that.
+	if iscgo && gp.m != nil && gp.m.ncgo > 0 && gp.syscallsp != 0 && gp.m.cgoCallers != nil && gp.m.cgoCallers[0] != 0 {
+		// Lock cgoCallers so that a signal handler won't
+		// change it, copy the array, reset it, unlock it.
+		// We are locked to the thread and are not running
+		// concurrently with a signal handler.
+		// We just have to stop a signal handler from interrupting
+		// in the middle of our copy.
+		atomic.Store(&gp.m.cgoCallersUse, 1)
+		cgoCallers := *gp.m.cgoCallers
+		gp.m.cgoCallers[0] = 0
+		atomic.Store(&gp.m.cgoCallersUse, 0)
+
+		printCgoTraceback(&cgoCallers)
+	}
+
 	var n int
 	if readgstatus(gp)&^_Gscan == _Gsyscall {
 		// Override registers if blocked in system call.
@@ -561,7 +670,7 @@
 
 func callers(skip int, pcbuf []uintptr) int {
 	sp := getcallersp(unsafe.Pointer(&skip))
-	pc := uintptr(getcallerpc(unsafe.Pointer(&skip)))
+	pc := getcallerpc(unsafe.Pointer(&skip))
 	gp := getg()
 	var n int
 	systemstack(func() {
@@ -579,17 +688,17 @@
 	if g.m.throwing > 0 && gp != nil && (gp == g.m.curg || gp == g.m.caughtsig.ptr()) {
 		return true
 	}
-	traceback := gotraceback(nil)
+	level, _, _ := gotraceback()
 	name := funcname(f)
 
-	// Special case: always show runtime.panic frame, so that we can
+	// Special case: always show runtime.gopanic frame, so that we can
 	// see where a panic started in the middle of a stack trace.
 	// See golang.org/issue/5832.
-	if name == "runtime.panic" {
+	if name == "runtime.gopanic" {
 		return true
 	}
 
-	return traceback > 1 || f != nil && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
+	return level > 1 || f != nil && contains(name, ".") && (!hasprefix(name, "runtime.") || isExportedRuntime(name))
 }
 
 // isExportedRuntime reports whether name is an exported runtime function.
@@ -606,45 +715,37 @@
 	_Gsyscall:   "syscall",
 	_Gwaiting:   "waiting",
 	_Gdead:      "dead",
-	_Genqueue:   "enqueue",
 	_Gcopystack: "copystack",
 }
 
-var gScanStatusStrings = [...]string{
-	0:          "scan",
-	_Grunnable: "scanrunnable",
-	_Grunning:  "scanrunning",
-	_Gsyscall:  "scansyscall",
-	_Gwaiting:  "scanwaiting",
-	_Gdead:     "scandead",
-	_Genqueue:  "scanenqueue",
-}
-
 func goroutineheader(gp *g) {
 	gpstatus := readgstatus(gp)
 
+	isScan := gpstatus&_Gscan != 0
+	gpstatus &^= _Gscan // drop the scan bit
+
 	// Basic string status
 	var status string
 	if 0 <= gpstatus && gpstatus < uint32(len(gStatusStrings)) {
 		status = gStatusStrings[gpstatus]
-	} else if gpstatus&_Gscan != 0 && 0 <= gpstatus&^_Gscan && gpstatus&^_Gscan < uint32(len(gStatusStrings)) {
-		status = gStatusStrings[gpstatus&^_Gscan]
 	} else {
 		status = "???"
 	}
 
 	// Override.
-	if (gpstatus == _Gwaiting || gpstatus == _Gscanwaiting) && gp.waitreason != "" {
+	if gpstatus == _Gwaiting && gp.waitreason != "" {
 		status = gp.waitreason
 	}
 
 	// approx time the G is blocked, in minutes
 	var waitfor int64
-	gpstatus &^= _Gscan // drop the scan bit
 	if (gpstatus == _Gwaiting || gpstatus == _Gsyscall) && gp.waitsince != 0 {
 		waitfor = (nanotime() - gp.waitsince) / 60e9
 	}
 	print("goroutine ", gp.goid, " [", status)
+	if isScan {
+		print(" (scan)")
+	}
 	if waitfor >= 1 {
 		print(", ", waitfor, " minutes")
 	}
@@ -655,7 +756,7 @@
 }
 
 func tracebackothers(me *g) {
-	level := gotraceback(nil)
+	level, _, _ := gotraceback()
 
 	// Show the current goroutine first, if we haven't already.
 	g := getg()
@@ -675,7 +776,7 @@
 		goroutineheader(gp)
 		// Note: gp.m == g.m occurs when tracebackothers is
 		// called from a signal handler initiated during a
-		// systemstack call.  The original G is still in the
+		// systemstack call. The original G is still in the
 		// running state, and we want to print its stack.
 		if gp.m != g.m && readgstatus(gp)&^_Gscan == _Grunning {
 			print("\tgoroutine running on other thread; stack unavailable\n")
@@ -703,9 +804,297 @@
 func isSystemGoroutine(gp *g) bool {
 	pc := gp.startpc
 	return pc == runfinqPC && !fingRunning ||
-		pc == backgroundgcPC ||
 		pc == bgsweepPC ||
 		pc == forcegchelperPC ||
 		pc == timerprocPC ||
 		pc == gcBgMarkWorkerPC
 }
+
+// SetCgoTraceback records three C functions to use to gather
+// traceback information from C code and to convert that traceback
+// information into symbolic information. These are used when printing
+// stack traces for a program that uses cgo.
+//
+// The traceback and context functions may be called from a signal
+// handler, and must therefore use only async-signal safe functions.
+// The symbolizer function may be called while the program is
+// crashing, and so must be cautious about using memory.  None of the
+// functions may call back into Go.
+//
+// The context function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t Context;
+//	};
+//
+// If the Context field is 0, the context function is being called to
+// record the current traceback context. It should record in the
+// Context field whatever information is needed about the current
+// point of execution to later produce a stack trace, probably the
+// stack pointer and PC. In this case the context function will be
+// called from C code.
+//
+// If the Context field is not 0, then it is a value returned by a
+// previous call to the context function. This case is called when the
+// context is no longer needed; that is, when the Go code is returning
+// to its C code caller. This permits the context function to release
+// any associated resources.
+//
+// While it would be correct for the context function to record a
+// complete a stack trace whenever it is called, and simply copy that
+// out in the traceback function, in a typical program the context
+// function will be called many times without ever recording a
+// traceback for that context. Recording a complete stack trace in a
+// call to the context function is likely to be inefficient.
+//
+// The traceback function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		Context    uintptr
+//		SigContext uintptr
+//		Buf        *uintptr
+//		Max        uintptr
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t  Context;
+//		uintptr_t  SigContext;
+//		uintptr_t* Buf;
+//		uintptr_t  Max;
+//	};
+//
+// The Context field will be zero to gather a traceback from the
+// current program execution point. In this case, the traceback
+// function will be called from C code.
+//
+// Otherwise Context will be a value previously returned by a call to
+// the context function. The traceback function should gather a stack
+// trace from that saved point in the program execution. The traceback
+// function may be called from an execution thread other than the one
+// that recorded the context, but only when the context is known to be
+// valid and unchanging. The traceback function may also be called
+// deeper in the call stack on the same thread that recorded the
+// context. The traceback function may be called multiple times with
+// the same Context value; it will usually be appropriate to cache the
+// result, if possible, the first time this is called for a specific
+// context value.
+//
+// If the traceback function is called from a signal handler on a Unix
+// system, SigContext will be the signal context argument passed to
+// the signal handler (a C ucontext_t* cast to uintptr_t). This may be
+// used to start tracing at the point where the signal occurred. If
+// the traceback function is not called from a signal handler,
+// SigContext will be zero.
+//
+// Buf is where the traceback information should be stored. It should
+// be PC values, such that Buf[0] is the PC of the caller, Buf[1] is
+// the PC of that function's caller, and so on.  Max is the maximum
+// number of entries to store.  The function should store a zero to
+// indicate the top of the stack, or that the caller is on a different
+// stack, presumably a Go stack.
+//
+// Unlike runtime.Callers, the PC values returned should, when passed
+// to the symbolizer function, return the file/line of the call
+// instruction.  No additional subtraction is required or appropriate.
+//
+// The symbolizer function will be called with a single argument, a
+// pointer to a struct:
+//
+//	struct {
+//		PC      uintptr // program counter to fetch information for
+//		File    *byte   // file name (NUL terminated)
+//		Lineno  uintptr // line number
+//		Func    *byte   // function name (NUL terminated)
+//		Entry   uintptr // function entry point
+//		More    uintptr // set non-zero if more info for this PC
+//		Data    uintptr // unused by runtime, available for function
+//	}
+//
+// In C syntax, this struct will be
+//
+//	struct {
+//		uintptr_t PC;
+//		char*     File;
+//		uintptr_t Lineno;
+//		char*     Func;
+//		uintptr_t Entry;
+//		uintptr_t More;
+//		uintptr_t Data;
+//	};
+//
+// The PC field will be a value returned by a call to the traceback
+// function.
+//
+// The first time the function is called for a particular traceback,
+// all the fields except PC will be 0. The function should fill in the
+// other fields if possible, setting them to 0/nil if the information
+// is not available. The Data field may be used to store any useful
+// information across calls. The More field should be set to non-zero
+// if there is more information for this PC, zero otherwise. If More
+// is set non-zero, the function will be called again with the same
+// PC, and may return different information (this is intended for use
+// with inlined functions). If More is zero, the function will be
+// called with the next PC value in the traceback. When the traceback
+// is complete, the function will be called once more with PC set to
+// zero; this may be used to free any information. Each call will
+// leave the fields of the struct set to the same values they had upon
+// return, except for the PC field when the More field is zero. The
+// function must not keep a copy of the struct pointer between calls.
+//
+// When calling SetCgoTraceback, the version argument is the version
+// number of the structs that the functions expect to receive.
+// Currently this must be zero.
+//
+// The symbolizer function may be nil, in which case the results of
+// the traceback function will be displayed as numbers. If the
+// traceback function is nil, the symbolizer function will never be
+// called. The context function may be nil, in which case the
+// traceback function will only be called with the context field set
+// to zero.  If the context function is nil, then calls from Go to C
+// to Go will not show a traceback for the C portion of the call stack.
+//
+// SetCgoTraceback should be called only once, ideally from an init function.
+func SetCgoTraceback(version int, traceback, context, symbolizer unsafe.Pointer) {
+	if version != 0 {
+		panic("unsupported version")
+	}
+
+	if cgoTraceback != nil && cgoTraceback != traceback ||
+		cgoContext != nil && cgoContext != context ||
+		cgoSymbolizer != nil && cgoSymbolizer != symbolizer {
+		panic("call SetCgoTraceback only once")
+	}
+
+	cgoTraceback = traceback
+	cgoContext = context
+	cgoSymbolizer = symbolizer
+
+	// The context function is called when a C function calls a Go
+	// function. As such it is only called by C code in runtime/cgo.
+	if _cgo_set_context_function != nil {
+		cgocall(_cgo_set_context_function, context)
+	}
+}
+
+var cgoTraceback unsafe.Pointer
+var cgoContext unsafe.Pointer
+var cgoSymbolizer unsafe.Pointer
+
+// cgoTracebackArg is the type passed to cgoTraceback.
+type cgoTracebackArg struct {
+	context    uintptr
+	sigContext uintptr
+	buf        *uintptr
+	max        uintptr
+}
+
+// cgoContextArg is the type passed to the context function.
+type cgoContextArg struct {
+	context uintptr
+}
+
+// cgoSymbolizerArg is the type passed to cgoSymbolizer.
+type cgoSymbolizerArg struct {
+	pc       uintptr
+	file     *byte
+	lineno   uintptr
+	funcName *byte
+	entry    uintptr
+	more     uintptr
+	data     uintptr
+}
+
+// cgoTraceback prints a traceback of callers.
+func printCgoTraceback(callers *cgoCallers) {
+	if cgoSymbolizer == nil {
+		for _, c := range callers {
+			if c == 0 {
+				break
+			}
+			print("non-Go function at pc=", hex(c), "\n")
+		}
+		return
+	}
+
+	var arg cgoSymbolizerArg
+	for _, c := range callers {
+		if c == 0 {
+			break
+		}
+		printOneCgoTraceback(c, 0x7fffffff, &arg)
+	}
+	arg.pc = 0
+	callCgoSymbolizer(&arg)
+}
+
+// printOneCgoTraceback prints the traceback of a single cgo caller.
+// This can print more than one line because of inlining.
+// Returns the number of frames printed.
+func printOneCgoTraceback(pc uintptr, max int, arg *cgoSymbolizerArg) int {
+	c := 0
+	arg.pc = pc
+	for {
+		if c > max {
+			break
+		}
+		callCgoSymbolizer(arg)
+		if arg.funcName != nil {
+			// Note that we don't print any argument
+			// information here, not even parentheses.
+			// The symbolizer must add that if appropriate.
+			println(gostringnocopy(arg.funcName))
+		} else {
+			println("non-Go function")
+		}
+		print("\t")
+		if arg.file != nil {
+			print(gostringnocopy(arg.file), ":", arg.lineno, " ")
+		}
+		print("pc=", hex(pc), "\n")
+		c++
+		if arg.more == 0 {
+			break
+		}
+	}
+	return c
+}
+
+// callCgoSymbolizer calls the cgoSymbolizer function.
+func callCgoSymbolizer(arg *cgoSymbolizerArg) {
+	call := cgocall
+	if panicking > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	call(cgoSymbolizer, noescape(unsafe.Pointer(arg)))
+}
+
+// cgoContextPCs gets the PC values from a cgo traceback.
+func cgoContextPCs(ctxt uintptr, buf []uintptr) {
+	if cgoTraceback == nil {
+		return
+	}
+	call := cgocall
+	if panicking > 0 || getg().m.curg != getg() {
+		// We do not want to call into the scheduler when panicking
+		// or when on the system stack.
+		call = asmcgocall
+	}
+	arg := cgoTracebackArg{
+		context: ctxt,
+		buf:     (*uintptr)(noescape(unsafe.Pointer(&buf[0]))),
+		max:     uintptr(len(buf)),
+	}
+	call(cgoTraceback, noescape(unsafe.Pointer(&arg)))
+}

diff --git a/src/runtime/type.go b/src/runtime/type.go
index 45bdac8..5ef11a4 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go

@@ -8,14 +8,28 @@
 
 import "unsafe"
 
-// Needs to be in sync with ../cmd/internal/ld/decodesym.go:/^func.commonsize,
-// ../cmd/internal/gc/reflect.go:/^func.dcommontype and
+// tflag is documented in reflect/type.go.
+//
+// tflag values must be kept in sync with copies in:
+//	cmd/compile/internal/gc/reflect.go
+//	cmd/link/internal/ld/decodesym.go
+//	reflect/type.go
+type tflag uint8
+
+const (
+	tflagUncommon  tflag = 1 << 0
+	tflagExtraStar tflag = 1 << 1
+	tflagNamed     tflag = 1 << 2
+)
+
+// Needs to be in sync with ../cmd/compile/internal/ld/decodesym.go:/^func.commonsize,
+// ../cmd/compile/internal/gc/reflect.go:/^func.dcommontype and
 // ../reflect/type.go:/^type.rtype.
 type _type struct {
 	size       uintptr
 	ptrdata    uintptr // size of memory prefix holding all pointers
 	hash       uint32
-	_unused    uint8
+	tflag      tflag
 	align      uint8
 	fieldalign uint8
 	kind       uint8
@@ -23,37 +37,285 @@
 	// gcdata stores the GC type data for the garbage collector.
 	// If the KindGCProg bit is set in kind, gcdata is a GC program.
 	// Otherwise it is a ptrmask bitmap. See mbitmap.go for details.
-	gcdata  *byte
-	_string *string
-	x       *uncommontype
-	ptrto   *_type
-	zero    *byte // ptr to the zero value for this type
+	gcdata    *byte
+	str       nameOff
+	ptrToThis typeOff
 }
 
+func (t *_type) string() string {
+	s := t.nameOff(t.str).name()
+	if t.tflag&tflagExtraStar != 0 {
+		return s[1:]
+	}
+	return s
+}
+
+func (t *_type) uncommon() *uncommontype {
+	if t.tflag&tflagUncommon == 0 {
+		return nil
+	}
+	switch t.kind & kindMask {
+	case kindStruct:
+		type u struct {
+			structtype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindPtr:
+		type u struct {
+			ptrtype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindFunc:
+		type u struct {
+			functype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindSlice:
+		type u struct {
+			slicetype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindArray:
+		type u struct {
+			arraytype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindChan:
+		type u struct {
+			chantype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindMap:
+		type u struct {
+			maptype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	case kindInterface:
+		type u struct {
+			interfacetype
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	default:
+		type u struct {
+			_type
+			u uncommontype
+		}
+		return &(*u)(unsafe.Pointer(t)).u
+	}
+}
+
+func hasPrefix(s, prefix string) bool {
+	return len(s) >= len(prefix) && s[:len(prefix)] == prefix
+}
+
+func (t *_type) name() string {
+	if t.tflag&tflagNamed == 0 {
+		return ""
+	}
+	s := t.string()
+	i := len(s) - 1
+	for i >= 0 {
+		if s[i] == '.' {
+			break
+		}
+		i--
+	}
+	return s[i+1:]
+}
+
+// reflectOffs holds type offsets defined at run time by the reflect package.
+//
+// When a type is defined at run time, its *rtype data lives on the heap.
+// There are a wide range of possible addresses the heap may use, that
+// may not be representable as a 32-bit offset. Moreover the GC may
+// one day start moving heap memory, in which case there is no stable
+// offset that can be defined.
+//
+// To provide stable offsets, we add pin *rtype objects in a global map
+// and treat the offset as an identifier. We use negative offsets that
+// do not overlap with any compile-time module offsets.
+//
+// Entries are created by reflect.addReflectOff.
+var reflectOffs struct {
+	lock mutex
+	next int32
+	m    map[int32]unsafe.Pointer
+	minv map[unsafe.Pointer]int32
+}
+
+func reflectOffsLock() {
+	lock(&reflectOffs.lock)
+	if raceenabled {
+		raceacquire(unsafe.Pointer(&reflectOffs.lock))
+	}
+}
+
+func reflectOffsUnlock() {
+	if raceenabled {
+		racerelease(unsafe.Pointer(&reflectOffs.lock))
+	}
+	unlock(&reflectOffs.lock)
+}
+
+func resolveNameOff(ptrInModule unsafe.Pointer, off nameOff) name {
+	if off == 0 {
+		return name{}
+	}
+	base := uintptr(ptrInModule)
+	for md := &firstmoduledata; md != nil; md = md.next {
+		if base >= md.types && base < md.etypes {
+			res := md.types + uintptr(off)
+			if res > md.etypes {
+				println("runtime: nameOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+				throw("runtime: name offset out of range")
+			}
+			return name{(*byte)(unsafe.Pointer(res))}
+		}
+	}
+
+	// No module found. see if it is a run time name.
+	reflectOffsLock()
+	res, found := reflectOffs.m[int32(off)]
+	reflectOffsUnlock()
+	if !found {
+		println("runtime: nameOff", hex(off), "base", hex(base), "not in ranges:")
+		for next := &firstmoduledata; next != nil; next = next.next {
+			println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+		}
+		throw("runtime: name offset base pointer out of range")
+	}
+	return name{(*byte)(res)}
+}
+
+func (t *_type) nameOff(off nameOff) name {
+	return resolveNameOff(unsafe.Pointer(t), off)
+}
+
+func (t *_type) typeOff(off typeOff) *_type {
+	if off == 0 {
+		return nil
+	}
+	base := uintptr(unsafe.Pointer(t))
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		reflectOffsLock()
+		res := reflectOffs.m[int32(off)]
+		reflectOffsUnlock()
+		if res == nil {
+			println("runtime: typeOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: type offset base pointer out of range")
+		}
+		return (*_type)(res)
+	}
+	if t := md.typemap[off]; t != nil {
+		return t
+	}
+	res := md.types + uintptr(off)
+	if res > md.etypes {
+		println("runtime: typeOff", hex(off), "out of range", hex(md.types), "-", hex(md.etypes))
+		throw("runtime: type offset out of range")
+	}
+	return (*_type)(unsafe.Pointer(res))
+}
+
+func (t *_type) textOff(off textOff) unsafe.Pointer {
+	base := uintptr(unsafe.Pointer(t))
+	var md *moduledata
+	for next := &firstmoduledata; next != nil; next = next.next {
+		if base >= next.types && base < next.etypes {
+			md = next
+			break
+		}
+	}
+	if md == nil {
+		reflectOffsLock()
+		res := reflectOffs.m[int32(off)]
+		reflectOffsUnlock()
+		if res == nil {
+			println("runtime: textOff", hex(off), "base", hex(base), "not in ranges:")
+			for next := &firstmoduledata; next != nil; next = next.next {
+				println("\ttypes", hex(next.types), "etypes", hex(next.etypes))
+			}
+			throw("runtime: text offset base pointer out of range")
+		}
+		return res
+	}
+	res := md.text + uintptr(off)
+	if res > md.etext {
+		println("runtime: textOff", hex(off), "out of range", hex(md.text), "-", hex(md.etext))
+		throw("runtime: text offset out of range")
+	}
+	return unsafe.Pointer(res)
+}
+
+func (t *functype) in() []*_type {
+	// See funcType in reflect/type.go for details on data layout.
+	uadd := uintptr(unsafe.Sizeof(functype{}))
+	if t.typ.tflag&tflagUncommon != 0 {
+		uadd += unsafe.Sizeof(uncommontype{})
+	}
+	return (*[1 << 20]*_type)(add(unsafe.Pointer(t), uadd))[:t.inCount]
+}
+
+func (t *functype) out() []*_type {
+	// See funcType in reflect/type.go for details on data layout.
+	uadd := uintptr(unsafe.Sizeof(functype{}))
+	if t.typ.tflag&tflagUncommon != 0 {
+		uadd += unsafe.Sizeof(uncommontype{})
+	}
+	outCount := t.outCount & (1<<15 - 1)
+	return (*[1 << 20]*_type)(add(unsafe.Pointer(t), uadd))[t.inCount : t.inCount+outCount]
+}
+
+func (t *functype) dotdotdot() bool {
+	return t.outCount&(1<<15) != 0
+}
+
+type nameOff int32
+type typeOff int32
+type textOff int32
+
 type method struct {
-	name    *string
-	pkgpath *string
-	mtyp    *_type
-	typ     *_type
-	ifn     unsafe.Pointer
-	tfn     unsafe.Pointer
+	name nameOff
+	mtyp typeOff
+	ifn  textOff
+	tfn  textOff
 }
 
 type uncommontype struct {
-	name    *string
-	pkgpath *string
-	mhdr    []method
+	pkgpath nameOff
+	mcount  uint16 // number of methods
+	_       uint16 // unused
+	moff    uint32 // offset from this uncommontype to [mcount]method
+	_       uint32 // unused
 }
 
 type imethod struct {
-	name    *string
-	pkgpath *string
-	_type   *_type
+	name nameOff
+	ityp typeOff
 }
 
 type interfacetype struct {
-	typ  _type
-	mhdr []imethod
+	typ     _type
+	pkgpath name
+	mhdr    []imethod
 }
 
 type maptype struct {
@@ -68,6 +330,14 @@
 	indirectvalue bool   // store ptr to value instead of value itself
 	bucketsize    uint16 // size of bucket
 	reflexivekey  bool   // true if k==k for all keys
+	needkeyupdate bool   // true if we need to update key on an overwrite
+}
+
+type arraytype struct {
+	typ   _type
+	elem  *_type
+	slice *_type
+	len   uintptr
 }
 
 type chantype struct {
@@ -82,13 +352,281 @@
 }
 
 type functype struct {
-	typ       _type
-	dotdotdot bool
-	in        slice
-	out       slice
+	typ      _type
+	inCount  uint16
+	outCount uint16
 }
 
 type ptrtype struct {
 	typ  _type
 	elem *_type
 }
+
+type structfield struct {
+	name   name
+	typ    *_type
+	offset uintptr
+}
+
+type structtype struct {
+	typ     _type
+	pkgPath name
+	fields  []structfield
+}
+
+// name is an encoded type name with optional extra data.
+// See reflect/type.go for details.
+type name struct {
+	bytes *byte
+}
+
+func (n name) data(off int) *byte {
+	return (*byte)(add(unsafe.Pointer(n.bytes), uintptr(off)))
+}
+
+func (n name) isExported() bool {
+	return (*n.bytes)&(1<<0) != 0
+}
+
+func (n name) nameLen() int {
+	return int(uint16(*n.data(1))<<8 | uint16(*n.data(2)))
+}
+
+func (n name) tagLen() int {
+	if *n.data(0)&(1<<1) == 0 {
+		return 0
+	}
+	off := 3 + n.nameLen()
+	return int(uint16(*n.data(off))<<8 | uint16(*n.data(off + 1)))
+}
+
+func (n name) name() (s string) {
+	if n.bytes == nil {
+		return ""
+	}
+	nl := n.nameLen()
+	if nl == 0 {
+		return ""
+	}
+	hdr := (*stringStruct)(unsafe.Pointer(&s))
+	hdr.str = unsafe.Pointer(n.data(3))
+	hdr.len = nl
+	return s
+}
+
+func (n name) tag() (s string) {
+	tl := n.tagLen()
+	if tl == 0 {
+		return ""
+	}
+	nl := n.nameLen()
+	hdr := (*stringStruct)(unsafe.Pointer(&s))
+	hdr.str = unsafe.Pointer(n.data(3 + nl + 2))
+	hdr.len = tl
+	return s
+}
+
+func (n name) pkgPath() string {
+	if n.bytes == nil || *n.data(0)&(1<<2) == 0 {
+		return ""
+	}
+	off := 3 + n.nameLen()
+	if tl := n.tagLen(); tl > 0 {
+		off += 2 + tl
+	}
+	var nameOff nameOff
+	copy((*[4]byte)(unsafe.Pointer(&nameOff))[:], (*[4]byte)(unsafe.Pointer(n.data(off)))[:])
+	pkgPathName := resolveNameOff(unsafe.Pointer(n.bytes), nameOff)
+	return pkgPathName.name()
+}
+
+// typelinksinit scans the types from extra modules and builds the
+// moduledata typemap used to de-duplicate type pointers.
+func typelinksinit() {
+	if firstmoduledata.next == nil {
+		return
+	}
+	typehash := make(map[uint32][]*_type)
+
+	modules := []*moduledata{}
+	for md := &firstmoduledata; md != nil; md = md.next {
+		modules = append(modules, md)
+	}
+	prev, modules := modules[len(modules)-1], modules[:len(modules)-1]
+	for len(modules) > 0 {
+		// Collect types from the previous module into typehash.
+	collect:
+		for _, tl := range prev.typelinks {
+			var t *_type
+			if prev.typemap == nil {
+				t = (*_type)(unsafe.Pointer(prev.types + uintptr(tl)))
+			} else {
+				t = prev.typemap[typeOff(tl)]
+			}
+			// Add to typehash if not seen before.
+			tlist := typehash[t.hash]
+			for _, tcur := range tlist {
+				if tcur == t {
+					continue collect
+				}
+			}
+			typehash[t.hash] = append(tlist, t)
+		}
+
+		// If any of this module's typelinks match a type from a
+		// prior module, prefer that prior type by adding the offset
+		// to this module's typemap.
+		md := modules[len(modules)-1]
+		md.typemap = make(map[typeOff]*_type, len(md.typelinks))
+		for _, tl := range md.typelinks {
+			t := (*_type)(unsafe.Pointer(md.types + uintptr(tl)))
+			for _, candidate := range typehash[t.hash] {
+				if typesEqual(t, candidate) {
+					t = candidate
+					break
+				}
+			}
+			md.typemap[typeOff(tl)] = t
+		}
+
+		prev, modules = md, modules[:len(modules)-1]
+	}
+}
+
+// typesEqual reports whether two types are equal.
+//
+// Everywhere in the runtime and reflect packages, it is assumed that
+// there is exactly one *_type per Go type, so that pointer equality
+// can be used to test if types are equal. There is one place that
+// breaks this assumption: buildmode=shared. In this case a type can
+// appear as two different pieces of memory. This is hidden from the
+// runtime and reflect package by the per-module typemap built in
+// typelinksinit. It uses typesEqual to map types from later modules
+// back into earlier ones.
+//
+// Only typelinksinit needs this function.
+func typesEqual(t, v *_type) bool {
+	if t == v {
+		return true
+	}
+	kind := t.kind & kindMask
+	if kind != v.kind&kindMask {
+		return false
+	}
+	if t.string() != v.string() {
+		return false
+	}
+	ut := t.uncommon()
+	uv := v.uncommon()
+	if ut != nil || uv != nil {
+		if ut == nil || uv == nil {
+			return false
+		}
+		pkgpatht := t.nameOff(ut.pkgpath).name()
+		pkgpathv := v.nameOff(uv.pkgpath).name()
+		if pkgpatht != pkgpathv {
+			return false
+		}
+	}
+	if kindBool <= kind && kind <= kindComplex128 {
+		return true
+	}
+	switch kind {
+	case kindString, kindUnsafePointer:
+		return true
+	case kindArray:
+		at := (*arraytype)(unsafe.Pointer(t))
+		av := (*arraytype)(unsafe.Pointer(v))
+		return typesEqual(at.elem, av.elem) && at.len == av.len
+	case kindChan:
+		ct := (*chantype)(unsafe.Pointer(t))
+		cv := (*chantype)(unsafe.Pointer(v))
+		return ct.dir == cv.dir && typesEqual(ct.elem, cv.elem)
+	case kindFunc:
+		ft := (*functype)(unsafe.Pointer(t))
+		fv := (*functype)(unsafe.Pointer(v))
+		if ft.outCount != fv.outCount || ft.inCount != fv.inCount {
+			return false
+		}
+		tin, vin := ft.in(), fv.in()
+		for i := 0; i < len(tin); i++ {
+			if !typesEqual(tin[i], vin[i]) {
+				return false
+			}
+		}
+		tout, vout := ft.out(), fv.out()
+		for i := 0; i < len(tout); i++ {
+			if !typesEqual(tout[i], vout[i]) {
+				return false
+			}
+		}
+		return true
+	case kindInterface:
+		it := (*interfacetype)(unsafe.Pointer(t))
+		iv := (*interfacetype)(unsafe.Pointer(v))
+		if it.pkgpath.name() != iv.pkgpath.name() {
+			return false
+		}
+		if len(it.mhdr) != len(iv.mhdr) {
+			return false
+		}
+		for i := range it.mhdr {
+			tm := &it.mhdr[i]
+			vm := &iv.mhdr[i]
+			tname := it.typ.nameOff(tm.name)
+			vname := iv.typ.nameOff(vm.name)
+			if tname.name() != vname.name() {
+				return false
+			}
+			if tname.pkgPath() != vname.pkgPath() {
+				return false
+			}
+			if !typesEqual(it.typ.typeOff(tm.ityp), iv.typ.typeOff(vm.ityp)) {
+				return false
+			}
+		}
+		return true
+	case kindMap:
+		mt := (*maptype)(unsafe.Pointer(t))
+		mv := (*maptype)(unsafe.Pointer(v))
+		return typesEqual(mt.key, mv.key) && typesEqual(mt.elem, mv.elem)
+	case kindPtr:
+		pt := (*ptrtype)(unsafe.Pointer(t))
+		pv := (*ptrtype)(unsafe.Pointer(v))
+		return typesEqual(pt.elem, pv.elem)
+	case kindSlice:
+		st := (*slicetype)(unsafe.Pointer(t))
+		sv := (*slicetype)(unsafe.Pointer(v))
+		return typesEqual(st.elem, sv.elem)
+	case kindStruct:
+		st := (*structtype)(unsafe.Pointer(t))
+		sv := (*structtype)(unsafe.Pointer(v))
+		if len(st.fields) != len(sv.fields) {
+			return false
+		}
+		for i := range st.fields {
+			tf := &st.fields[i]
+			vf := &sv.fields[i]
+			if tf.name.name() != vf.name.name() {
+				return false
+			}
+			if tf.name.pkgPath() != vf.name.pkgPath() {
+				return false
+			}
+			if !typesEqual(tf.typ, vf.typ) {
+				return false
+			}
+			if tf.name.tag() != vf.name.tag() {
+				return false
+			}
+			if tf.offset != vf.offset {
+				return false
+			}
+		}
+		return true
+	default:
+		println("runtime: impossible type kind", kind)
+		throw("runtime: impossible type kind")
+		return false
+	}
+}

diff --git a/src/runtime/typekind.go b/src/runtime/typekind.go
index b64ec44..abb2777 100644
--- a/src/runtime/typekind.go
+++ b/src/runtime/typekind.go

@@ -1,41 +1,41 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
 package runtime
 
 const (
-	kindBool          = _KindBool
-	kindInt           = _KindInt
-	kindInt8          = _KindInt8
-	kindInt16         = _KindInt16
-	kindInt32         = _KindInt32
-	kindInt64         = _KindInt64
-	kindUint          = _KindUint
-	kindUint8         = _KindUint8
-	kindUint16        = _KindUint16
-	kindUint32        = _KindUint32
-	kindUint64        = _KindUint64
-	kindUintptr       = _KindUintptr
-	kindFloat32       = _KindFloat32
-	kindFloat64       = _KindFloat64
-	kindComplex64     = _KindComplex64
-	kindComplex128    = _KindComplex128
-	kindArray         = _KindArray
-	kindChan          = _KindChan
-	kindFunc          = _KindFunc
-	kindInterface     = _KindInterface
-	kindMap           = _KindMap
-	kindPtr           = _KindPtr
-	kindSlice         = _KindSlice
-	kindString        = _KindString
-	kindStruct        = _KindStruct
-	kindUnsafePointer = _KindUnsafePointer
+	kindBool = 1 + iota
+	kindInt
+	kindInt8
+	kindInt16
+	kindInt32
+	kindInt64
+	kindUint
+	kindUint8
+	kindUint16
+	kindUint32
+	kindUint64
+	kindUintptr
+	kindFloat32
+	kindFloat64
+	kindComplex64
+	kindComplex128
+	kindArray
+	kindChan
+	kindFunc
+	kindInterface
+	kindMap
+	kindPtr
+	kindSlice
+	kindString
+	kindStruct
+	kindUnsafePointer
 
-	kindDirectIface = _KindDirectIface
-	kindGCProg      = _KindGCProg
-	kindNoPointers  = _KindNoPointers
-	kindMask        = _KindMask
+	kindDirectIface = 1 << 5
+	kindGCProg      = 1 << 6
+	kindNoPointers  = 1 << 7
+	kindMask        = (1 << 5) - 1
 )
 
 // isDirectIface reports whether t is stored directly in an interface value.

diff --git a/src/runtime/typekind1.go b/src/runtime/typekind1.go
deleted file mode 100644
index 73028d6..0000000
--- a/src/runtime/typekind1.go
+++ /dev/null

@@ -1,39 +0,0 @@
-// Copyright 2012 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package runtime
-
-const (
-	_KindBool = 1 + iota
-	_KindInt
-	_KindInt8
-	_KindInt16
-	_KindInt32
-	_KindInt64
-	_KindUint
-	_KindUint8
-	_KindUint16
-	_KindUint32
-	_KindUint64
-	_KindUintptr
-	_KindFloat32
-	_KindFloat64
-	_KindComplex64
-	_KindComplex128
-	_KindArray
-	_KindChan
-	_KindFunc
-	_KindInterface
-	_KindMap
-	_KindPtr
-	_KindSlice
-	_KindString
-	_KindStruct
-	_KindUnsafePointer
-
-	_KindDirectIface = 1 << 5
-	_KindGCProg      = 1 << 6 // Type.gc points to GC program
-	_KindNoPointers  = 1 << 7
-	_KindMask        = (1 << 5) - 1
-)

diff --git a/src/runtime/unaligned1.go b/src/runtime/unaligned1.go
index d3d6c70..754d63b 100644
--- a/src/runtime/unaligned1.go
+++ b/src/runtime/unaligned1.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 amd64 amd64p32 arm64
+// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le s390x
 
 package runtime
 

diff --git a/src/runtime/unaligned2.go b/src/runtime/unaligned2.go
index 4fc7917..fed3cca 100644
--- a/src/runtime/unaligned2.go
+++ b/src/runtime/unaligned2.go

@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build arm ppc64 ppc64le
+// +build arm mips64 mips64le
 
 package runtime
 

diff --git a/src/runtime/vdso_linux_amd64.go b/src/runtime/vdso_linux_amd64.go
index 2440015..8a970df 100644
--- a/src/runtime/vdso_linux_amd64.go
+++ b/src/runtime/vdso_linux_amd64.go

@@ -18,9 +18,7 @@
 // http://refspecs.linuxfoundation.org/LSB_3.2.0/LSB-Core-generic/LSB-Core-generic/symversion.html
 
 const (
-	_AT_RANDOM       = 25
 	_AT_SYSINFO_EHDR = 33
-	_AT_NULL         = 0 /* End of vector */
 
 	_PT_LOAD    = 1 /* Loadable program segment */
 	_PT_DYNAMIC = 2 /* Dynamic linking information */
@@ -260,7 +258,7 @@
 		def = (*elf64Verdef)(add(unsafe.Pointer(def), uintptr(def.vd_next)))
 	}
 
-	return -1 // can not match any version
+	return -1 // cannot match any version
 }
 
 func vdso_parse_symbols(info *vdso_info, version int32) {
@@ -291,37 +289,18 @@
 	}
 }
 
-func sysargs(argc int32, argv **byte) {
-	n := argc + 1
-
-	// skip envp to get to ELF auxiliary vector.
-	for argv_index(argv, n) != nil {
-		n++
-	}
-
-	// skip NULL separator
-	n++
-
-	// now argv+n is auxv
-	auxv := (*[1 << 32]elf64Auxv)(add(unsafe.Pointer(argv), uintptr(n)*ptrSize))
-
-	for i := 0; auxv[i].a_type != _AT_NULL; i++ {
-		av := &auxv[i]
-		switch av.a_type {
-		case _AT_SYSINFO_EHDR:
-			if av.a_val == 0 {
-				// Something went wrong
-				continue
-			}
-			var info vdso_info
-			// TODO(rsc): I don't understand why the compiler thinks info escapes
-			// when passed to the three functions below.
-			info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
-			vdso_init_from_sysinfo_ehdr(info1, (*elf64Ehdr)(unsafe.Pointer(uintptr(av.a_val))))
-			vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
-
-		case _AT_RANDOM:
-			startupRandomData = (*[16]byte)(unsafe.Pointer(uintptr(av.a_val)))[:]
+func archauxv(tag, val uintptr) {
+	switch tag {
+	case _AT_SYSINFO_EHDR:
+		if val == 0 {
+			// Something went wrong
+			return
 		}
+		var info vdso_info
+		// TODO(rsc): I don't understand why the compiler thinks info escapes
+		// when passed to the three functions below.
+		info1 := (*vdso_info)(noescape(unsafe.Pointer(&info)))
+		vdso_init_from_sysinfo_ehdr(info1, (*elf64Ehdr)(unsafe.Pointer(val)))
+		vdso_parse_symbols(info1, vdso_find_version(info1, &linux26))
 	}
 }

diff --git a/src/runtime/vdso_none.go b/src/runtime/vdso_none.go
index 93bd91c..efae23f 100644
--- a/src/runtime/vdso_none.go
+++ b/src/runtime/vdso_none.go

@@ -1,10 +1,8 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build !linux !amd64
-// +build !linux !386
-// +build !linux !arm
+// +build !linux
 
 package runtime
 

diff --git a/src/runtime/vlop_386.s b/src/runtime/vlop_386.s
index ce8e7d0..92232d5 100644
--- a/src/runtime/vlop_386.s
+++ b/src/runtime/vlop_386.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/vlop-386.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/vlop-386.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //

diff --git a/src/runtime/vlop_arm.s b/src/runtime/vlop_arm.s
index ae1f582..338d9d5 100644
--- a/src/runtime/vlop_arm.s
+++ b/src/runtime/vlop_arm.s

@@ -1,7 +1,7 @@
 // Inferno's libkern/vlop-arm.s
 // http://code.google.com/p/inferno-os/source/browse/libkern/vlop-arm.s
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //

diff --git a/src/runtime/vlop_arm_test.go b/src/runtime/vlop_arm_test.go
index 1a21119..85cea92 100644
--- a/src/runtime/vlop_arm_test.go
+++ b/src/runtime/vlop_arm_test.go

@@ -82,3 +82,47 @@
 		}
 	}
 }
+
+//go:noinline
+func armFloatWrite(a *[129]float64) {
+	// This used to miscompile on arm5.
+	// The offset is too big to fit in a load.
+	// So the code does:
+	//   ldr     r0, [sp, #8]
+	//   bl      6f690 <_sfloat>
+	//   ldr     fp, [pc, #32]   ; (address of 128.0)
+	//   vldr    d0, [fp]
+	//   ldr     fp, [pc, #28]   ; (1024)
+	//   add     fp, fp, r0
+	//   vstr    d0, [fp]
+	// The software floating-point emulator gives up on the add.
+	// This causes the store to not work.
+	// See issue 15440.
+	a[128] = 128.0
+}
+func TestArmFloatBigOffsetWrite(t *testing.T) {
+	var a [129]float64
+	for i := 0; i < 128; i++ {
+		a[i] = float64(i)
+	}
+	armFloatWrite(&a)
+	for i, x := range a {
+		if x != float64(i) {
+			t.Errorf("bad entry %d:%f\n", i, x)
+		}
+	}
+}
+
+//go:noinline
+func armFloatRead(a *[129]float64) float64 {
+	return a[128]
+}
+func TestArmFloatBigOffsetRead(t *testing.T) {
+	var a [129]float64
+	for i := 0; i < 129; i++ {
+		a[i] = float64(i)
+	}
+	if x := armFloatRead(&a); x != 128.0 {
+		t.Errorf("bad value %f\n", x)
+	}
+}

diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 6370732..cd37828 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go

@@ -1,7 +1,7 @@
 // Inferno's libkern/vlrt-arm.c
 // http://code.google.com/p/inferno-os/source/browse/libkern/vlrt-arm.c
 //
-//         Copyright © 1994-1999 Lucent Technologies Inc.  All rights reserved.
+//         Copyright © 1994-1999 Lucent Technologies Inc. All rights reserved.
 //         Revisions Copyright © 2000-2007 Vita Nuova Holdings Limited (www.vitanuova.com).  All rights reserved.
 //         Portions Copyright 2009 The Go Authors. All rights reserved.
 //
@@ -195,7 +195,6 @@
 	if GOARCH == "arm" {
 		// arm doesn't have a division instruction, so
 		// slowdodiv is the best that we can do.
-		// TODO: revisit for arm64.
 		return slowdodiv(n, d)
 	}
 

diff --git a/src/runtime/wbfat.go b/src/runtime/wbfat.go
deleted file mode 100644
index 8fe2cef..0000000
--- a/src/runtime/wbfat.go
+++ /dev/null

@@ -1,190 +0,0 @@
-// generated by wbfat_gen.go; use go generate
-
-package runtime
-
-//go:nosplit
-func writebarrierfat01(dst *[2]uintptr, _ uintptr, src [2]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-}
-
-//go:nosplit
-func writebarrierfat10(dst *[2]uintptr, _ uintptr, src [2]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-}
-
-//go:nosplit
-func writebarrierfat11(dst *[2]uintptr, _ uintptr, src [2]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-}
-
-//go:nosplit
-func writebarrierfat001(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-}
-
-//go:nosplit
-func writebarrierfat010(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-}
-
-//go:nosplit
-func writebarrierfat011(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-}
-
-//go:nosplit
-func writebarrierfat100(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	dst[2] = src[2]
-}
-
-//go:nosplit
-func writebarrierfat101(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-}
-
-//go:nosplit
-func writebarrierfat110(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-}
-
-//go:nosplit
-func writebarrierfat111(dst *[3]uintptr, _ uintptr, src [3]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-}
-
-//go:nosplit
-func writebarrierfat0001(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	dst[2] = src[2]
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat0010(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat0011(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat0100(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat0101(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat0110(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat0111(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	dst[0] = src[0]
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat1000(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	dst[2] = src[2]
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat1001(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	dst[2] = src[2]
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat1010(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat1011(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	dst[1] = src[1]
-	writebarrierptr(&dst[2], src[2])
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat1100(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat1101(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	dst[2] = src[2]
-	writebarrierptr(&dst[3], src[3])
-}
-
-//go:nosplit
-func writebarrierfat1110(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-	dst[3] = src[3]
-}
-
-//go:nosplit
-func writebarrierfat1111(dst *[4]uintptr, _ uintptr, src [4]uintptr) {
-	writebarrierptr(&dst[0], src[0])
-	writebarrierptr(&dst[1], src[1])
-	writebarrierptr(&dst[2], src[2])
-	writebarrierptr(&dst[3], src[3])
-}

diff --git a/src/runtime/wbfat_gen.go b/src/runtime/wbfat_gen.go
deleted file mode 100644
index 9482cfe..0000000
--- a/src/runtime/wbfat_gen.go
+++ /dev/null

@@ -1,41 +0,0 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build ignore
-
-package main
-
-import (
-	"flag"
-	"fmt"
-	"log"
-	"os"
-)
-
-func main() {
-	flag.Parse()
-	if flag.NArg() > 0 {
-		f, err := os.Create(flag.Arg(0))
-		if err != nil {
-			log.Fatal(err)
-		}
-		os.Stdout = f
-	}
-	fmt.Printf("// generated by wbfat_gen.go; use go generate\n\n")
-	fmt.Printf("package runtime\n")
-	for i := uint(2); i <= 4; i++ {
-		for j := 1; j < 1<<i; j++ {
-			fmt.Printf("\n//go:nosplit\n")
-			fmt.Printf("func writebarrierfat%0*b(dst *[%d]uintptr, _ uintptr, src [%d]uintptr) {\n", int(i), j, i, i)
-			for k := uint(0); k < i; k++ {
-				if j&(1<<(i-1-k)) != 0 {
-					fmt.Printf("\twritebarrierptr(&dst[%d], src[%d])\n", k, k)
-				} else {
-					fmt.Printf("\tdst[%d] = src[%d]\n", k, k)
-				}
-			}
-			fmt.Printf("}\n")
-		}
-	}
-}

diff --git a/src/runtime/wincallback.go b/src/runtime/wincallback.go
index a16ad21..9f003ae 100644
--- a/src/runtime/wincallback.go
+++ b/src/runtime/wincallback.go

@@ -1,4 +1,4 @@
-// Copyright 2014 The Go Authors.  All rights reserved.
+// Copyright 2014 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 

diff --git a/src/runtime/print1_write.go b/src/runtime/write_err.go
similarity index 100%
rename from src/runtime/print1_write.go
rename to src/runtime/write_err.go


diff --git a/src/runtime/print1_write_android.go b/src/runtime/write_err_android.go
similarity index 97%
rename from src/runtime/print1_write_android.go
rename to src/runtime/write_err_android.go
index 54d4826..4411a14 100644
--- a/src/runtime/print1_write_android.go
+++ b/src/runtime/write_err_android.go

@@ -116,7 +116,7 @@
 		exit(2)
 	}
 
-	errno := connect(uintptr(fd), unsafe.Pointer(&logdAddr), int32(unsafe.Sizeof(logdAddr)))
+	errno := connect(fd, unsafe.Pointer(&logdAddr), int32(unsafe.Sizeof(logdAddr)))
 	if errno < 0 {
 		msg := []byte("runtime: cannot connect to /dev/socket/logdw\x00")
 		write(2, unsafe.Pointer(&msg[0]), int32(len(msg)))

diff --git a/src/runtime/zgoarch_386.go b/src/runtime/zgoarch_386.go
deleted file mode 100644
index 79053f1..0000000
--- a/src/runtime/zgoarch_386.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `386`
-
-const goarch_386 = 1
-const goarch_amd64 = 0
-const goarch_amd64p32 = 0
-const goarch_arm = 0
-const goarch_arm64 = 0
-const goarch_ppc64 = 0
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_amd64.go b/src/runtime/zgoarch_amd64.go
deleted file mode 100644
index 70095f5..0000000
--- a/src/runtime/zgoarch_amd64.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `amd64`
-
-const goarch_386 = 0
-const goarch_amd64 = 1
-const goarch_amd64p32 = 0
-const goarch_arm = 0
-const goarch_arm64 = 0
-const goarch_ppc64 = 0
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_amd64p32.go b/src/runtime/zgoarch_amd64p32.go
deleted file mode 100644
index 9ac3f0b..0000000
--- a/src/runtime/zgoarch_amd64p32.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `amd64p32`
-
-const goarch_386 = 0
-const goarch_amd64 = 0
-const goarch_amd64p32 = 1
-const goarch_arm = 0
-const goarch_arm64 = 0
-const goarch_ppc64 = 0
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_arm.go b/src/runtime/zgoarch_arm.go
deleted file mode 100644
index c865dc0..0000000
--- a/src/runtime/zgoarch_arm.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `arm`
-
-const goarch_386 = 0
-const goarch_amd64 = 0
-const goarch_amd64p32 = 0
-const goarch_arm = 1
-const goarch_arm64 = 0
-const goarch_ppc64 = 0
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_arm64.go b/src/runtime/zgoarch_arm64.go
deleted file mode 100644
index cde5e9f..0000000
--- a/src/runtime/zgoarch_arm64.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `arm64`
-
-const goarch_386 = 0
-const goarch_amd64 = 0
-const goarch_amd64p32 = 0
-const goarch_arm = 0
-const goarch_arm64 = 1
-const goarch_ppc64 = 0
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_ppc64.go b/src/runtime/zgoarch_ppc64.go
deleted file mode 100644
index 13d87d9..0000000
--- a/src/runtime/zgoarch_ppc64.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `ppc64`
-
-const goarch_386 = 0
-const goarch_amd64 = 0
-const goarch_amd64p32 = 0
-const goarch_arm = 0
-const goarch_arm64 = 0
-const goarch_ppc64 = 1
-const goarch_ppc64le = 0

diff --git a/src/runtime/zgoarch_ppc64le.go b/src/runtime/zgoarch_ppc64le.go
deleted file mode 100644
index 5d088aa..0000000
--- a/src/runtime/zgoarch_ppc64le.go
+++ /dev/null

@@ -1,13 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoarch = `ppc64le`
-
-const goarch_386 = 0
-const goarch_amd64 = 0
-const goarch_amd64p32 = 0
-const goarch_arm = 0
-const goarch_arm64 = 0
-const goarch_ppc64 = 0
-const goarch_ppc64le = 1

diff --git a/src/runtime/zgoos_android.go b/src/runtime/zgoos_android.go
deleted file mode 100644
index 0590bd9..0000000
--- a/src/runtime/zgoos_android.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `android`
-
-const goos_android = 1
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_darwin.go b/src/runtime/zgoos_darwin.go
deleted file mode 100644
index c0a7cd6..0000000
--- a/src/runtime/zgoos_darwin.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `darwin`
-
-const goos_android = 0
-const goos_darwin = 1
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_dragonfly.go b/src/runtime/zgoos_dragonfly.go
deleted file mode 100644
index 008d6de..0000000
--- a/src/runtime/zgoos_dragonfly.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `dragonfly`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 1
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_freebsd.go b/src/runtime/zgoos_freebsd.go
deleted file mode 100644
index 2478940..0000000
--- a/src/runtime/zgoos_freebsd.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `freebsd`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 1
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_linux.go b/src/runtime/zgoos_linux.go
deleted file mode 100644
index c775ab5..0000000
--- a/src/runtime/zgoos_linux.go
+++ /dev/null

@@ -1,19 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-// +build !android
-
-package runtime
-
-const theGoos = `linux`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 1
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_nacl.go b/src/runtime/zgoos_nacl.go
deleted file mode 100644
index d9d88f4..0000000
--- a/src/runtime/zgoos_nacl.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `nacl`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 1
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_netbsd.go b/src/runtime/zgoos_netbsd.go
deleted file mode 100644
index ff2c5cb..0000000
--- a/src/runtime/zgoos_netbsd.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `netbsd`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 1
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_openbsd.go b/src/runtime/zgoos_openbsd.go
deleted file mode 100644
index b071dc6..0000000
--- a/src/runtime/zgoos_openbsd.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `openbsd`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 1
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_plan9.go b/src/runtime/zgoos_plan9.go
deleted file mode 100644
index 4306b0f..0000000
--- a/src/runtime/zgoos_plan9.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `plan9`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 1
-const goos_solaris = 0
-const goos_windows = 0

diff --git a/src/runtime/zgoos_solaris.go b/src/runtime/zgoos_solaris.go
deleted file mode 100644
index 10f9537..0000000
--- a/src/runtime/zgoos_solaris.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `solaris`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 1
-const goos_windows = 0

diff --git a/src/runtime/zgoos_windows.go b/src/runtime/zgoos_windows.go
deleted file mode 100644
index 56f5c58..0000000
--- a/src/runtime/zgoos_windows.go
+++ /dev/null

@@ -1,17 +0,0 @@
-// generated by gengoos.go using 'go generate'
-
-package runtime
-
-const theGoos = `windows`
-
-const goos_android = 0
-const goos_darwin = 0
-const goos_dragonfly = 0
-const goos_freebsd = 0
-const goos_linux = 0
-const goos_nacl = 0
-const goos_netbsd = 0
-const goos_openbsd = 0
-const goos_plan9 = 0
-const goos_solaris = 0
-const goos_windows = 1

diff --git a/src/runtime/zversion.go b/src/runtime/zversion.go
deleted file mode 100644
index 4086c8d..0000000
--- a/src/runtime/zversion.go
+++ /dev/null

@@ -1,9 +0,0 @@
-// auto generated by go tool dist
-
-package runtime
-
-const defaultGoroot = `./prebuilts/go/linux-x86`
-const theVersion = `go1.5.1`
-const goexperiment = ``
-const stackGuardMultiplier = 1
-var buildVersion = theVersion
commit	0c15709165b7a6466cf121b3333ffa4386796b51	[log] [tgz]
author	Dan Willemsen <[email protected]>	Fri Jul 08 13:57:52 2016 -0700
committer	Dan Willemsen <[email protected]>	Fri Jul 08 14:02:56 2016 -0700
tree	08b920899f1b5e74b7d6b6bbac171271f8ae2349
parent	0cbf8ca5219490161024433a12d92adf55b4a2c3 [diff]