Update prebuilts to go1.8rc1 ab/3640477

Bug: 32982374
Test: m -j blueprint_tools
Change-Id: Ife36ed9cf5b2617ccc8fdf0fdd178d19378546cd
diff --git a/src/math/all_test.go b/src/math/all_test.go
index d9ea1fd..3d8cd72 100644
--- a/src/math/all_test.go
+++ b/src/math/all_test.go
@@ -1165,21 +1165,88 @@
 	{NaN(), 0},
 }
 
-var vfgammaSC = []float64{
-	Inf(-1),
-	-3,
-	Copysign(0, -1),
-	0,
-	Inf(1),
-	NaN(),
-}
-var gammaSC = []float64{
-	NaN(),
-	NaN(),
-	Inf(-1),
-	Inf(1),
-	Inf(1),
-	NaN(),
+var vfgamma = [][2]float64{
+	{Inf(1), Inf(1)},
+	{Inf(-1), NaN()},
+	{0, Inf(1)},
+	{Copysign(0, -1), Inf(-1)},
+	{NaN(), NaN()},
+	{-1, NaN()},
+	{-2, NaN()},
+	{-3, NaN()},
+	{-1e16, NaN()},
+	{-1e300, NaN()},
+	{1.7e308, Inf(1)},
+
+	// Test inputs inspired by Python test suite.
+	// Outputs computed at high precision by PARI/GP.
+	// If recomputing table entries, be careful to use
+	// high-precision (%.1000g) formatting of the float64 inputs.
+	// For example, -2.0000000000000004 is the float64 with exact value
+	// -2.00000000000000044408920985626161695, and
+	// gamma(-2.0000000000000004) = -1249999999999999.5386078562728167651513, while
+	// gamma(-2.00000000000000044408920985626161695) = -1125899906826907.2044875028130093136826.
+	// Thus the table lists -1.1258999068426235e+15 as the answer.
+	{0.5, 1.772453850905516},
+	{1.5, 0.886226925452758},
+	{2.5, 1.329340388179137},
+	{3.5, 3.3233509704478426},
+	{-0.5, -3.544907701811032},
+	{-1.5, 2.363271801207355},
+	{-2.5, -0.9453087204829419},
+	{-3.5, 0.2700882058522691},
+	{0.1, 9.51350769866873},
+	{0.01, 99.4325851191506},
+	{1e-08, 9.999999942278434e+07},
+	{1e-16, 1e+16},
+	{0.001, 999.4237724845955},
+	{1e-16, 1e+16},
+	{1e-308, 1e+308},
+	{5.6e-309, 1.7857142857142864e+308},
+	{5.5e-309, Inf(1)},
+	{1e-309, Inf(1)},
+	{1e-323, Inf(1)},
+	{5e-324, Inf(1)},
+	{-0.1, -10.686287021193193},
+	{-0.01, -100.58719796441078},
+	{-1e-08, -1.0000000057721567e+08},
+	{-1e-16, -1e+16},
+	{-0.001, -1000.5782056293586},
+	{-1e-16, -1e+16},
+	{-1e-308, -1e+308},
+	{-5.6e-309, -1.7857142857142864e+308},
+	{-5.5e-309, Inf(-1)},
+	{-1e-309, Inf(-1)},
+	{-1e-323, Inf(-1)},
+	{-5e-324, Inf(-1)},
+	{-0.9999999999999999, -9.007199254740992e+15},
+	{-1.0000000000000002, 4.5035996273704955e+15},
+	{-1.9999999999999998, 2.2517998136852485e+15},
+	{-2.0000000000000004, -1.1258999068426235e+15},
+	{-100.00000000000001, -7.540083334883109e-145},
+	{-99.99999999999999, 7.540083334884096e-145},
+	{17, 2.0922789888e+13},
+	{171, 7.257415615307999e+306},
+	{171.6, 1.5858969096672565e+308},
+	{171.624, 1.7942117599248104e+308},
+	{171.625, Inf(1)},
+	{172, Inf(1)},
+	{2000, Inf(1)},
+	{-100.5, -3.3536908198076787e-159},
+	{-160.5, -5.255546447007829e-286},
+	{-170.5, -3.3127395215386074e-308},
+	{-171.5, 1.9316265431712e-310},
+	{-176.5, -1.196e-321},
+	{-177.5, 5e-324},
+	{-178.5, Copysign(0, -1)},
+	{-179.5, 0},
+	{-201.0001, 0},
+	{-202.9999, Copysign(0, -1)},
+	{-1000.5, Copysign(0, -1)},
+	{-1.0000000003e+09, Copysign(0, -1)},
+	{-4.5035996273704955e+15, 0},
+	{-63.349078729022985, 4.177797167776188e-88},
+	{-127.45117632943295, 1.183111089623681e-214},
 }
 
 var vfhypotSC = [][2]float64{
@@ -1735,6 +1802,12 @@
 }
 
 func tolerance(a, b, e float64) bool {
+	// Multiplying by e here can underflow denormal values to zero.
+	// Check a==b so that at least if a and b are small and identical
+	// we say they match.
+	if a == b {
+		return true
+	}
 	d := a - b
 	if d < 0 {
 		d = -d
@@ -1974,7 +2047,7 @@
 
 func testExp(t *testing.T, Exp func(float64) float64, name string) {
 	for i := 0; i < len(vf); i++ {
-		if f := Exp(vf[i]); !close(exp[i], f) {
+		if f := Exp(vf[i]); !veryclose(exp[i], f) {
 			t.Errorf("%s(%g) = %g, want %g", name, vf[i], f, exp[i])
 		}
 	}
@@ -2147,9 +2220,18 @@
 			t.Errorf("Gamma(%g) = %g, want %g", vf[i], f, gamma[i])
 		}
 	}
-	for i := 0; i < len(vfgammaSC); i++ {
-		if f := Gamma(vfgammaSC[i]); !alike(gammaSC[i], f) {
-			t.Errorf("Gamma(%g) = %g, want %g", vfgammaSC[i], f, gammaSC[i])
+	for _, g := range vfgamma {
+		f := Gamma(g[0])
+		var ok bool
+		if IsNaN(g[1]) || IsInf(g[1], 0) || g[1] == 0 || f == 0 {
+			ok = alike(g[1], f)
+		} else if g[0] > -50 && g[0] <= 171 {
+			ok = veryclose(g[1], f)
+		} else {
+			ok = close(g[1], f)
+		}
+		if !ok {
+			t.Errorf("Gamma(%g) = %g, want %g", g[0], f, g[1])
 		}
 	}
 }
@@ -3000,14 +3082,6 @@
 
 var Global float64
 
-func BenchmarkSqrt(b *testing.B) {
-	x, y := 0.0, 10.0
-	for i := 0; i < b.N; i++ {
-		x += Sqrt(y)
-	}
-	Global = x
-}
-
 func BenchmarkSqrtIndirect(b *testing.B) {
 	x, y := 0.0, 10.0
 	f := Sqrt
@@ -3017,10 +3091,27 @@
 	Global = x
 }
 
-func BenchmarkSqrtGo(b *testing.B) {
-	x, y := 0.0, 10.0
+func BenchmarkSqrtLatency(b *testing.B) {
+	x := 10.0
 	for i := 0; i < b.N; i++ {
-		x += SqrtGo(y)
+		x = Sqrt(x)
+	}
+	Global = x
+}
+
+func BenchmarkSqrtIndirectLatency(b *testing.B) {
+	x := 10.0
+	f := Sqrt
+	for i := 0; i < b.N; i++ {
+		x = f(x)
+	}
+	Global = x
+}
+
+func BenchmarkSqrtGoLatency(b *testing.B) {
+	x := 10.0
+	for i := 0; i < b.N; i++ {
+		x = SqrtGo(x)
 	}
 	Global = x
 }
diff --git a/src/math/arith_s390x.go b/src/math/arith_s390x.go
new file mode 100644
index 0000000..892935a
--- /dev/null
+++ b/src/math/arith_s390x.go
@@ -0,0 +1,29 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+func log10TrampolineSetup(x float64) float64
+func log10Asm(x float64) float64
+
+func cosTrampolineSetup(x float64) float64
+func cosAsm(x float64) float64
+
+func coshTrampolineSetup(x float64) float64
+func coshAsm(x float64) float64
+
+func sinTrampolineSetup(x float64) float64
+func sinAsm(x float64) float64
+
+func sinhTrampolineSetup(x float64) float64
+func sinhAsm(x float64) float64
+
+func tanhTrampolineSetup(x float64) float64
+func tanhAsm(x float64) float64
+
+// hasVectorFacility reports whether the machine has the z/Architecture
+// vector facility installed and enabled.
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
diff --git a/src/math/arith_s390x_test.go b/src/math/arith_s390x_test.go
new file mode 100644
index 0000000..b4f3070
--- /dev/null
+++ b/src/math/arith_s390x_test.go
@@ -0,0 +1,144 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Tests whether the non vector routines are working, even when the tests are run on a
+// vector-capable machine.
+package math_test
+
+import (
+	. "math"
+	"testing"
+)
+
+func TestCosNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		if f := CosNoVec(vf[i]); !veryclose(cos[i], f) {
+			t.Errorf("Cos(%g) = %g, want %g", vf[i], f, cos[i])
+		}
+	}
+	for i := 0; i < len(vfcosSC); i++ {
+		if f := CosNoVec(vfcosSC[i]); !alike(cosSC[i], f) {
+			t.Errorf("Cos(%g) = %g, want %g", vfcosSC[i], f, cosSC[i])
+		}
+	}
+}
+
+func TestCoshNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		if f := CoshNoVec(vf[i]); !close(cosh[i], f) {
+			t.Errorf("Cosh(%g) = %g, want %g", vf[i], f, cosh[i])
+		}
+	}
+	for i := 0; i < len(vfcoshSC); i++ {
+		if f := CoshNoVec(vfcoshSC[i]); !alike(coshSC[i], f) {
+			t.Errorf("Cosh(%g) = %g, want %g", vfcoshSC[i], f, coshSC[i])
+		}
+	}
+}
+func TestSinNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		if f := SinNoVec(vf[i]); !veryclose(sin[i], f) {
+			t.Errorf("Sin(%g) = %g, want %g", vf[i], f, sin[i])
+		}
+	}
+	for i := 0; i < len(vfsinSC); i++ {
+		if f := SinNoVec(vfsinSC[i]); !alike(sinSC[i], f) {
+			t.Errorf("Sin(%g) = %g, want %g", vfsinSC[i], f, sinSC[i])
+		}
+	}
+}
+
+func TestSinhNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		if f := SinhNoVec(vf[i]); !close(sinh[i], f) {
+			t.Errorf("Sinh(%g) = %g, want %g", vf[i], f, sinh[i])
+		}
+	}
+	for i := 0; i < len(vfsinhSC); i++ {
+		if f := SinhNoVec(vfsinhSC[i]); !alike(sinhSC[i], f) {
+			t.Errorf("Sinh(%g) = %g, want %g", vfsinhSC[i], f, sinhSC[i])
+		}
+	}
+}
+
+// Check that math functions of high angle values
+// return accurate results. [Since (vf[i] + large) - large != vf[i],
+// testing for Trig(vf[i] + large) == Trig(vf[i]), where large is
+// a multiple of 2*Pi, is misleading.]
+func TestLargeCosNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	large := float64(100000 * Pi)
+	for i := 0; i < len(vf); i++ {
+		f1 := cosLarge[i]
+		f2 := CosNoVec(vf[i] + large)
+		if !close(f1, f2) {
+			t.Errorf("Cos(%g) = %g, want %g", vf[i]+large, f2, f1)
+		}
+	}
+}
+
+func TestLargeSinNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	large := float64(100000 * Pi)
+	for i := 0; i < len(vf); i++ {
+		f1 := sinLarge[i]
+		f2 := SinNoVec(vf[i] + large)
+		if !close(f1, f2) {
+			t.Errorf("Sin(%g) = %g, want %g", vf[i]+large, f2, f1)
+		}
+	}
+}
+
+func TestTanhNovec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		if f := TanhNoVec(vf[i]); !veryclose(tanh[i], f) {
+			t.Errorf("Tanh(%g) = %g, want %g", vf[i], f, tanh[i])
+		}
+	}
+	for i := 0; i < len(vftanhSC); i++ {
+		if f := TanhNoVec(vftanhSC[i]); !alike(tanhSC[i], f) {
+			t.Errorf("Tanh(%g) = %g, want %g", vftanhSC[i], f, tanhSC[i])
+		}
+	}
+
+}
+
+func TestLog10Novec(t *testing.T) {
+	if !HasVX {
+		t.Skipf("no vector support")
+	}
+	for i := 0; i < len(vf); i++ {
+		a := Abs(vf[i])
+		if f := Log10NoVec(a); !veryclose(log10[i], f) {
+			t.Errorf("Log10(%g) = %g, want %g", a, f, log10[i])
+		}
+	}
+	if f := Log10NoVec(E); f != Log10E {
+		t.Errorf("Log10(%g) = %g, want %g", E, f, Log10E)
+	}
+	for i := 0; i < len(vflogSC); i++ {
+		if f := Log10NoVec(vflogSC[i]); !alike(logSC[i], f) {
+			t.Errorf("Log10(%g) = %g, want %g", vflogSC[i], f, logSC[i])
+		}
+	}
+}
diff --git a/src/math/big/arith_amd64.s b/src/math/big/arith_amd64.s
index b69a2c6..a7eba67 100644
--- a/src/math/big/arith_amd64.s
+++ b/src/math/big/arith_amd64.s
@@ -326,6 +326,41 @@
 	MOVQ r+56(FP), CX	// c = r
 	MOVQ z_len+8(FP), R11
 	MOVQ $0, BX		// i = 0
+	
+	CMPQ R11, $4
+	JL E5
+	
+U5:	// i+4 <= n
+	// regular loop body unrolled 4x
+	MOVQ (0*8)(R8)(BX*8), AX
+	MULQ R9
+	ADDQ CX, AX
+	ADCQ $0, DX
+	MOVQ AX, (0*8)(R10)(BX*8)
+	MOVQ DX, CX
+	MOVQ (1*8)(R8)(BX*8), AX
+	MULQ R9
+	ADDQ CX, AX
+	ADCQ $0, DX
+	MOVQ AX, (1*8)(R10)(BX*8)
+	MOVQ DX, CX
+	MOVQ (2*8)(R8)(BX*8), AX
+	MULQ R9
+	ADDQ CX, AX
+	ADCQ $0, DX
+	MOVQ AX, (2*8)(R10)(BX*8)
+	MOVQ DX, CX
+	MOVQ (3*8)(R8)(BX*8), AX
+	MULQ R9
+	ADDQ CX, AX
+	ADCQ $0, DX
+	MOVQ AX, (3*8)(R10)(BX*8)
+	MOVQ DX, CX
+	ADDQ $4, BX		// i += 4
+	
+	LEAQ 4(BX), DX
+	CMPQ DX, R11
+	JLE U5
 	JMP E5
 
 L5:	MOVQ (R8)(BX*8), AX
diff --git a/src/math/big/arith_decl_s390x.go b/src/math/big/arith_decl_s390x.go
new file mode 100644
index 0000000..0f11481
--- /dev/null
+++ b/src/math/big/arith_decl_s390x.go
@@ -0,0 +1,23 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go
+
+package big
+
+func addVV_check(z, x, y []Word) (c Word)
+func addVV_vec(z, x, y []Word) (c Word)
+func addVV_novec(z, x, y []Word) (c Word)
+func subVV_check(z, x, y []Word) (c Word)
+func subVV_vec(z, x, y []Word) (c Word)
+func subVV_novec(z, x, y []Word) (c Word)
+func addVW_check(z, x []Word, y Word) (c Word)
+func addVW_vec(z, x []Word, y Word) (c Word)
+func addVW_novec(z, x []Word, y Word) (c Word)
+func subVW_check(z, x []Word, y Word) (c Word)
+func subVW_vec(z, x []Word, y Word) (c Word)
+func subVW_novec(z, x []Word, y Word) (c Word)
+func hasVectorFacility() bool
+
+var hasVX = hasVectorFacility()
diff --git a/src/math/big/arith_mipsx.s b/src/math/big/arith_mipsx.s
new file mode 100644
index 0000000..ac23114
--- /dev/null
+++ b/src/math/big/arith_mipsx.s
@@ -0,0 +1,46 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,mips !math_big_pure_go,mipsle
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+TEXT ·mulWW(SB),NOSPLIT,$0
+	JMP	·mulWW_g(SB)
+
+TEXT ·divWW(SB),NOSPLIT,$0
+	JMP	·divWW_g(SB)
+
+TEXT ·addVV(SB),NOSPLIT,$0
+	JMP	·addVV_g(SB)
+
+TEXT ·subVV(SB),NOSPLIT,$0
+	JMP	·subVV_g(SB)
+
+TEXT ·addVW(SB),NOSPLIT,$0
+	JMP	·addVW_g(SB)
+
+TEXT ·subVW(SB),NOSPLIT,$0
+	JMP	·subVW_g(SB)
+
+TEXT ·shlVU(SB),NOSPLIT,$0
+	JMP	·shlVU_g(SB)
+
+TEXT ·shrVU(SB),NOSPLIT,$0
+	JMP	·shrVU_g(SB)
+
+TEXT ·mulAddVWW(SB),NOSPLIT,$0
+	JMP	·mulAddVWW_g(SB)
+
+TEXT ·addMulVVW(SB),NOSPLIT,$0
+	JMP	·addMulVVW_g(SB)
+
+TEXT ·divWVW(SB),NOSPLIT,$0
+	JMP	·divWVW_g(SB)
+
+TEXT ·bitLen(SB),NOSPLIT,$0
+	JMP	·bitLen_g(SB)
diff --git a/src/math/big/arith_ppc64.s b/src/math/big/arith_ppc64.s
new file mode 100644
index 0000000..47fe8f1
--- /dev/null
+++ b/src/math/big/arith_ppc64.s
@@ -0,0 +1,14 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,ppc64
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+TEXT ·divWW(SB), NOSPLIT, $0
+	BR ·divWW_g(SB)
+
diff --git a/src/math/big/arith_ppc64le.s b/src/math/big/arith_ppc64le.s
new file mode 100644
index 0000000..b78cdfe
--- /dev/null
+++ b/src/math/big/arith_ppc64le.s
@@ -0,0 +1,50 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !math_big_pure_go,ppc64le
+
+#include "textflag.h"
+
+// This file provides fast assembly versions for the elementary
+// arithmetic operations on vectors implemented in arith.go.
+
+// func divWW(x1, x0, y Word) (q, r Word)
+TEXT ·divWW(SB), NOSPLIT, $0
+	MOVD x1+0(FP), R4
+	MOVD x0+8(FP), R5
+	MOVD y+16(FP), R6
+
+	CMPU R4, R6
+	BGE  divbigger
+
+	// from the programmer's note in ch. 3 of the ISA manual, p.74
+	DIVDEU R6, R4, R3
+	DIVDU  R6, R5, R7
+	MULLD  R6, R3, R8
+	MULLD  R6, R7, R20
+	SUB    R20, R5, R10
+	ADD    R7, R3, R3
+	SUB    R8, R10, R4
+	CMPU   R4, R10
+	BLT    adjust
+	CMPU   R4, R6
+	BLT    end
+
+adjust:
+	MOVD $1, R21
+	ADD  R21, R3, R3
+	SUB  R6, R4, R4
+
+end:
+	MOVD R3, q+24(FP)
+	MOVD R4, r+32(FP)
+
+	RET
+
+divbigger:
+	MOVD $-1, R7
+	MOVD R7, q+24(FP)
+	MOVD R7, r+32(FP)
+	RET
+
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index d4d4171..89d1cbf 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -9,38 +9,178 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
-TEXT ·mulWW(SB),NOSPLIT,$0
-	BR ·mulWW_g(SB)
+// func mulWW(x, y Word) (z1, z0 Word)
+TEXT ·mulWW(SB), NOSPLIT, $0
+	MOVD   x+0(FP), R4
+	MOVD   y+8(FP), R5
+	MULHDU R4, R5, R6
+	MULLD  R4, R5, R7
+	MOVD   R6, z1+16(FP)
+	MOVD   R7, z0+24(FP)
+	RET
 
-TEXT ·divWW(SB),NOSPLIT,$0
-	BR ·divWW_g(SB)
-
-TEXT ·addVV(SB),NOSPLIT,$0
+TEXT ·addVV(SB), NOSPLIT, $0
 	BR ·addVV_g(SB)
 
-TEXT ·subVV(SB),NOSPLIT,$0
-	BR ·subVV_g(SB)
+// func subVV(z, x, y []Word) (c Word)
+// z[i] = x[i] - y[i] for all i, carrying
+TEXT ·subVV(SB), NOSPLIT, $0
+	MOVD z_len+8(FP), R7
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD z+0(FP), R10
 
-TEXT ·addVW(SB),NOSPLIT,$0
+	MOVD $0, R4  // c = 0
+	MOVD $0, R5  // i = 0
+	MOVD $1, R29 // work around lack of ADDI
+	MOVD $8, R28 // work around lack of scaled addressing
+
+	SUBC R0, R0  // clear CA
+	JMP  sublend
+
+// amd64 saves and restores CF, but I believe they only have to do that because all of
+// their math operations clobber it - we should just be able to recover it at the end.
+subloop:
+	MULLD R5, R28, R6
+	MOVD  (R8)(R6), R11 // x[i]
+	MOVD  (R9)(R6), R12 // y[i]
+
+	SUBE R12, R11, R15
+	MOVD R15, (R10)(R6)
+
+	ADD R29, R5 // i++
+
+sublend:
+	CMP R5, R7
+	BLT subloop
+
+	ADDZE R4
+	XOR   R29, R4
+	MOVD  R4, c+72(FP)
+	RET
+
+TEXT ·addVW(SB), NOSPLIT, $0
 	BR ·addVW_g(SB)
 
-TEXT ·subVW(SB),NOSPLIT,$0
+TEXT ·subVW(SB), NOSPLIT, $0
 	BR ·subVW_g(SB)
 
-TEXT ·shlVU(SB),NOSPLIT,$0
+TEXT ·shlVU(SB), NOSPLIT, $0
 	BR ·shlVU_g(SB)
 
-TEXT ·shrVU(SB),NOSPLIT,$0
+TEXT ·shrVU(SB), NOSPLIT, $0
 	BR ·shrVU_g(SB)
 
-TEXT ·mulAddVWW(SB),NOSPLIT,$0
-	BR ·mulAddVWW_g(SB)
+// func mulAddVWW(z, x []Word, y, r Word) (c Word)
+TEXT ·mulAddVWW(SB), NOSPLIT, $0
+	MOVD z+0(FP), R10
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD r+56(FP), R4     // c = r
+	MOVD z_len+8(FP), R11
+	MOVD $0, R3           // i = 0
+	MOVD $8, R18
+	MOVD $1, R19
 
-TEXT ·addMulVVW(SB),NOSPLIT,$0
-	BR ·addMulVVW_g(SB)
+	JMP e5
 
-TEXT ·divWVW(SB),NOSPLIT,$0
+l5:
+	MULLD  R18, R3, R5
+	MOVD   (R8)(R5), R20
+	MULLD  R9, R20, R6
+	MULHDU R9, R20, R7
+	ADDC   R4, R6
+	ADDZE  R7
+	MOVD   R6, (R10)(R5)
+	MOVD   R7, R4
+	ADD    R19, R3
+
+e5:
+	CMP R3, R11
+	BLT l5
+
+	MOVD R4, c+64(FP)
+	RET
+
+// func addMulVVW(z, x []Word, y Word) (c Word)
+TEXT ·addMulVVW(SB), NOSPLIT, $0
+	MOVD z+0(FP), R10
+	MOVD x+24(FP), R8
+	MOVD y+48(FP), R9
+	MOVD z_len+8(FP), R22
+
+	MOVD $0, R5   // i = 0
+	MOVD $0, R4   // c = 0
+	MOVD $8, R28
+	MOVD $-2, R23
+	AND  R22, R23 // mask the last bit of z.len
+	MOVD $2, R24
+	CMP  R23, R24
+	BGE  unrolled
+	JMP  end
+
+unrolled:
+	MOVD  $8, R19         // no (RA)(RB*8) on power
+	MULLD R5, R19
+	MOVD  (R10)(R19), R11 // R11 = z[i]
+	MOVD  (R8)(R19), R16  // R16 = x[i]
+	ADD   R28, R19, R25
+	MOVD  (R10)(R25), R17
+	MOVD  (R8)(R25), R18
+
+	MULLD  R9, R16, R12
+	MULHDU R9, R16, R14
+	MULLD  R9, R18, R6
+	MULHDU R9, R18, R7
+	ADDC   R4, R12
+	ADDZE  R14
+	ADDC   R11, R12        // z[i] = (x[i]*y) + z[i] + carry
+	ADDZE  R14             // carry = high order bits + add carry
+	MOVD   R12, (R10)(R19)
+	ADDC   R14, R6
+	ADDZE  R7
+	ADDC   R17, R6
+	ADDZE  R7
+	MOVD   R6, (R10)(R25)
+	MOVD   R7, R4
+
+	ADD R24, R5
+	CMP R5, R23
+	BLT unrolled
+	JMP end
+
+loop:
+	MOVD   $8, R19
+	MULLD  R5, R19
+	MOVD   (R10)(R19), R11
+	MOVD   (R8)(R19), R16
+	MULLD  R9, R16, R12
+	MULHDU R9, R16, R14
+	ADDC   R4, R12
+	ADDZE  R14
+	ADDC   R11, R12
+	ADDZE  R14
+	MOVD   R12, (R10)(R19)
+	MOVD   R14, R4
+
+	MOVD $1, R15
+	ADD  R15, R5
+
+end:
+	CMP R5, R22
+	BLT loop
+
+	MOVD R4, c+56(FP)
+	RET
+
+TEXT ·divWVW(SB), NOSPLIT, $0
 	BR ·divWVW_g(SB)
 
-TEXT ·bitLen(SB),NOSPLIT,$0
-	BR ·bitLen_g(SB)
+// func bitLen(x Word) int
+TEXT ·bitLen(SB), NOSPLIT, $0
+	MOVD   x+0(FP), R4
+	CNTLZD R4, R4
+	MOVD   $64, R5
+	SUB    R4, R5
+	MOVD   R5, n+8(FP)
+	RET
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s
index a691970..bddfd9e 100644
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@@ -9,93 +9,464 @@
 // This file provides fast assembly versions for the elementary
 // arithmetic operations on vectors implemented in arith.go.
 
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+        MOVD    $x-24(SP), R1
+        XC      $24, 0(R1), 0(R1) // clear the storage
+        MOVD    $2, R0            // R0 is the number of double words stored -1
+        WORD    $0xB2B01000       // STFLE 0(R1)
+        XOR     R0, R0            // reset the value of R0
+        MOVBZ   z-8(SP), R1
+        AND     $0x40, R1
+        BEQ     novector
+vectorinstalled:
+        // check if the vector instruction has been enabled
+        VLEIB   $0, $0xF, V16
+        VLGVB   $0, V16, R1
+        CMPBNE  R1, $0xF, novector
+        MOVB    $1, ret+0(FP) // have vx
+        RET
+novector:
+        MOVB    $0, ret+0(FP) // no vx
+        RET
+
 TEXT ·mulWW(SB),NOSPLIT,$0
-	MOVD x+0(FP), R3
-	MOVD y+8(FP), R4
-	MULHDU R3, R4
-	MOVD R10, z1+16(FP)
-	MOVD R11, z0+24(FP)
+	MOVD	x+0(FP), R3
+	MOVD	y+8(FP), R4
+	MULHDU	R3, R4
+	MOVD	R10, z1+16(FP)
+	MOVD	R11, z0+24(FP)
 	RET
 
 // func divWW(x1, x0, y Word) (q, r Word)
 TEXT ·divWW(SB),NOSPLIT,$0
-	MOVD  x1+0(FP), R10
-	MOVD  x0+8(FP), R11
-	MOVD  y+16(FP), R5
-	WORD  $0xb98700a5 // dlgr r10,r5
-	MOVD  R11, q+24(FP)
-	MOVD  R10, r+32(FP)
+	MOVD	x1+0(FP), R10
+	MOVD	x0+8(FP), R11
+	MOVD	y+16(FP), R5
+	WORD	$0xb98700a5 // dlgr r10,r5
+	MOVD	R11, q+24(FP)
+	MOVD	R10, r+32(FP)
 	RET
 
 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
 // func addVV(z, x, y []Word) (c Word)
-TEXT ·addVV(SB),NOSPLIT,$0
-	MOVD z_len+8(FP), R3
-	MOVD x+24(FP), R8
-	MOVD y+48(FP), R9
-	MOVD z+0(FP), R2
 
-	MOVD $0, R4		// c = 0
-	MOVD $0, R0		// make sure it's zero
-	MOVD $0, R10		// i = 0
+
+TEXT ·addVV(SB),NOSPLIT,$0
+	MOVD	addvectorfacility+0x00(SB),R1
+	BR	(R1)
+	
+TEXT ·addVV_check(SB),NOSPLIT, $0
+	MOVB	·hasVX(SB), R1
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD	$addvectorfacility+0x00(SB), R1
+	MOVD	$·addVV_novec(SB), R2
+	MOVD	R2, 0(R1)
+	//MOVD	$·addVV_novec(SB), 0(R1)
+	BR	·addVV_novec(SB)
+vectorimpl:
+	MOVD	$addvectorfacility+0x00(SB), R1
+	MOVD	$·addVV_vec(SB), R2
+	MOVD	R2, 0(R1)
+	//MOVD	$·addVV_vec(SB), 0(R1)
+	BR	·addVV_vec(SB)
+
+GLOBL addvectorfacility+0x00(SB), NOPTR, $8
+DATA addvectorfacility+0x00(SB)/8, $·addVV_check(SB)
+
+TEXT ·addVV_vec(SB),NOSPLIT,$0
+	MOVD	z_len+8(FP), R3
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R9
+	MOVD	z+0(FP), R2
+
+	MOVD	$0, R4		// c = 0
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R10		// i = 0
+
 
 	// s/JL/JMP/ below to disable the unrolled loop
-	SUB  $4, R3		// n -= 4
-	BLT v1			// if n < 0 goto v1
+	SUB	$4, R3
+	BLT	v1
+	SUB     $12, R3                 // n -= 16
+        BLT     A1                      // if n < 0 goto A1
+       
+	MOVD	R8, R5
+	MOVD	R9, R6
+	MOVD	R2, R7
+	// n >= 0
+	// regular loop body unrolled 16x
+	VZERO	V0			// c = 0
+UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
+	ADD	$64, R5
+	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
+	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
+
+
+	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
+	ADD	$64, R6
+	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
+	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
+
+	VACCCQ	V1, V9, V0, V25
+	VACQ	V1, V9, V0, V17
+	VACCCQ	V2, V10, V25, V26
+	VACQ	V2, V10, V25, V18
+
+
+	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
+	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
+	ADD	$32, R5
+	ADD	$32, R6
+
+	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
+	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
+	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
+	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
+
+	VACCCQ	V3, V11, V26, V27
+	VACQ	V3, V11, V26, V19
+	VACCCQ	V4, V12, V27, V28
+	VACQ	V4, V12, V27, V20
+
+	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
+	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
+	ADD	$32, R5
+	ADD	$32, R6
+
+	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
+	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
+	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
+	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
+
+	VACCCQ	V5, V13, V28, V29
+	VACQ	V5, V13, V28, V21
+	VACCCQ	V6, V14, V29, V30
+	VACQ	V6, V14, V29, V22
+
+	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
+	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
+	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
+	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
+
+	VACCCQ	V7, V15, V30, V31
+	VACQ	V7, V15, V30, V23
+	VACCCQ	V8, V16, V31, V0	//V0 has carry-over
+	VACQ	V8, V16, V31, V24
+
+	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
+	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
+	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
+	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
+	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
+	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
+	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
+	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
+	VSTM	V17, V24, 0(R7)  	// 128-bytes into z
+	ADD	$128, R7
+	ADD	$128, R10	// i += 16
+	SUB	$16,  R3	// n -= 16
+	BGE	UU1		// if n >= 0 goto U1
+	VLGVG	$1, V0, R4	// put cf into R4
+	NEG	R4, R4		// save cf
+
+A1:	ADD	$12, R3		// n += 16
+
+
+	// s/JL/JMP/ below to disable the unrolled loop
+	BLT	v1		// if n < 0 goto v1
 
 U1:	// n >= 0
 	// regular loop body unrolled 4x
-	MOVD 0(R8)(R10*1), R5
-	MOVD 8(R8)(R10*1), R6
-	MOVD 16(R8)(R10*1), R7
-	MOVD 24(R8)(R10*1), R1
-	ADDC R4, R4		// restore CF
-	MOVD 0(R9)(R10*1), R11
-	ADDE R11, R5
-	MOVD 8(R9)(R10*1), R11
-	ADDE R11, R6
-	MOVD 16(R9)(R10*1), R11
-	ADDE R11, R7
-	MOVD 24(R9)(R10*1), R11
-	ADDE R11, R1
-	MOVD R0, R4
-	ADDE R4, R4		// save CF
-	NEG  R4, R4
-	MOVD R5, 0(R2)(R10*1)
-	MOVD R6, 8(R2)(R10*1)
-	MOVD R7, 16(R2)(R10*1)
-	MOVD R1, 24(R2)(R10*1)
+	MOVD	0(R8)(R10*1), R5
+	MOVD	8(R8)(R10*1), R6
+	MOVD	16(R8)(R10*1), R7
+	MOVD	24(R8)(R10*1), R1
+	ADDC	R4, R4		// restore CF
+	MOVD	0(R9)(R10*1), R11
+	ADDE	R11, R5
+	MOVD	8(R9)(R10*1), R11
+	ADDE	R11, R6
+	MOVD	16(R9)(R10*1), R11
+	ADDE	R11, R7
+	MOVD	24(R9)(R10*1), R11
+	ADDE	R11, R1
+	MOVD	R0, R4
+	ADDE	R4, R4		// save CF
+	NEG	R4, R4
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R6, 8(R2)(R10*1)
+	MOVD	R7, 16(R2)(R10*1)
+	MOVD	R1, 24(R2)(R10*1)
 
 
-	ADD  $32, R10		// i += 4
-	SUB  $4,  R3		// n -= 4
-	BGE  U1			// if n >= 0 goto U1
+	ADD	$32, R10	// i += 4
+	SUB	$4,  R3		// n -= 4
+	BGE	U1		// if n >= 0 goto U1
 
-v1:	ADD  $4, R3		// n += 4
-	BLE E1			// if n <= 0 goto E1
+v1:	ADD	$4, R3		// n += 4
+	BLE	E1		// if n <= 0 goto E1
 
 L1:	// n > 0
-	ADDC R4, R4		// restore CF
-	MOVD 0(R8)(R10*1), R5
-	MOVD 0(R9)(R10*1), R11
-	ADDE R11, R5
-	MOVD R5, 0(R2)(R10*1)
-	MOVD R0, R4
-	ADDE R4, R4		// save CF
-	NEG  R4, R4
+	ADDC	R4, R4		// restore CF
+	MOVD	0(R8)(R10*1), R5
+	MOVD	0(R9)(R10*1), R11
+	ADDE	R11, R5
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R0, R4
+	ADDE	R4, R4		// save CF
+	NEG 	R4, R4
 
-	ADD  $8, R10		// i++
-	SUB  $1, R3		// n--
-	BGT L1			// if n > 0 goto L1
+	ADD	$8, R10		// i++
+	SUB	$1, R3		// n--
+	BGT	L1		// if n > 0 goto L1
 
-E1:	NEG  R4, R4
-	MOVD R4, c+72(FP)	// return c
+E1:	NEG	R4, R4
+	MOVD	R4, c+72(FP)	// return c
 	RET
 
+TEXT ·addVV_novec(SB),NOSPLIT,$0
+novec:
+	MOVD	z_len+8(FP), R3
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R9
+	MOVD	z+0(FP), R2
+
+	MOVD	$0, R4		// c = 0
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R10		// i = 0
+
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUB	$4, R3		// n -= 4
+	BLT	v1n		// if n < 0 goto v1n
+U1n:	// n >= 0
+	// regular loop body unrolled 4x
+	MOVD	0(R8)(R10*1), R5
+	MOVD	8(R8)(R10*1), R6
+	MOVD	16(R8)(R10*1), R7
+	MOVD	24(R8)(R10*1), R1
+	ADDC	R4, R4		// restore CF
+	MOVD	0(R9)(R10*1), R11
+	ADDE	R11, R5
+	MOVD	8(R9)(R10*1), R11
+	ADDE	R11, R6
+	MOVD	16(R9)(R10*1), R11
+	ADDE	R11, R7
+	MOVD	24(R9)(R10*1), R11
+	ADDE	R11, R1
+	MOVD	R0, R4
+	ADDE	R4, R4		// save CF
+	NEG	R4, R4
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R6, 8(R2)(R10*1)
+	MOVD	R7, 16(R2)(R10*1)
+	MOVD	R1, 24(R2)(R10*1)
+
+
+	ADD	$32, R10	// i += 4
+	SUB	$4,  R3		// n -= 4
+	BGE	U1n		// if n >= 0 goto U1n
+
+v1n:	ADD	$4, R3		// n += 4
+	BLE	E1n		// if n <= 0 goto E1n
+
+L1n:	// n > 0
+	ADDC	R4, R4		// restore CF
+	MOVD	0(R8)(R10*1), R5
+	MOVD	0(R9)(R10*1), R11
+	ADDE	R11, R5
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R0, R4
+	ADDE	R4, R4		// save CF
+	NEG 	R4, R4
+
+	ADD	$8, R10		// i++
+	SUB	$1, R3		// n--
+	BGT L1n			// if n > 0 goto L1n
+
+E1n:	NEG	R4, R4
+	MOVD	R4, c+72(FP)	// return c
+	RET
+
+
+TEXT ·subVV(SB),NOSPLIT,$0
+	MOVD	subvectorfacility+0x00(SB),R1
+	BR	(R1)
+	
+TEXT ·subVV_check(SB),NOSPLIT,$0
+	MOVB	·hasVX(SB), R1
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD	$subvectorfacility+0x00(SB), R1
+	MOVD	$·subVV_novec(SB), R2
+	MOVD	R2, 0(R1)
+	//MOVD	$·subVV_novec(SB), 0(R1)
+	BR	·subVV_novec(SB)
+vectorimpl:
+	MOVD	$subvectorfacility+0x00(SB), R1
+	MOVD    $·subVV_vec(SB), R2
+        MOVD    R2, 0(R1)
+	//MOVD	$·subVV_vec(SB), 0(R1)
+	BR	·subVV_vec(SB)
+
+GLOBL subvectorfacility+0x00(SB), NOPTR, $8
+DATA subvectorfacility+0x00(SB)/8, $·subVV_check(SB)
+
 // DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
 // func subVV(z, x, y []Word) (c Word)
 // (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
-TEXT ·subVV(SB),NOSPLIT,$0
+TEXT ·subVV_vec(SB),NOSPLIT,$0
+	MOVD	z_len+8(FP), R3
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R9
+	MOVD	z+0(FP), R2
+	MOVD	$0, R4		// c = 0
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R10		// i = 0
+	
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUB	$4, R3		// n -= 4
+	BLT	v1		// if n < 0 goto v1
+	SUB     $12, R3         // n -= 16
+        BLT     A1              // if n < 0 goto A1
+
+	MOVD	R8, R5
+	MOVD	R9, R6
+	MOVD	R2, R7
+
+	// n >= 0
+	// regular loop body unrolled 16x
+	VZERO	V0		// cf = 0
+	MOVD	$1, R4		// for 390 subtraction cf starts as 1 (no borrow)
+	VLVGG	$1, R4, V0	//put carry into V0
+
+UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V8
+	ADD	$64, R5
+	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
+	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
+
+
+	VLM	0(R6), V9, V12  	// 64-bytes into V9..V16
+	ADD	$64, R6
+	VPDI	$0x4,V9,V9,V9		// flip the doublewords to big-endian order
+	VPDI	$0x4,V10,V10,V10	// flip the doublewords to big-endian order
+
+	VSBCBIQ	V1, V9, V0, V25
+	VSBIQ	V1, V9, V0, V17
+	VSBCBIQ	V2, V10, V25, V26
+	VSBIQ	V2, V10, V25, V18
+
+
+	VLM	0(R5), V5, V6		// 32-bytes into V1..V8
+	VLM	0(R6), V13, V14  	// 32-bytes into V9..V16
+	ADD	$32, R5
+	ADD	$32, R6
+
+	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
+	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
+	VPDI	$0x4,V11,V11,V11	// flip the doublewords to big-endian order
+	VPDI	$0x4,V12,V12,V12	// flip the doublewords to big-endian order
+
+	VSBCBIQ	V3, V11, V26, V27
+	VSBIQ	V3, V11, V26, V19
+	VSBCBIQ	V4, V12, V27, V28
+	VSBIQ	V4, V12, V27, V20
+
+	VLM	0(R5), V7, V8		// 32-bytes into V1..V8
+	VLM	0(R6), V15, V16  	// 32-bytes into V9..V16
+	ADD	$32, R5
+	ADD	$32, R6
+
+	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
+	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
+	VPDI	$0x4,V13,V13,V13	// flip the doublewords to big-endian order
+	VPDI	$0x4,V14,V14,V14	// flip the doublewords to big-endian order
+
+	VSBCBIQ	V5, V13, V28, V29
+	VSBIQ	V5, V13, V28, V21
+	VSBCBIQ	V6, V14, V29, V30
+	VSBIQ	V6, V14, V29, V22
+
+	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
+	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
+	VPDI	$0x4,V15,V15,V15	// flip the doublewords to big-endian order
+	VPDI	$0x4,V16,V16,V16	// flip the doublewords to big-endian order
+
+	VSBCBIQ	V7, V15, V30, V31
+	VSBIQ	V7, V15, V30, V23
+	VSBCBIQ	V8, V16, V31, V0	//V0 has carry-over
+	VSBIQ	V8, V16, V31, V24
+
+	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
+	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
+	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
+	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
+	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
+	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
+	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
+	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
+	VSTM	V17, V24, 0(R7)   // 128-bytes into z
+	ADD	$128, R7
+	ADD	$128, R10	// i += 16
+	SUB	$16,  R3	// n -= 16
+	BGE	UU1		// if n >= 0 goto U1
+	VLGVG	$1, V0, R4	// put cf into R4
+	SUB	$1, R4		// save cf
+
+A1:	ADD	$12, R3		// n += 16
+	BLT	v1		// if n < 0 goto v1
+	
+U1:	// n >= 0
+	// regular loop body unrolled 4x
+	MOVD	0(R8)(R10*1), R5
+	MOVD	8(R8)(R10*1), R6
+	MOVD	16(R8)(R10*1), R7
+	MOVD	24(R8)(R10*1), R1
+	MOVD	R0, R11
+	SUBC	R4, R11		// restore CF
+	MOVD	0(R9)(R10*1), R11
+	SUBE	R11, R5
+	MOVD	8(R9)(R10*1), R11
+	SUBE	R11, R6
+	MOVD	16(R9)(R10*1), R11
+	SUBE	R11, R7
+	MOVD	24(R9)(R10*1), R11
+	SUBE	R11, R1
+	MOVD	R0, R4
+	SUBE	R4, R4		// save CF
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R6, 8(R2)(R10*1)
+	MOVD	R7, 16(R2)(R10*1)
+	MOVD	R1, 24(R2)(R10*1)
+
+	ADD	$32, R10	// i += 4
+	SUB	$4,  R3		// n -= 4
+	BGE	U1		// if n >= 0 goto U1n
+
+v1:	ADD	$4, R3		// n += 4
+	BLE	E1		// if n <= 0 goto E1
+
+L1:	// n > 0
+	MOVD	R0, R11
+	SUBC	R4, R11		// restore CF
+	MOVD	0(R8)(R10*1), R5
+	MOVD	0(R9)(R10*1), R11
+	SUBE	R11, R5
+	MOVD	R5, 0(R2)(R10*1)
+	MOVD	R0, R4
+	SUBE	R4, R4		// save CF
+
+	ADD	$8, R10		// i++
+	SUB	$1, R3		// n--
+	BGT	L1		// if n > 0 goto L1n
+
+E1:	NEG	R4, R4
+	MOVD	R4, c+72(FP)	// return c
+	RET
+
+
+// DI = R3, CX = R4, SI = r10, r8 = r8, r9=r9, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0) + use R11
+// func subVV(z, x, y []Word) (c Word)
+// (same as addVV except for SUBC/SUBE instead of ADDC/ADDE and label names)
+TEXT ·subVV_novec(SB),NOSPLIT,$0
 	MOVD z_len+8(FP), R3
 	MOVD x+24(FP), R8
 	MOVD y+48(FP), R9
@@ -158,9 +529,163 @@
 	MOVD R4, c+72(FP)	// return c
 	RET
 
-
-// func addVW(z, x []Word, y Word) (c Word)
 TEXT ·addVW(SB),NOSPLIT,$0
+	MOVD	addwvectorfacility+0x00(SB),R1
+	BR	(R1)
+	
+TEXT ·addVW_check(SB),NOSPLIT,$0
+	MOVB	·hasVX(SB), R1
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD	$addwvectorfacility+0x00(SB), R1
+	MOVD    $·addVW_novec(SB), R2
+        MOVD    R2, 0(R1)
+	//MOVD	$·addVW_novec(SB), 0(R1)
+	BR	·addVW_novec(SB)
+vectorimpl:
+	MOVD	$addwvectorfacility+0x00(SB), R1
+	MOVD    $·addVW_vec(SB), R2
+        MOVD    R2, 0(R1)
+	//MOVD	$·addVW_vec(SB), 0(R1)
+	BR	·addVW_vec(SB)
+
+GLOBL addwvectorfacility+0x00(SB), NOPTR, $8
+DATA addwvectorfacility+0x00(SB)/8, $·addVW_check(SB)
+
+
+// func addVW_vec(z, x []Word, y Word) (c Word)
+TEXT ·addVW_vec(SB),NOSPLIT,$0
+	MOVD	z_len+8(FP), R3
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R4	// c = y
+	MOVD	z+0(FP), R2
+
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R10		// i = 0
+	MOVD	R8, R5
+	MOVD	R2, R7
+
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUB	$4, R3			// n -= 4
+	BLT	v10			// if n < 0 goto v10
+	SUB	$12, R3
+	BLT	A10
+
+	// n >= 0
+	// regular loop body unrolled 16x
+
+	VZERO	V0			// prepare V0 to be final carry register
+	VZERO	V9			// to ensure upper half is zero
+	VLVGG	$1, R4, V9
+UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
+	ADD	$64, R5
+	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
+	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
+
+
+	VACCCQ	V1, V9, V0, V25
+	VACQ	V1, V9, V0, V17
+	VZERO	V9
+	VACCCQ	V2, V9, V25, V26
+	VACQ	V2, V9, V25, V18
+
+
+	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
+	ADD	$32, R5
+
+	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
+	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
+
+	VACCCQ	V3, V9, V26, V27
+	VACQ	V3, V9, V26, V19
+	VACCCQ	V4, V9, V27, V28
+	VACQ	V4, V9, V27, V20
+
+	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
+	ADD	$32, R5
+
+	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
+	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
+
+	VACCCQ	V5, V9, V28, V29
+	VACQ	V5, V9, V28, V21
+	VACCCQ	V6, V9, V29, V30
+	VACQ	V6, V9, V29, V22
+
+	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
+	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
+
+	VACCCQ	V7, V9, V30, V31
+	VACQ	V7, V9, V30, V23
+	VACCCQ	V8, V9, V31, V0	//V0 has carry-over
+	VACQ	V8, V9, V31, V24
+
+	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
+	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
+	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
+	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
+	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
+	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
+	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
+	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
+	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
+	ADD	$128, R7
+	ADD	$128, R10		// i += 16
+	SUB	$16,  R3		// n -= 16
+	BGE	UU1		// if n >= 0 goto U1
+	VLGVG	$1, V0, R4	// put cf into R4 in case we branch to v10
+
+A10:	ADD	$12, R3		// n += 16
+
+
+	// s/JL/JMP/ below to disable the unrolled loop
+
+	BLT	v10		// if n < 0 goto v10
+
+
+U4:	// n >= 0
+	// regular loop body unrolled 4x
+	MOVD 0(R8)(R10*1), R5
+	MOVD 8(R8)(R10*1), R6
+	MOVD 16(R8)(R10*1), R7
+	MOVD 24(R8)(R10*1), R1
+	ADDC R4, R5
+	ADDE R0, R6
+	ADDE R0, R7
+	ADDE R0, R1
+	ADDE R0, R0
+	MOVD R0, R4		// save CF
+	SUB  R0, R0
+	MOVD R5, 0(R2)(R10*1)
+	MOVD R6, 8(R2)(R10*1)
+	MOVD R7, 16(R2)(R10*1)
+	MOVD R1, 24(R2)(R10*1)
+
+	ADD $32, R10		// i += 4 -> i +=32
+	SUB $4, R3		// n -= 4
+	BGE U4			// if n >= 0 goto U4
+
+v10:	ADD $4, R3		// n += 4
+	BLE E10			// if n <= 0 goto E4
+
+
+L4:	// n > 0
+	MOVD	0(R8)(R10*1), R5
+	ADDC	R4, R5
+	ADDE	R0, R0
+	MOVD	R0, R4		// save CF
+	SUB 	R0, R0
+	MOVD	R5, 0(R2)(R10*1)
+
+	ADD	$8, R10		// i++
+	SUB	$1, R3		// n--
+	BGT	L4		// if n > 0 goto L4
+
+E10:	MOVD	R4, c+56(FP)	// return c
+
+	RET
+
+
+TEXT ·addVW_novec(SB),NOSPLIT,$0
 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
 	MOVD z_len+8(FP), R3
 	MOVD x+24(FP), R8
@@ -214,10 +739,166 @@
 
 	RET
 
+TEXT ·subVW(SB),NOSPLIT,$0
+	MOVD	subwvectorfacility+0x00(SB),R1
+	BR	(R1)
+	
+TEXT ·subVW_check(SB),NOSPLIT,$0
+	MOVB	·hasVX(SB), R1
+	CMPBEQ	R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD	$subwvectorfacility+0x00(SB), R1
+	MOVD    $·subVW_novec(SB), R2
+        MOVD    R2, 0(R1)
+	//MOVD	$·subVW_novec(SB), 0(R1)
+	BR	·subVW_novec(SB)
+vectorimpl:
+	MOVD	$subwvectorfacility+0x00(SB), R1
+	MOVD    $·subVW_vec(SB), R2
+        MOVD    R2, 0(R1)
+	//MOVD	$·subVW_vec(SB), 0(R1)
+	BR	·subVW_vec(SB)
+
+GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
+DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
+
+// func subVW(z, x []Word, y Word) (c Word)
+TEXT ·subVW_vec(SB),NOSPLIT,$0
+	MOVD	z_len+8(FP), R3
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R4	// c = y
+	MOVD	z+0(FP), R2
+
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R10		// i = 0
+	MOVD	R8, R5
+	MOVD	R2, R7
+
+	// s/JL/JMP/ below to disable the unrolled loop
+	SUB	$4, R3			// n -= 4
+	BLT	v11			// if n < 0 goto v11
+	SUB	$12, R3
+	BLT	A11
+
+	VZERO	V0
+	MOVD	$1, R6			// prepare V0 to be final carry register
+	VLVGG	$1, R6, V0		// borrow is initially "no borrow"
+	VZERO	V9			// to ensure upper half is zero
+	VLVGG	$1, R4, V9
+
+	// n >= 0
+	// regular loop body unrolled 16x
+
+
+UU1:	VLM	0(R5), V1, V4		// 64-bytes into V1..V4
+	ADD	$64, R5
+	VPDI	$0x4,V1,V1,V1		// flip the doublewords to big-endian order
+	VPDI	$0x4,V2,V2,V2		// flip the doublewords to big-endian order
+
+
+	VSBCBIQ	V1, V9, V0, V25
+	VSBIQ	V1, V9, V0, V17
+	VZERO	V9
+	VSBCBIQ	V2, V9, V25, V26
+	VSBIQ	V2, V9, V25, V18
+
+	VLM	0(R5), V5, V6		// 32-bytes into V5..V6
+	ADD	$32, R5
+
+	VPDI	$0x4,V3,V3,V3		// flip the doublewords to big-endian order
+	VPDI	$0x4,V4,V4,V4		// flip the doublewords to big-endian order
+
+
+	VSBCBIQ	V3, V9, V26, V27
+	VSBIQ	V3, V9, V26, V19
+	VSBCBIQ	V4, V9, V27, V28
+	VSBIQ	V4, V9, V27, V20
+
+	VLM	0(R5), V7, V8		// 32-bytes into V7..V8
+	ADD	$32, R5
+
+	VPDI	$0x4,V5,V5,V5		// flip the doublewords to big-endian order
+	VPDI	$0x4,V6,V6,V6		// flip the doublewords to big-endian order
+
+	VSBCBIQ	V5, V9, V28, V29
+	VSBIQ	V5, V9, V28, V21
+	VSBCBIQ	V6, V9, V29, V30
+	VSBIQ	V6, V9, V29, V22
+
+	VPDI	$0x4,V7,V7,V7		// flip the doublewords to big-endian order
+	VPDI	$0x4,V8,V8,V8		// flip the doublewords to big-endian order
+
+	VSBCBIQ	V7, V9, V30, V31
+	VSBIQ	V7, V9, V30, V23
+	VSBCBIQ	V8, V9, V31, V0	// V0 has carry-over
+	VSBIQ	V8, V9, V31, V24
+
+	VPDI	$0x4,V17,V17,V17	// flip the doublewords to big-endian order
+	VPDI	$0x4,V18,V18,V18	// flip the doublewords to big-endian order
+	VPDI	$0x4,V19,V19,V19	// flip the doublewords to big-endian order
+	VPDI	$0x4,V20,V20,V20	// flip the doublewords to big-endian order
+	VPDI	$0x4,V21,V21,V21	// flip the doublewords to big-endian order
+	VPDI	$0x4,V22,V22,V22	// flip the doublewords to big-endian order
+	VPDI	$0x4,V23,V23,V23	// flip the doublewords to big-endian order
+	VPDI	$0x4,V24,V24,V24	// flip the doublewords to big-endian order
+	VSTM	V17, V24, 0(R7)   	// 128-bytes into z
+	ADD	$128, R7
+	ADD	$128, R10		// i += 16
+	SUB	$16,  R3		// n -= 16
+	BGE	UU1			// if n >= 0 goto U1
+	VLGVG	$1, V0, R4		// put cf into R4 in case we branch to v10
+	SUB	$1, R4			// save cf
+	NEG	R4, R4
+A11:	ADD	$12, R3			// n += 16
+
+	BLT	v11			// if n < 0 goto v11
+
+	// n >= 0
+	// regular loop body unrolled 4x
+
+U4:	// n >= 0
+	// regular loop body unrolled 4x
+	MOVD 0(R8)(R10*1), R5
+	MOVD 8(R8)(R10*1), R6
+	MOVD 16(R8)(R10*1), R7
+	MOVD 24(R8)(R10*1), R1
+	SUBC R4, R5 //SLGR  -> SUBC
+	SUBE R0, R6 //SLBGR -> SUBE
+	SUBE R0, R7
+	SUBE R0, R1
+	SUBE R4, R4		// save CF
+	NEG  R4, R4
+	MOVD R5, 0(R2)(R10*1)
+	MOVD R6, 8(R2)(R10*1)
+	MOVD R7, 16(R2)(R10*1)
+	MOVD R1, 24(R2)(R10*1)
+
+	ADD $32, R10		// i += 4 -> i +=32
+	SUB $4, R3		// n -= 4
+	BGE U4			// if n >= 0 goto U4
+
+v11:	ADD $4, R3		// n += 4
+	BLE E11			// if n <= 0 goto E4
+
+L4:	// n > 0
+
+	MOVD	0(R8)(R10*1), R5
+	SUBC	R4, R5
+	SUBE	R4, R4		// save CF
+	NEG	R4, R4
+	MOVD	R5, 0(R2)(R10*1)
+
+	ADD	$8, R10		// i++
+	SUB	$1, R3		// n--
+	BGT	L4		// if n > 0 goto L4
+
+E11:	MOVD	R4, c+56(FP)	// return c
+
+	RET
+
 //DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
 // func subVW(z, x []Word, y Word) (c Word)
 // (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
-TEXT ·subVW(SB),NOSPLIT,$0
+TEXT ·subVW_novec(SB),NOSPLIT,$0
 	MOVD z_len+8(FP), R3
 	MOVD x+24(FP), R8
 	MOVD y+48(FP), R4	// c = y
@@ -270,296 +951,299 @@
 
 // func shlVU(z, x []Word, s uint) (c Word)
 TEXT ·shlVU(SB),NOSPLIT,$0
-	MOVD z_len+8(FP), R5
-	SUB  $1, R5             // n--
-	BLT  X8b                // n < 0        (n <= 0)
+	MOVD	z_len+8(FP), R5
+	MOVD	$0, R0
+	SUB	$1, R5             // n--
+	BLT	X8b                // n < 0        (n <= 0)
 
 	// n > 0
-	MOVD s+48(FP), R4
-	CMPBEQ	R0, R4, Z80	       //handle 0 case beq
-	MOVD $64, R6
-	CMPBEQ  R6, R4, Z864	       //handle 64 case beq
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
-	SUB  R4, R6, R7
-	MOVD (R8)(R5*1), R10    // w1 = x[i-1]
-	SRD  R7, R10, R3
-	MOVD R3, c+56(FP)
+	MOVD	s+48(FP), R4
+	CMPBEQ	R0, R4, Z80	   //handle 0 case beq
+	MOVD	$64, R6
+	CMPBEQ	R6, R4, Z864	   //handle 64 case beq
+	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5             // n = n*8
+	SUB	R4, R6, R7
+	MOVD	(R8)(R5*1), R10    // w1 = x[i-1]
+	SRD	R7, R10, R3
+	MOVD	R3, c+56(FP)
 
-	MOVD $0, R1             // i = 0
-	BR   E8
+	MOVD	$0, R1             // i = 0
+	BR	E8
 
 	// i < n-1
-L8:	MOVD R10, R3             // w = w1
-	MOVD -8(R8)(R5*1), R10   // w1 = x[i+1]
+L8:	MOVD	R10, R3             // w = w1
+	MOVD	-8(R8)(R5*1), R10   // w1 = x[i+1]
 
-	SLD  R4,  R3             // w<<s | w1>>ŝ
-	SRD  R7, R10, R6
-	OR   R6, R3
-	MOVD R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ
-	SUB  $8, R5              // i--
+	SLD	R4,  R3             // w<<s | w1>>ŝ
+	SRD	R7, R10, R6
+	OR 	R6, R3
+	MOVD	R3, (R2)(R5*1)      // z[i] = w<<s | w1>>ŝ
+	SUB	$8, R5              // i--
 
-E8:	CMPBGT R5, R0, L8        // i < n-1
+E8:	CMPBGT	R5, R0, L8	    // i < n-1
 
 	// i >= n-1
-X8a:	SLD  R4, R10             // w1<<s
-	MOVD R10, (R2)           // z[0] = w1<<s
+X8a:	SLD	R4, R10             // w1<<s
+	MOVD	R10, (R2)           // z[0] = w1<<s
 	RET
 
-X8b:	MOVD R0, c+56(FP)
+X8b:	MOVD	R0, c+56(FP)
 	RET
 
-Z80:	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
+Z80:	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5             // n = n*8
 
-	MOVD (R8), R10
-	MOVD $0, R3
-	MOVD R3, c+56(FP)
+	MOVD	(R8), R10
+	MOVD	$0, R3
+	MOVD	R3, c+56(FP)
 
-	MOVD $0, R1             // i = 0
-	BR   E8Z
+	MOVD	$0, R1             // i = 0
+	BR	E8Z
 
 	// i < n-1
-L8Z:	MOVD R10, R3
-	MOVD 8(R8)(R1*1), R10
+L8Z:	MOVD	R10, R3
+	MOVD	8(R8)(R1*1), R10
 
-	MOVD R3, (R2)(R1*1)
-	ADD  $8, R1
+	MOVD	R3, (R2)(R1*1)
+	ADD 	$8, R1
 
-E8Z:	CMPBLT R1, R5, L8Z
+E8Z:	CMPBLT	R1, R5, L8Z
 
 	// i >= n-1
-	MOVD R10, (R2)(R5*1)
+	MOVD	R10, (R2)(R5*1)
 	RET
 
-Z864:	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
-	MOVD (R8)(R5*1), R3     // w1 = x[n-1]
-	MOVD R3, c+56(FP)       // z[i] = x[n-1]
+Z864:	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5             // n = n*8
+	MOVD	(R8)(R5*1), R3     // w1 = x[n-1]
+	MOVD	R3, c+56(FP)       // z[i] = x[n-1]
 
-	BR   E864
+	BR	E864
 
 	// i < n-1
-L864:	MOVD -8(R8)(R5*1), R3
+L864:	MOVD	-8(R8)(R5*1), R3
 
-	MOVD R3, (R2)(R5*1)     // z[i] = x[n-1]
-	SUB  $8, R5             // i--
+	MOVD	R3, (R2)(R5*1)     // z[i] = x[n-1]
+	SUB	$8, R5             // i--
 
-E864:	CMPBGT R5, R0, L864     // i < n-1
+E864:	CMPBGT	R5, R0, L864       // i < n-1
 
-	MOVD R0, (R2)           // z[n-1] = 0
+	MOVD	R0, (R2)           // z[n-1] = 0
 	RET
 
 
 // CX = R4, r8 = r8, r10 = r2 , r11 = r5, DX = r3, AX = r10 , BX = R1 , 64-count = r7 (R0 set to 0) temp = R6
 // func shrVU(z, x []Word, s uint) (c Word)
 TEXT ·shrVU(SB),NOSPLIT,$0
-	MOVD z_len+8(FP), R5
-	SUB  $1, R5             // n--
-	BLT  X9b                // n < 0        (n <= 0)
+	MOVD	z_len+8(FP), R5
+	MOVD	$0, R0
+	SUB	$1, R5             // n--
+	BLT	X9b                // n < 0        (n <= 0)
 
 	// n > 0
-	MOVD s+48(FP), R4
-	CMPBEQ	R0, R4, ZB0	       //handle 0 case beq
-	MOVD $64, R6
-	CMPBEQ  R6, R4, ZB64	       //handle 64 case beq
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
-	SUB  R4, R6, R7
-	MOVD (R8), R10          // w1 = x[0]
-	SLD  R7, R10, R3
-	MOVD R3, c+56(FP)
+	MOVD	s+48(FP), R4
+	CMPBEQ	R0, R4, ZB0	//handle 0 case beq
+	MOVD	$64, R6
+	CMPBEQ 	R6, R4, ZB64	//handle 64 case beq
+	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5		// n = n*8
+	SUB	R4, R6, R7
+	MOVD	(R8), R10	// w1 = x[0]
+	SLD	R7, R10, R3
+	MOVD	R3, c+56(FP)
 
-	MOVD $0, R1            // i = 0
-	BR   E9
+	MOVD	$0, R1		// i = 0
+	BR 	E9
 
 	// i < n-1
-L9:	MOVD R10, R3            // w = w1
-	MOVD 8(R8)(R1*1), R10   // w1 = x[i+1]
+L9:	MOVD	R10, R3		// w = w1
+	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
 
-	SRD  R4,  R3            // w>>s | w1<<s
-	SLD  R7, R10, R6
-	OR   R6, R3
-	MOVD R3, (R2)(R1*1)     // z[i] = w>>s | w1<<s
-	ADD  $8, R1             // i++
+	SRD	R4,  R3		// w>>s | w1<<s
+	SLD	R7, R10, R6
+	OR	R6, R3
+	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
+	ADD	$8, R1		// i++
 
-E9:	CMPBLT R1, R5, L9       // i < n-1
+E9:	CMPBLT	R1, R5, L9	// i < n-1
 
 	// i >= n-1
-X9a:	SRD  R4, R10            // w1>>s
-	MOVD R10, (R2)(R5*1)    // z[n-1] = w1>>s
+X9a:	SRD	R4, R10		// w1>>s
+	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
 	RET
 
-X9b:	MOVD R0, c+56(FP)
+X9b:	MOVD	R0, c+56(FP)
 	RET
 
-ZB0:	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
+ZB0:	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5		// n = n*8
 
-	MOVD (R8), R10          // w1 = x[0]
-	MOVD $0, R3             // R10 << 64
-	MOVD R3, c+56(FP)
+	MOVD	(R8), R10	// w1 = x[0]
+	MOVD	$0, R3		// R10 << 64
+	MOVD	R3, c+56(FP)
 
-	MOVD $0, R1             // i = 0
-	BR   E9Z
+	MOVD	$0, R1		// i = 0
+	BR	E9Z
 
 	// i < n-1
-L9Z:	MOVD R10, R3            // w = w1
-	MOVD 8(R8)(R1*1), R10   // w1 = x[i+1]
+L9Z:	MOVD	R10, R3		// w = w1
+	MOVD	8(R8)(R1*1), R10	// w1 = x[i+1]
 
-	MOVD R3, (R2)(R1*1)     // z[i] = w>>s | w1<<s
-	ADD  $8, R1             // i++
+	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
+	ADD	$8, R1		// i++
 
-E9Z:	CMPBLT R1, R5, L9Z      // i < n-1
+E9Z:	CMPBLT	R1, R5, L9Z	// i < n-1
 
 	// i >= n-1
-	MOVD R10, (R2)(R5*1)    // z[n-1] = w1>>s
+	MOVD	R10, (R2)(R5*1)	// z[n-1] = w1>>s
 	RET
 
-ZB64:	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	SLD  $3, R5             // n = n*8
-	MOVD (R8), R3          // w1 = x[0]
-	MOVD R3, c+56(FP)
+ZB64:	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	SLD	$3, R5		// n = n*8
+	MOVD	(R8), R3	// w1 = x[0]
+	MOVD	R3, c+56(FP)
 
-	MOVD $0, R1            // i = 0
-	BR   E964
+	MOVD	$0, R1		// i = 0
+	BR	E964
 
 	// i < n-1
-L964:	MOVD 8(R8)(R1*1), R3   // w1 = x[i+1]
+L964:	MOVD	8(R8)(R1*1), R3	// w1 = x[i+1]
 
-	MOVD R3, (R2)(R1*1)     // z[i] = w>>s | w1<<s
-	ADD  $8, R1             // i++
+	MOVD	R3, (R2)(R1*1)	// z[i] = w>>s | w1<<s
+	ADD	$8, R1		// i++
 
-E964:	CMPBLT R1, R5, L964     // i < n-1
+E964:	CMPBLT	R1, R5, L964	// i < n-1
 
 	// i >= n-1
-	MOVD  $0, R10            // w1>>s
-	MOVD R10, (R2)(R5*1)    // z[n-1] = w1>>s
+	MOVD	$0, R10            // w1>>s
+	MOVD	R10, (R2)(R5*1)    // z[n-1] = w1>>s
 	RET
 
 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, DX = r3, AX = r6 , BX = R1 , (R0 set to 0) + use R11 + use R7 for i
 // func mulAddVWW(z, x []Word, y, r Word) (c Word)
 TEXT ·mulAddVWW(SB),NOSPLIT,$0
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	MOVD y+48(FP), R9
-	MOVD r+56(FP), R4	// c = r
-	MOVD z_len+8(FP), R5
-	MOVD $0, R1		// i = 0
-	MOVD $0, R7		// i*8 = 0
-	MOVD $0, R0		// make sure it's zero
-	BR E5
+	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R9
+	MOVD	r+56(FP), R4	// c = r
+	MOVD	z_len+8(FP), R5
+	MOVD	$0, R1		// i = 0
+	MOVD	$0, R7		// i*8 = 0
+	MOVD	$0, R0		// make sure it's zero
+	BR	E5
 
-L5:	MOVD (R8)(R1*1), R6
-	MULHDU R9, R6
-	ADDC R4, R11 		//add to low order bits
-	ADDE R0, R6
-	MOVD R11, (R2)(R1*1)
-	MOVD R6, R4
-	ADD  $8, R1		// i*8 + 8
-	ADD  $1, R7		// i++
+L5:	MOVD	(R8)(R1*1), R6
+	MULHDU	R9, R6
+	ADDC	R4, R11 	//add to low order bits
+	ADDE	R0, R6
+	MOVD	R11, (R2)(R1*1)
+	MOVD	R6, R4
+	ADD	$8, R1		// i*8 + 8
+	ADD	$1, R7		// i++
 
-E5:	CMPBLT R7, R5, L5	// i < n
+E5:	CMPBLT	R7, R5, L5	// i < n
 
-	MOVD R4, c+64(FP)
+	MOVD	R4, c+64(FP)
 	RET
 
 // func addMulVVW(z, x []Word, y Word) (c Word)
 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1 , (R0 set to 0) + use R11 + use R7 for i
 TEXT ·addMulVVW(SB),NOSPLIT,$0
-	MOVD z+0(FP), R2
-	MOVD x+24(FP), R8
-	MOVD y+48(FP), R9
-	MOVD z_len+8(FP), R5
+	MOVD	z+0(FP), R2
+	MOVD	x+24(FP), R8
+	MOVD	y+48(FP), R9
+	MOVD	z_len+8(FP), R5
 
-	MOVD $0, R1		// i*8 = 0
-	MOVD $0, R7		// i = 0
-	MOVD $0, R0		// make sure it's zero
-	MOVD $0, R4		// c = 0
+	MOVD	$0, R1		// i*8 = 0
+	MOVD	$0, R7		// i = 0
+	MOVD	$0, R0		// make sure it's zero
+	MOVD	$0, R4		// c = 0
 
-	MOVD R5, R12
-	AND  $-2, R12
-	CMPBGE R5, $2, A6
-	BR   E6
+	MOVD	R5, R12
+	AND	$-2, R12
+	CMPBGE	R5, $2, A6
+	BR	E6
 
-A6:	MOVD (R8)(R1*1), R6
-	MULHDU R9, R6
-	MOVD (R2)(R1*1), R10
-	ADDC R10, R11	//add to low order bits
-	ADDE R0, R6
-	ADDC R4, R11
-	ADDE R0, R6
-	MOVD R6, R4
-	MOVD R11, (R2)(R1*1)
+A6:	MOVD	(R8)(R1*1), R6
+	MULHDU	R9, R6
+	MOVD	(R2)(R1*1), R10
+	ADDC	R10, R11	//add to low order bits
+	ADDE	R0, R6
+	ADDC	R4, R11
+	ADDE	R0, R6
+	MOVD	R6, R4
+	MOVD	R11, (R2)(R1*1)
 
-	MOVD (8)(R8)(R1*1), R6
-	MULHDU R9, R6
-	MOVD (8)(R2)(R1*1), R10
-	ADDC R10, R11	//add to low order bits
-	ADDE R0, R6
-	ADDC R4, R11
-	ADDE R0, R6
-	MOVD R6, R4
-	MOVD R11, (8)(R2)(R1*1)
+	MOVD	(8)(R8)(R1*1), R6
+	MULHDU	R9, R6
+	MOVD	(8)(R2)(R1*1), R10
+	ADDC	R10, R11	//add to low order bits
+	ADDE	R0, R6
+	ADDC	R4, R11
+	ADDE	R0, R6
+	MOVD	R6, R4
+	MOVD	R11, (8)(R2)(R1*1)
 
-	ADD  $16, R1		// i*8 + 8
-	ADD  $2, R7		// i++
+	ADD	$16, R1		// i*8 + 8
+	ADD	$2, R7		// i++
 
-	CMPBLT R7, R12, A6
-	BR E6
+	CMPBLT	R7, R12, A6
+	BR	E6
 
-L6:	MOVD (R8)(R1*1), R6
-	MULHDU R9, R6
-	MOVD (R2)(R1*1), R10
-	ADDC R10, R11	//add to low order bits
-	ADDE R0, R6
-	ADDC R4, R11
-	ADDE R0, R6
-	MOVD R6, R4
-	MOVD R11, (R2)(R1*1)
+L6:	MOVD	(R8)(R1*1), R6
+	MULHDU	R9, R6
+	MOVD	(R2)(R1*1), R10
+	ADDC	R10, R11	//add to low order bits
+	ADDE	R0, R6
+	ADDC	R4, R11
+	ADDE	R0, R6
+	MOVD	R6, R4
+	MOVD	R11, (R2)(R1*1)
 
-	ADD  $8, R1		// i*8 + 8
-	ADD  $1, R7		// i++
+	ADD	$8, R1		// i*8 + 8
+	ADD	$1, R7		// i++
 
-E6:	CMPBLT R7, R5, L6	// i < n
+E6:	CMPBLT	R7, R5, L6	// i < n
 
-	MOVD R4, c+56(FP)
+	MOVD	R4, c+56(FP)
 	RET
 
 // func divWVW(z []Word, xn Word, x []Word, y Word) (r Word)
 // CX = R4, r8 = r8, r9=r9, r10 = r2 , r11 = r5, AX = r11, DX = R6, r12=r12, BX = R1(*8) , (R0 set to 0) + use R11 + use R7 for i
 TEXT ·divWVW(SB),NOSPLIT,$0
-	MOVD z+0(FP), R2
-	MOVD xn+24(FP), R10	// r = xn
-	MOVD x+32(FP), R8
-	MOVD y+56(FP), R9
-	MOVD z_len+8(FP), R7	// i = z
-	SLD  $3, R7, R1		// i*8
-	MOVD $0, R0		// make sure it's zero
-	BR E7
+	MOVD	z+0(FP), R2
+	MOVD	xn+24(FP), R10	// r = xn
+	MOVD	x+32(FP), R8
+	MOVD	y+56(FP), R9
+	MOVD	z_len+8(FP), R7	// i = z
+	SLD	$3, R7, R1		// i*8
+	MOVD	$0, R0		// make sure it's zero
+	BR	E7
 
-L7:	MOVD (R8)(R1*1), R11
-	WORD $0xB98700A9  //DLGR R10,R9
-	MOVD R11, (R2)(R1*1)
+L7:	MOVD	(R8)(R1*1), R11
+	WORD	$0xB98700A9	//DLGR R10,R9
+	MOVD	R11, (R2)(R1*1)
 
-E7:	SUB  $1, R7		// i--
-	SUB  $8, R1
-	BGE L7			// i >= 0
+E7:	SUB	$1, R7		// i--
+	SUB	$8, R1
+	BGE	L7		// i >= 0
 
-	MOVD R10, r+64(FP)
+	MOVD	R10, r+64(FP)
 	RET
 
 // func bitLen(x Word) (n int)
 TEXT ·bitLen(SB),NOSPLIT,$0
-	MOVD x+0(FP), R2
-	WORD $0xb9830022 // FLOGR R2,R2
-	MOVD $64, R3
-	SUB  R2, R3
-	MOVD R3, n+8(FP)
+	MOVD  x+0(FP), R2
+	FLOGR R2, R2 // clobbers R3
+	MOVD  $64, R3
+	SUB   R2, R3
+	MOVD  R3, n+8(FP)
 	RET
+
diff --git a/src/math/big/arith_s390x_test.go b/src/math/big/arith_s390x_test.go
new file mode 100644
index 0000000..31a777e
--- /dev/null
+++ b/src/math/big/arith_s390x_test.go
@@ -0,0 +1,44 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build s390x !math_big_pure_go
+
+package big
+
+import (
+	"testing"
+)
+
+// Tests whether the non vector routines are working, even when the tests are run on a
+// vector-capable machine
+
+func TestFunVVnovec(t *testing.T) {
+	if hasVX == true {
+		for _, a := range sumVV {
+			arg := a
+			testFunVV(t, "addVV_novec", addVV_novec, arg)
+
+			arg = argVV{a.z, a.y, a.x, a.c}
+			testFunVV(t, "addVV_novec symmetric", addVV_novec, arg)
+
+			arg = argVV{a.x, a.z, a.y, a.c}
+			testFunVV(t, "subVV_novec", subVV_novec, arg)
+
+			arg = argVV{a.y, a.z, a.x, a.c}
+			testFunVV(t, "subVV_novec symmetric", subVV_novec, arg)
+		}
+	}
+}
+
+func TestFunVWnovec(t *testing.T) {
+	if hasVX == true {
+		for _, a := range sumVW {
+			arg := a
+			testFunVW(t, "addVW_novec", addVW_novec, arg)
+
+			arg = argVW{a.x, a.z, a.y, a.c}
+			testFunVW(t, "subVW_novec", subVW_novec, arg)
+		}
+	}
+}
diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go
index 75862b4..f2b3083 100644
--- a/src/math/big/arith_test.go
+++ b/src/math/big/arith_test.go
@@ -6,10 +6,14 @@
 
 import (
 	"fmt"
+	"internal/testenv"
 	"math/rand"
+	"strings"
 	"testing"
 )
 
+var isRaceBuilder = strings.HasSuffix(testenv.Builder(), "-race")
+
 type funWW func(x, y, c Word) (z1, z0 Word)
 type argWW struct {
 	x, y, c, z1, z0 Word
@@ -123,6 +127,9 @@
 
 func BenchmarkAddVV(b *testing.B) {
 	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
 		x := rndV(n)
 		y := rndV(n)
 		z := make([]Word, n)
@@ -233,6 +240,9 @@
 
 func BenchmarkAddVW(b *testing.B) {
 	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
 		x := rndV(n)
 		y := rndW()
 		z := make([]Word, n)
@@ -371,6 +381,9 @@
 
 func BenchmarkAddMulVVW(b *testing.B) {
 	for _, n := range benchSizes {
+		if isRaceBuilder && n > 1e3 {
+			continue
+		}
 		x := rndV(n)
 		y := rndW()
 		z := make([]Word, n)
diff --git a/src/math/big/decimal.go b/src/math/big/decimal.go
index 2c0c9da..2dfa032 100644
--- a/src/math/big/decimal.go
+++ b/src/math/big/decimal.go
@@ -125,11 +125,12 @@
 
 	// read a digit, write a digit
 	w := 0 // write index
+	mask := Word(1)<<s - 1
 	for r < len(x.mant) {
 		ch := Word(x.mant[r])
 		r++
 		d := n >> s
-		n -= d << s
+		n &= mask // n -= d << s
 		x.mant[w] = byte(d + '0')
 		w++
 		n = n*10 + ch - '0'
@@ -138,7 +139,7 @@
 	// write extra digits that still fit
 	for n > 0 && w < len(x.mant) {
 		d := n >> s
-		n -= d << s
+		n &= mask
 		x.mant[w] = byte(d + '0')
 		w++
 		n = n * 10
@@ -148,7 +149,7 @@
 	// append additional digits that didn't fit
 	for n > 0 {
 		d := n >> s
-		n -= d << s
+		n &= mask
 		x.mant = append(x.mant, byte(d+'0'))
 		n = n * 10
 	}
diff --git a/src/math/big/decimal_test.go b/src/math/big/decimal_test.go
index 15bdb18..424811e 100644
--- a/src/math/big/decimal_test.go
+++ b/src/math/big/decimal_test.go
@@ -4,7 +4,10 @@
 
 package big
 
-import "testing"
+import (
+	"fmt"
+	"testing"
+)
 
 func TestDecimalString(t *testing.T) {
 	for _, test := range []struct {
@@ -105,12 +108,27 @@
 	}
 }
 
+var sink string
+
 func BenchmarkDecimalConversion(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		for shift := -100; shift <= +100; shift++ {
 			var d decimal
 			d.init(natOne, shift)
-			d.String()
+			sink = d.String()
 		}
 	}
 }
+
+func BenchmarkFloatString(b *testing.B) {
+	x := new(Float)
+	for _, prec := range []uint{1e2, 1e3, 1e4, 1e5} {
+		x.SetPrec(prec).SetRat(NewRat(1, 3))
+		b.Run(fmt.Sprintf("%v", prec), func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				sink = x.String()
+			}
+		})
+	}
+}
diff --git a/src/math/big/doc.go b/src/math/big/doc.go
index a3c2375..65ed019 100644
--- a/src/math/big/doc.go
+++ b/src/math/big/doc.go
@@ -31,7 +31,7 @@
 
 	var z1 Int
 	z1.SetUint64(123)                 // z1 := 123
-	z2 := new(Rat).SetFloat64(1.2)    // z2 := 6/5
+	z2 := new(Rat).SetFloat64(1.25)   // z2 := 5/4
 	z3 := new(Float).SetInt(z1)       // z3 := 123.0
 
 Setters, numeric operations and predicates are represented as methods of
diff --git a/src/math/big/example_test.go b/src/math/big/example_test.go
index ac79552..cfc7735 100644
--- a/src/math/big/example_test.go
+++ b/src/math/big/example_test.go
@@ -51,6 +51,19 @@
 	// Output: 18446744073709551617
 }
 
+func ExampleFloat_Scan() {
+	// The Scan function is rarely used directly;
+	// the fmt package recognizes it as an implementation of fmt.Scanner.
+	f := new(big.Float)
+	_, err := fmt.Sscan("1.19282e99", f)
+	if err != nil {
+		log.Println("error scanning value:", err)
+	} else {
+		fmt.Println(f)
+	}
+	// Output: 1.19282e+99
+}
+
 // This example demonstrates how to use big.Int to compute the smallest
 // Fibonacci number with 100 decimal digits and to test whether it is prime.
 func Example_fibonacci() {
diff --git a/src/math/big/float.go b/src/math/big/float.go
index 7a9c2b3..aabd7b4 100644
--- a/src/math/big/float.go
+++ b/src/math/big/float.go
@@ -1210,20 +1210,30 @@
 	ex := int64(x.exp) - int64(len(x.mant))*_W
 	ey := int64(y.exp) - int64(len(y.mant))*_W
 
+	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
+
 	// TODO(gri) having a combined add-and-shift primitive
 	//           could make this code significantly faster
 	switch {
 	case ex < ey:
-		// cannot re-use z.mant w/o testing for aliasing
-		t := nat(nil).shl(y.mant, uint(ey-ex))
-		z.mant = z.mant.add(x.mant, t)
+		if al {
+			t := nat(nil).shl(y.mant, uint(ey-ex))
+			z.mant = z.mant.add(x.mant, t)
+		} else {
+			z.mant = z.mant.shl(y.mant, uint(ey-ex))
+			z.mant = z.mant.add(x.mant, z.mant)
+		}
 	default:
 		// ex == ey, no shift needed
 		z.mant = z.mant.add(x.mant, y.mant)
 	case ex > ey:
-		// cannot re-use z.mant w/o testing for aliasing
-		t := nat(nil).shl(x.mant, uint(ex-ey))
-		z.mant = z.mant.add(t, y.mant)
+		if al {
+			t := nat(nil).shl(x.mant, uint(ex-ey))
+			z.mant = z.mant.add(t, y.mant)
+		} else {
+			z.mant = z.mant.shl(x.mant, uint(ex-ey))
+			z.mant = z.mant.add(z.mant, y.mant)
+		}
 		ex = ey
 	}
 	// len(z.mant) > 0
@@ -1247,18 +1257,28 @@
 	ex := int64(x.exp) - int64(len(x.mant))*_W
 	ey := int64(y.exp) - int64(len(y.mant))*_W
 
+	al := alias(z.mant, x.mant) || alias(z.mant, y.mant)
+
 	switch {
 	case ex < ey:
-		// cannot re-use z.mant w/o testing for aliasing
-		t := nat(nil).shl(y.mant, uint(ey-ex))
-		z.mant = t.sub(x.mant, t)
+		if al {
+			t := nat(nil).shl(y.mant, uint(ey-ex))
+			z.mant = t.sub(x.mant, t)
+		} else {
+			z.mant = z.mant.shl(y.mant, uint(ey-ex))
+			z.mant = z.mant.sub(x.mant, z.mant)
+		}
 	default:
 		// ex == ey, no shift needed
 		z.mant = z.mant.sub(x.mant, y.mant)
 	case ex > ey:
-		// cannot re-use z.mant w/o testing for aliasing
-		t := nat(nil).shl(x.mant, uint(ex-ey))
-		z.mant = t.sub(t, y.mant)
+		if al {
+			t := nat(nil).shl(x.mant, uint(ex-ey))
+			z.mant = t.sub(t, y.mant)
+		} else {
+			z.mant = z.mant.shl(x.mant, uint(ex-ey))
+			z.mant = z.mant.sub(z.mant, y.mant)
+		}
 		ex = ey
 	}
 
diff --git a/src/math/big/float_test.go b/src/math/big/float_test.go
index 464619b..7d4bd31 100644
--- a/src/math/big/float_test.go
+++ b/src/math/big/float_test.go
@@ -5,6 +5,7 @@
 package big
 
 import (
+	"flag"
 	"fmt"
 	"math"
 	"strconv"
@@ -1495,12 +1496,14 @@
 	}
 }
 
+var long = flag.Bool("long", false, "run very long tests")
+
 // TestFloatQuoSmoke tests all divisions x/y for values x, y in the range [-n, +n];
 // it serves as a smoke test for basic correctness of division.
 func TestFloatQuoSmoke(t *testing.T) {
-	n := 1000
-	if testing.Short() {
-		n = 10
+	n := 10
+	if *long {
+		n = 1000
 	}
 
 	const dprec = 3         // max. precision variation
@@ -1762,3 +1765,41 @@
 		}
 	}
 }
+
+func BenchmarkFloatAdd(b *testing.B) {
+	x := new(Float)
+	y := new(Float)
+	z := new(Float)
+
+	for _, prec := range []uint{10, 1e2, 1e3, 1e4, 1e5} {
+		x.SetPrec(prec).SetRat(NewRat(1, 3))
+		y.SetPrec(prec).SetRat(NewRat(1, 6))
+		z.SetPrec(prec)
+
+		b.Run(fmt.Sprintf("%v", prec), func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				z.Add(x, y)
+			}
+		})
+	}
+}
+
+func BenchmarkFloatSub(b *testing.B) {
+	x := new(Float)
+	y := new(Float)
+	z := new(Float)
+
+	for _, prec := range []uint{10, 1e2, 1e3, 1e4, 1e5} {
+		x.SetPrec(prec).SetRat(NewRat(1, 3))
+		y.SetPrec(prec).SetRat(NewRat(1, 6))
+		z.SetPrec(prec)
+
+		b.Run(fmt.Sprintf("%v", prec), func(b *testing.B) {
+			b.ReportAllocs()
+			for i := 0; i < b.N; i++ {
+				z.Sub(x, y)
+			}
+		})
+	}
+}
diff --git a/src/math/big/floatconv.go b/src/math/big/floatconv.go
index a884df6..95d1bf8 100644
--- a/src/math/big/floatconv.go
+++ b/src/math/big/floatconv.go
@@ -12,9 +12,13 @@
 	"strings"
 )
 
+var floatZero Float
+
 // SetString sets z to the value of s and returns z and a boolean indicating
 // success. s must be a floating-point number of the same format as accepted
-// by Parse, with base argument 0.
+// by Parse, with base argument 0. The entire string (not just a prefix) must
+// be valid for success. If the operation failed, the value of z is undefined
+// but the returned value is nil.
 func (z *Float) SetString(s string) (*Float, bool) {
 	if f, _, err := z.Parse(s, 0); err == nil {
 		return f, true
@@ -212,17 +216,18 @@
 //
 // It sets z to the (possibly rounded) value of the corresponding floating-
 // point value, and returns z, the actual base b, and an error err, if any.
+// The entire string (not just a prefix) must be consumed for success.
 // If z's precision is 0, it is changed to 64 before rounding takes effect.
 // The number must be of the form:
 //
 //	number   = [ sign ] [ prefix ] mantissa [ exponent ] | infinity .
 //	sign     = "+" | "-" .
-//      prefix   = "0" ( "x" | "X" | "b" | "B" ) .
+//	prefix   = "0" ( "x" | "X" | "b" | "B" ) .
 //	mantissa = digits | digits "." [ digits ] | "." digits .
 //	exponent = ( "E" | "e" | "p" ) [ sign ] digits .
 //	digits   = digit { digit } .
 //	digit    = "0" ... "9" | "a" ... "z" | "A" ... "Z" .
-//      infinity = [ sign ] ( "inf" | "Inf" ) .
+//	infinity = [ sign ] ( "inf" | "Inf" ) .
 //
 // The base argument must be 0, 2, 10, or 16. Providing an invalid base
 // argument will lead to a run-time panic.
@@ -273,3 +278,16 @@
 func ParseFloat(s string, base int, prec uint, mode RoundingMode) (f *Float, b int, err error) {
 	return new(Float).SetPrec(prec).SetMode(mode).Parse(s, base)
 }
+
+var _ fmt.Scanner = &floatZero // *Float must implement fmt.Scanner
+
+// Scan is a support routine for fmt.Scanner; it sets z to the value of
+// the scanned number. It accepts formats whose verbs are supported by
+// fmt.Scan for floating point values, which are:
+// 'b' (binary), 'e', 'E', 'f', 'F', 'g' and 'G'.
+// Scan doesn't handle ±Inf.
+func (z *Float) Scan(s fmt.ScanState, ch rune) error {
+	s.SkipSpace()
+	_, _, err := z.scan(byteReader{s}, 0)
+	return err
+}
diff --git a/src/math/big/floatconv_test.go b/src/math/big/floatconv_test.go
index b2a1ab0..edcb2eb 100644
--- a/src/math/big/floatconv_test.go
+++ b/src/math/big/floatconv_test.go
@@ -5,6 +5,7 @@
 package big
 
 import (
+	"bytes"
 	"fmt"
 	"math"
 	"strconv"
@@ -665,3 +666,54 @@
 		}
 	}
 }
+
+func TestFloatScan(t *testing.T) {
+	var floatScanTests = []struct {
+		input     string
+		format    string
+		output    string
+		remaining int
+		wantErr   bool
+	}{
+		0: {"10.0", "%f", "10", 0, false},
+		1: {"23.98+2.0", "%v", "23.98", 4, false},
+		2: {"-1+1", "%v", "-1", 2, false},
+		3: {" 00000", "%v", "0", 0, false},
+		4: {"-123456p-78", "%b", "-4.084816388e-19", 0, false},
+		5: {"+123", "%b", "123", 0, false},
+		6: {"-1.234e+56", "%e", "-1.234e+56", 0, false},
+		7: {"-1.234E-56", "%E", "-1.234e-56", 0, false},
+		8: {"-1.234e+567", "%g", "-1.234e+567", 0, false},
+		9: {"+1234567891011.234", "%G", "1.234567891e+12", 0, false},
+
+		// Scan doesn't handle ±Inf.
+		10: {"Inf", "%v", "", 3, true},
+		11: {"-Inf", "%v", "", 3, true},
+		12: {"-Inf", "%v", "", 3, true},
+	}
+
+	var buf bytes.Buffer
+	for i, test := range floatScanTests {
+		x := new(Float)
+		buf.Reset()
+		buf.WriteString(test.input)
+		_, err := fmt.Fscanf(&buf, test.format, x)
+		if test.wantErr {
+			if err == nil {
+				t.Errorf("#%d want non-nil err", i)
+			}
+			continue
+		}
+
+		if err != nil {
+			t.Errorf("#%d error: %s", i, err)
+		}
+
+		if x.String() != test.output {
+			t.Errorf("#%d got %s; want %s", i, x.String(), test.output)
+		}
+		if buf.Len() != test.remaining {
+			t.Errorf("#%d got %d bytes remaining; want %d", i, buf.Len(), test.remaining)
+		}
+	}
+}
diff --git a/src/math/big/floatexample_test.go b/src/math/big/floatexample_test.go
index fb799d5..0c6668c 100644
--- a/src/math/big/floatexample_test.go
+++ b/src/math/big/floatexample_test.go
@@ -11,7 +11,7 @@
 )
 
 func ExampleFloat_Add() {
-	// Operating on numbers of different precision.
+	// Operate on numbers of different precision.
 	var x, y, z big.Float
 	x.SetInt64(1000)          // x is automatically set to 64bit precision
 	y.SetFloat64(2.718281828) // y is automatically set to 53bit precision
@@ -26,8 +26,8 @@
 	// z = 1002.718282 (0x.faadf854p+10, prec = 32, acc = Below)
 }
 
-func Example_Shift() {
-	// Implementing Float "shift" by modifying the (binary) exponents directly.
+func ExampleFloat_shift() {
+	// Implement Float "shift" by modifying the (binary) exponents directly.
 	for s := -5; s <= 5; s++ {
 		x := big.NewFloat(0.5)
 		x.SetMantExp(x, x.MantExp(nil)+s) // shift x by s
diff --git a/src/math/big/floatmarsh.go b/src/math/big/floatmarsh.go
index 3725d4b..d1c1dab 100644
--- a/src/math/big/floatmarsh.go
+++ b/src/math/big/floatmarsh.go
@@ -16,7 +16,7 @@
 
 // GobEncode implements the gob.GobEncoder interface.
 // The Float value and all its attributes (precision,
-// rounding mode, accuracy) are marshalled.
+// rounding mode, accuracy) are marshaled.
 func (x *Float) GobEncode() ([]byte, error) {
 	if x == nil {
 		return nil, nil
diff --git a/src/math/big/ftoa.go b/src/math/big/ftoa.go
index 57b16e1..d2a8588 100644
--- a/src/math/big/ftoa.go
+++ b/src/math/big/ftoa.go
@@ -376,6 +376,8 @@
 	return y
 }
 
+var _ fmt.Formatter = &floatZero // *Float must implement fmt.Formatter
+
 // Format implements fmt.Formatter. It accepts all the regular
 // formats for floating-point numbers ('b', 'e', 'E', 'f', 'F',
 // 'g', 'G') as well as 'p' and 'v'. See (*Float).Text for the
diff --git a/src/math/big/gcd_test.go b/src/math/big/gcd_test.go
index a929bf5..3cca2ec 100644
--- a/src/math/big/gcd_test.go
+++ b/src/math/big/gcd_test.go
@@ -20,6 +20,9 @@
 }
 
 func runGCD(b *testing.B, aSize, bSize uint) {
+	if isRaceBuilder && (aSize > 1000 || bSize > 1000) {
+		b.Skip("skipping on race builder")
+	}
 	b.Run("WithoutXY", func(b *testing.B) {
 		runGCDExt(b, aSize, bSize, false)
 	})
diff --git a/src/math/big/int.go b/src/math/big/int.go
index f2a75d1..1d8dabc 100644
--- a/src/math/big/int.go
+++ b/src/math/big/int.go
@@ -361,7 +361,8 @@
 }
 
 // SetString sets z to the value of s, interpreted in the given base,
-// and returns z and a boolean indicating success. If SetString fails,
+// and returns z and a boolean indicating success. The entire string
+// (not just a prefix) must be valid for success. If SetString fails,
 // the value of z is undefined but the returned value is nil.
 //
 // The base argument must be 0 or a value between 2 and MaxBase. If the base
@@ -371,12 +372,11 @@
 //
 func (z *Int) SetString(s string, base int) (*Int, bool) {
 	r := strings.NewReader(s)
-	_, _, err := z.scan(r, base)
-	if err != nil {
+	if _, _, err := z.scan(r, base); err != nil {
 		return nil, false
 	}
-	_, err = r.ReadByte()
-	if err != io.EOF {
+	// entire string must have been consumed
+	if _, err := r.ReadByte(); err != io.EOF {
 		return nil, false
 	}
 	return z, true // err == io.EOF => scan consumed all of s
@@ -404,8 +404,11 @@
 
 // Exp sets z = x**y mod |m| (i.e. the sign of m is ignored), and returns z.
 // If y <= 0, the result is 1 mod |m|; if m == nil or m == 0, z = x**y.
-// See Knuth, volume 2, section 4.6.3.
+//
+// Modular exponentation of inputs of a particular size is not a
+// cryptographically constant-time operation.
 func (z *Int) Exp(x, y, m *Int) *Int {
+	// See Knuth, volume 2, section 4.6.3.
 	var yWords nat
 	if !y.neg {
 		yWords = y.abs
@@ -550,19 +553,6 @@
 	return z.Lsh(u, k)
 }
 
-// ProbablyPrime performs n Miller-Rabin tests to check whether x is prime.
-// If x is prime, it returns true.
-// If x is not prime, it returns false with probability at least 1 - ¼ⁿ.
-//
-// It is not suitable for judging primes that an adversary may have crafted
-// to fool this test.
-func (x *Int) ProbablyPrime(n int) bool {
-	if n <= 0 {
-		panic("non-positive n for ProbablyPrime")
-	}
-	return !x.neg && x.abs.probablyPrime(n)
-}
-
 // Rand sets z to a pseudo-random number in [0, n) and returns z.
 func (z *Int) Rand(rnd *rand.Rand, n *Int) *Int {
 	z.neg = false
@@ -577,6 +567,11 @@
 // ModInverse sets z to the multiplicative inverse of g in the ring ℤ/nℤ
 // and returns z. If g and n are not relatively prime, the result is undefined.
 func (z *Int) ModInverse(g, n *Int) *Int {
+	if g.neg {
+		// GCD expects parameters a and b to be > 0.
+		var g2 Int
+		g = g2.Mod(g, n)
+	}
 	var d Int
 	d.GCD(z, nil, g, n)
 	// x and y are such that g*x + n*y = d. Since g and n are
@@ -932,3 +927,14 @@
 	z.neg = true // z cannot be zero if x is positive
 	return z
 }
+
+// Sqrt sets z to ⌊√x⌋, the largest integer such that z² ≤ x, and returns z.
+// It panics if x is negative.
+func (z *Int) Sqrt(x *Int) *Int {
+	if x.neg {
+		panic("square root of negative number")
+	}
+	z.neg = false
+	z.abs = z.abs.sqrt(x.abs)
+	return z
+}
diff --git a/src/math/big/int_test.go b/src/math/big/int_test.go
index 45a3765..b8e0778 100644
--- a/src/math/big/int_test.go
+++ b/src/math/big/int_test.go
@@ -9,6 +9,7 @@
 	"encoding/hex"
 	"fmt"
 	"math/rand"
+	"strings"
 	"testing"
 	"testing/quick"
 )
@@ -478,6 +479,18 @@
 	}
 }
 
+func BenchmarkQuoRem(b *testing.B) {
+	x, _ := new(Int).SetString("153980389784927331788354528594524332344709972855165340650588877572729725338415474372475094155672066328274535240275856844648695200875763869073572078279316458648124537905600131008790701752441155668003033945258023841165089852359980273279085783159654751552359397986180318708491098942831252291841441726305535546071", 0)
+	y, _ := new(Int).SetString("7746362281539803897849273317883545285945243323447099728551653406505888775727297253384154743724750941556720663282745352402758568446486952008757638690735720782793164586481245379056001310087907017524411556680030339452580238411650898523599802732790857831596547515523593979861803187084910989428312522918414417263055355460715745539358014631136245887418412633787074173796862711588221766398229333338511838891484974940633857861775630560092874987828057333663969469797013996401149696897591265769095952887917296740109742927689053276850469671231961384715398038978492733178835452859452433234470997285516534065058887757272972533841547437247509415567206632827453524027585684464869520087576386907357207827931645864812453790560013100879070175244115566800303394525802384116508985235998027327908578315965475155235939798618031870849109894283125229184144172630553554607112725169432413343763989564437170644270643461665184965150423819594083121075825", 0)
+	q := new(Int)
+	r := new(Int)
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		q.QuoRem(y, x, r)
+	}
+}
+
 var bitLenTests = []struct {
 	in  string
 	out int
@@ -572,6 +585,19 @@
 	{"0xffffffffffffffff00000001", "0xffffffffffffffff00000001", "0xffffffffffffffff00000001", "0"},
 	{"0xffffffffffffffffffffffff00000001", "0xffffffffffffffffffffffff00000001", "0xffffffffffffffffffffffff00000001", "0"},
 	{"0xffffffffffffffffffffffffffffffff00000001", "0xffffffffffffffffffffffffffffffff00000001", "0xffffffffffffffffffffffffffffffff00000001", "0"},
+
+	{
+		"2",
+		"0xB08FFB20760FFED58FADA86DFEF71AD72AA0FA763219618FE022C197E54708BB1191C66470250FCE8879487507CEE41381CA4D932F81C2B3F1AB20B539D50DCD",
+		"0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF73", // odd
+		"0x6AADD3E3E424D5B713FCAA8D8945B1E055166132038C57BBD2D51C833F0C5EA2007A2324CE514F8E8C2F008A2F36F44005A4039CB55830986F734C93DAF0EB4BAB54A6A8C7081864F44346E9BC6F0A3EB9F2C0146A00C6A05187D0C101E1F2D038CDB70CB5E9E05A2D188AB6CBB46286624D4415E7D4DBFAD3BCC6009D915C406EED38F468B940F41E6BEDC0430DD78E6F19A7DA3A27498A4181E24D738B0072D8F6ADB8C9809A5B033A09785814FD9919F6EF9F83EEA519BEC593855C4C10CBEEC582D4AE0792158823B0275E6AEC35242740468FAF3D5C60FD1E376362B6322F78B7ED0CA1C5BBCD2B49734A56C0967A1D01A100932C837B91D592CE08ABFF",
+	},
+	{
+		"2",
+		"0xB08FFB20760FFED58FADA86DFEF71AD72AA0FA763219618FE022C197E54708BB1191C66470250FCE8879487507CEE41381CA4D932F81C2B3F1AB20B539D50DCD",
+		"0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF72", // even
+		"0x7858794B5897C29F4ED0B40913416AB6C48588484E6A45F2ED3E26C941D878E923575AAC434EE2750E6439A6976F9BB4D64CEDB2A53CE8D04DD48CADCDF8E46F22747C6B81C6CEA86C0D873FBF7CEF262BAAC43A522BD7F32F3CDAC52B9337C77B3DCFB3DB3EDD80476331E82F4B1DF8EFDC1220C92656DFC9197BDC1877804E28D928A2A284B8DED506CBA304435C9D0133C246C98A7D890D1DE60CBC53A024361DA83A9B8775019083D22AC6820ED7C3C68F8E801DD4EC779EE0A05C6EB682EF9840D285B838369BA7E148FA27691D524FAEAF7C6ECE2A4B99A294B9F2C241857B5B90CC8BFFCFCF18DFA7D676131D5CD3855A5A3E8EBFA0CDFADB4D198B4A",
+	},
 }
 
 func TestExp(t *testing.T) {
@@ -614,6 +640,26 @@
 	}
 }
 
+func BenchmarkExp(b *testing.B) {
+	x, _ := new(Int).SetString("11001289118363089646017359372117963499250546375269047542777928006103246876688756735760905680604646624353196869572752623285140408755420374049317646428185270079555372763503115646054602867593662923894140940837479507194934267532831694565516466765025434902348314525627418515646588160955862839022051353653052947073136084780742729727874803457643848197499548297570026926927502505634297079527299004267769780768565695459945235586892627059178884998772989397505061206395455591503771677500931269477503508150175717121828518985901959919560700853226255420793148986854391552859459511723547532575574664944815966793196961286234040892865", 0)
+	y, _ := new(Int).SetString("0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF72", 0)
+	n, _ := new(Int).SetString("0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF73", 0)
+	out := new(Int)
+	for i := 0; i < b.N; i++ {
+		out.Exp(x, y, n)
+	}
+}
+
+func BenchmarkExp2(b *testing.B) {
+	x, _ := new(Int).SetString("2", 0)
+	y, _ := new(Int).SetString("0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF72", 0)
+	n, _ := new(Int).SetString("0xAC6BDB41324A9A9BF166DE5E1389582FAF72B6651987EE07FC3192943DB56050A37329CBB4A099ED8193E0757767A13DD52312AB4B03310DCD7F48A9DA04FD50E8083969EDB767B0CF6095179A163AB3661A05FBD5FAAAE82918A9962F0B93B855F97993EC975EEAA80D740ADBF4FF747359D041D5C33EA71D281E446B14773BCA97B43A23FB801676BD207A436C6481F1D2B9078717461A5B9D32E688F87748544523B524B0D57D5EA77A2775D2ECFA032CFBDBF52FB3786160279004E57AE6AF874E7303CE53299CCC041C7BC308D82A5698F3A8D0C38271AE35F8E9DBFBB694B5C803D89F7AE435DE236D525F54759B65E372FCD68EF20FA7111F9E4AFF73", 0)
+	out := new(Int)
+	for i := 0; i < b.N; i++ {
+		out.Exp(x, y, n)
+	}
+}
+
 func checkGcd(aBytes, bBytes []byte) bool {
 	x := new(Int)
 	y := new(Int)
@@ -715,85 +761,6 @@
 	}
 }
 
-var primes = []string{
-	"2",
-	"3",
-	"5",
-	"7",
-	"11",
-
-	"13756265695458089029",
-	"13496181268022124907",
-	"10953742525620032441",
-	"17908251027575790097",
-
-	// https://golang.org/issue/638
-	"18699199384836356663",
-
-	"98920366548084643601728869055592650835572950932266967461790948584315647051443",
-	"94560208308847015747498523884063394671606671904944666360068158221458669711639",
-
-	// http://primes.utm.edu/lists/small/small3.html
-	"449417999055441493994709297093108513015373787049558499205492347871729927573118262811508386655998299074566974373711472560655026288668094291699357843464363003144674940345912431129144354948751003607115263071543163",
-	"230975859993204150666423538988557839555560243929065415434980904258310530753006723857139742334640122533598517597674807096648905501653461687601339782814316124971547968912893214002992086353183070342498989426570593",
-	"5521712099665906221540423207019333379125265462121169655563495403888449493493629943498064604536961775110765377745550377067893607246020694972959780839151452457728855382113555867743022746090187341871655890805971735385789993",
-	"203956878356401977405765866929034577280193993314348263094772646453283062722701277632936616063144088173312372882677123879538709400158306567338328279154499698366071906766440037074217117805690872792848149112022286332144876183376326512083574821647933992961249917319836219304274280243803104015000563790123",
-
-	// ECC primes: http://tools.ietf.org/html/draft-ladd-safecurves-02
-	"3618502788666131106986593281521497120414687020801267626233049500247285301239",                                                                                  // Curve1174: 2^251-9
-	"57896044618658097711785492504343953926634992332820282019728792003956564819949",                                                                                 // Curve25519: 2^255-19
-	"9850501549098619803069760025035903451269934817616361666987073351061430442874302652853566563721228910201656997576599",                                           // E-382: 2^382-105
-	"42307582002575910332922579714097346549017899709713998034217522897561970639123926132812109468141778230245837569601494931472367",                                 // Curve41417: 2^414-17
-	"6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151", // E-521: 2^521-1
-}
-
-var composites = []string{
-	"0",
-	"1",
-	"21284175091214687912771199898307297748211672914763848041968395774954376176754",
-	"6084766654921918907427900243509372380954290099172559290432744450051395395951",
-	"84594350493221918389213352992032324280367711247940675652888030554255915464401",
-	"82793403787388584738507275144194252681",
-}
-
-func TestProbablyPrime(t *testing.T) {
-	nreps := 20
-	if testing.Short() {
-		nreps = 1
-	}
-	for i, s := range primes {
-		p, _ := new(Int).SetString(s, 10)
-		if !p.ProbablyPrime(nreps) {
-			t.Errorf("#%d prime found to be non-prime (%s)", i, s)
-		}
-	}
-
-	for i, s := range composites {
-		c, _ := new(Int).SetString(s, 10)
-		if c.ProbablyPrime(nreps) {
-			t.Errorf("#%d composite found to be prime (%s)", i, s)
-		}
-		if testing.Short() {
-			break
-		}
-	}
-
-	// check that ProbablyPrime panics if n <= 0
-	c := NewInt(11) // a prime
-	for _, n := range []int{-1, 0, 1} {
-		func() {
-			defer func() {
-				if n <= 0 && recover() == nil {
-					t.Fatalf("expected panic from ProbablyPrime(%d)", n)
-				}
-			}()
-			if !c.ProbablyPrime(n) {
-				t.Fatalf("%v should be a prime", c)
-			}
-		}()
-	}
-}
-
 type intShiftTest struct {
 	in    string
 	shift uint
@@ -1229,6 +1196,9 @@
 }
 
 func BenchmarkModSqrt5430_Tonelli(b *testing.B) {
+	if isRaceBuilder {
+		b.Skip("skipping on race builder")
+	}
 	p := tri(5430)
 	x := new(Int).SetUint64(2)
 	for i := 0; i < b.N; i++ {
@@ -1238,6 +1208,9 @@
 }
 
 func BenchmarkModSqrt5430_3Mod4(b *testing.B) {
+	if isRaceBuilder {
+		b.Skip("skipping on race builder")
+	}
 	p := tri(5430)
 	x := new(Int).SetUint64(2)
 	for i := 0; i < b.N; i++ {
@@ -1303,6 +1276,7 @@
 }{
 	{"1234567", "458948883992"},
 	{"239487239847", "2410312426921032588552076022197566074856950548502459942654116941958108831682612228890093858261341614673227141477904012196503648957050582631942730706805009223062734745341073406696246014589361659774041027169249453200378729434170325843778659198143763193776859869524088940195577346119843545301547043747207749969763750084308926339295559968882457872412993810129130294592999947926365264059284647209730384947211681434464714438488520940127459844288859336526896320919633919"},
+	{"-10", "13"}, // issue #16984
 }
 
 func TestModInverse(t *testing.T) {
@@ -1480,3 +1454,44 @@
 	n := NewInt(10)
 	n.Rand(rand.New(rand.NewSource(9)), n)
 }
+
+func TestSqrt(t *testing.T) {
+	root := 0
+	r := new(Int)
+	for i := 0; i < 10000; i++ {
+		if (root+1)*(root+1) <= i {
+			root++
+		}
+		n := NewInt(int64(i))
+		r.SetInt64(-2)
+		r.Sqrt(n)
+		if r.Cmp(NewInt(int64(root))) != 0 {
+			t.Errorf("Sqrt(%v) = %v, want %v", n, r, root)
+		}
+	}
+
+	for i := 0; i < 1000; i += 10 {
+		n, _ := new(Int).SetString("1"+strings.Repeat("0", i), 10)
+		r := new(Int).Sqrt(n)
+		root, _ := new(Int).SetString("1"+strings.Repeat("0", i/2), 10)
+		if r.Cmp(root) != 0 {
+			t.Errorf("Sqrt(1e%d) = %v, want 1e%d", i, r, i/2)
+		}
+	}
+
+	// Test aliasing.
+	r.SetInt64(100)
+	r.Sqrt(r)
+	if r.Int64() != 10 {
+		t.Errorf("Sqrt(100) = %v, want 10 (aliased output)", r.Int64())
+	}
+}
+
+func BenchmarkSqrt(b *testing.B) {
+	n, _ := new(Int).SetString("1"+strings.Repeat("0", 1001), 10)
+	b.ResetTimer()
+	t := new(Int)
+	for i := 0; i < b.N; i++ {
+		t.Sqrt(n)
+	}
+}
diff --git a/src/math/big/intconv.go b/src/math/big/intconv.go
index daf674a..91a62ce 100644
--- a/src/math/big/intconv.go
+++ b/src/math/big/intconv.go
@@ -52,6 +52,8 @@
 	}
 }
 
+var _ fmt.Formatter = intOne // *Int must implement fmt.Formatter
+
 // Format implements fmt.Formatter. It accepts the formats
 // 'b' (binary), 'o' (octal), 'd' (decimal), 'x' (lowercase
 // hexadecimal), and 'X' (uppercase hexadecimal).
@@ -223,6 +225,8 @@
 	return r.UnreadRune()
 }
 
+var _ fmt.Scanner = intOne // *Int must implement fmt.Scanner
+
 // Scan is a support routine for fmt.Scanner; it sets z to the value of
 // the scanned number. It accepts the formats 'b' (binary), 'o' (octal),
 // 'd' (decimal), 'x' (lowercase hexadecimal), and 'X' (uppercase hexadecimal).
diff --git a/src/math/big/intmarsh.go b/src/math/big/intmarsh.go
index 4ff57b6..ee1e414 100644
--- a/src/math/big/intmarsh.go
+++ b/src/math/big/intmarsh.go
@@ -59,7 +59,7 @@
 	return nil
 }
 
-// The JSON marshallers are only here for API backward compatibility
+// The JSON marshalers are only here for API backward compatibility
 // (programs that explicitly look for these two methods). JSON works
 // fine with the TextMarshaler only.
 
@@ -70,5 +70,9 @@
 
 // UnmarshalJSON implements the json.Unmarshaler interface.
 func (z *Int) UnmarshalJSON(text []byte) error {
+	// Ignore null, like in the main JSON package.
+	if string(text) == "null" {
+		return nil
+	}
 	return z.UnmarshalText(text)
 }
diff --git a/src/math/big/nat.go b/src/math/big/nat.go
index 2e65d2a..9b1a626 100644
--- a/src/math/big/nat.go
+++ b/src/math/big/nat.go
@@ -542,16 +542,21 @@
 	return
 }
 
-// getNat returns a nat of len n. The contents may not be zero.
-func getNat(n int) nat {
-	var z nat
+// getNat returns a *nat of len n. The contents may not be zero.
+// The pool holds *nat to avoid allocation when converting to interface{}.
+func getNat(n int) *nat {
+	var z *nat
 	if v := natPool.Get(); v != nil {
-		z = v.(nat)
+		z = v.(*nat)
 	}
-	return z.make(n)
+	if z == nil {
+		z = new(nat)
+	}
+	*z = z.make(n)
+	return z
 }
 
-func putNat(x nat) {
+func putNat(x *nat) {
 	natPool.Put(x)
 }
 
@@ -575,7 +580,8 @@
 	}
 	q = z.make(m + 1)
 
-	qhatv := getNat(n + 1)
+	qhatvp := getNat(n + 1)
+	qhatv := *qhatvp
 	if alias(u, uIn) || alias(u, v) {
 		u = nil // u is an alias for uIn or v - cannot reuse
 	}
@@ -583,36 +589,40 @@
 	u.clear() // TODO(gri) no need to clear if we allocated a new u
 
 	// D1.
-	var v1 nat
+	var v1p *nat
 	shift := nlz(v[n-1])
 	if shift > 0 {
 		// do not modify v, it may be used by another goroutine simultaneously
-		v1 = getNat(n)
+		v1p = getNat(n)
+		v1 := *v1p
 		shlVU(v1, v, shift)
 		v = v1
 	}
 	u[len(uIn)] = shlVU(u[0:len(uIn)], uIn, shift)
 
 	// D2.
+	vn1 := v[n-1]
 	for j := m; j >= 0; j-- {
 		// D3.
 		qhat := Word(_M)
-		if u[j+n] != v[n-1] {
+		if ujn := u[j+n]; ujn != vn1 {
 			var rhat Word
-			qhat, rhat = divWW(u[j+n], u[j+n-1], v[n-1])
+			qhat, rhat = divWW(ujn, u[j+n-1], vn1)
 
 			// x1 | x2 = q̂v_{n-2}
-			x1, x2 := mulWW(qhat, v[n-2])
+			vn2 := v[n-2]
+			x1, x2 := mulWW(qhat, vn2)
 			// test if q̂v_{n-2} > br̂ + u_{j+n-2}
-			for greaterThan(x1, x2, rhat, u[j+n-2]) {
+			ujn2 := u[j+n-2]
+			for greaterThan(x1, x2, rhat, ujn2) {
 				qhat--
 				prevRhat := rhat
-				rhat += v[n-1]
+				rhat += vn1
 				// v[n-1] >= 0, so this tests for overflow.
 				if rhat < prevRhat {
 					break
 				}
-				x1, x2 = mulWW(qhat, v[n-2])
+				x1, x2 = mulWW(qhat, vn2)
 			}
 		}
 
@@ -628,10 +638,10 @@
 
 		q[j] = qhat
 	}
-	if v1 != nil {
-		putNat(v1)
+	if v1p != nil {
+		putNat(v1p)
 	}
-	putNat(qhatv)
+	putNat(qhatvp)
 
 	q = q.norm()
 	shrVU(u, u, shift)
@@ -650,14 +660,14 @@
 
 const deBruijn32 = 0x077CB531
 
-var deBruijn32Lookup = []byte{
+var deBruijn32Lookup = [...]byte{
 	0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
 	31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9,
 }
 
 const deBruijn64 = 0x03f79d71b4ca8b09
 
-var deBruijn64Lookup = []byte{
+var deBruijn64Lookup = [...]byte{
 	0, 1, 56, 2, 57, 49, 28, 3, 61, 58, 42, 50, 38, 29, 17, 4,
 	62, 47, 59, 36, 45, 43, 51, 22, 53, 39, 33, 30, 24, 18, 12, 5,
 	63, 55, 48, 27, 60, 41, 37, 16, 46, 35, 44, 21, 52, 32, 23, 11,
@@ -950,7 +960,7 @@
 	// (x^2...x^15) but then reduces the number of multiply-reduces by a
 	// third. Even for a 32-bit exponent, this reduces the number of
 	// operations. Uses Montgomery method for odd moduli.
-	if len(x) > 1 && len(y) > 1 && len(m) > 0 {
+	if x.cmp(natOne) > 0 && len(y) > 1 && len(m) > 0 {
 		if m[0]&1 == 1 {
 			return z.expNNMontgomery(x, y, m)
 		}
@@ -1169,96 +1179,6 @@
 	return zz.norm()
 }
 
-// probablyPrime performs n Miller-Rabin tests to check whether x is prime.
-// If x is prime, it returns true.
-// If x is not prime, it returns false with probability at least 1 - ¼ⁿ.
-//
-// It is not suitable for judging primes that an adversary may have crafted
-// to fool this test.
-func (n nat) probablyPrime(reps int) bool {
-	if len(n) == 0 {
-		return false
-	}
-
-	if len(n) == 1 {
-		if n[0] < 2 {
-			return false
-		}
-
-		if n[0]%2 == 0 {
-			return n[0] == 2
-		}
-
-		// We have to exclude these cases because we reject all
-		// multiples of these numbers below.
-		switch n[0] {
-		case 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53:
-			return true
-		}
-	}
-
-	if n[0]&1 == 0 {
-		return false // n is even
-	}
-
-	const primesProduct32 = 0xC0CFD797         // Π {p ∈ primes, 2 < p <= 29}
-	const primesProduct64 = 0xE221F97C30E94E1D // Π {p ∈ primes, 2 < p <= 53}
-
-	var r Word
-	switch _W {
-	case 32:
-		r = n.modW(primesProduct32)
-	case 64:
-		r = n.modW(primesProduct64 & _M)
-	default:
-		panic("Unknown word size")
-	}
-
-	if r%3 == 0 || r%5 == 0 || r%7 == 0 || r%11 == 0 ||
-		r%13 == 0 || r%17 == 0 || r%19 == 0 || r%23 == 0 || r%29 == 0 {
-		return false
-	}
-
-	if _W == 64 && (r%31 == 0 || r%37 == 0 || r%41 == 0 ||
-		r%43 == 0 || r%47 == 0 || r%53 == 0) {
-		return false
-	}
-
-	nm1 := nat(nil).sub(n, natOne)
-	// determine q, k such that nm1 = q << k
-	k := nm1.trailingZeroBits()
-	q := nat(nil).shr(nm1, k)
-
-	nm3 := nat(nil).sub(nm1, natTwo)
-	rand := rand.New(rand.NewSource(int64(n[0])))
-
-	var x, y, quotient nat
-	nm3Len := nm3.bitLen()
-
-NextRandom:
-	for i := 0; i < reps; i++ {
-		x = x.random(rand, nm3, nm3Len)
-		x = x.add(x, natTwo)
-		y = y.expNN(x, q, n)
-		if y.cmp(natOne) == 0 || y.cmp(nm1) == 0 {
-			continue
-		}
-		for j := uint(1); j < k; j++ {
-			y = y.mul(y, y)
-			quotient, y = quotient.div(y, y, n)
-			if y.cmp(nm1) == 0 {
-				continue NextRandom
-			}
-			if y.cmp(natOne) == 0 {
-				return false
-			}
-		}
-		return false
-	}
-
-	return true
-}
-
 // bytes writes the value of z into buf using big-endian encoding.
 // len(buf) must be >= len(z)*_S. The value of z is encoded in the
 // slice buf[i:]. The number i of unused bytes at the beginning of
@@ -1303,3 +1223,37 @@
 
 	return z.norm()
 }
+
+// sqrt sets z = ⌊√x⌋
+func (z nat) sqrt(x nat) nat {
+	if x.cmp(natOne) <= 0 {
+		return z.set(x)
+	}
+	if alias(z, x) {
+		z = nil
+	}
+
+	// Start with value known to be too large and repeat "z = ⌊(z + ⌊x/z⌋)/2⌋" until it stops getting smaller.
+	// See Brent and Zimmermann, Modern Computer Arithmetic, Algorithm 1.13 (SqrtInt).
+	// https://members.loria.fr/PZimmermann/mca/pub226.html
+	// If x is one less than a perfect square, the sequence oscillates between the correct z and z+1;
+	// otherwise it converges to the correct z and stays there.
+	var z1, z2 nat
+	z1 = z
+	z1 = z1.setUint64(1)
+	z1 = z1.shl(z1, uint(x.bitLen()/2+1)) // must be ≥ √x
+	for n := 0; ; n++ {
+		z2, _ = z2.div(nil, x, z1)
+		z2 = z2.add(z2, z1)
+		z2 = z2.shr(z2, 1)
+		if z2.cmp(z1) >= 0 {
+			// z1 is answer.
+			// Figure out whether z1 or z2 is currently aliased to z by looking at loop count.
+			if n&1 == 0 {
+				return z1
+			}
+			return z.set(z1)
+		}
+		z1, z2 = z2, z1
+	}
+}
diff --git a/src/math/big/natconv_test.go b/src/math/big/natconv_test.go
index 79901d1..bdb60e6 100644
--- a/src/math/big/natconv_test.go
+++ b/src/math/big/natconv_test.go
@@ -278,6 +278,9 @@
 	const x = 10
 	for _, base := range []int{2, 8, 10, 16} {
 		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			if isRaceBuilder && y > 1000 {
+				continue
+			}
 			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
 				b.StopTimer()
 				var z nat
@@ -301,6 +304,9 @@
 	const x = 10
 	for _, base := range []int{2, 8, 10, 16} {
 		for _, y := range []Word{10, 100, 1000, 10000, 100000} {
+			if isRaceBuilder && y > 1000 {
+				continue
+			}
 			b.Run(fmt.Sprintf("%d/Base%d", y, base), func(b *testing.B) {
 				b.StopTimer()
 				var z nat
diff --git a/src/math/big/prime.go b/src/math/big/prime.go
new file mode 100644
index 0000000..3e9690e
--- /dev/null
+++ b/src/math/big/prime.go
@@ -0,0 +1,320 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package big
+
+import "math/rand"
+
+// ProbablyPrime reports whether x is probably prime,
+// applying the Miller-Rabin test with n pseudorandomly chosen bases
+// as well as a Baillie-PSW test.
+//
+// If x is prime, ProbablyPrime returns true.
+// If x is chosen randomly and not prime, ProbablyPrime probably returns false.
+// The probability of returning true for a randomly chosen non-prime is at most ¼ⁿ.
+//
+// ProbablyPrime is 100% accurate for inputs less than 2⁶⁴.
+// See Menezes et al., Handbook of Applied Cryptography, 1997, pp. 145-149,
+// and FIPS 186-4 Appendix F for further discussion of the error probabilities.
+//
+// ProbablyPrime is not suitable for judging primes that an adversary may
+// have crafted to fool the test.
+//
+// As of Go 1.8, ProbablyPrime(0) is allowed and applies only a Baillie-PSW test.
+// Before Go 1.8, ProbablyPrime applied only the Miller-Rabin tests, and ProbablyPrime(0) panicked.
+func (x *Int) ProbablyPrime(n int) bool {
+	// Note regarding the doc comment above:
+	// It would be more precise to say that the Baillie-PSW test uses the
+	// extra strong Lucas test as its Lucas test, but since no one knows
+	// how to tell any of the Lucas tests apart inside a Baillie-PSW test
+	// (they all work equally well empirically), that detail need not be
+	// documented or implicitly guaranteed.
+	// The comment does avoid saying "the" Baillie-PSW test
+	// because of this general ambiguity.
+
+	if n < 0 {
+		panic("negative n for ProbablyPrime")
+	}
+	if x.neg || len(x.abs) == 0 {
+		return false
+	}
+
+	// primeBitMask records the primes < 64.
+	const primeBitMask uint64 = 1<<2 | 1<<3 | 1<<5 | 1<<7 |
+		1<<11 | 1<<13 | 1<<17 | 1<<19 | 1<<23 | 1<<29 | 1<<31 |
+		1<<37 | 1<<41 | 1<<43 | 1<<47 | 1<<53 | 1<<59 | 1<<61
+
+	w := x.abs[0]
+	if len(x.abs) == 1 && w < 64 {
+		return primeBitMask&(1<<w) != 0
+	}
+
+	if w&1 == 0 {
+		return false // n is even
+	}
+
+	const primesA = 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23 * 37
+	const primesB = 29 * 31 * 41 * 43 * 47 * 53
+
+	var rA, rB uint32
+	switch _W {
+	case 32:
+		rA = uint32(x.abs.modW(primesA))
+		rB = uint32(x.abs.modW(primesB))
+	case 64:
+		r := x.abs.modW((primesA * primesB) & _M)
+		rA = uint32(r % primesA)
+		rB = uint32(r % primesB)
+	default:
+		panic("math/big: invalid word size")
+	}
+
+	if rA%3 == 0 || rA%5 == 0 || rA%7 == 0 || rA%11 == 0 || rA%13 == 0 || rA%17 == 0 || rA%19 == 0 || rA%23 == 0 || rA%37 == 0 ||
+		rB%29 == 0 || rB%31 == 0 || rB%41 == 0 || rB%43 == 0 || rB%47 == 0 || rB%53 == 0 {
+		return false
+	}
+
+	return x.abs.probablyPrimeMillerRabin(n+1, true) && x.abs.probablyPrimeLucas()
+}
+
+// probablyPrimeMillerRabin reports whether n passes reps rounds of the
+// Miller-Rabin primality test, using pseudo-randomly chosen bases.
+// If force2 is true, one of the rounds is forced to use base 2.
+// See Handbook of Applied Cryptography, p. 139, Algorithm 4.24.
+// The number n is known to be non-zero.
+func (n nat) probablyPrimeMillerRabin(reps int, force2 bool) bool {
+	nm1 := nat(nil).sub(n, natOne)
+	// determine q, k such that nm1 = q << k
+	k := nm1.trailingZeroBits()
+	q := nat(nil).shr(nm1, k)
+
+	nm3 := nat(nil).sub(nm1, natTwo)
+	rand := rand.New(rand.NewSource(int64(n[0])))
+
+	var x, y, quotient nat
+	nm3Len := nm3.bitLen()
+
+NextRandom:
+	for i := 0; i < reps; i++ {
+		if i == reps-1 && force2 {
+			x = x.set(natTwo)
+		} else {
+			x = x.random(rand, nm3, nm3Len)
+			x = x.add(x, natTwo)
+		}
+		y = y.expNN(x, q, n)
+		if y.cmp(natOne) == 0 || y.cmp(nm1) == 0 {
+			continue
+		}
+		for j := uint(1); j < k; j++ {
+			y = y.mul(y, y)
+			quotient, y = quotient.div(y, y, n)
+			if y.cmp(nm1) == 0 {
+				continue NextRandom
+			}
+			if y.cmp(natOne) == 0 {
+				return false
+			}
+		}
+		return false
+	}
+
+	return true
+}
+
+// probablyPrimeLucas reports whether n passes the "almost extra strong" Lucas probable prime test,
+// using Baillie-OEIS parameter selection. This corresponds to "AESLPSP" on Jacobsen's tables (link below).
+// The combination of this test and a Miller-Rabin/Fermat test with base 2 gives a Baillie-PSW test.
+//
+// References:
+//
+// Baillie and Wagstaff, "Lucas Pseudoprimes", Mathematics of Computation 35(152),
+// October 1980, pp. 1391-1417, especially page 1401.
+// http://www.ams.org/journals/mcom/1980-35-152/S0025-5718-1980-0583518-6/S0025-5718-1980-0583518-6.pdf
+//
+// Grantham, "Frobenius Pseudoprimes", Mathematics of Computation 70(234),
+// March 2000, pp. 873-891.
+// http://www.ams.org/journals/mcom/2001-70-234/S0025-5718-00-01197-2/S0025-5718-00-01197-2.pdf
+//
+// Baillie, "Extra strong Lucas pseudoprimes", OEIS A217719, https://oeis.org/A217719.
+//
+// Jacobsen, "Pseudoprime Statistics, Tables, and Data", http://ntheory.org/pseudoprimes.html.
+//
+// Nicely, "The Baillie-PSW Primality Test", http://www.trnicely.net/misc/bpsw.html.
+// (Note that Nicely's definition of the "extra strong" test gives the wrong Jacobi condition,
+// as pointed out by Jacobsen.)
+//
+// Crandall and Pomerance, Prime Numbers: A Computational Perspective, 2nd ed.
+// Springer, 2005.
+func (n nat) probablyPrimeLucas() bool {
+	// Discard 0, 1.
+	if len(n) == 0 || n.cmp(natOne) == 0 {
+		return false
+	}
+	// Two is the only even prime.
+	// Already checked by caller, but here to allow testing in isolation.
+	if n[0]&1 == 0 {
+		return n.cmp(natTwo) == 0
+	}
+
+	// Baillie-OEIS "method C" for choosing D, P, Q,
+	// as in https://oeis.org/A217719/a217719.txt:
+	// try increasing P ≥ 3 such that D = P² - 4 (so Q = 1)
+	// until Jacobi(D, n) = -1.
+	// The search is expected to succeed for non-square n after just a few trials.
+	// After more than expected failures, check whether n is square
+	// (which would cause Jacobi(D, n) = 1 for all D not dividing n).
+	p := Word(3)
+	d := nat{1}
+	t1 := nat(nil) // temp
+	intD := &Int{abs: d}
+	intN := &Int{abs: n}
+	for ; ; p++ {
+		if p > 10000 {
+			// This is widely believed to be impossible.
+			// If we get a report, we'll want the exact number n.
+			panic("math/big: internal error: cannot find (D/n) = -1 for " + intN.String())
+		}
+		d[0] = p*p - 4
+		j := Jacobi(intD, intN)
+		if j == -1 {
+			break
+		}
+		if j == 0 {
+			// d = p²-4 = (p-2)(p+2).
+			// If (d/n) == 0 then d shares a prime factor with n.
+			// Since the loop proceeds in increasing p and starts with p-2==1,
+			// the shared prime factor must be p+2.
+			// If p+2 == n, then n is prime; otherwise p+2 is a proper factor of n.
+			return len(n) == 1 && n[0] == p+2
+		}
+		if p == 40 {
+			// We'll never find (d/n) = -1 if n is a square.
+			// If n is a non-square we expect to find a d in just a few attempts on average.
+			// After 40 attempts, take a moment to check if n is indeed a square.
+			t1 = t1.sqrt(n)
+			t1 = t1.mul(t1, t1)
+			if t1.cmp(n) == 0 {
+				return false
+			}
+		}
+	}
+
+	// Grantham definition of "extra strong Lucas pseudoprime", after Thm 2.3 on p. 876
+	// (D, P, Q above have become Δ, b, 1):
+	//
+	// Let U_n = U_n(b, 1), V_n = V_n(b, 1), and Δ = b²-4.
+	// An extra strong Lucas pseudoprime to base b is a composite n = 2^r s + Jacobi(Δ, n),
+	// where s is odd and gcd(n, 2*Δ) = 1, such that either (i) U_s ≡ 0 mod n and V_s ≡ ±2 mod n,
+	// or (ii) V_{2^t s} ≡ 0 mod n for some 0 ≤ t < r-1.
+	//
+	// We know gcd(n, Δ) = 1 or else we'd have found Jacobi(d, n) == 0 above.
+	// We know gcd(n, 2) = 1 because n is odd.
+	//
+	// Arrange s = (n - Jacobi(Δ, n)) / 2^r = (n+1) / 2^r.
+	s := nat(nil).add(n, natOne)
+	r := int(s.trailingZeroBits())
+	s = s.shr(s, uint(r))
+	nm2 := nat(nil).sub(n, natTwo) // n-2
+
+	// We apply the "almost extra strong" test, which checks the above conditions
+	// except for U_s ≡ 0 mod n, which allows us to avoid computing any U_k values.
+	// Jacobsen points out that maybe we should just do the full extra strong test:
+	// "It is also possible to recover U_n using Crandall and Pomerance equation 3.13:
+	// U_n = D^-1 (2V_{n+1} - PV_n) allowing us to run the full extra-strong test
+	// at the cost of a single modular inversion. This computation is easy and fast in GMP,
+	// so we can get the full extra-strong test at essentially the same performance as the
+	// almost extra strong test."
+
+	// Compute Lucas sequence V_s(b, 1), where:
+	//
+	//	V(0) = 2
+	//	V(1) = P
+	//	V(k) = P V(k-1) - Q V(k-2).
+	//
+	// (Remember that due to method C above, P = b, Q = 1.)
+	//
+	// In general V(k) = α^k + β^k, where α and β are roots of x² - Px + Q.
+	// Crandall and Pomerance (p.147) observe that for 0 ≤ j ≤ k,
+	//
+	//	V(j+k) = V(j)V(k) - V(k-j).
+	//
+	// So in particular, to quickly double the subscript:
+	//
+	//	V(2k) = V(k)² - 2
+	//	V(2k+1) = V(k) V(k+1) - P
+	//
+	// We can therefore start with k=0 and build up to k=s in log₂(s) steps.
+	natP := nat(nil).setWord(p)
+	vk := nat(nil).setWord(2)
+	vk1 := nat(nil).setWord(p)
+	t2 := nat(nil) // temp
+	for i := int(s.bitLen()); i >= 0; i-- {
+		if s.bit(uint(i)) != 0 {
+			// k' = 2k+1
+			// V(k') = V(2k+1) = V(k) V(k+1) - P.
+			t1 = t1.mul(vk, vk1)
+			t1 = t1.add(t1, n)
+			t1 = t1.sub(t1, natP)
+			t2, vk = t2.div(vk, t1, n)
+			// V(k'+1) = V(2k+2) = V(k+1)² - 2.
+			t1 = t1.mul(vk1, vk1)
+			t1 = t1.add(t1, nm2)
+			t2, vk1 = t2.div(vk1, t1, n)
+		} else {
+			// k' = 2k
+			// V(k'+1) = V(2k+1) = V(k) V(k+1) - P.
+			t1 = t1.mul(vk, vk1)
+			t1 = t1.add(t1, n)
+			t1 = t1.sub(t1, natP)
+			t2, vk1 = t2.div(vk1, t1, n)
+			// V(k') = V(2k) = V(k)² - 2
+			t1 = t1.mul(vk, vk)
+			t1 = t1.add(t1, nm2)
+			t2, vk = t2.div(vk, t1, n)
+		}
+	}
+
+	// Now k=s, so vk = V(s). Check V(s) ≡ ±2 (mod n).
+	if vk.cmp(natTwo) == 0 || vk.cmp(nm2) == 0 {
+		// Check U(s) ≡ 0.
+		// As suggested by Jacobsen, apply Crandall and Pomerance equation 3.13:
+		//
+		//	U(k) = D⁻¹ (2 V(k+1) - P V(k))
+		//
+		// Since we are checking for U(k) == 0 it suffices to check 2 V(k+1) == P V(k) mod n,
+		// or P V(k) - 2 V(k+1) == 0 mod n.
+		t1 := t1.mul(vk, natP)
+		t2 := t2.shl(vk1, 1)
+		if t1.cmp(t2) < 0 {
+			t1, t2 = t2, t1
+		}
+		t1 = t1.sub(t1, t2)
+		t3 := vk1 // steal vk1, no longer needed below
+		vk1 = nil
+		_ = vk1
+		t2, t3 = t2.div(t3, t1, n)
+		if len(t3) == 0 {
+			return true
+		}
+	}
+
+	// Check V(2^t s) ≡ 0 mod n for some 0 ≤ t < r-1.
+	for t := 0; t < r-1; t++ {
+		if len(vk) == 0 { // vk == 0
+			return true
+		}
+		// Optimization: V(k) = 2 is a fixed point for V(k') = V(k)² - 2,
+		// so if V(k) = 2, we can stop: we will never find a future V(k) == 0.
+		if len(vk) == 1 && vk[0] == 2 { // vk == 2
+			return false
+		}
+		// k' = 2k
+		// V(k') = V(2k) = V(k)² - 2
+		t1 = t1.mul(vk, vk)
+		t1 = t1.sub(t1, natTwo)
+		t2, vk = t2.div(vk, t1, n)
+	}
+	return false
+}
diff --git a/src/math/big/prime_test.go b/src/math/big/prime_test.go
new file mode 100644
index 0000000..a2d3d18
--- /dev/null
+++ b/src/math/big/prime_test.go
@@ -0,0 +1,214 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package big
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+	"unicode"
+)
+
+var primes = []string{
+	"2",
+	"3",
+	"5",
+	"7",
+	"11",
+
+	"13756265695458089029",
+	"13496181268022124907",
+	"10953742525620032441",
+	"17908251027575790097",
+
+	// https://golang.org/issue/638
+	"18699199384836356663",
+
+	"98920366548084643601728869055592650835572950932266967461790948584315647051443",
+	"94560208308847015747498523884063394671606671904944666360068158221458669711639",
+
+	// http://primes.utm.edu/lists/small/small3.html
+	"449417999055441493994709297093108513015373787049558499205492347871729927573118262811508386655998299074566974373711472560655026288668094291699357843464363003144674940345912431129144354948751003607115263071543163",
+	"230975859993204150666423538988557839555560243929065415434980904258310530753006723857139742334640122533598517597674807096648905501653461687601339782814316124971547968912893214002992086353183070342498989426570593",
+	"5521712099665906221540423207019333379125265462121169655563495403888449493493629943498064604536961775110765377745550377067893607246020694972959780839151452457728855382113555867743022746090187341871655890805971735385789993",
+	"203956878356401977405765866929034577280193993314348263094772646453283062722701277632936616063144088173312372882677123879538709400158306567338328279154499698366071906766440037074217117805690872792848149112022286332144876183376326512083574821647933992961249917319836219304274280243803104015000563790123",
+
+	// ECC primes: http://tools.ietf.org/html/draft-ladd-safecurves-02
+	"3618502788666131106986593281521497120414687020801267626233049500247285301239",                                                                                  // Curve1174: 2^251-9
+	"57896044618658097711785492504343953926634992332820282019728792003956564819949",                                                                                 // Curve25519: 2^255-19
+	"9850501549098619803069760025035903451269934817616361666987073351061430442874302652853566563721228910201656997576599",                                           // E-382: 2^382-105
+	"42307582002575910332922579714097346549017899709713998034217522897561970639123926132812109468141778230245837569601494931472367",                                 // Curve41417: 2^414-17
+	"6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151", // E-521: 2^521-1
+}
+
+var composites = []string{
+	"0",
+	"1",
+	"21284175091214687912771199898307297748211672914763848041968395774954376176754",
+	"6084766654921918907427900243509372380954290099172559290432744450051395395951",
+	"84594350493221918389213352992032324280367711247940675652888030554255915464401",
+	"82793403787388584738507275144194252681",
+
+	// Arnault, "Rabin-Miller Primality Test: Composite Numbers Which Pass It",
+	// Mathematics of Computation, 64(209) (January 1995), pp. 335-361.
+	"1195068768795265792518361315725116351898245581", // strong pseudoprime to prime bases 2 through 29
+	// strong pseudoprime to all prime bases up to 200
+	`
+     80383745745363949125707961434194210813883768828755814583748891752229
+      74273765333652186502336163960045457915042023603208766569966760987284
+       0439654082329287387918508691668573282677617710293896977394701670823
+        0428687109997439976544144845341155872450633409279022275296229414984
+         2306881685404326457534018329786111298960644845216191652872597534901`,
+
+	// Extra-strong Lucas pseudoprimes. https://oeis.org/A217719
+	"989",
+	"3239",
+	"5777",
+	"10877",
+	"27971",
+	"29681",
+	"30739",
+	"31631",
+	"39059",
+	"72389",
+	"73919",
+	"75077",
+	"100127",
+	"113573",
+	"125249",
+	"137549",
+	"137801",
+	"153931",
+	"155819",
+	"161027",
+	"162133",
+	"189419",
+	"218321",
+	"231703",
+	"249331",
+	"370229",
+	"429479",
+	"430127",
+	"459191",
+	"473891",
+	"480689",
+	"600059",
+	"621781",
+	"632249",
+	"635627",
+
+	"3673744903",
+	"3281593591",
+	"2385076987",
+	"2738053141",
+	"2009621503",
+	"1502682721",
+	"255866131",
+	"117987841",
+	"587861",
+
+	"6368689",
+	"8725753",
+	"80579735209",
+	"105919633",
+}
+
+func cutSpace(r rune) rune {
+	if unicode.IsSpace(r) {
+		return -1
+	}
+	return r
+}
+
+func TestProbablyPrime(t *testing.T) {
+	nreps := 20
+	if testing.Short() {
+		nreps = 3
+	}
+	for i, s := range primes {
+		p, _ := new(Int).SetString(s, 10)
+		if !p.ProbablyPrime(nreps) || !p.ProbablyPrime(1) || !p.ProbablyPrime(0) {
+			t.Errorf("#%d prime found to be non-prime (%s)", i, s)
+		}
+	}
+
+	for i, s := range composites {
+		s = strings.Map(cutSpace, s)
+		c, _ := new(Int).SetString(s, 10)
+		if c.ProbablyPrime(nreps) || c.ProbablyPrime(1) || c.ProbablyPrime(0) {
+			t.Errorf("#%d composite found to be prime (%s)", i, s)
+		}
+	}
+
+	// check that ProbablyPrime panics if n <= 0
+	c := NewInt(11) // a prime
+	for _, n := range []int{-1, 0, 1} {
+		func() {
+			defer func() {
+				if n < 0 && recover() == nil {
+					t.Fatalf("expected panic from ProbablyPrime(%d)", n)
+				}
+			}()
+			if !c.ProbablyPrime(n) {
+				t.Fatalf("%v should be a prime", c)
+			}
+		}()
+	}
+}
+
+func BenchmarkProbablyPrime(b *testing.B) {
+	p, _ := new(Int).SetString("203956878356401977405765866929034577280193993314348263094772646453283062722701277632936616063144088173312372882677123879538709400158306567338328279154499698366071906766440037074217117805690872792848149112022286332144876183376326512083574821647933992961249917319836219304274280243803104015000563790123", 10)
+	for _, n := range []int{0, 1, 5, 10, 20} {
+		b.Run(fmt.Sprintf("n=%d", n), func(b *testing.B) {
+			for i := 0; i < b.N; i++ {
+				p.ProbablyPrime(n)
+			}
+		})
+	}
+
+	b.Run("Lucas", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			p.abs.probablyPrimeLucas()
+		}
+	})
+	b.Run("MillerRabinBase2", func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			p.abs.probablyPrimeMillerRabin(1, true)
+		}
+	})
+}
+
+func TestMillerRabinPseudoprimes(t *testing.T) {
+	testPseudoprimes(t, "probablyPrimeMillerRabin",
+		func(n nat) bool { return n.probablyPrimeMillerRabin(1, true) && !n.probablyPrimeLucas() },
+		// https://oeis.org/A001262
+		[]int{2047, 3277, 4033, 4681, 8321, 15841, 29341, 42799, 49141, 52633, 65281, 74665, 80581, 85489, 88357, 90751})
+}
+
+func TestLucasPseudoprimes(t *testing.T) {
+	testPseudoprimes(t, "probablyPrimeLucas",
+		func(n nat) bool { return n.probablyPrimeLucas() && !n.probablyPrimeMillerRabin(1, true) },
+		// https://oeis.org/A217719
+		[]int{989, 3239, 5777, 10877, 27971, 29681, 30739, 31631, 39059, 72389, 73919, 75077})
+}
+
+func testPseudoprimes(t *testing.T, name string, cond func(nat) bool, want []int) {
+	n := nat{1}
+	for i := 3; i < 100000; i += 2 {
+		n[0] = Word(i)
+		pseudo := cond(n)
+		if pseudo && (len(want) == 0 || i != want[0]) {
+			t.Errorf("%s(%v, base=2) = %v, want false", name, i)
+		} else if !pseudo && len(want) >= 1 && i == want[0] {
+			t.Errorf("%s(%v, base=2) = false, want true", name, i)
+		}
+		if len(want) > 0 && i == want[0] {
+			want = want[1:]
+		}
+	}
+	if len(want) > 0 {
+		t.Fatalf("forgot to test %v", want)
+	}
+}
diff --git a/src/math/big/rat_test.go b/src/math/big/rat_test.go
index 3a06fca..afda686 100644
--- a/src/math/big/rat_test.go
+++ b/src/math/big/rat_test.go
@@ -382,9 +382,9 @@
 		9,
 		11,
 	}
-	var winc, einc = uint64(1), 1 // soak test (~1.5s on x86-64)
-	if testing.Short() {
-		winc, einc = 5, 15 // quick test (~60ms on x86-64)
+	var winc, einc = uint64(5), 15 // quick test (~60ms on x86-64)
+	if *long {
+		winc, einc = uint64(1), 1 // soak test (~1.5s on x86-64)
 	}
 
 	for _, sign := range "+-" {
@@ -430,9 +430,9 @@
 		9,
 		11,
 	}
-	var winc, einc = uint64(1), 1 // soak test (~75s on x86-64)
-	if testing.Short() {
-		winc, einc = 10, 500 // quick test (~12ms on x86-64)
+	var winc, einc = uint64(10), 500 // quick test (~12ms on x86-64)
+	if *long {
+		winc, einc = uint64(1), 1 // soak test (~75s on x86-64)
 	}
 
 	for _, sign := range "+-" {
diff --git a/src/math/big/ratconv.go b/src/math/big/ratconv.go
index ef2b675..a6a401c 100644
--- a/src/math/big/ratconv.go
+++ b/src/math/big/ratconv.go
@@ -18,6 +18,9 @@
 	return strings.ContainsRune("+-/0123456789.eE", ch)
 }
 
+var ratZero Rat
+var _ fmt.Scanner = &ratZero // *Rat must implement fmt.Scanner
+
 // Scan is a support routine for fmt.Scanner. It accepts the formats
 // 'e', 'E', 'f', 'F', 'g', 'G', and 'v'. All formats are equivalent.
 func (z *Rat) Scan(s fmt.ScanState, ch rune) error {
@@ -36,8 +39,9 @@
 
 // SetString sets z to the value of s and returns z and a boolean indicating
 // success. s can be given as a fraction "a/b" or as a floating-point number
-// optionally followed by an exponent. If the operation failed, the value of
-// z is undefined but the returned value is nil.
+// optionally followed by an exponent. The entire string (not just a prefix)
+// must be valid for success. If the operation failed, the value of z is un-
+// defined but the returned value is nil.
 func (z *Rat) SetString(s string) (*Rat, bool) {
 	if len(s) == 0 {
 		return nil, false
@@ -49,9 +53,13 @@
 		if _, ok := z.a.SetString(s[:sep], 0); !ok {
 			return nil, false
 		}
-		s = s[sep+1:]
+		r := strings.NewReader(s[sep+1:])
 		var err error
-		if z.b.abs, _, _, err = z.b.abs.scan(strings.NewReader(s), 0, false); err != nil {
+		if z.b.abs, _, _, err = z.b.abs.scan(r, 0, false); err != nil {
+			return nil, false
+		}
+		// entire string must have been consumed
+		if _, err = r.ReadByte(); err != io.EOF {
 			return nil, false
 		}
 		if len(z.b.abs) == 0 {
diff --git a/src/math/big/ratconv_test.go b/src/math/big/ratconv_test.go
index 35ad6cc..56ac8d7 100644
--- a/src/math/big/ratconv_test.go
+++ b/src/math/big/ratconv_test.go
@@ -50,6 +50,10 @@
 	{"204211327800791583.81095", "4084226556015831676219/20000", true},
 	{"0e9999999999", "0", true}, // issue #16176
 	{in: "1/0"},
+	{in: "4/3/2"}, // issue 17001
+	{in: "4/3/"},
+	{in: "4/3."},
+	{in: "4/"},
 }
 
 // These are not supported by fmt.Fscanf.
@@ -59,6 +63,7 @@
 	{"-010.", "-10", true},
 	{"0x10/0x20", "1/2", true},
 	{"0b1000/3", "8/3", true},
+	{in: "4/3x"},
 	// TODO(gri) add more tests
 }
 
@@ -139,7 +144,7 @@
 }
 
 // Test inputs to Rat.SetString. The prefix "long:" causes the test
-// to be skipped in --test.short mode.  (The threshold is about 500us.)
+// to be skipped except in -long mode.  (The threshold is about 500us.)
 var float64inputs = []string{
 	// Constants plundered from strconv/testfp.txt.
 
@@ -345,7 +350,7 @@
 func TestFloat32SpecialCases(t *testing.T) {
 	for _, input := range float64inputs {
 		if strings.HasPrefix(input, "long:") {
-			if testing.Short() {
+			if !*long {
 				continue
 			}
 			input = input[len("long:"):]
@@ -401,7 +406,7 @@
 func TestFloat64SpecialCases(t *testing.T) {
 	for _, input := range float64inputs {
 		if strings.HasPrefix(input, "long:") {
-			if testing.Short() {
+			if !*long {
 				continue
 			}
 			input = input[len("long:"):]
diff --git a/src/math/cmplx/cmath_test.go b/src/math/cmplx/cmath_test.go
index d904be8..7a5c485 100644
--- a/src/math/cmplx/cmath_test.go
+++ b/src/math/cmplx/cmath_test.go
@@ -759,6 +759,14 @@
 	}
 }
 
+// See issue 17577
+func TestInfiniteLoopIntanSeries(t *testing.T) {
+	want := Inf()
+	if got := Cot(0); got != want {
+		t.Errorf("Cot(0): got %g, want %g", got, want)
+	}
+}
+
 func BenchmarkAbs(b *testing.B) {
 	for i := 0; i < b.N; i++ {
 		Abs(complex(2.5, 3.5))
diff --git a/src/math/cmplx/example_test.go b/src/math/cmplx/example_test.go
new file mode 100644
index 0000000..f0ed963
--- /dev/null
+++ b/src/math/cmplx/example_test.go
@@ -0,0 +1,28 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cmplx_test
+
+import (
+	"fmt"
+	"math"
+	"math/cmplx"
+)
+
+func ExampleAbs() {
+	fmt.Printf("%.1f", cmplx.Abs(3+4i))
+	// Output: 5.0
+}
+
+// ExampleExp computes Euler's identity.
+func ExampleExp() {
+	fmt.Printf("%.1f", cmplx.Exp(1i*math.Pi)+1)
+	// Output: (0.0+0.0i)
+}
+
+func ExamplePolar() {
+	r, theta := cmplx.Polar(2i)
+	fmt.Printf("r: %.1f, θ: %.1f*π", r, theta/math.Pi)
+	// Output: r: 2.0, θ: 0.5*π
+}
diff --git a/src/math/cmplx/tan.go b/src/math/cmplx/tan.go
index 9485315..2990552 100644
--- a/src/math/cmplx/tan.go
+++ b/src/math/cmplx/tan.go
@@ -120,9 +120,9 @@
 	rn := 0.0
 	d := 0.0
 	for {
-		rn += 1
+		rn++
 		f *= rn
-		rn += 1
+		rn++
 		f *= rn
 		x2 *= x
 		y2 *= y
@@ -130,16 +130,18 @@
 		t /= f
 		d += t
 
-		rn += 1
+		rn++
 		f *= rn
-		rn += 1
+		rn++
 		f *= rn
 		x2 *= x
 		y2 *= y
 		t = y2 - x2
 		t /= f
 		d += t
-		if math.Abs(t/d) <= MACHEP {
+		if !(math.Abs(t/d) > MACHEP) {
+			// Caution: Use ! and > instead of <= for correct behavior if t/d is NaN.
+			// See issue 17577.
 			break
 		}
 	}
diff --git a/src/math/cosh_s390x.s b/src/math/cosh_s390x.s
new file mode 100644
index 0000000..d061bd0
--- /dev/null
+++ b/src/math/cosh_s390x.s
@@ -0,0 +1,227 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Constants
+DATA coshrodataL23<>+0(SB)/8, $0.231904681384629956E-16
+DATA coshrodataL23<>+8(SB)/8, $0.693147180559945286E+00
+DATA coshrodataL23<>+16(SB)/8, $0.144269504088896339E+01
+DATA coshrodataL23<>+24(SB)/8, $704.E0
+GLOBL coshrodataL23<>+0(SB), RODATA, $32
+DATA coshxinf<>+0(SB)/8, $0x7FF0000000000000
+GLOBL coshxinf<>+0(SB), RODATA, $8
+DATA coshxlim1<>+0(SB)/8, $800.E0
+GLOBL coshxlim1<>+0(SB), RODATA, $8
+DATA coshxaddhy<>+0(SB)/8, $0xc2f0000100003fdf
+GLOBL coshxaddhy<>+0(SB), RODATA, $8
+DATA coshx4ff<>+0(SB)/8, $0x4ff0000000000000
+GLOBL coshx4ff<>+0(SB), RODATA, $8
+DATA coshe1<>+0(SB)/8, $0x3ff000000000000a
+GLOBL coshe1<>+0(SB), RODATA, $8
+
+// Log multiplier table
+DATA coshtab<>+0(SB)/8, $0.442737824274138381E-01
+DATA coshtab<>+8(SB)/8, $0.263602189790660309E-01
+DATA coshtab<>+16(SB)/8, $0.122565642281703586E-01
+DATA coshtab<>+24(SB)/8, $0.143757052860721398E-02
+DATA coshtab<>+32(SB)/8, $-.651375034121276075E-02
+DATA coshtab<>+40(SB)/8, $-.119317678849450159E-01
+DATA coshtab<>+48(SB)/8, $-.150868749549871069E-01
+DATA coshtab<>+56(SB)/8, $-.161992609578469234E-01
+DATA coshtab<>+64(SB)/8, $-.154492360403337917E-01
+DATA coshtab<>+72(SB)/8, $-.129850717389178721E-01
+DATA coshtab<>+80(SB)/8, $-.892902649276657891E-02
+DATA coshtab<>+88(SB)/8, $-.338202636596794887E-02
+DATA coshtab<>+96(SB)/8, $0.357266307045684762E-02
+DATA coshtab<>+104(SB)/8, $0.118665304327406698E-01
+DATA coshtab<>+112(SB)/8, $0.214434994118118914E-01
+DATA coshtab<>+120(SB)/8, $0.322580645161290314E-01
+GLOBL coshtab<>+0(SB), RODATA, $128
+
+// Minimax polynomial approximations
+DATA coshe2<>+0(SB)/8, $0.500000000000004237e+00
+GLOBL coshe2<>+0(SB), RODATA, $8
+DATA coshe3<>+0(SB)/8, $0.166666666630345592e+00
+GLOBL coshe3<>+0(SB), RODATA, $8
+DATA coshe4<>+0(SB)/8, $0.416666664838056960e-01
+GLOBL coshe4<>+0(SB), RODATA, $8
+DATA coshe5<>+0(SB)/8, $0.833349307718286047e-02
+GLOBL coshe5<>+0(SB), RODATA, $8
+DATA coshe6<>+0(SB)/8, $0.138926439368309441e-02
+GLOBL coshe6<>+0(SB), RODATA, $8
+
+// Cosh returns the hyperbolic cosine of x.
+//
+// Special cases are:
+//      Cosh(±0) = 1
+//      Cosh(±Inf) = +Inf
+//      Cosh(NaN) = NaN
+// The algorithm used is minimax polynomial approximation
+// with coefficients determined with a Remez exchange algorithm.
+
+TEXT ·coshAsm(SB),NOSPLIT,$0-16
+	FMOVD   x+0(FP), F0
+	MOVD    $coshrodataL23<>+0(SB), R9
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	MOVD    $0x4086000000000000, R2
+	MOVD    $0x4086000000000000, R3
+	BLTU    L19
+	FMOVD   F0, F4
+L2:
+	WORD    $0xED409018     //cdb %f4,.L24-.L23(%r9)
+	BYTE    $0x00
+	BYTE    $0x19
+	BGE     L14     //jnl   .L14
+	BVS     L14
+	WFCEDBS V4, V4, V2
+	BEQ     L20
+L1:
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L14:
+	WFCEDBS V4, V4, V2
+	BVS     L1
+	MOVD    $coshxlim1<>+0(SB), R1
+	FMOVD   0(R1), F2
+	WFCHEDBS        V4, V2, V2
+	BEQ     L21
+	MOVD    $coshxaddhy<>+0(SB), R1
+	FMOVD   coshrodataL23<>+16(SB), F5
+	FMOVD   0(R1), F2
+	WFMSDB  V0, V5, V2, V5
+	FMOVD   coshrodataL23<>+8(SB), F3
+	FADD    F5, F2
+	MOVD    $coshe6<>+0(SB), R1
+	WFMSDB  V2, V3, V0, V3
+	FMOVD   0(R1), F6
+	WFMDB   V3, V3, V1
+	MOVD    $coshe4<>+0(SB), R1
+	FMOVD   coshrodataL23<>+0(SB), F7
+	WFMADB  V2, V7, V3, V2
+	FMOVD   0(R1), F3
+	MOVD    $coshe5<>+0(SB), R1
+	WFMADB  V1, V6, V3, V6
+	FMOVD   0(R1), F7
+	MOVD    $coshe3<>+0(SB), R1
+	FMOVD   0(R1), F3
+	WFMADB  V1, V7, V3, V7
+	FNEG    F2, F3
+	WORD    $0xB3CD0015     //lgdr %r1,%f5
+	MOVD    $coshe2<>+0(SB), R3
+	WFCEDBS V4, V0, V0
+	FMOVD   0(R3), F5
+	MOVD    $coshe1<>+0(SB), R3
+	WFMADB  V1, V6, V5, V6
+	FMOVD   0(R3), F5
+	WORD    $0xEC21000F     //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WFMADB  V1, V7, V5, V1
+	BVS     L22
+	WORD    $0xEC4139BC     //risbg %r4,%r1,57,128+60,3
+	BYTE    $0x03
+	BYTE    $0x55
+	MOVD    $coshtab<>+0(SB), R3
+	WFMADB  V3, V6, V1, V6
+	WORD    $0x68043000     //ld    %f0,0(%r4,%r3)
+	FMSUB   F0, F3, F2, F2
+	WORD    $0xA71AF000     //ahi   %r1,-4096
+	WFMADB  V2, V6, V0, V6
+L17:
+	WORD    $0xEC21000F     //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0xB3C10022     //ldgr %f2,%r2
+	FMADD   F2, F6, F2, F2
+	MOVD    $coshx4ff<>+0(SB), R1
+	FMOVD   0(R1), F0
+	FMUL    F2, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L19:
+	FNEG    F0, F4
+	BR      L2
+L20:
+	MOVD    $coshxaddhy<>+0(SB), R1
+	FMOVD   coshrodataL23<>+16(SB), F3
+	FMOVD   0(R1), F2
+	WFMSDB  V0, V3, V2, V3
+	FMOVD   coshrodataL23<>+8(SB), F4
+	FADD    F3, F2
+	MOVD    $coshe6<>+0(SB), R1
+	FMSUB   F4, F2, F0, F0
+	FMOVD   0(R1), F6
+	WFMDB   V0, V0, V1
+	MOVD    $coshe4<>+0(SB), R1
+	FMOVD   0(R1), F4
+	MOVD    $coshe5<>+0(SB), R1
+	FMOVD   coshrodataL23<>+0(SB), F5
+	WFMADB  V1, V6, V4, V6
+	FMADD   F5, F2, F0, F0
+	FMOVD   0(R1), F2
+	MOVD    $coshe3<>+0(SB), R1
+	FMOVD   0(R1), F4
+	WFMADB  V1, V2, V4, V2
+	MOVD    $coshe2<>+0(SB), R1
+	FMOVD   0(R1), F5
+	FNEG    F0, F4
+	WFMADB  V1, V6, V5, V6
+	MOVD    $coshe1<>+0(SB), R1
+	FMOVD   0(R1), F5
+	WFMADB  V1, V2, V5, V1
+	WORD    $0xB3CD0013     //lgdr  %r1,%f3
+	MOVD    $coshtab<>+0(SB), R5
+	WFMADB  V4, V6, V1, V3
+	WORD    $0xEC4139BC     //risbg %r4,%r1,57,128+60,3
+	BYTE    $0x03
+	BYTE    $0x55
+	WFMSDB  V4, V6, V1, V6
+	WORD    $0x68145000     //ld %f1,0(%r4,%r5)
+	WFMSDB  V4, V1, V0, V2
+	WORD    $0xA7487FBE     //lhi %r4,32702
+	FMADD   F3, F2, F1, F1
+	SUBW    R1, R4
+	WORD    $0xECC439BC     //risbg %r12,%r4,57,128+60,3
+	BYTE    $0x03
+	BYTE    $0x55
+	WORD    $0x682C5000     //ld %f2,0(%r12,%r5)
+	FMSUB   F2, F4, F0, F0
+	WORD    $0xEC21000F     //risbgn %r2,%r1,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WFMADB  V0, V6, V2, V6
+	WORD    $0xEC34000F     //risbgn %r3,%r4,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0xB3C10022     //ldgr %f2,%r2
+	WORD    $0xB3C10003     //ldgr %f0,%r3
+	FMADD   F2, F1, F2, F2
+	FMADD   F0, F6, F0, F0
+	FADD    F2, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L22:
+	WORD    $0xA7387FBE     //lhi %r3,32702
+	MOVD    $coshtab<>+0(SB), R4
+	SUBW    R1, R3
+	WFMSDB  V3, V6, V1, V6
+	WORD    $0xEC3339BC     //risbg %r3,%r3,57,128+60,3
+	BYTE    $0x03
+	BYTE    $0x55
+	WORD    $0x68034000     //ld %f0,0(%r3,%r4)
+	FMSUB   F0, F3, F2, F2
+	WORD    $0xA7386FBE     //lhi %r3,28606
+	WFMADB  V2, V6, V0, V6
+	SUBW    R1, R3, R1
+	BR      L17
+L21:
+	MOVD    $coshxinf<>+0(SB), R1
+	FMOVD   0(R1), F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
diff --git a/src/math/dim_arm64.s b/src/math/dim_arm64.s
new file mode 100644
index 0000000..4b6b592
--- /dev/null
+++ b/src/math/dim_arm64.s
@@ -0,0 +1,78 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+#define PosInf 0x7FF0000000000000
+#define NaN    0x7FF8000000000001
+#define NegInf 0xFFF0000000000000
+
+// func Dim(x, y float64) float64
+TEXT ·Dim(SB),NOSPLIT,$0
+	// (+Inf, +Inf) special case
+	MOVD	$PosInf, R0
+	MOVD	x+0(FP), R1
+	MOVD	y+8(FP), R2
+	CMP	R0, R1
+	BNE	dim2
+	CMP	R0, R2
+	BEQ	bothInf
+dim2:	// (-Inf, -Inf) special case
+	MOVD	$NegInf, R0
+	CMP	R0, R1
+	BNE	dim3
+	CMP	R0, R2
+	BEQ	bothInf
+dim3:	// normal case
+	FMOVD	R1, F0
+	FMOVD	R2, F1
+	FMOVD	$0.0, F2
+	FSUBD	F1, F0
+	FMAXD	F0, F2, F0
+	FMOVD	F0, ret+16(FP)
+	RET
+bothInf:
+	MOVD	$NaN, R0
+	MOVD	R0, ret+16(FP)
+	RET
+
+// func ·Max(x, y float64) float64
+TEXT ·Max(SB),NOSPLIT,$0
+	// +Inf special cases
+	MOVD	$PosInf, R0
+	MOVD	x+0(FP), R1
+	CMP	R0, R1
+	BEQ	isPosInf
+	MOVD	y+8(FP), R2
+	CMP	R0, R2
+	BEQ	isPosInf
+	// normal case
+	FMOVD	R1, F0
+	FMOVD	R2, F1
+	FMAXD	F0, F1, F0
+	FMOVD	F0, ret+16(FP)
+	RET
+isPosInf: // return +Inf
+	MOVD	R0, ret+16(FP)
+	RET
+
+// func Min(x, y float64) float64
+TEXT ·Min(SB),NOSPLIT,$0
+	// -Inf special cases
+	MOVD	$NegInf, R0
+	MOVD	x+0(FP), R1
+	CMP	R0, R1
+	BEQ	isNegInf
+	MOVD	y+8(FP), R2
+	CMP	R0, R2
+	BEQ	isNegInf
+	// normal case
+	FMOVD	R1, F0
+	FMOVD	R2, F1
+	FMIND	F0, F1, F0
+	FMOVD	F0, ret+16(FP)
+	RET
+isNegInf: // return -Inf
+	MOVD	R0, ret+16(FP)
+	RET
diff --git a/src/math/exp_386.s b/src/math/exp_386.s
index 18a92ef..9d63295 100644
--- a/src/math/exp_386.s
+++ b/src/math/exp_386.s
@@ -6,36 +6,6 @@
 
 // func Exp(x float64) float64
 TEXT ·Exp(SB),NOSPLIT,$0
-// test bits for not-finite
-	MOVL    x_hi+4(FP), AX
-	ANDL    $0x7ff00000, AX
-	CMPL    AX, $0x7ff00000
-	JEQ     not_finite
-	FLDL2E                // F0=log2(e)
-	FMULD   x+0(FP), F0   // F0=x*log2(e)
-	FMOVD   F0, F1        // F0=x*log2(e), F1=x*log2(e)
-	FRNDINT               // F0=int(x*log2(e)), F1=x*log2(e)
-	FSUBD   F0, F1        // F0=int(x*log2(e)), F1=x*log2(e)-int(x*log2(e))
-	FXCHD   F0, F1        // F0=x*log2(e)-int(x*log2(e)), F1=int(x*log2(e))
-	F2XM1                 // F0=2**(x*log2(e)-int(x*log2(e)))-1, F1=int(x*log2(e))
-	FLD1                  // F0=1, F1=2**(x*log2(e)-int(x*log2(e)))-1, F2=int(x*log2(e))
-	FADDDP  F0, F1        // F0=2**(x*log2(e)-int(x*log2(e))), F1=int(x*log2(e))
-	FSCALE                // F0=e**x, F1=int(x*log2(e))
-	FMOVDP  F0, F1        // F0=e**x
-	FMOVDP  F0, ret+8(FP)
-	RET
-not_finite:
-// test bits for -Inf
-	MOVL    x_hi+4(FP), BX
-	MOVL    x_lo+0(FP), CX
-	CMPL    BX, $0xfff00000
-	JNE     not_neginf
-	CMPL    CX, $0
-	JNE     not_neginf
-	FLDZ                  // F0=0
-	FMOVDP  F0, ret+8(FP)
-	RET
-not_neginf:
-	MOVL    CX, ret_lo+8(FP)
-	MOVL    BX, ret_hi+12(FP)
-	RET
+	// Used to use 387 assembly (FLDL2E+F2XM1) here,
+	// but it was both slower and less accurate than the portable Go code.
+	JMP ·exp(SB)
diff --git a/src/math/expm1.go b/src/math/expm1.go
index 8ce67e5..7dd75a8 100644
--- a/src/math/expm1.go
+++ b/src/math/expm1.go
@@ -229,7 +229,7 @@
 		}
 		t := Float64frombits(uint64(0x3ff-k) << 52) // 2**-k
 		y := x - (e + t)
-		y += 1
+		y++
 		y = Float64frombits(Float64bits(y) + uint64(k)<<52) // add k to y's exponent
 		return y
 	}
diff --git a/src/math/export_s390x_test.go b/src/math/export_s390x_test.go
new file mode 100644
index 0000000..3fdbd86
--- /dev/null
+++ b/src/math/export_s390x_test.go
@@ -0,0 +1,14 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package math
+
+// Export internal functions and variable for testing.
+var Log10NoVec = log10
+var CosNoVec = cos
+var CoshNoVec = cosh
+var SinNoVec = sin
+var SinhNoVec = sinh
+var TanhNoVec = tanh
+var HasVX = hasVX
diff --git a/src/math/floor_arm64.s b/src/math/floor_arm64.s
new file mode 100644
index 0000000..6d240d4
--- /dev/null
+++ b/src/math/floor_arm64.s
@@ -0,0 +1,26 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Floor(x float64) float64
+TEXT ·Floor(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FRINTMD	F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
+
+// func Ceil(x float64) float64
+TEXT ·Ceil(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FRINTPD	F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
+
+// func Trunc(x float64) float64
+TEXT ·Trunc(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FRINTZD	F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
diff --git a/src/math/floor_ppc64x.s b/src/math/floor_ppc64x.s
new file mode 100644
index 0000000..2ab011d
--- /dev/null
+++ b/src/math/floor_ppc64x.s
@@ -0,0 +1,25 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ppc64 ppc64le
+
+#include "textflag.h"
+
+TEXT ·Floor(SB),NOSPLIT,$0
+	FMOVD   x+0(FP), F0
+	FRIM	F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+TEXT ·Ceil(SB),NOSPLIT,$0
+	FMOVD   x+0(FP), F0
+	FRIP    F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
+
+TEXT ·Trunc(SB),NOSPLIT,$0
+	FMOVD   x+0(FP), F0
+	FRIZ    F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
diff --git a/src/math/floor_s390x.s b/src/math/floor_s390x.s
new file mode 100644
index 0000000..896e79b
--- /dev/null
+++ b/src/math/floor_s390x.s
@@ -0,0 +1,26 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Floor(x float64) float64
+TEXT ·Floor(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FIDBR	$7, F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
+
+// func Ceil(x float64) float64
+TEXT ·Ceil(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FIDBR	$6, F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
+
+// func Trunc(x float64) float64
+TEXT ·Trunc(SB),NOSPLIT,$0
+	FMOVD	x+0(FP), F0
+	FIDBR	$5, F0, F0
+	FMOVD	F0, ret+8(FP)
+	RET
diff --git a/src/math/gamma.go b/src/math/gamma.go
index 841ec11..cc9e869 100644
--- a/src/math/gamma.go
+++ b/src/math/gamma.go
@@ -91,23 +91,31 @@
 }
 
 // Gamma function computed by Stirling's formula.
-// The polynomial is valid for 33 <= x <= 172.
-func stirling(x float64) float64 {
+// The pair of results must be multiplied together to get the actual answer.
+// The multiplication is left to the caller so that, if careful, the caller can avoid
+// infinity for 172 <= x <= 180.
+// The polynomial is valid for 33 <= x <= 172; larger values are only used
+// in reciprocal and produce denormalized floats. The lower precision there
+// masks any imprecision in the polynomial.
+func stirling(x float64) (float64, float64) {
+	if x > 200 {
+		return Inf(1), 1
+	}
 	const (
 		SqrtTwoPi   = 2.506628274631000502417
 		MaxStirling = 143.01608
 	)
 	w := 1 / x
 	w = 1 + w*((((_gamS[0]*w+_gamS[1])*w+_gamS[2])*w+_gamS[3])*w+_gamS[4])
-	y := Exp(x)
+	y1 := Exp(x)
+	y2 := 1.0
 	if x > MaxStirling { // avoid Pow() overflow
 		v := Pow(x, 0.5*x-0.25)
-		y = v * (v / y)
+		y1, y2 = v, v/y1
 	} else {
-		y = Pow(x, x-0.5) / y
+		y1 = Pow(x, x-0.5) / y1
 	}
-	y = SqrtTwoPi * y * w
-	return y
+	return y1, SqrtTwoPi * w * y2
 }
 
 // Gamma returns the Gamma function of x.
@@ -125,22 +133,26 @@
 	switch {
 	case isNegInt(x) || IsInf(x, -1) || IsNaN(x):
 		return NaN()
+	case IsInf(x, 1):
+		return Inf(1)
 	case x == 0:
 		if Signbit(x) {
 			return Inf(-1)
 		}
 		return Inf(1)
-	case x < -170.5674972726612 || x > 171.61447887182298:
-		return Inf(1)
 	}
 	q := Abs(x)
 	p := Floor(q)
 	if q > 33 {
 		if x >= 0 {
-			return stirling(x)
+			y1, y2 := stirling(x)
+			return y1 * y2
 		}
+		// Note: x is negative but (checked above) not a negative integer,
+		// so x must be small enough to be in range for conversion to int64.
+		// If |x| were >= 2⁶³ it would have to be an integer.
 		signgam := 1
-		if ip := int(p); ip&1 == 0 {
+		if ip := int64(p); ip&1 == 0 {
 			signgam = -1
 		}
 		z := q - p
@@ -152,7 +164,14 @@
 		if z == 0 {
 			return Inf(signgam)
 		}
-		z = Pi / (Abs(z) * stirling(q))
+		sq1, sq2 := stirling(q)
+		absz := Abs(z)
+		d := absz * sq1 * sq2
+		if IsInf(d, 0) {
+			z = Pi / absz / sq1 / sq2
+		} else {
+			z = Pi / d
+		}
 		return float64(signgam) * z
 	}
 
diff --git a/src/math/j0.go b/src/math/j0.go
index cbef7aa..fe26791 100644
--- a/src/math/j0.go
+++ b/src/math/j0.go
@@ -305,20 +305,20 @@
 }
 
 func pzero(x float64) float64 {
-	var p [6]float64
-	var q [5]float64
+	var p *[6]float64
+	var q *[5]float64
 	if x >= 8 {
-		p = p0R8
-		q = p0S8
+		p = &p0R8
+		q = &p0S8
 	} else if x >= 4.5454 {
-		p = p0R5
-		q = p0S5
+		p = &p0R5
+		q = &p0S5
 	} else if x >= 2.8571 {
-		p = p0R3
-		q = p0S3
+		p = &p0R3
+		q = &p0S3
 	} else if x >= 2 {
-		p = p0R2
-		q = p0S2
+		p = &p0R2
+		q = &p0S2
 	}
 	z := 1 / (x * x)
 	r := p[0] + z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))))
@@ -408,19 +408,19 @@
 }
 
 func qzero(x float64) float64 {
-	var p, q [6]float64
+	var p, q *[6]float64
 	if x >= 8 {
-		p = q0R8
-		q = q0S8
+		p = &q0R8
+		q = &q0S8
 	} else if x >= 4.5454 {
-		p = q0R5
-		q = q0S5
+		p = &q0R5
+		q = &q0S5
 	} else if x >= 2.8571 {
-		p = q0R3
-		q = q0S3
+		p = &q0R3
+		q = &q0S3
 	} else if x >= 2 {
-		p = q0R2
-		q = q0S2
+		p = &q0R2
+		q = &q0S2
 	}
 	z := 1 / (x * x)
 	r := p[0] + z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))))
diff --git a/src/math/j1.go b/src/math/j1.go
index d359d90..f1adcb6 100644
--- a/src/math/j1.go
+++ b/src/math/j1.go
@@ -298,20 +298,20 @@
 }
 
 func pone(x float64) float64 {
-	var p [6]float64
-	var q [5]float64
+	var p *[6]float64
+	var q *[5]float64
 	if x >= 8 {
-		p = p1R8
-		q = p1S8
+		p = &p1R8
+		q = &p1S8
 	} else if x >= 4.5454 {
-		p = p1R5
-		q = p1S5
+		p = &p1R5
+		q = &p1S5
 	} else if x >= 2.8571 {
-		p = p1R3
-		q = p1S3
+		p = &p1R3
+		q = &p1S3
 	} else if x >= 2 {
-		p = p1R2
-		q = p1S2
+		p = &p1R2
+		q = &p1S2
 	}
 	z := 1 / (x * x)
 	r := p[0] + z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))))
@@ -401,19 +401,19 @@
 }
 
 func qone(x float64) float64 {
-	var p, q [6]float64
+	var p, q *[6]float64
 	if x >= 8 {
-		p = q1R8
-		q = q1S8
+		p = &q1R8
+		q = &q1S8
 	} else if x >= 4.5454 {
-		p = q1R5
-		q = q1S5
+		p = &q1R5
+		q = &q1S5
 	} else if x >= 2.8571 {
-		p = q1R3
-		q = q1S3
+		p = &q1R3
+		q = &q1S3
 	} else if x >= 2 {
-		p = q1R2
-		q = q1S2
+		p = &q1R2
+		q = &q1S2
 	}
 	z := 1 / (x * x)
 	r := p[0] + z*(p[1]+z*(p[2]+z*(p[3]+z*(p[4]+z*p[5]))))
diff --git a/src/math/jn.go b/src/math/jn.go
index 721112f..3422782 100644
--- a/src/math/jn.go
+++ b/src/math/jn.go
@@ -174,7 +174,7 @@
 			q1 := w*z - 1
 			k := 1
 			for q1 < 1e9 {
-				k += 1
+				k++
 				z += h
 				q0, q1 = q1, z*q1-q0
 			}
diff --git a/src/math/log10_s390x.s b/src/math/log10_s390x.s
new file mode 100644
index 0000000..460bcd9
--- /dev/null
+++ b/src/math/log10_s390x.s
@@ -0,0 +1,170 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Minimax polynomial coefficients and other constants
+DATA log10rodataL19<>+0(SB)/8, $0.000000000000000000E+00
+DATA log10rodataL19<>+8(SB)/8, $-1.0
+DATA log10rodataL19<>+16(SB)/8, $0x7FF8000000000000   //+NanN
+DATA log10rodataL19<>+24(SB)/8, $.15375570329280596749
+DATA log10rodataL19<>+32(SB)/8, $.60171950900703668594E+04
+DATA log10rodataL19<>+40(SB)/8, $-1.9578460454940795898
+DATA log10rodataL19<>+48(SB)/8, $0.78962633073318517310E-01
+DATA log10rodataL19<>+56(SB)/8, $-.71784211884836937993E-02
+DATA log10rodataL19<>+64(SB)/8, $0.87011165920689940661E-03
+DATA log10rodataL19<>+72(SB)/8, $-.11865158981621437541E-03
+DATA log10rodataL19<>+80(SB)/8, $0.17258413403018680410E-04
+DATA log10rodataL19<>+88(SB)/8, $0.40752932047883484315E-06
+DATA log10rodataL19<>+96(SB)/8, $-.26149194688832680410E-05
+DATA log10rodataL19<>+104(SB)/8, $0.92453396963875026759E-08
+DATA log10rodataL19<>+112(SB)/8, $-.64572084905921579630E-07
+DATA log10rodataL19<>+120(SB)/8, $-5.5
+DATA log10rodataL19<>+128(SB)/8, $18446744073709551616.
+GLOBL log10rodataL19<>+0(SB), RODATA, $136
+
+// Table of log10 correction terms
+DATA log10tab2074<>+0(SB)/8, $0.254164497922885069E-01
+DATA log10tab2074<>+8(SB)/8, $0.179018857989381839E-01
+DATA log10tab2074<>+16(SB)/8, $0.118926768029048674E-01
+DATA log10tab2074<>+24(SB)/8, $0.722595568238080033E-02
+DATA log10tab2074<>+32(SB)/8, $0.376393570022739135E-02
+DATA log10tab2074<>+40(SB)/8, $0.138901135928814326E-02
+DATA log10tab2074<>+48(SB)/8, $0
+DATA log10tab2074<>+56(SB)/8, $-0.490780466387818203E-03
+DATA log10tab2074<>+64(SB)/8, $-0.159811431402137571E-03
+DATA log10tab2074<>+72(SB)/8, $0.925796337165100494E-03
+DATA log10tab2074<>+80(SB)/8, $0.270683176738357035E-02
+DATA log10tab2074<>+88(SB)/8, $0.513079030821304758E-02
+DATA log10tab2074<>+96(SB)/8, $0.815089785397996303E-02
+DATA log10tab2074<>+104(SB)/8, $0.117253060262419215E-01
+DATA log10tab2074<>+112(SB)/8, $0.158164239345343963E-01
+DATA log10tab2074<>+120(SB)/8, $0.203903595489229786E-01
+GLOBL log10tab2074<>+0(SB), RODATA, $128
+
+// Log10 returns the decimal logarithm of the argument.
+//
+// Special cases are:
+//      Log(+Inf) = +Inf
+//      Log(0) = -Inf
+//      Log(x < 0) = NaN
+//      Log(NaN) = NaN
+// The algorithm used is minimax polynomial approximation
+// with coefficients determined with a Remez exchange algorithm.
+
+TEXT ·log10Asm(SB),NOSPLIT,$8-16
+	FMOVD   x+0(FP), F0
+	MOVD    $log10rodataL19<>+0(SB), R9
+	FMOVD   F0, x-8(SP)
+	WORD    $0xC0298006     //iilf %r2,2147909631
+	BYTE    $0x7F
+	BYTE    $0xFF
+	WORD    $0x5840F008     //l %r4, 8(%r15)
+	SUBW    R4, R2, R3
+	WORD    $0xEC5320AF     //risbg %r5,%r3,32,128+47,0
+	BYTE    $0x00
+	BYTE    $0x55
+	MOVH    $0x0, R1
+	WORD    $0xEC15001F     //risbgn %r1,%r5,64-64+0,64-64+0+32-1,64-0-32
+	BYTE    $0x20
+	BYTE    $0x59
+	WORD    $0xC0590016     //iilf %r5,1507327
+	BYTE    $0xFF
+	BYTE    $0xFF
+	MOVW    R4, R10
+	MOVW    R5, R11
+	CMPBLE  R10, R11, L2
+	WORD    $0xC0297FEF     //iilf %r2,2146435071
+	BYTE    $0xFF
+	BYTE    $0xFF
+	MOVW    R4, R10
+	MOVW    R2, R11
+	CMPBLE  R10, R11, L16
+L3:
+L1:
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L2:
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	BLEU    L13
+	WORD    $0xED009080     //mdb %f0,.L20-.L19(%r9)
+	BYTE    $0x00
+	BYTE    $0x1C
+	FMOVD   F0, x-8(SP)
+	WORD    $0x5B20F008     //s %r2, 8(%r15)
+	WORD    $0xEC3239BC     //risbg %r3,%r2,57,128+60,64-13
+	BYTE    $0x33
+	BYTE    $0x55
+	ANDW    $0xFFFF0000, R2
+	WORD    $0xEC12001F     //risbgn %r1,%r2,64-64+0,64-64+0+32-1,64-0-32
+	BYTE    $0x20
+	BYTE    $0x59
+	ADDW    $0x4000000, R2
+	BLEU    L17
+L8:
+	SRW     $8, R2, R2
+	ORW     $0x45000000, R2
+L4:
+	FMOVD   log10rodataL19<>+120(SB), F2
+	WORD    $0xB3C10041     //ldgr  %f4,%r1
+	WFMADB  V4, V0, V2, V0
+	FMOVD   log10rodataL19<>+112(SB), F4
+	FMOVD   log10rodataL19<>+104(SB), F6
+	WFMADB  V0, V6, V4, V6
+	FMOVD   log10rodataL19<>+96(SB), F4
+	FMOVD   log10rodataL19<>+88(SB), F1
+	WFMADB  V0, V1, V4, V1
+	WFMDB   V0, V0, V4
+	FMOVD   log10rodataL19<>+80(SB), F2
+	WFMADB  V6, V4, V1, V6
+	FMOVD   log10rodataL19<>+72(SB), F1
+	WFMADB  V0, V2, V1, V2
+	FMOVD   log10rodataL19<>+64(SB), F1
+	WORD    $0xEC3339BC     //risbg %r3,%r3,57,128+60,0
+	BYTE    $0x00
+	BYTE    $0x55
+	WFMADB  V4, V6, V2, V6
+	FMOVD   log10rodataL19<>+56(SB), F2
+	WFMADB  V0, V1, V2, V1
+	VLVGF   $0, R2, V2
+	WFMADB  V4, V6, V1, V4
+	LDEBR   F2, F2
+	FMOVD   log10rodataL19<>+48(SB), F6
+	WFMADB  V0, V4, V6, V4
+	FMOVD   log10rodataL19<>+40(SB), F1
+	FMOVD   log10rodataL19<>+32(SB), F6
+	MOVD    $log10tab2074<>+0(SB), R1
+	WFMADB  V2, V1, V6, V2
+	WORD    $0x68331000     //ld %f3,0(%r3,%r1)
+	WFMADB  V0, V4, V3, V0
+	FMOVD   log10rodataL19<>+24(SB), F4
+	FMADD   F4, F2, F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L16:
+	WORD    $0xEC2328B7     //risbg %r2,%r3,40,128+55,64-8
+	BYTE    $0x38
+	BYTE    $0x55
+	WORD    $0xEC3339BC     //risbg %r3,%r3,57,128+60,64-13
+	BYTE    $0x33
+	BYTE    $0x55
+	ORW     $0x45000000, R2
+	BR      L4
+L13:
+	BGE     L18     //jnl .L18
+	BVS     L18
+	FMOVD   log10rodataL19<>+16(SB), F0
+	BR      L1
+L17:
+	SRAW    $1, R2, R2
+	SUBW    $0x40000000, R2
+	BR      L8
+L18:
+	FMOVD   log10rodataL19<>+8(SB), F0
+	WORD    $0xED009000     //ddb %f0,.L36-.L19(%r9)
+	BYTE    $0x00
+	BYTE    $0x1D
+	BR      L1
diff --git a/src/math/log1p.go b/src/math/log1p.go
index d1bddfb..b128a16 100644
--- a/src/math/log1p.go
+++ b/src/math/log1p.go
@@ -167,7 +167,7 @@
 		if iu < 0x0006a09e667f3bcd { // mantissa of Sqrt(2)
 			u = Float64frombits(iu | 0x3ff0000000000000) // normalize u
 		} else {
-			k += 1
+			k++
 			u = Float64frombits(iu | 0x3fe0000000000000) // normalize u/2
 			iu = (0x0010000000000000 - iu) >> 2
 		}
@@ -179,10 +179,9 @@
 		if f == 0 {
 			if k == 0 {
 				return 0
-			} else {
-				c += float64(k) * Ln2Lo
-				return float64(k)*Ln2Hi + c
 			}
+			c += float64(k) * Ln2Lo
+			return float64(k)*Ln2Hi + c
 		}
 		R = hfsq * (1.0 - 0.66666666666666666*f) // avoid division
 		if k == 0 {
diff --git a/src/math/modf_386.s b/src/math/modf_386.s
index d9b1eeb..e916073 100644
--- a/src/math/modf_386.s
+++ b/src/math/modf_386.s
@@ -7,16 +7,16 @@
 // func Modf(f float64) (int float64, frac float64)
 TEXT ·Modf(SB),NOSPLIT,$0
 	// special case for f == -0.0
-	MOVL f+4(FP), DX	// high word
-	MOVL f+0(FP), AX	// low word
+	MOVL f_hi+4(FP), DX	// high word
+	MOVL f_lo+0(FP), AX	// low word
 	CMPL DX, $(1<<31)	// beginning of -0.0
 	JNE notNegativeZero
 	CMPL AX, $0			// could be denormalized
 	JNE notNegativeZero
-	MOVL AX, int+8(FP)
-	MOVL DX, int+12(FP)
-	MOVL AX, frac+16(FP)
-	MOVL DX, frac+20(FP)
+	MOVL AX, int_lo+8(FP)
+	MOVL DX, int_hi+12(FP)
+	MOVL AX, frac_lo+16(FP)
+	MOVL DX, frac_hi+20(FP)
 	RET
 notNegativeZero:
 	FMOVD   f+0(FP), F0  // F0=f
diff --git a/src/math/modf_arm64.s b/src/math/modf_arm64.s
new file mode 100644
index 0000000..7c70ef3
--- /dev/null
+++ b/src/math/modf_arm64.s
@@ -0,0 +1,18 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// func Modf(f float64) (int float64, frac float64)
+TEXT ·Modf(SB),NOSPLIT,$0
+	MOVD	f+0(FP), R0
+	FMOVD	R0, F0
+	FRINTZD	F0, F1
+	FMOVD	F1, int+8(FP)
+	FSUBD	F1, F0
+	FMOVD	F0, R1
+	AND	$(1<<63), R0
+	ORR	R0, R1 // must have same sign
+	MOVD	R1, frac+16(FP)
+	RET
diff --git a/src/math/rand/gen_cooked.go b/src/math/rand/gen_cooked.go
new file mode 100644
index 0000000..567b7a8
--- /dev/null
+++ b/src/math/rand/gen_cooked.go
@@ -0,0 +1,89 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+// This program computes the value of rng_cooked in rng.go,
+// which is used for seeding all instances of rand.Source.
+// a 64bit and a 63bit version of the array is printed to
+// the standard output.
+
+package main
+
+import "fmt"
+
+const (
+	length = 607
+	tap    = 273
+	mask   = (1 << 63) - 1
+	a      = 48271
+	m      = (1 << 31) - 1
+	q      = 44488
+	r      = 3399
+)
+
+var (
+	rngVec          [length]int64
+	rngTap, rngFeed int
+)
+
+func seedrand(x int32) int32 {
+	hi := x / q
+	lo := x % q
+	x = a*lo - r*hi
+	if x < 0 {
+		x += m
+	}
+	return x
+}
+
+func srand(seed int32) {
+	rngTap = 0
+	rngFeed = length - tap
+	seed %= m
+	if seed < 0 {
+		seed += m
+	} else if seed == 0 {
+		seed = 89482311
+	}
+	x := seed
+	for i := -20; i < length; i++ {
+		x = seedrand(x)
+		if i >= 0 {
+			var u int64
+			u = int64(x) << 20
+			x = seedrand(x)
+			u ^= int64(x) << 10
+			x = seedrand(x)
+			u ^= int64(x)
+			rngVec[i] = u
+		}
+	}
+}
+
+func vrand() int64 {
+	rngTap--
+	if rngTap < 0 {
+		rngTap += length
+	}
+	rngFeed--
+	if rngFeed < 0 {
+		rngFeed += length
+	}
+	x := (rngVec[rngFeed] + rngVec[rngTap])
+	rngVec[rngFeed] = x
+	return x
+}
+
+func main() {
+	srand(1)
+	for i := uint64(0); i < 7.8e12; i++ {
+		vrand()
+	}
+	fmt.Printf("rngVec after 7.8e12 calls to vrand:\n%#v\n", rngVec)
+	for i := range rngVec {
+		rngVec[i] &= mask
+	}
+	fmt.Printf("lower 63bit of rngVec after 7.8e12 calls to vrand:\n%#v\n", rngVec)
+}
diff --git a/src/math/rand/race_test.go b/src/math/rand/race_test.go
index 48f6c29..186c716 100644
--- a/src/math/rand/race_test.go
+++ b/src/math/rand/race_test.go
@@ -33,6 +33,7 @@
 				seed += int64(Int63n(Int63()))
 				seed += int64(NormFloat64())
 				seed += int64(Uint32())
+				seed += int64(Uint64())
 				for _, p := range Perm(10) {
 					seed += int64(p)
 				}
diff --git a/src/math/rand/rand.go b/src/math/rand/rand.go
index dd8d43c..9fe1cbd 100644
--- a/src/math/rand/rand.go
+++ b/src/math/rand/rand.go
@@ -23,7 +23,20 @@
 	Seed(seed int64)
 }
 
+// A Source64 is a Source that can also generate
+// uniformly-distributed pseudo-random uint64 values in
+// the range [0, 1<<64) directly.
+// If a Rand r's underlying Source s implements Source64,
+// then r.Uint64 returns the result of one call to s.Uint64
+// instead of making two calls to s.Int63.
+type Source64 interface {
+	Source
+	Uint64() uint64
+}
+
 // NewSource returns a new pseudo-random Source seeded with the given value.
+// Unlike the default Source used by top-level functions, this source is not
+// safe for concurrent use by multiple goroutines.
 func NewSource(seed int64) Source {
 	var rng rngSource
 	rng.Seed(seed)
@@ -33,6 +46,7 @@
 // A Rand is a source of random numbers.
 type Rand struct {
 	src Source
+	s64 Source64 // non-nil if src is source64
 
 	// readVal contains remainder of 63-bit integer used for bytes
 	// generation during most recent Read call.
@@ -46,7 +60,10 @@
 
 // New returns a new Rand that uses random values from src
 // to generate other random values.
-func New(src Source) *Rand { return &Rand{src: src} }
+func New(src Source) *Rand {
+	s64, _ := src.(Source64)
+	return &Rand{src: src, s64: s64}
+}
 
 // Seed uses the provided seed value to initialize the generator to a deterministic state.
 // Seed should not be called concurrently with any other Rand method.
@@ -66,6 +83,14 @@
 // Uint32 returns a pseudo-random 32-bit value as a uint32.
 func (r *Rand) Uint32() uint32 { return uint32(r.Int63() >> 31) }
 
+// Uint64 returns a pseudo-random 64-bit value as a uint64.
+func (r *Rand) Uint64() uint64 {
+	if r.s64 != nil {
+		return r.s64.Uint64()
+	}
+	return uint64(r.Int63())>>31 | uint64(r.Int63())<<32
+}
+
 // Int31 returns a non-negative pseudo-random 31-bit integer as an int32.
 func (r *Rand) Int31() int32 { return int32(r.Int63() >> 32) }
 
@@ -207,7 +232,7 @@
  * Top-level convenience functions
  */
 
-var globalRand = New(&lockedSource{src: NewSource(1)})
+var globalRand = New(&lockedSource{src: NewSource(1).(Source64)})
 
 // Seed uses the provided seed value to initialize the default Source to a
 // deterministic state. If Seed is not called, the generator behaves as
@@ -224,6 +249,10 @@
 // from the default Source.
 func Uint32() uint32 { return globalRand.Uint32() }
 
+// Uint64 returns a pseudo-random 64-bit value as a uint64
+// from the default Source.
+func Uint64() uint64 { return globalRand.Uint64() }
+
 // Int31 returns a non-negative pseudo-random 31-bit integer as an int32
 // from the default Source.
 func Int31() int32 { return globalRand.Int31() }
@@ -286,7 +315,7 @@
 
 type lockedSource struct {
 	lk  sync.Mutex
-	src Source
+	src Source64
 }
 
 func (r *lockedSource) Int63() (n int64) {
@@ -296,6 +325,13 @@
 	return
 }
 
+func (r *lockedSource) Uint64() (n uint64) {
+	r.lk.Lock()
+	n = r.src.Uint64()
+	r.lk.Unlock()
+	return
+}
+
 func (r *lockedSource) Seed(seed int64) {
 	r.lk.Lock()
 	r.src.Seed(seed)
diff --git a/src/math/rand/rand_test.go b/src/math/rand/rand_test.go
index 6f31279..bf509e0 100644
--- a/src/math/rand/rand_test.go
+++ b/src/math/rand/rand_test.go
@@ -328,13 +328,26 @@
 	}
 }
 
+func hasSlowFloatingPoint() bool {
+	switch runtime.GOARCH {
+	case "arm":
+		return os.Getenv("GOARM") == "5"
+	case "mips", "mipsle", "mips64", "mips64le":
+		// Be conservative and assume that all mips boards
+		// have emulated floating point.
+		// TODO: detect what it actually has.
+		return true
+	}
+	return false
+}
+
 func TestFloat32(t *testing.T) {
 	// For issue 6721, the problem came after 7533753 calls, so check 10e6.
 	num := int(10e6)
 	// But do the full amount only on builders (not locally).
 	// But ARM5 floating point emulation is slow (Issue 10749), so
 	// do less for that builder:
-	if testing.Short() && (testenv.Builder() == "" || runtime.GOARCH == "arm" && os.Getenv("GOARM") == "5") {
+	if testing.Short() && (testenv.Builder() == "" || hasSlowFloatingPoint()) {
 		num /= 100 // 1.72 seconds instead of 172 seconds
 	}
 
diff --git a/src/math/rand/regress_test.go b/src/math/rand/regress_test.go
index 4dd965c..e31e6c5 100644
--- a/src/math/rand/regress_test.go
+++ b/src/math/rand/regress_test.go
@@ -381,4 +381,24 @@
 	uint32(75079301),                                                   // Uint32()
 	uint32(3380456901),                                                 // Uint32()
 	uint32(3433369789),                                                 // Uint32()
+	uint64(8717895732742165505),                                        // Uint64()
+	uint64(2259404117704393152),                                        // Uint64()
+	uint64(6050128673802995827),                                        // Uint64()
+	uint64(9724605487393973602),                                        // Uint64()
+	uint64(12613765599614152010),                                       // Uint64()
+	uint64(11893357769247901871),                                       // Uint64()
+	uint64(1774932891286980153),                                        // Uint64()
+	uint64(15267744271532198264),                                       // Uint64()
+	uint64(17498302081433670737),                                       // Uint64()
+	uint64(1543572285742637646),                                        // Uint64()
+	uint64(11885104867954719224),                                       // Uint64()
+	uint64(17548432336275752516),                                       // Uint64()
+	uint64(7837839688282259259),                                        // Uint64()
+	uint64(2518412263346885298),                                        // Uint64()
+	uint64(5617773211005988520),                                        // Uint64()
+	uint64(11562935753659892057),                                       // Uint64()
+	uint64(16368296284793757383),                                       // Uint64()
+	uint64(161231572858529631),                                         // Uint64()
+	uint64(16482847956365694147),                                       // Uint64()
+	uint64(16596477517051940556),                                       // Uint64()
 }
diff --git a/src/math/rand/rng.go b/src/math/rand/rng.go
index 947c49f..f922417 100644
--- a/src/math/rand/rng.go
+++ b/src/math/rand/rng.go
@@ -23,161 +23,159 @@
 )
 
 var (
-	// cooked random numbers
-	// the state of the rng
-	// after 780e10 iterations
+	// Used for seeding. See gen_cooked.go for details.
 	rng_cooked [_LEN]int64 = [...]int64{
-		5041579894721019882, 4646389086726545243, 1395769623340756751, 5333664234075297259,
-		2875692520355975054, 9033628115061424579, 7143218595135194537, 4812947590706362721,
-		7937252194349799378, 5307299880338848416, 8209348851763925077, 2115741599318814044,
-		4593015457530856296, 8140875735541888011, 3319429241265089026, 8619815648190321034,
-		1727074043483619500, 113108499721038619, 4569519971459345583, 5062833859075314731,
-		2387618771259064424, 2716131344356686112, 6559392774825876886, 7650093201692370310,
-		7684323884043752161, 257867835996031390, 6593456519409015164, 271327514973697897,
-		2789386447340118284, 1065192797246149621, 3344507881999356393, 4459797941780066633,
-		7465081662728599889, 1014950805555097187, 4449440729345990775, 3481109366438502643,
+		-4181792142133755926, -4576982950128230565, 1395769623340756751, 5333664234075297259,
+		-6347679516498800754, 9033628115061424579, 7143218595135194537, 4812947590706362721,
+		7937252194349799378, 5307299880338848416, 8209348851763925077, -7107630437535961764,
+		4593015457530856296, 8140875735541888011, -5903942795589686782, -603556388664454774,
+		-7496297993371156308, 113108499721038619, 4569519971459345583, -4160538177779461077,
+		-6835753265595711384, -6507240692498089696, 6559392774825876886, 7650093201692370310,
+		7684323884043752161, -8965504200858744418, -2629915517445760644, 271327514973697897,
+		-6433985589514657524, 1065192797246149621, 3344507881999356393, -4763574095074709175,
+		7465081662728599889, 1014950805555097187, -4773931307508785033, -5742262670416273165,
 		2418672789110888383, 5796562887576294778, 4484266064449540171, 3738982361971787048,
-		4523597184512354423, 10530508058128498, 8633833783282346118, 2625309929628791628,
-		8660405965245884302, 10162832508971942, 6540714680961817391, 7031802312784620857,
-		6240911277345944669, 831864355460801054, 8004434137542152891, 2116287251661052151,
+		-4699774852342421385, 10530508058128498, -589538253572429690, -6598062107225984180,
+		8660405965245884302, 10162832508971942, -2682657355892958417, 7031802312784620857,
+		6240911277345944669, 831864355460801054, -1218937899312622917, 2116287251661052151,
 		2202309800992166967, 9161020366945053561, 4069299552407763864, 4936383537992622449,
-		457351505131524928, 342195045928179354, 2847771682816600509, 2068020115986376518,
-		4368649989588021065, 887231587095185257, 5563591506886576496, 6816225200251950296,
-		5616972787034086048, 8471809303394836566, 1686575021641186857, 4045484338074262002,
-		4244156215201778923, 7848217333783577387, 5632136521049761902, 833283142057835272,
-		9029726508369077193, 3243583134664087292, 4316371101804477087, 8937849979965997980,
-		6446940406810434101, 1679342092332374735, 6050638460742422078, 6993520719509581582,
-		7640877852514293609, 5881353426285907985, 812786550756860885, 4541845584483343330,
-		2725470216277009086, 4980675660146853729, 5210769080603236061, 8894283318990530821,
-		6326442804750084282, 1495812843684243920, 7069751578799128019, 7370257291860230865,
-		6756929275356942261, 4706794511633873654, 7824520467827898663, 8549875090542453214,
-		33650829478596156, 1328918435751322643, 7297902601803624459, 1011190183918857495,
-		2238025036817854944, 5147159997473910359, 896512091560522982, 2659470849286379941,
-		6097729358393448602, 1731725986304753684, 4106255841983812711, 8327155210721535508,
-		8477511620686074402, 5803876044675762232, 8435417780860221662, 5988852856651071244,
-		4715837297103951910, 7566171971264485114, 505808562678895611, 5070098180695063370,
-		842110666775871513, 572156825025677802, 1791881013492340891, 3393267094866038768,
-		3778721850472236509, 2352769483186201278, 1292459583847367458, 8897907043675088419,
-		5781809037144163536, 2733958794029492513, 5092019688680754699, 8996124554772526841,
-		4234737173186232084, 5027558287275472836, 4635198586344772304, 8687338893267139351,
-		5907508150730407386, 784756255473944452, 972392927514829904, 5422057694808175112,
-		5158420642969283891, 9048531678558643225, 2407211146698877100, 7583282216521099569,
-		3940796514530962282, 3341174631045206375, 3095313889586102949, 7405321895688238710,
-		5832080132947175283, 7890064875145919662, 8184139210799583195, 1149859861409226130,
-		1464597243840211302, 4641648007187991873, 3516491885471466898, 956288521791657692,
+		457351505131524928, -8881176990926596454, -6375600354038175299, -7155351920868399290,
+		4368649989588021065, 887231587095185257, -3659780529968199312, -2407146836602825512,
+		5616972787034086048, -751562733459939242, 1686575021641186857, -5177887698780513806,
+		-4979215821652996885, -1375154703071198421, 5632136521049761902, -8390088894796940536,
+		-193645528485698615, -5979788902190688516, -4907000935050298721, -285522056888777828,
+		-2776431630044341707, 1679342092332374735, 6050638460742422078, -2229851317345194226,
+		-1582494184340482199, 5881353426285907985, 812786550756860885, 4541845584483343330,
+		-6497901820577766722, 4980675660146853729, -4012602956251539747, -329088717864244987,
+		-2896929232104691526, 1495812843684243920, -2153620458055647789, 7370257291860230865,
+		-2466442761497833547, 4706794511633873654, -1398851569026877145, 8549875090542453214,
+		-9189721207376179652, -7894453601103453165, 7297902601803624459, 1011190183918857495,
+		-6985347000036920864, 5147159997473910359, -8326859945294252826, 2659470849286379941,
+		6097729358393448602, -7491646050550022124, -5117116194870963097, -896216826133240300,
+		-745860416168701406, 5803876044675762232, -787954255994554146, -3234519180203704564,
+		-4507534739750823898, -1657200065590290694, 505808562678895611, -4153273856159712438,
+		-8381261370078904295, 572156825025677802, 1791881013492340891, 3393267094866038768,
+		-5444650186382539299, 2352769483186201278, -7930912453007408350, -325464993179687389,
+		-3441562999710612272, -6489413242825283295, 5092019688680754699, -227247482082248967,
+		4234737173186232084, 5027558287275472836, 4635198586344772304, -536033143587636457,
+		5907508150730407386, -8438615781380831356, 972392927514829904, -3801314342046600696,
+		-4064951393885491917, -174840358296132583, 2407211146698877100, -1640089820333676239,
+		3940796514530962282, -5882197405809569433, 3095313889586102949, -1818050141166537098,
+		5832080132947175283, 7890064875145919662, 8184139210799583195, -8073512175445549678,
+		-7758774793014564506, -4581724029666783935, 3516491885471466898, -8267083515063118116,
 		6657089965014657519, 5220884358887979358, 1796677326474620641, 5340761970648932916,
 		1147977171614181568, 5066037465548252321, 2574765911837859848, 1085848279845204775,
-		3350107529868390359, 6116438694366558490, 2107701075971293812, 1803294065921269267,
-		2469478054175558874, 7368243281019965984, 3791908367843677526, 185046971116456637,
-		2257095756513439648, 7217693971077460129, 909049953079504259, 7196649268545224266,
-		5637660345400869599, 3955544945427965183, 8057528650917418961, 4139268440301127643,
-		6621926588513568059, 1373361136802681441, 6527366231383600011, 3507654575162700890,
-		9202058512774729859, 1954818376891585542, 6640380907130175705, 8299563319178235687,
-		3901867355218954373, 7046310742295574065, 6847195391333990232, 1572638100518868053,
-		8850422670118399721, 3631909142291992901, 5158881091950831288, 2882958317343121593,
-		4763258931815816403, 6280052734341785344, 4243789408204964850, 2043464728020827976,
-		6545300466022085465, 4562580375758598164, 5495451168795427352, 1738312861590151095,
-		553004618757816492, 6895160632757959823, 8233623922264685171, 7139506338801360852,
-		8550891222387991669, 5535668688139305547, 2430933853350256242, 5401941257863201076,
-		8159640039107728799, 6157493831600770366, 7632066283658143750, 6308328381617103346,
+		-5873264506986385449, 6116438694366558490, 2107701075971293812, -7420077970933506541,
+		2469478054175558874, -1855128755834809824, -5431463669011098282, -9038325065738319171,
+		-6966276280341336160, 7217693971077460129, -8314322083775271549, 7196649268545224266,
+		-3585711691453906209, -5267827091426810625, 8057528650917418961, -5084103596553648165,
+		-2601445448341207749, -7850010900052094367, 6527366231383600011, 3507654575162700890,
+		9202058512774729859, 1954818376891585542, -2582991129724600103, 8299563319178235687,
+		-5321504681635821435, 7046310742295574065, -2376176645520785576, -7650733936335907755,
+		8850422670118399721, 3631909142291992901, 5158881091950831288, -6340413719511654215,
+		4763258931815816403, 6280052734341785344, -4979582628649810958, 2043464728020827976,
+		-2678071570832690343, 4562580375758598164, 5495451168795427352, -7485059175264624713,
+		553004618757816492, 6895160632757959823, -989748114590090637, 7139506338801360852,
+		-672480814466784139, 5535668688139305547, 2430933853350256242, -3821430778991574732,
+		-1063731997747047009, -3065878205254005442, 7632066283658143750, 6308328381617103346,
 		3681878764086140361, 3289686137190109749, 6587997200611086848, 244714774258135476,
-		4079788377417136100, 8090302575944624335, 2945117363431356361, 864324395848741045,
-		3009039260312620700, 8430027460082534031, 401084700045993341, 7254622446438694921,
-		4707864159563588614, 5640248530963493951, 5982507712689997893, 3315098242282210105,
-		5503847578771918426, 3941971367175193882, 8118566580304798074, 3839261274019871296,
-		7062410411742090847, 741381002980207668, 6027994129690250817, 2497829994150063930,
-		6251390334426228834, 1368930247903518833, 8809096399316380241, 6492004350391900708,
-		2462145737463489636, 404828418920299174, 4153026434231690595, 261785715255475940,
-		5464715384600071357, 592710404378763017, 6764129236657751224, 8513655718539357449,
-		5820343663801914208, 385298524683789911, 5224135003438199467, 6303131641338802145,
-		7150122561309371392, 368107899140673753, 3115186834558311558, 2915636353584281051,
+		-5143583659437639708, 8090302575944624335, 2945117363431356361, -8359047641006034763,
+		3009039260312620700, -793344576772241777, 401084700045993341, -1968749590416080887,
+		4707864159563588614, -3583123505891281857, -3240864324164777915, -5908273794572565703,
+		-3719524458082857382, -5281400669679581926, 8118566580304798074, 3839261274019871296,
+		7062410411742090847, -8481991033874568140, 6027994129690250817, -6725542042704711878,
+		-2971981702428546974, -7854441788951256975, 8809096399316380241, 6492004350391900708,
+		2462145737463489636, -8818543617934476634, -5070345602623085213, -8961586321599299868,
+		-3758656652254704451, -8630661632476012791, 6764129236657751224, -709716318315418359,
+		-3403028373052861600, -8838073512170985897, -3999237033416576341, -2920240395515973663,
+		-2073249475545404416, 368107899140673753, -6108185202296464250, -6307735683270494757,
 		4782583894627718279, 6718292300699989587, 8387085186914375220, 3387513132024756289,
-		4654329375432538231, 8930667561363381602, 5374373436876319273, 7623042350483453954,
-		7725442901813263321, 9186225467561587250, 4091027289597503355, 2357631606492579800,
-		2530936820058611833, 1636551876240043639, 5564664674334965799, 1452244145334316253,
-		2061642381019690829, 1279580266495294036, 9108481583171221009, 6023278686734049809,
-		5007630032676973346, 2153168792952589781, 6720334534964750538, 6041546491134794105,
-		3433922409283786309, 2285479922797300912, 3110614940896576130, 6366559590722842893,
-		5418791419666136509, 7163298419643543757, 4891138053923696990, 580618510277907015,
-		1684034065251686769, 4429514767357295841, 330346578555450005, 1119637995812174675,
-		7177515271653460134, 4589042248470800257, 7693288629059004563, 143607045258444228,
-		246994305896273627, 866417324803099287, 6473547110565816071, 3092379936208876896,
-		2058427839513754051, 5133784708526867938, 8785882556301281247, 6149332666841167611,
-		8585842181454472135, 6137678347805511274, 2070447184436970006, 5708223427705576541,
-		5999657892458244504, 4358391411789012426, 325123008708389849, 6837621693887290924,
-		4843721905315627004, 6010651222149276415, 5398352198963874652, 4602025990114250980,
-		1044646352569048800, 9106614159853161675, 829256115228593269, 4919284369102997000,
-		2681532557646850893, 3681559472488511871, 5307999518958214035, 6334130388442829274,
-		2658708232916537604, 1163313865052186287, 581945337509520675, 3648778920718647903,
-		4423673246306544414, 1620799783996955743, 220828013409515943, 8150384699999389761,
-		4287360518296753003, 4590000184845883843, 5513660857261085186, 6964829100392774275,
-		478991688350776035, 8746140185685648781, 228500091334420247, 1356187007457302238,
-		3019253992034194581, 3152601605678500003, 430152752706002213, 5559581553696971176,
-		4916432985369275664, 663574931734554391, 3420773838927732076, 2868348622579915573,
-		1999319134044418520, 3328689518636282723, 2587672709781371173, 1517255313529399333,
-		3092343956317362483, 3662252519007064108, 972445599196498113, 7664865435875959367,
-		1708913533482282562, 6917817162668868494, 3217629022545312900, 2570043027221707107,
-		8739788839543624613, 2488075924621352812, 4694002395387436668, 4559628481798514356,
+		4654329375432538231, -292704475491394206, -3848998599978456535, 7623042350483453954,
+		7725442901813263321, 9186225467561587250, -5132344747257272453, -6865740430362196008,
+		2530936820058611833, 1636551876240043639, -3658707362519810009, 1452244145334316253,
+		-7161729655835084979, -7943791770359481772, 9108481583171221009, -3200093350120725999,
+		5007630032676973346, 2153168792952589781, 6720334534964750538, -3181825545719981703,
+		3433922409283786309, 2285479922797300912, 3110614940896576130, -2856812446131932915,
+		-3804580617188639299, 7163298419643543757, 4891138053923696990, 580618510277907015,
+		1684034065251686769, 4429514767357295841, -8893025458299325803, -8103734041042601133,
+		7177515271653460134, 4589042248470800257, -1530083407795771245, 143607045258444228,
+		246994305896273627, -8356954712051676521, 6473547110565816071, 3092379936208876896,
+		2058427839513754051, -4089587328327907870, 8785882556301281247, -3074039370013608197,
+		-637529855400303673, 6137678347805511274, -7152924852417805802, 5708223427705576541,
+		-3223714144396531304, 4358391411789012426, 325123008708389849, 6837621693887290924,
+		4843721905315627004, -3212720814705499393, -3825019837890901156, 4602025990114250980,
+		1044646352569048800, 9106614159853161675, -8394115921626182539, -4304087667751778808,
+		2681532557646850893, 3681559472488511871, -3915372517896561773, -2889241648411946534,
+		-6564663803938238204, -8060058171802589521, 581945337509520675, 3648778920718647903,
+		-4799698790548231394, -7602572252857820065, 220828013409515943, -1072987336855386047,
+		4287360518296753003, -4633371852008891965, 5513660857261085186, -2258542936462001533,
+		-8744380348503999773, 8746140185685648781, 228500091334420247, 1356187007457302238,
+		3019253992034194581, 3152601605678500003, -8793219284148773595, 5559581553696971176,
+		4916432985369275664, -8559797105120221417, -5802598197927043732, 2868348622579915573,
+		-7224052902810357288, -5894682518218493085, 2587672709781371173, -7706116723325376475,
+		3092343956317362483, -5561119517847711700, 972445599196498113, -1558506600978816441,
+		1708913533482282562, -2305554874185907314, -6005743014309462908, -6653329009633068701,
+		-483583197311151195, 2488075924621352812, -4529369641467339140, -4663743555056261452,
 		2997203966153298104, 1282559373026354493, 240113143146674385, 8665713329246516443,
-		628141331766346752, 4571950817186770476, 1472811188152235408, 7596648026010355826,
-		6091219417754424743, 7834161864828164065, 7103445518877254909, 4390861237357459201,
-		4442653864240571734, 8903482404847331368, 622261699494173647, 6037261250297213248,
-		504404948065709118, 7275215526217113061, 1011176780856001400, 2194750105623461063,
-		2623071828615234808, 5157313728073836108, 3738405111966602044, 2539767524076729570,
-		2467284396349269342, 5256026990536851868, 7841086888628396109, 6640857538655893162,
-		1202087339038317498, 2113514992440715978, 7534350895342931403, 4925284734898484745,
-		5145623771477493805, 8225140880134972332, 2719520354384050532, 9132346697815513771,
-		4332154495710163773, 7137789594094346916, 6994721091344268833, 6667228574869048934,
-		655440045726677499, 59934747298466858, 6124974028078036405, 8957774780655365418,
-		2332206071942466437, 1701056712286369627, 3154897383618636503, 1637766181387607527,
-		2460521277767576533, 197309393502684135, 643677854385267315, 2543179307861934850,
-		4350769010207485119, 4754652089410667672, 2015595502641514512, 7999059458976458608,
-		4287946071480840813, 8362686366770308971, 6486469209321732151, 3617727845841796026,
-		7554353525834302244, 4450022655153542367, 1605195740213535749, 5327014565305508387,
-		4626575813550328320, 2692222020597705149, 241045573717249868, 5098046974627094010,
-		7916882295460730264, 884817090297530579, 5329160409530630596, 7790979528857726136,
-		4955070238059373407, 4918537275422674302, 3008076183950404629, 3007769226071157901,
-		2470346235617803020, 8928702772696731736, 7856187920214445904, 4474874585391974885,
-		7900176660600710914, 2140571127916226672, 2425445057265199971, 2486055153341847830,
-		4186670094382025798, 1883939007446035042, 8808666044074867985, 3734134241178479257,
-		4065968871360089196, 6953124200385847784, 1305686814738899057, 1637739099014457647,
-		3656125660947993209, 3966759634633167020, 3106378204088556331, 6328899822778449810,
-		4565385105440252958, 1979884289539493806, 2331793186920865425, 3783206694208922581,
-		8464961209802336085, 2843963751609577687, 3030678195484896323, 4793717574095772604,
+		628141331766346752, -4651421219668005332, -7750560848702540400, 7596648026010355826,
+		-3132152619100351065, 7834161864828164065, 7103445518877254909, 4390861237357459201,
+		-4780718172614204074, -319889632007444440, 622261699494173647, -3186110786557562560,
+		-8718967088789066690, -1948156510637662747, -8212195255998774408, -7028621931231314745,
+		2623071828615234808, -4066058308780939700, -5484966924888173764, -6683604512778046238,
+		-6756087640505506466, 5256026990536851868, 7841086888628396109, 6640857538655893162,
+		-8021284697816458310, -7109857044414059830, -1689021141511844405, -4298087301956291063,
+		-4077748265377282003, -998231156719803476, 2719520354384050532, 9132346697815513771,
+		4332154495710163773, -2085582442760428892, 6994721091344268833, -2556143461985726874,
+		-8567931991128098309, 59934747298466858, -3098398008776739403, -265597256199410390,
+		2332206071942466437, -7522315324568406181, 3154897383618636503, -7585605855467168281,
+		-6762850759087199275, 197309393502684135, -8579694182469508493, 2543179307861934850,
+		4350769010207485119, -4468719947444108136, -7207776534213261296, -1224312577878317200,
+		4287946071480840813, 8362686366770308971, 6486469209321732151, -5605644191012979782,
+		-1669018511020473564, 4450022655153542367, -7618176296641240059, -3896357471549267421,
+		-4596796223304447488, -6531150016257070659, -8982326463137525940, -4125325062227681798,
+		-1306489741394045544, -8338554946557245229, 5329160409530630596, 7790979528857726136,
+		4955070238059373407, -4304834761432101506, -6215295852904371179, 3007769226071157901,
+		-6753025801236972788, 8928702772696731736, 7856187920214445904, -4748497451462800923,
+		7900176660600710914, -7082800908938549136, -6797926979589575837, -6737316883512927978,
+		4186670094382025798, 1883939007446035042, -414705992779907823, 3734134241178479257,
+		4065968871360089196, 6953124200385847784, -7917685222115876751, -7585632937840318161,
+		-5567246375906782599, -5256612402221608788, 3106378204088556331, -2894472214076325998,
+		4565385105440252958, 1979884289539493806, -6891578849933910383, 3783206694208922581,
+		8464961209802336085, 2843963751609577687, 3030678195484896323, -4429654462759003204,
 		4459239494808162889, 402587895800087237, 8057891408711167515, 4541888170938985079,
-		1042662272908816815, 5557303057122568958, 2647678726283249984, 2144477441549833761,
-		5806352215355387087, 7117771003473903623, 5916597177708541638, 462597715452321361,
-		8833658097025758785, 5970273481425315300, 563813119381731307, 2768349550652697015,
-		1598828206250873866, 5206393647403558110, 6235043485709261823, 3152217402014639496,
-		8469693267274066490, 125672920241807416, 5311079624024060938, 6663754932310491587,
-		8736848295048751716, 4488039774992061878, 5923302823487327109, 140891791083103236,
-		7414942793393574290, 7990420780896957397, 4317817392807076702, 3625184369705367340,
-		2740722765288122703, 5743100009702758344, 5997898640509039159, 8854493341352484163,
-		5242208035432907801, 701338899890987198, 7609280429197514109, 3020985755112334161,
-		6651322707055512866, 2635195723621160615, 5144520864246028816, 1035086515727829828,
-		1567242097116389047, 8172389260191636581, 6337820351429292273, 2163012566996458925,
-		2743190902890262681, 1906367633221323427, 6011544915663598137, 5932255307352610768,
-		2241128460406315459, 895504896216695588, 3094483003111372717, 4583857460292963101,
-		9079887171656594975, 8839289181930711403, 5762740387243057873, 4225072055348026230,
-		1838220598389033063, 3801620336801580414, 8823526620080073856, 1776617605585100335,
-		7899055018877642622, 5421679761463003041, 5521102963086275121, 4248279443559365898,
-		8735487530905098534, 1760527091573692978, 7142485049657745894, 8222656872927218123,
-		4969531564923704323, 3394475942196872480, 6424174453260338141, 359248545074932887,
-		3273651282831730598, 6797106199797138596, 3030918217665093212, 145600834617314036,
-		6036575856065626233, 740416251634527158, 7080427635449935582, 6951781370868335478,
-		399922722363687927, 294902314447253185, 7844950936339178523, 880320858634709042,
-		6192655680808675579, 411604686384710388, 9026808440365124461, 6440783557497587732,
+		1042662272908816815, -3666068979732206850, 2647678726283249984, 2144477441549833761,
+		-3417019821499388721, -2105601033380872185, 5916597177708541638, -8760774321402454447,
+		8833658097025758785, 5970273481425315300, 563813119381731307, -6455022486202078793,
+		1598828206250873866, -4016978389451217698, -2988328551145513985, -6071154634840136312,
+		8469693267274066490, 125672920241807416, -3912292412830714870, -2559617104544284221,
+		-486523741806024092, -4735332261862713930, 5923302823487327109, -9082480245771672572,
+		-1808429243461201518, 7990420780896957397, 4317817392807076702, 3625184369705367340,
+		-6482649271566653105, -3480272027152017464, -3225473396345736649, -368878695502291645,
+		-3981164001421868007, -8522033136963788610, 7609280429197514109, 3020985755112334161,
+		-2572049329799262942, 2635195723621160615, 5144520864246028816, -8188285521126945980,
+		1567242097116389047, 8172389260191636581, -2885551685425483535, -7060359469858316883,
+		-6480181133964513127, -7317004403633452381, 6011544915663598137, 5932255307352610768,
+		2241128460406315459, -8327867140638080220, 3094483003111372717, 4583857460292963101,
+		9079887171656594975, -384082854924064405, -3460631649611717935, 4225072055348026230,
+		-7385151438465742745, 3801620336801580414, -399845416774701952, -7446754431269675473,
+		7899055018877642622, 5421679761463003041, 5521102963086275121, -4975092593295409910,
+		8735487530905098534, -7462844945281082830, -2080886987197029914, -1000715163927557685,
+		-4253840471931071485, -5828896094657903328, 6424174453260338141, 359248545074932887,
+		-5949720754023045210, -2426265837057637212, 3030918217665093212, -9077771202237461772,
+		-3186796180789149575, 740416251634527158, -2142944401404840226, 6951781370868335478,
+		399922722363687927, -8928469722407522623, -1378421100515597285, -8343051178220066766,
+		-3030716356046100229, -8811767350470065420, 9026808440365124461, 6440783557497587732,
 		4615674634722404292, 539897290441580544, 2096238225866883852, 8751955639408182687,
-		1907224908052289603, 7381039757301768559, 6157238513393239656, 7749994231914157575,
+		-7316147128802486205, 7381039757301768559, 6157238513393239656, -1473377804940618233,
 		8629571604380892756, 5280433031239081479, 7101611890139813254, 2479018537985767835,
-		7169176924412769570, 7942066497793203302, 1357759729055557688, 2278447439451174845,
-		3625338785743880657, 6477479539006708521, 8976185375579272206, 5511371554711836120,
+		7169176924412769570, -1281305539061572506, -7865612307799218120, 2278447439451174845,
+		3625338785743880657, 6477479539006708521, 8976185375579272206, -3712000482142939688,
 		1326024180520890843, 7537449876596048829, 5464680203499696154, 3189671183162196045,
-		6346751753565857109, 241159987320630307, 3095793449658682053, 8978332846736310159,
-		2902794662273147216, 7208698530190629697, 7276901792339343736, 1732385229314443140,
-		4133292154170828382, 2918308698224194548, 1519461397937144458, 5293934712616591764,
-		4922828954023452664, 2879211533496425641, 5896236396443472108, 8465043815351752425,
-		7329020396871624740, 8915471717014488588, 2944902635677463047, 7052079073493465134,
+		6346751753565857109, -8982212049534145501, -6127578587196093755, -245039190118465649,
+		-6320577374581628592, 7208698530190629697, 7276901792339343736, -7490986807540332668,
+		4133292154170828382, 2918308698224194548, -7703910638917631350, -3929437324238184044,
+		-4300543082831323144, -6344160503358350167, 5896236396443472108, -758328221503023383,
+		-1894351639983151068, -307900319840287220, -6278469401177312761, -2171292963361310674,
 		8382142935188824023, 9103922860780351547, 4152330101494654406,
 	}
 )
@@ -223,13 +221,18 @@
 			x = seedrand(x)
 			u ^= int64(x)
 			u ^= rng_cooked[i]
-			rng.vec[i] = u & _MASK
+			rng.vec[i] = u
 		}
 	}
 }
 
 // Int63 returns a non-negative pseudo-random 63-bit integer as an int64.
 func (rng *rngSource) Int63() int64 {
+	return int64(rng.Uint64() & _MASK)
+}
+
+// Uint64 returns a non-negative pseudo-random 64-bit integer as an uint64.
+func (rng *rngSource) Uint64() uint64 {
 	rng.tap--
 	if rng.tap < 0 {
 		rng.tap += _LEN
@@ -240,7 +243,7 @@
 		rng.feed += _LEN
 	}
 
-	x := (rng.vec[rng.feed] + rng.vec[rng.tap]) & _MASK
+	x := rng.vec[rng.feed] + rng.vec[rng.tap]
 	rng.vec[rng.feed] = x
-	return x
+	return uint64(x)
 }
diff --git a/src/math/sin.go b/src/math/sin.go
index ed85f21..7a75a5f 100644
--- a/src/math/sin.go
+++ b/src/math/sin.go
@@ -140,8 +140,8 @@
 
 	// map zeros to origin
 	if j&1 == 1 {
-		j += 1
-		y += 1
+		j++
+		y++
 	}
 	j &= 7 // octant modulo 2Pi radians (360 degrees)
 	if j > 3 {
@@ -200,8 +200,8 @@
 
 	// map zeros to origin
 	if j&1 == 1 {
-		j += 1
-		y += 1
+		j++
+		y++
 	}
 	j &= 7 // octant modulo 2Pi radians (360 degrees)
 	// reflect in x axis
diff --git a/src/math/sin_s390x.s b/src/math/sin_s390x.s
new file mode 100644
index 0000000..5dc823c
--- /dev/null
+++ b/src/math/sin_s390x.s
@@ -0,0 +1,356 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Various constants
+DATA sincosxnan<>+0(SB)/8, $0x7ff8000000000000
+GLOBL sincosxnan<>+0(SB), RODATA, $8
+DATA sincosxlim<>+0(SB)/8, $0x432921fb54442d19
+GLOBL sincosxlim<>+0(SB), RODATA, $8
+DATA sincosxadd<>+0(SB)/8, $0xc338000000000000
+GLOBL sincosxadd<>+0(SB), RODATA, $8
+DATA sincosxpi2l<>+0(SB)/8, $0.108285667392191389e-31
+GLOBL sincosxpi2l<>+0(SB), RODATA, $8
+DATA sincosxpi2m<>+0(SB)/8, $0.612323399573676480e-16
+GLOBL sincosxpi2m<>+0(SB), RODATA, $8
+DATA sincosxpi2h<>+0(SB)/8, $0.157079632679489656e+01
+GLOBL sincosxpi2h<>+0(SB), RODATA, $8
+DATA sincosrpi2<>+0(SB)/8, $0.636619772367581341e+00
+GLOBL sincosrpi2<>+0(SB), RODATA, $8
+
+// Minimax polynomial approximations
+DATA sincosc0<>+0(SB)/8, $0.100000000000000000E+01
+GLOBL sincosc0<>+0(SB), RODATA, $8
+DATA sincosc1<>+0(SB)/8, $-.499999999999999833E+00
+GLOBL sincosc1<>+0(SB), RODATA, $8
+DATA sincosc2<>+0(SB)/8, $0.416666666666625843E-01
+GLOBL sincosc2<>+0(SB), RODATA, $8
+DATA sincosc3<>+0(SB)/8, $-.138888888885498984E-02
+GLOBL sincosc3<>+0(SB), RODATA, $8
+DATA sincosc4<>+0(SB)/8, $0.248015871681607202E-04
+GLOBL sincosc4<>+0(SB), RODATA, $8
+DATA sincosc5<>+0(SB)/8, $-.275572911309937875E-06
+GLOBL sincosc5<>+0(SB), RODATA, $8
+DATA sincosc6<>+0(SB)/8, $0.208735047247632818E-08
+GLOBL sincosc6<>+0(SB), RODATA, $8
+DATA sincosc7<>+0(SB)/8, $-.112753632738365317E-10
+GLOBL sincosc7<>+0(SB), RODATA, $8
+DATA sincoss0<>+0(SB)/8, $0.100000000000000000E+01
+GLOBL sincoss0<>+0(SB), RODATA, $8
+DATA sincoss1<>+0(SB)/8, $-.166666666666666657E+00
+GLOBL sincoss1<>+0(SB), RODATA, $8
+DATA sincoss2<>+0(SB)/8, $0.833333333333309209E-02
+GLOBL sincoss2<>+0(SB), RODATA, $8
+DATA sincoss3<>+0(SB)/8, $-.198412698410701448E-03
+GLOBL sincoss3<>+0(SB), RODATA, $8
+DATA sincoss4<>+0(SB)/8, $0.275573191453906794E-05
+GLOBL sincoss4<>+0(SB), RODATA, $8
+DATA sincoss5<>+0(SB)/8, $-.250520918387633290E-07
+GLOBL sincoss5<>+0(SB), RODATA, $8
+DATA sincoss6<>+0(SB)/8, $0.160571285514715856E-09
+GLOBL sincoss6<>+0(SB), RODATA, $8
+DATA sincoss7<>+0(SB)/8, $-.753213484933210972E-12
+GLOBL sincoss7<>+0(SB), RODATA, $8
+
+// Sin returns the sine of the radian argument x.
+//
+// Special cases are:
+//      Sin(±0) = ±0
+//      Sin(±Inf) = NaN
+//      Sin(NaN) = NaN
+// The algorithm used is minimax polynomial approximation.
+// with coefficients determined with a Remez exchange algorithm.
+
+TEXT ·sinAsm(SB),NOSPLIT,$0-16
+	FMOVD   x+0(FP), F0
+	//special case Sin(±0) = ±0
+	FMOVD   $(0.0), F1
+	FCMPU   F0, F1
+	BEQ     sinIsZero
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	BLTU    L17
+	FMOVD   F0, F5
+L2:
+	MOVD    $sincoss7<>+0(SB), R1
+	FMOVD   0(R1), F4
+	MOVD    $sincoss6<>+0(SB), R1
+	FMOVD   0(R1), F1
+	MOVD    $sincoss5<>+0(SB), R1
+	VLEG    $0, 0(R1), V18
+	MOVD    $sincoss4<>+0(SB), R1
+	FMOVD   0(R1), F6
+	MOVD    $sincoss2<>+0(SB), R1
+	VLEG    $0, 0(R1), V16
+	MOVD    $sincoss3<>+0(SB), R1
+	FMOVD   0(R1), F7
+	MOVD    $sincoss1<>+0(SB), R1
+	FMOVD   0(R1), F3
+	MOVD    $sincoss0<>+0(SB), R1
+	FMOVD   0(R1), F2
+	WFCHDBS V2, V5, V2
+	BEQ     L18
+	MOVD    $sincosrpi2<>+0(SB), R1
+	FMOVD   0(R1), F3
+	MOVD    $sincosxadd<>+0(SB), R1
+	FMOVD   0(R1), F2
+	WFMSDB  V0, V3, V2, V3
+	FMOVD   0(R1), F6
+	FADD    F3, F6
+	MOVD    $sincosxpi2h<>+0(SB), R1
+	FMOVD   0(R1), F2
+	FMSUB   F2, F6, F0, F0
+	MOVD    $sincosxpi2m<>+0(SB), R1
+	FMOVD   0(R1), F4
+	FMADD   F4, F6, F0, F0
+	MOVD    $sincosxpi2l<>+0(SB), R1
+	WFMDB   V0, V0, V1
+	FMOVD   0(R1), F7
+	WFMDB   V1, V1, V2
+	WORD    $0xB3CD0013     //lgdr  %r1,%f3
+	MOVD    $sincosxlim<>+0(SB), R2
+	WORD    $0xA7110001     //tmll  %r1,1
+	BEQ     L6
+	FMOVD   0(R2), F0
+	WFCHDBS V0, V5, V0
+	BNE     L14
+	MOVD    $sincosc7<>+0(SB), R2
+	FMOVD   0(R2), F0
+	MOVD    $sincosc6<>+0(SB), R2
+	FMOVD   0(R2), F4
+	MOVD    $sincosc5<>+0(SB), R2
+	WFMADB  V1, V0, V4, V0
+	FMOVD   0(R2), F6
+	MOVD    $sincosc4<>+0(SB), R2
+	WFMADB  V1, V0, V6, V0
+	FMOVD   0(R2), F4
+	MOVD    $sincosc2<>+0(SB), R2
+	FMOVD   0(R2), F6
+	WFMADB  V2, V4, V6, V4
+	MOVD    $sincosc3<>+0(SB), R2
+	FMOVD   0(R2), F3
+	MOVD    $sincosc1<>+0(SB), R2
+	WFMADB  V2, V0, V3, V0
+	FMOVD   0(R2), F6
+	WFMADB  V1, V4, V6, V4
+	WORD    $0xA7110002     //tmll  %r1,2
+	WFMADB  V2, V0, V4, V0
+	MOVD    $sincosc0<>+0(SB), R1
+	FMOVD   0(R1), F2
+	WFMADB  V1, V0, V2, V0
+	BNE     L15
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L6:
+	FMOVD   0(R2), F4
+	WFCHDBS V4, V5, V4
+	BNE     L14
+	MOVD    $sincoss7<>+0(SB), R2
+	FMOVD   0(R2), F4
+	MOVD    $sincoss6<>+0(SB), R2
+	FMOVD   0(R2), F3
+	MOVD    $sincoss5<>+0(SB), R2
+	WFMADB  V1, V4, V3, V4
+	WFMADB  V6, V7, V0, V6
+	FMOVD   0(R2), F0
+	MOVD    $sincoss4<>+0(SB), R2
+	FMADD   F4, F1, F0, F0
+	FMOVD   0(R2), F3
+	MOVD    $sincoss2<>+0(SB), R2
+	FMOVD   0(R2), F4
+	MOVD    $sincoss3<>+0(SB), R2
+	WFMADB  V2, V3, V4, V3
+	FMOVD   0(R2), F4
+	MOVD    $sincoss1<>+0(SB), R2
+	WFMADB  V2, V0, V4, V0
+	FMOVD   0(R2), F4
+	WFMADB  V1, V3, V4, V3
+	FNEG    F6, F4
+	WFMADB  V2, V0, V3, V2
+	WFMDB   V4, V1, V0
+	WORD    $0xA7110002     //tmll  %r1,2
+	WFMSDB  V0, V2, V6, V0
+	BNE     L15
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L14:
+	MOVD    $sincosxnan<>+0(SB), R1
+	FMOVD   0(R1), F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L18:
+	WFMDB   V0, V0, V2
+	WFMADB  V2, V4, V1, V4
+	WFMDB   V2, V2, V1
+	WFMADB  V2, V4, V18, V4
+	WFMADB  V1, V6, V16, V6
+	WFMADB  V1, V4, V7, V4
+	WFMADB  V2, V6, V3, V6
+	FMUL    F0, F2
+	WFMADB  V1, V4, V6, V4
+	FMADD   F4, F2, F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L17:
+	FNEG    F0, F5
+	BR      L2
+L15:
+	FNEG    F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+
+sinIsZero:
+	FMOVD   F0, ret+8(FP)
+	RET
+
+// Cos returns the cosine of the radian argument.
+//
+// Special cases are:
+//      Cos(±Inf) = NaN
+//      Cos(NaN) = NaN
+// The algorithm used is minimax polynomial approximation.
+// with coefficients determined with a Remez exchange algorithm.
+
+TEXT ·cosAsm(SB),NOSPLIT,$0-16
+	FMOVD   x+0(FP), F0
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	BLTU    L35
+	FMOVD   F0, F1
+L21:
+	MOVD    $sincosc7<>+0(SB), R1
+	FMOVD   0(R1), F4
+	MOVD    $sincosc6<>+0(SB), R1
+	VLEG    $0, 0(R1), V20
+	MOVD    $sincosc5<>+0(SB), R1
+	VLEG    $0, 0(R1), V18
+	MOVD    $sincosc4<>+0(SB), R1
+	FMOVD   0(R1), F6
+	MOVD    $sincosc2<>+0(SB), R1
+	VLEG    $0, 0(R1), V16
+	MOVD    $sincosc3<>+0(SB), R1
+	FMOVD   0(R1), F7
+	MOVD    $sincosc1<>+0(SB), R1
+	FMOVD   0(R1), F5
+	MOVD    $sincosrpi2<>+0(SB), R1
+	FMOVD   0(R1), F2
+	MOVD    $sincosxadd<>+0(SB), R1
+	FMOVD   0(R1), F3
+	MOVD    $sincoss0<>+0(SB), R1
+	WFMSDB  V0, V2, V3, V2
+	FMOVD   0(R1), F3
+	WFCHDBS V3, V1, V3
+	WORD    $0xB3CD0012     //lgdr %r1,%f2
+	BEQ     L36
+	MOVD    $sincosxadd<>+0(SB), R2
+	FMOVD   0(R2), F4
+	FADD    F2, F4
+	MOVD    $sincosxpi2h<>+0(SB), R2
+	FMOVD   0(R2), F2
+	WFMSDB  V4, V2, V0, V2
+	MOVD    $sincosxpi2m<>+0(SB), R2
+	FMOVD   0(R2), F0
+	WFMADB  V4, V0, V2, V0
+	MOVD    $sincosxpi2l<>+0(SB), R2
+	WFMDB   V0, V0, V2
+	FMOVD   0(R2), F5
+	WFMDB   V2, V2, V6
+	MOVD    $sincosxlim<>+0(SB), R2
+	WORD    $0xA7110001     //tmll %r1,1
+	BNE     L25
+	FMOVD   0(R2), F0
+	WFCHDBS V0, V1, V0
+	BNE     L33
+	MOVD    $sincosc7<>+0(SB), R2
+	FMOVD   0(R2), F0
+	MOVD    $sincosc6<>+0(SB), R2
+	FMOVD   0(R2), F4
+	MOVD    $sincosc5<>+0(SB), R2
+	WFMADB  V2, V0, V4, V0
+	FMOVD   0(R2), F1
+	MOVD    $sincosc4<>+0(SB), R2
+	WFMADB  V2, V0, V1, V0
+	FMOVD   0(R2), F4
+	MOVD    $sincosc2<>+0(SB), R2
+	FMOVD   0(R2), F1
+	WFMADB  V6, V4, V1, V4
+	MOVD    $sincosc3<>+0(SB), R2
+	FMOVD   0(R2), F3
+	MOVD    $sincosc1<>+0(SB), R2
+	WFMADB  V6, V0, V3, V0
+	FMOVD   0(R2), F1
+	WFMADB  V2, V4, V1, V4
+	WORD    $0xA7110002     //tmll %r1,2
+	WFMADB  V6, V0, V4, V0
+	MOVD    $sincosc0<>+0(SB), R1
+	FMOVD   0(R1), F4
+	WFMADB  V2, V0, V4, V0
+	BNE     L34
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L25:
+	FMOVD   0(R2), F3
+	WFCHDBS V3, V1, V1
+	BNE     L33
+	MOVD    $sincoss7<>+0(SB), R2
+	FMOVD   0(R2), F1
+	MOVD    $sincoss6<>+0(SB), R2
+	FMOVD   0(R2), F3
+	MOVD    $sincoss5<>+0(SB), R2
+	WFMADB  V2, V1, V3, V1
+	FMOVD   0(R2), F3
+	MOVD    $sincoss4<>+0(SB), R2
+	WFMADB  V2, V1, V3, V1
+	FMOVD   0(R2), F3
+	MOVD    $sincoss2<>+0(SB), R2
+	FMOVD   0(R2), F7
+	WFMADB  V6, V3, V7, V3
+	MOVD    $sincoss3<>+0(SB), R2
+	FMADD   F5, F4, F0, F0
+	FMOVD   0(R2), F4
+	MOVD    $sincoss1<>+0(SB), R2
+	FMADD   F1, F6, F4, F4
+	FMOVD   0(R2), F1
+	FMADD   F3, F2, F1, F1
+	FMUL    F0, F2
+	WFMADB  V6, V4, V1, V6
+	WORD    $0xA7110002     //tmll  %r1,2
+	FMADD   F6, F2, F0, F0
+	BNE     L34
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L33:
+	MOVD    $sincosxnan<>+0(SB), R1
+	FMOVD   0(R1), F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L36:
+	FMUL    F0, F0
+	MOVD    $sincosc0<>+0(SB), R1
+	WFMDB   V0, V0, V1
+	WFMADB  V0, V4, V20, V4
+	WFMADB  V1, V6, V16, V6
+	WFMADB  V0, V4, V18, V4
+	WFMADB  V0, V6, V5, V6
+	WFMADB  V1, V4, V7, V4
+	FMOVD   0(R1), F2
+	WFMADB  V1, V4, V6, V4
+	WFMADB  V0, V4, V2, V0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L35:
+	FNEG    F0, F1
+	BR      L21
+L34:
+	FNEG    F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
diff --git a/src/math/sincos.go b/src/math/sincos.go
index 7180303..6e663d0 100644
--- a/src/math/sincos.go
+++ b/src/math/sincos.go
@@ -40,8 +40,8 @@
 	y := float64(j)      // integer part of x/(Pi/4), as float
 
 	if j&1 == 1 { // map zeros to origin
-		j += 1
-		y += 1
+		j++
+		y++
 	}
 	j &= 7     // octant modulo 2Pi radians (360 degrees)
 	if j > 3 { // reflect in x axis
diff --git a/src/math/sinh.go b/src/math/sinh.go
index 139b911..2bdd7b1 100644
--- a/src/math/sinh.go
+++ b/src/math/sinh.go
@@ -22,7 +22,9 @@
 //	Sinh(±0) = ±0
 //	Sinh(±Inf) = ±Inf
 //	Sinh(NaN) = NaN
-func Sinh(x float64) float64 {
+func Sinh(x float64) float64
+
+func sinh(x float64) float64 {
 	// The coefficients are #2029 from Hart & Cheney. (20.36D)
 	const (
 		P0 = -0.6307673640497716991184787251e+6
@@ -66,7 +68,9 @@
 //	Cosh(±0) = 1
 //	Cosh(±Inf) = +Inf
 //	Cosh(NaN) = NaN
-func Cosh(x float64) float64 {
+func Cosh(x float64) float64
+
+func cosh(x float64) float64 {
 	if x < 0 {
 		x = -x
 	}
diff --git a/src/math/sinh_s390x.s b/src/math/sinh_s390x.s
new file mode 100644
index 0000000..e492415
--- /dev/null
+++ b/src/math/sinh_s390x.s
@@ -0,0 +1,261 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+
+#include "textflag.h"
+
+// Constants
+DATA sinhrodataL21<>+0(SB)/8, $0.231904681384629956E-16
+DATA sinhrodataL21<>+8(SB)/8, $0.693147180559945286E+00
+DATA sinhrodataL21<>+16(SB)/8, $704.E0
+GLOBL sinhrodataL21<>+0(SB), RODATA, $24
+DATA sinhrlog2<>+0(SB)/8, $0x3ff7154760000000
+GLOBL sinhrlog2<>+0(SB), RODATA, $8
+DATA sinhxinf<>+0(SB)/8, $0x7ff0000000000000
+GLOBL sinhxinf<>+0(SB), RODATA, $8
+DATA sinhxinit<>+0(SB)/8, $0x3ffb504f333f9de6
+GLOBL sinhxinit<>+0(SB), RODATA, $8
+DATA sinhxlim1<>+0(SB)/8, $800.E0
+GLOBL sinhxlim1<>+0(SB), RODATA, $8
+DATA sinhxadd<>+0(SB)/8, $0xc3200001610007fb
+GLOBL sinhxadd<>+0(SB), RODATA, $8
+DATA sinhx4ff<>+0(SB)/8, $0x4ff0000000000000
+GLOBL sinhx4ff<>+0(SB), RODATA, $8
+
+// Minimax polynomial approximations
+DATA sinhe0<>+0(SB)/8, $0.11715728752538099300E+01
+GLOBL sinhe0<>+0(SB), RODATA, $8
+DATA sinhe1<>+0(SB)/8, $0.11715728752538099300E+01
+GLOBL sinhe1<>+0(SB), RODATA, $8
+DATA sinhe2<>+0(SB)/8, $0.58578643762688526692E+00
+GLOBL sinhe2<>+0(SB), RODATA, $8
+DATA sinhe3<>+0(SB)/8, $0.19526214587563004497E+00
+GLOBL sinhe3<>+0(SB), RODATA, $8
+DATA sinhe4<>+0(SB)/8, $0.48815536475176217404E-01
+GLOBL sinhe4<>+0(SB), RODATA, $8
+DATA sinhe5<>+0(SB)/8, $0.97631072948627397816E-02
+GLOBL sinhe5<>+0(SB), RODATA, $8
+DATA sinhe6<>+0(SB)/8, $0.16271839297756073153E-02
+GLOBL sinhe6<>+0(SB), RODATA, $8
+DATA sinhe7<>+0(SB)/8, $0.23245485387271142509E-03
+GLOBL sinhe7<>+0(SB), RODATA, $8
+DATA sinhe8<>+0(SB)/8, $0.29080955860869629131E-04
+GLOBL sinhe8<>+0(SB), RODATA, $8
+DATA sinhe9<>+0(SB)/8, $0.32311267157667725278E-05
+GLOBL sinhe9<>+0(SB), RODATA, $8
+
+// Sinh returns the hyperbolic sine of the argument.
+//
+// Special cases are:
+//      Sinh(±0) = ±0
+//      Sinh(±Inf) = ±Inf
+//      Sinh(NaN) = NaN
+// The algorithm used is minimax polynomial approximation
+// with coefficients determined with a Remez exchange algorithm.
+
+TEXT ·sinhAsm(SB),NOSPLIT,$0-16
+	FMOVD   x+0(FP), F0
+	//specail case Sinh(±0) = ±0
+	FMOVD   $(0.0), F1
+	FCMPU   F0, F1
+	BEQ     sinhIsZero
+	//specail case Sinh(±Inf = ±Inf
+	FMOVD   $1.797693134862315708145274237317043567981e+308, F1
+	FCMPU   F1, F0
+	BLEU    sinhIsInf
+	FMOVD   $-1.797693134862315708145274237317043567981e+308, F1
+	FCMPU   F1, F0
+	BGT             sinhIsInf
+
+	MOVD    $sinhrodataL21<>+0(SB), R5
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	MOVD    sinhxinit<>+0(SB), R1
+	FMOVD   F0, F4
+	MOVD    R1, R3
+	BLTU    L19
+	FMOVD   F0, F2
+L2:
+	WORD    $0xED205010     //cdb %f2,.L22-.L21(%r5)
+	BYTE    $0x00
+	BYTE    $0x19
+	BGE     L15     //jnl   .L15
+	BVS     L15
+	WFCEDBS V2, V2, V0
+	BEQ     L20
+L12:
+	FMOVD   F4, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L15:
+	WFCEDBS V2, V2, V0
+	BVS     L12
+	MOVD    $sinhxlim1<>+0(SB), R2
+	FMOVD   0(R2), F0
+	WFCHDBS V0, V2, V0
+	BEQ     L6
+	WFCHEDBS        V4, V2, V6
+	MOVD    $sinhxinf<>+0(SB), R1
+	FMOVD   0(R1), F0
+	BNE     LEXITTAGsinh
+	WFCHDBS V2, V4, V2
+	BNE     L16
+	FNEG    F0, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L19:
+	FNEG    F0, F2
+	BR      L2
+L6:
+	MOVD    $sinhxadd<>+0(SB), R2
+	FMOVD   0(R2), F0
+	MOVD    sinhrlog2<>+0(SB), R2
+	WORD    $0xB3C10062     //ldgr  %f6,%r2
+	WFMSDB  V4, V6, V0, V16
+	FMOVD   sinhrodataL21<>+8(SB), F6
+	WFADB   V0, V16, V0
+	FMOVD   sinhrodataL21<>+0(SB), F3
+	WFMSDB  V0, V6, V4, V6
+	MOVD    $sinhe9<>+0(SB), R2
+	WFMADB  V0, V3, V6, V0
+	FMOVD   0(R2), F1
+	MOVD    $sinhe7<>+0(SB), R2
+	WFMDB   V0, V0, V6
+	FMOVD   0(R2), F5
+	MOVD    $sinhe8<>+0(SB), R2
+	FMOVD   0(R2), F3
+	MOVD    $sinhe6<>+0(SB), R2
+	WFMADB  V6, V1, V5, V1
+	FMOVD   0(R2), F5
+	MOVD    $sinhe5<>+0(SB), R2
+	FMOVD   0(R2), F7
+	MOVD    $sinhe3<>+0(SB), R2
+	WFMADB  V6, V3, V5, V3
+	FMOVD   0(R2), F5
+	MOVD    $sinhe4<>+0(SB), R2
+	WFMADB  V6, V7, V5, V7
+	FMOVD   0(R2), F5
+	MOVD    $sinhe2<>+0(SB), R2
+	VLEG    $0, 0(R2), V20
+	WFMDB   V6, V6, V18
+	WFMADB  V6, V5, V20, V5
+	WFMADB  V1, V18, V7, V1
+	FNEG    F0, F0
+	WFMADB  V3, V18, V5, V3
+	MOVD    $sinhe1<>+0(SB), R3
+	WFCEDBS V2, V4, V2
+	FMOVD   0(R3), F5
+	MOVD    $sinhe0<>+0(SB), R3
+	WFMADB  V6, V1, V5, V1
+	FMOVD   0(R3), F5
+	VLGVG   $0, V16, R2
+	WFMADB  V6, V3, V5, V6
+	RLL     $3, R2, R2
+	WORD    $0xEC12000F     //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	BEQ     L9
+	WFMSDB  V0, V1, V6, V0
+	MOVD    $sinhx4ff<>+0(SB), R3
+	FNEG    F0, F0
+	FMOVD   0(R3), F2
+	FMUL    F2, F0
+	ANDW    $0xFFFF, R2
+	WORD    $0xA53FEFB6     //llill %r3,61366
+	SUBW    R2, R3, R2
+	WORD    $0xEC12000F     //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0xB3C10021     //ldgr %f2,%r1
+	FMUL    F2, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L20:
+	MOVD    $sinhxadd<>+0(SB), R2
+	FMOVD   0(R2), F2
+	MOVD    sinhrlog2<>+0(SB), R2
+	WORD    $0xB3C10002     //ldgr  %f0,%r2
+	WFMSDB  V4, V0, V2, V6
+	FMOVD   sinhrodataL21<>+8(SB), F0
+	FADD    F6, F2
+	MOVD    $sinhe9<>+0(SB), R2
+	FMSUB   F0, F2, F4, F4
+	FMOVD   0(R2), F1
+	FMOVD   sinhrodataL21<>+0(SB), F3
+	MOVD    $sinhe7<>+0(SB), R2
+	FMADD   F3, F2, F4, F4
+	FMOVD   0(R2), F0
+	MOVD    $sinhe8<>+0(SB), R2
+	WFMDB   V4, V4, V2
+	FMOVD   0(R2), F3
+	MOVD    $sinhe6<>+0(SB), R2
+	FMOVD   0(R2), F5
+	WORD    $0xB3CD0026     //lgdr %r2,%f6
+	RLL     $3, R2, R2
+	WORD    $0xEC12000F     //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WFMADB  V2, V1, V0, V1
+	WORD    $0xB3C10001     //ldgr  %f0,%r1
+	MOVD    $sinhe5<>+0(SB), R1
+	WFMADB  V2, V3, V5, V3
+	FMOVD   0(R1), F5
+	MOVD    $sinhe3<>+0(SB), R1
+	FMOVD   0(R1), F6
+	WFMDB   V2, V2, V7
+	WFMADB  V2, V5, V6, V5
+	WORD    $0xA7487FB6     //lhi %r4,32694
+	FNEG    F4, F4
+	ANDW    $0xFFFF, R2
+	SUBW    R2, R4, R2
+	WORD    $0xEC32000F     //risbgn %r3,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0xB3C10063     //ldgr  %f6,%r3
+	WFADB   V0, V6, V16
+	MOVD    $sinhe4<>+0(SB), R1
+	WFMADB  V1, V7, V5, V1
+	WFMDB   V4, V16, V4
+	FMOVD   0(R1), F5
+	MOVD    $sinhe2<>+0(SB), R1
+	VLEG    $0, 0(R1), V16
+	MOVD    $sinhe1<>+0(SB), R1
+	WFMADB  V2, V5, V16, V5
+	VLEG    $0, 0(R1), V16
+	WFMADB  V3, V7, V5, V3
+	WFMADB  V2, V1, V16, V1
+	FSUB    F6, F0
+	FMUL    F1, F4
+	MOVD    $sinhe0<>+0(SB), R1
+	FMOVD   0(R1), F6
+	WFMADB  V2, V3, V6, V2
+	WFMADB  V0, V2, V4, V0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L9:
+	WFMADB  V0, V1, V6, V0
+	MOVD    $sinhx4ff<>+0(SB), R3
+	FMOVD   0(R3), F2
+	FMUL    F2, F0
+	WORD    $0xA72AF000     //ahi   %r2,-4096
+	WORD    $0xEC12000F     //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0xB3C10021     //ldgr %f2,%r1
+	FMUL    F2, F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L16:
+	FMOVD   F0, ret+8(FP)
+	RET
+
+LEXITTAGsinh:
+sinhIsInf:
+sinhIsZero:
+	FMOVD   F0, ret+8(FP)
+	RET
diff --git a/src/math/sinh_stub.s b/src/math/sinh_stub.s
new file mode 100644
index 0000000..4caaa0c
--- /dev/null
+++ b/src/math/sinh_stub.s
@@ -0,0 +1,17 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32 arm
+
+#include "textflag.h"
+
+TEXT ·Sinh(SB),NOSPLIT,$0
+	JMP ·sinh(SB)
+
+TEXT ·Cosh(SB),NOSPLIT,$0
+	JMP ·cosh(SB)
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	JMP ·tanh(SB)
+
diff --git a/src/math/sqrt_amd64.s b/src/math/sqrt_amd64.s
index f8d825d..1102903 100644
--- a/src/math/sqrt_amd64.s
+++ b/src/math/sqrt_amd64.s
@@ -5,7 +5,8 @@
 #include "textflag.h"
 
 // func Sqrt(x float64) float64
-TEXT ·Sqrt(SB),NOSPLIT,$0
+TEXT ·Sqrt(SB), NOSPLIT, $0
+	XORPS  X0, X0 // break dependency
 	SQRTSD x+0(FP), X0
-	MOVSD X0, ret+8(FP)
+	MOVSD  X0, ret+8(FP)
 	RET
diff --git a/src/math/sqrt_mipsx.s b/src/math/sqrt_mipsx.s
new file mode 100644
index 0000000..1b27d49
--- /dev/null
+++ b/src/math/sqrt_mipsx.s
@@ -0,0 +1,14 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "textflag.h"
+
+// func Sqrt(x float64) float64
+TEXT ·Sqrt(SB),NOSPLIT,$0
+	MOVD	x+0(FP), F0
+	SQRTD	F0, F0
+	MOVD	F0, ret+8(FP)
+	RET
diff --git a/src/math/stubs_arm64.s b/src/math/stubs_arm64.s
index 04de911..d8c9aa8 100644
--- a/src/math/stubs_arm64.s
+++ b/src/math/stubs_arm64.s
@@ -18,33 +18,18 @@
 TEXT ·Atan(SB),NOSPLIT,$0
 	B ·atan(SB)
 
-TEXT ·Dim(SB),NOSPLIT,$0
-	B ·dim(SB)
-
-TEXT ·Min(SB),NOSPLIT,$0
-	B ·min(SB)
-
-TEXT ·Max(SB),NOSPLIT,$0
-	B ·max(SB)
-
 TEXT ·Exp2(SB),NOSPLIT,$0
 	B ·exp2(SB)
 
+TEXT ·Cosh(SB),NOSPLIT,$0
+	B ·cosh(SB)
+
 TEXT ·Expm1(SB),NOSPLIT,$0
 	B ·expm1(SB)
 
 TEXT ·Exp(SB),NOSPLIT,$0
 	B ·exp(SB)
 
-TEXT ·Floor(SB),NOSPLIT,$0
-	B ·floor(SB)
-
-TEXT ·Ceil(SB),NOSPLIT,$0
-	B ·ceil(SB)
-
-TEXT ·Trunc(SB),NOSPLIT,$0
-	B ·trunc(SB)
-
 TEXT ·Frexp(SB),NOSPLIT,$0
 	B ·frexp(SB)
 
@@ -66,9 +51,6 @@
 TEXT ·Log(SB),NOSPLIT,$0
 	B ·log(SB)
 
-TEXT ·Modf(SB),NOSPLIT,$0
-	B ·modf(SB)
-
 TEXT ·Mod(SB),NOSPLIT,$0
 	B ·mod(SB)
 
@@ -81,8 +63,14 @@
 TEXT ·Sin(SB),NOSPLIT,$0
 	B ·sin(SB)
 
+TEXT ·Sinh(SB),NOSPLIT,$0
+	B ·sinh(SB)
+
 TEXT ·Cos(SB),NOSPLIT,$0
 	B ·cos(SB)
 
 TEXT ·Tan(SB),NOSPLIT,$0
 	B ·tan(SB)
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	B ·tanh(SB)
diff --git a/src/math/stubs_mips64x.s b/src/math/stubs_mips64x.s
index 97e6e4c..21df5cc 100644
--- a/src/math/stubs_mips64x.s
+++ b/src/math/stubs_mips64x.s
@@ -81,11 +81,20 @@
 TEXT ·Sin(SB),NOSPLIT,$0
 	JMP ·sin(SB)
 
+TEXT ·Sinh(SB),NOSPLIT,$0
+	JMP ·sinh(SB)
+
 TEXT ·Cos(SB),NOSPLIT,$0
 	JMP ·cos(SB)
 
+TEXT ·Cosh(SB),NOSPLIT,$0
+	JMP ·cosh(SB)
+
 TEXT ·Sqrt(SB),NOSPLIT,$0
 	JMP ·sqrt(SB)
 
 TEXT ·Tan(SB),NOSPLIT,$0
 	JMP ·tan(SB)
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	JMP ·tanh(SB)
diff --git a/src/math/stubs_mipsx.s b/src/math/stubs_mipsx.s
new file mode 100644
index 0000000..b869768
--- /dev/null
+++ b/src/math/stubs_mipsx.s
@@ -0,0 +1,98 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build mips mipsle
+
+#include "textflag.h"
+
+TEXT ·Asin(SB),NOSPLIT,$0
+	JMP	·asin(SB)
+
+TEXT ·Acos(SB),NOSPLIT,$0
+	JMP	·acos(SB)
+
+TEXT ·Atan2(SB),NOSPLIT,$0
+	JMP	·atan2(SB)
+
+TEXT ·Atan(SB),NOSPLIT,$0
+	JMP	·atan(SB)
+
+TEXT ·Dim(SB),NOSPLIT,$0
+	JMP	·dim(SB)
+
+TEXT ·Min(SB),NOSPLIT,$0
+	JMP	·min(SB)
+
+TEXT ·Max(SB),NOSPLIT,$0
+	JMP	·max(SB)
+
+TEXT ·Exp2(SB),NOSPLIT,$0
+	JMP	·exp2(SB)
+
+TEXT ·Expm1(SB),NOSPLIT,$0
+	JMP	·expm1(SB)
+
+TEXT ·Exp(SB),NOSPLIT,$0
+	JMP	·exp(SB)
+
+TEXT ·Floor(SB),NOSPLIT,$0
+	JMP	·floor(SB)
+
+TEXT ·Ceil(SB),NOSPLIT,$0
+	JMP	·ceil(SB)
+
+TEXT ·Trunc(SB),NOSPLIT,$0
+	JMP	·trunc(SB)
+
+TEXT ·Frexp(SB),NOSPLIT,$0
+	JMP	·frexp(SB)
+
+TEXT ·Hypot(SB),NOSPLIT,$0
+	JMP	·hypot(SB)
+
+TEXT ·Ldexp(SB),NOSPLIT,$0
+	JMP	·ldexp(SB)
+
+TEXT ·Log10(SB),NOSPLIT,$0
+	JMP	·log10(SB)
+
+TEXT ·Log2(SB),NOSPLIT,$0
+	JMP	·log2(SB)
+
+TEXT ·Log1p(SB),NOSPLIT,$0
+	JMP	·log1p(SB)
+
+TEXT ·Log(SB),NOSPLIT,$0
+	JMP	·log(SB)
+
+TEXT ·Modf(SB),NOSPLIT,$0
+	JMP	·modf(SB)
+
+TEXT ·Mod(SB),NOSPLIT,$0
+	JMP	·mod(SB)
+
+TEXT ·Remainder(SB),NOSPLIT,$0
+	JMP	·remainder(SB)
+
+TEXT ·Sincos(SB),NOSPLIT,$0
+	JMP	·sincos(SB)
+
+TEXT ·Sin(SB),NOSPLIT,$0
+	JMP	·sin(SB)
+
+TEXT ·Sinh(SB),NOSPLIT,$0
+        JMP ·sinh(SB)
+
+TEXT ·Cos(SB),NOSPLIT,$0
+	JMP	·cos(SB)
+
+TEXT ·Cosh(SB),NOSPLIT,$0
+        JMP ·cosh(SB)
+
+TEXT ·Tan(SB),NOSPLIT,$0
+	JMP	·tan(SB)
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+        JMP ·tanh(SB)
+
diff --git a/src/math/stubs_ppc64x.s b/src/math/stubs_ppc64x.s
index a57357e..b622016 100644
--- a/src/math/stubs_ppc64x.s
+++ b/src/math/stubs_ppc64x.s
@@ -36,15 +36,6 @@
 TEXT ·Exp(SB),NOSPLIT,$0
 	BR ·exp(SB)
 
-TEXT ·Floor(SB),NOSPLIT,$0
-	BR ·floor(SB)
-
-TEXT ·Ceil(SB),NOSPLIT,$0
-	BR ·ceil(SB)
-
-TEXT ·Trunc(SB),NOSPLIT,$0
-	BR ·trunc(SB)
-
 TEXT ·Frexp(SB),NOSPLIT,$0
 	BR ·frexp(SB)
 
@@ -81,8 +72,17 @@
 TEXT ·Sin(SB),NOSPLIT,$0
 	BR ·sin(SB)
 
+TEXT ·Sinh(SB),NOSPLIT,$0
+	BR ·sinh(SB)
+
 TEXT ·Cos(SB),NOSPLIT,$0
 	BR ·cos(SB)
 
+TEXT ·Cosh(SB),NOSPLIT,$0
+	BR ·cosh(SB)
+
 TEXT ·Tan(SB),NOSPLIT,$0
 	BR ·tan(SB)
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	BR ·tanh(SB)
diff --git a/src/math/stubs_s390x.s b/src/math/stubs_s390x.s
index 7686844..8da55c5 100644
--- a/src/math/stubs_s390x.s
+++ b/src/math/stubs_s390x.s
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-#include "../runtime/textflag.h"
+#include "textflag.h"
 
 TEXT ·Asin(SB),NOSPLIT,$0
 	BR ·asin(SB)
@@ -25,15 +25,6 @@
 TEXT ·Exp(SB),NOSPLIT,$0
 	BR ·exp(SB)
 
-TEXT ·Floor(SB),NOSPLIT,$0
-	BR ·floor(SB)
-
-TEXT ·Ceil(SB),NOSPLIT,$0
-	BR ·ceil(SB)
-
-TEXT ·Trunc(SB),NOSPLIT,$0
-	BR ·trunc(SB)
-
 TEXT ·Frexp(SB),NOSPLIT,$0
 	BR ·frexp(SB)
 
@@ -43,9 +34,6 @@
 TEXT ·Ldexp(SB),NOSPLIT,$0
 	BR ·ldexp(SB)
 
-TEXT ·Log10(SB),NOSPLIT,$0
-	BR ·log10(SB)
-
 TEXT ·Log2(SB),NOSPLIT,$0
 	BR ·log2(SB)
 
@@ -67,11 +55,154 @@
 TEXT ·Sincos(SB),NOSPLIT,$0
 	BR ·sincos(SB)
 
-TEXT ·Sin(SB),NOSPLIT,$0
-	BR ·sin(SB)
-
-TEXT ·Cos(SB),NOSPLIT,$0
-	BR ·cos(SB)
-
 TEXT ·Tan(SB),NOSPLIT,$0
 	BR ·tan(SB)
+
+//if go assembly use vector instruction
+TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1
+	MOVD    $x-24(SP), R1
+	XC      $24, 0(R1), 0(R1) // clear the storage
+	MOVD    $2, R0            // R0 is the number of double words stored -1
+	WORD    $0xB2B01000       // STFLE 0(R1)
+	XOR     R0, R0            // reset the value of R0
+	MOVBZ   z-8(SP), R1
+	AND     $0x40, R1
+	BEQ     novector
+vectorinstalled:
+	// check if the vector instruction has been enabled
+	VLEIB   $0, $0xF, V16
+	VLGVB   $0, V16, R1
+	CMPBNE  R1, $0xF, novector
+	MOVB    $1, ret+0(FP) // have vx
+	RET
+novector:
+	MOVB    $0, ret+0(FP) // no vx
+	RET
+
+TEXT ·Log10(SB),NOSPLIT,$0
+	MOVD    log10vectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·log10TrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $log10vectorfacility+0x00(SB), R1
+	MOVD    $·log10(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·log10(SB)
+vectorimpl:
+	MOVD    $log10vectorfacility+0x00(SB), R1
+	MOVD    $·log10Asm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·log10Asm(SB)
+
+GLOBL log10vectorfacility+0x00(SB), NOPTR, $8
+DATA log10vectorfacility+0x00(SB)/8, $·log10TrampolineSetup(SB)
+
+
+TEXT ·Cos(SB),NOSPLIT,$0
+	MOVD    cosvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·cosTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $cosvectorfacility+0x00(SB), R1
+	MOVD    $·cos(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cos(SB)
+vectorimpl:
+	MOVD    $cosvectorfacility+0x00(SB), R1
+	MOVD    $·cosAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cosAsm(SB)
+
+GLOBL cosvectorfacility+0x00(SB), NOPTR, $8
+DATA cosvectorfacility+0x00(SB)/8, $·cosTrampolineSetup(SB)
+
+
+TEXT ·Cosh(SB),NOSPLIT,$0
+	MOVD    coshvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·coshTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $coshvectorfacility+0x00(SB), R1
+	MOVD    $·cosh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·cosh(SB)
+vectorimpl:
+	MOVD    $coshvectorfacility+0x00(SB), R1
+	MOVD    $·coshAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·coshAsm(SB)
+
+GLOBL coshvectorfacility+0x00(SB), NOPTR, $8
+DATA coshvectorfacility+0x00(SB)/8, $·coshTrampolineSetup(SB)
+
+
+TEXT ·Sin(SB),NOSPLIT,$0
+	MOVD    sinvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·sinTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $sinvectorfacility+0x00(SB), R1
+	MOVD    $·sin(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sin(SB)
+vectorimpl:
+	MOVD    $sinvectorfacility+0x00(SB), R1
+	MOVD    $·sinAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinAsm(SB)
+
+GLOBL sinvectorfacility+0x00(SB), NOPTR, $8
+DATA sinvectorfacility+0x00(SB)/8, $·sinTrampolineSetup(SB)
+
+
+TEXT ·Sinh(SB),NOSPLIT,$0
+	MOVD    sinhvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·sinhTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $sinhvectorfacility+0x00(SB), R1
+	MOVD    $·sinh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinh(SB)
+vectorimpl:
+	MOVD    $sinhvectorfacility+0x00(SB), R1
+	MOVD    $·sinhAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·sinhAsm(SB)
+
+GLOBL sinhvectorfacility+0x00(SB), NOPTR, $8
+DATA sinhvectorfacility+0x00(SB)/8, $·sinhTrampolineSetup(SB)
+
+
+
+TEXT ·Tanh(SB),NOSPLIT,$0
+	MOVD    tanhvectorfacility+0x00(SB),R1
+	BR      (R1)
+
+TEXT ·tanhTrampolineSetup(SB),NOSPLIT, $0
+	MOVB    ·hasVX(SB), R1
+	CMPBEQ  R1, $1, vectorimpl      // vectorfacility = 1, vector supported
+	MOVD    $tanhvectorfacility+0x00(SB), R1
+	MOVD    $·tanh(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·tanh(SB)
+vectorimpl:
+	MOVD    $tanhvectorfacility+0x00(SB), R1
+	MOVD    $·tanhAsm(SB), R2
+	MOVD    R2, 0(R1)
+	BR      ·tanhAsm(SB)
+
+GLOBL tanhvectorfacility+0x00(SB), NOPTR, $8
+DATA tanhvectorfacility+0x00(SB)/8, $·tanhTrampolineSetup(SB)
+
+
diff --git a/src/math/tan.go b/src/math/tan.go
index 285eff1..aa2fb37 100644
--- a/src/math/tan.go
+++ b/src/math/tan.go
@@ -108,8 +108,8 @@
 
 	/* map zeros and singularities to origin */
 	if j&1 == 1 {
-		j += 1
-		y += 1
+		j++
+		y++
 	}
 
 	z := ((x - y*PI4A) - y*PI4B) - y*PI4C
diff --git a/src/math/tanh.go b/src/math/tanh.go
index cf0ffa1..eaa0e4c 100644
--- a/src/math/tanh.go
+++ b/src/math/tanh.go
@@ -71,7 +71,9 @@
 //	Tanh(±0) = ±0
 //	Tanh(±Inf) = ±1
 //	Tanh(NaN) = NaN
-func Tanh(x float64) float64 {
+func Tanh(x float64) float64
+
+func tanh(x float64) float64 {
 	const MAXLOG = 8.8029691931113054295988e+01 // log(2**127)
 	z := Abs(x)
 	switch {
diff --git a/src/math/tanh_s390x.s b/src/math/tanh_s390x.s
new file mode 100644
index 0000000..1b76c14
--- /dev/null
+++ b/src/math/tanh_s390x.s
@@ -0,0 +1,173 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// Minimax polynomial approximations
+DATA tanhrodataL18<>+0(SB)/8, $-1.0
+DATA tanhrodataL18<>+8(SB)/8, $-2.0
+DATA tanhrodataL18<>+16(SB)/8, $1.0
+DATA tanhrodataL18<>+24(SB)/8, $2.0
+DATA tanhrodataL18<>+32(SB)/8, $0.20000000000000011868E+01
+DATA tanhrodataL18<>+40(SB)/8, $0.13333333333333341256E+01
+DATA tanhrodataL18<>+48(SB)/8, $0.26666666663549111502E+00
+DATA tanhrodataL18<>+56(SB)/8, $0.66666666658721844678E+00
+DATA tanhrodataL18<>+64(SB)/8, $0.88890217768964374821E-01
+DATA tanhrodataL18<>+72(SB)/8, $0.25397199429103821138E-01
+DATA tanhrodataL18<>+80(SB)/8, $-.346573590279972643E+00
+DATA tanhrodataL18<>+88(SB)/8, $20.E0
+GLOBL tanhrodataL18<>+0(SB), RODATA, $96
+
+// Constants
+DATA tanhrlog2<>+0(SB)/8, $0x4007154760000000
+GLOBL tanhrlog2<>+0(SB), RODATA, $8
+DATA tanhxadd<>+0(SB)/8, $0xc2f0000100003ff0
+GLOBL tanhxadd<>+0(SB), RODATA, $8
+DATA tanhxmone<>+0(SB)/8, $-1.0
+GLOBL tanhxmone<>+0(SB), RODATA, $8
+DATA tanhxzero<>+0(SB)/8, $0
+GLOBL tanhxzero<>+0(SB), RODATA, $8
+
+// Polynomial coefficients
+DATA tanhtab<>+0(SB)/8, $0.000000000000000000E+00
+DATA tanhtab<>+8(SB)/8, $-.171540871271399150E-01
+DATA tanhtab<>+16(SB)/8, $-.306597931864376363E-01
+DATA tanhtab<>+24(SB)/8, $-.410200970469965021E-01
+DATA tanhtab<>+32(SB)/8, $-.486343079978231466E-01
+DATA tanhtab<>+40(SB)/8, $-.538226193725835820E-01
+DATA tanhtab<>+48(SB)/8, $-.568439602538111520E-01
+DATA tanhtab<>+56(SB)/8, $-.579091847395528847E-01
+DATA tanhtab<>+64(SB)/8, $-.571909584179366341E-01
+DATA tanhtab<>+72(SB)/8, $-.548312665987204407E-01
+DATA tanhtab<>+80(SB)/8, $-.509471843643441085E-01
+DATA tanhtab<>+88(SB)/8, $-.456353588448863359E-01
+DATA tanhtab<>+96(SB)/8, $-.389755254243262365E-01
+DATA tanhtab<>+104(SB)/8, $-.310332908285244231E-01
+DATA tanhtab<>+112(SB)/8, $-.218623539150173528E-01
+DATA tanhtab<>+120(SB)/8, $-.115062908917949451E-01
+GLOBL tanhtab<>+0(SB), RODATA, $128
+
+// Tanh returns the hyperbolic tangent of the argument.
+//
+// Special cases are:
+//      Tanh(±0) = ±0
+//      Tanh(±Inf) = ±1
+//      Tanh(NaN) = NaN
+// The algorithm used is minimax polynomial approximation using a table of
+// polynomial coefficients determined with a Remez exchange algorithm.
+
+TEXT ·tanhAsm(SB),NOSPLIT,$0-16
+	FMOVD   x+0(FP), F0
+	//specail case Tanh(±0) = ±0
+	FMOVD   $(0.0), F1
+	FCMPU   F0, F1
+	BEQ     tanhIsZero
+	MOVD    $tanhrodataL18<>+0(SB), R5
+	WORD    $0xB3120000     //ltdbr %f0,%f0
+	MOVD    $0x4034000000000000, R1
+	BLTU    L15
+	FMOVD   F0, F1
+L2:
+	MOVD    $tanhxadd<>+0(SB), R2
+	FMOVD   0(R2), F2
+	MOVD    tanhrlog2<>+0(SB), R2
+	WORD    $0xB3C10042     //ldgr %f4,%r2
+	WFMSDB  V0, V4, V2, V4
+	MOVD    $tanhtab<>+0(SB), R3
+	WORD    $0xB3CD0024     //lgdr %r2,%f4
+	WORD    $0xEC4239BC     //risbg %r4,%r2,57,128+60,3
+	BYTE    $0x03
+	BYTE    $0x55
+	WORD    $0xED105058     //cdb %f1,.L19-.L18(%r5)
+	BYTE    $0x00
+	BYTE    $0x19
+	WORD    $0xEC12000F     //risbgn %r1,%r2,64-64+0,64-64+0+16-1,64-0-16
+	BYTE    $0x30
+	BYTE    $0x59
+	WORD    $0x68543000     //ld %f5,0(%r4,%r3)
+	WORD    $0xB3C10061     //ldgr %f6,%r1
+	BLT     L3
+	MOVD    $tanhxzero<>+0(SB), R1
+	FMOVD   0(R1), F2
+	WFCHDBS V0, V2, V4
+	BEQ     L9
+	WFCHDBS V2, V0, V2
+	BNE     L1
+	MOVD    $tanhxmone<>+0(SB), R1
+	FMOVD   0(R1), F0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L3:
+	FADD    F4, F2
+	FMOVD   tanhrodataL18<>+80(SB), F4
+	FMADD   F4, F2, F0, F0
+	FMOVD   tanhrodataL18<>+72(SB), F1
+	WFMDB   V0, V0, V3
+	FMOVD   tanhrodataL18<>+64(SB), F2
+	WFMADB  V0, V1, V2, V1
+	FMOVD   tanhrodataL18<>+56(SB), F4
+	FMOVD   tanhrodataL18<>+48(SB), F2
+	WFMADB  V1, V3, V4, V1
+	FMOVD   tanhrodataL18<>+40(SB), F4
+	WFMADB  V3, V2, V4, V2
+	FMOVD   tanhrodataL18<>+32(SB), F4
+	WORD    $0xB9270022     //lhr %r2,%r2
+	WFMADB  V3, V1, V4, V1
+	FMOVD   tanhrodataL18<>+24(SB), F4
+	WFMADB  V3, V2, V4, V3
+	WFMADB  V0, V5, V0, V2
+	WFMADB  V0, V1, V3, V0
+	WORD    $0xA7183ECF     //lhi %r1,16079
+	WFMADB  V0, V2, V5, V2
+	FMUL    F6, F2
+	MOVW    R2, R10
+	MOVW    R1, R11
+	CMPBLE  R10, R11, L16
+	FMOVD   F6, F0
+	WORD    $0xED005010     //adb %f0,.L28-.L18(%r5)
+	BYTE    $0x00
+	BYTE    $0x1A
+	WORD    $0xA7184330     //lhi %r1,17200
+	FADD    F2, F0
+	MOVW    R2, R10
+	MOVW    R1, R11
+	CMPBGT  R10, R11, L17
+	WORD    $0xED605010     //sdb %f6,.L28-.L18(%r5)
+	BYTE    $0x00
+	BYTE    $0x1B
+	FADD    F6, F2
+	WFDDB   V0, V2, V0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L9:
+	FMOVD   tanhrodataL18<>+16(SB), F0
+L1:
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L15:
+	FNEG    F0, F1
+	BR      L2
+L16:
+	FADD    F6, F2
+	FMOVD   tanhrodataL18<>+8(SB), F0
+	FMADD   F4, F2, F0, F0
+	FMOVD   tanhrodataL18<>+0(SB), F4
+	FNEG    F0, F0
+	WFMADB  V0, V2, V4, V0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+L17:
+	WFDDB   V0, V4, V0
+	FMOVD   tanhrodataL18<>+16(SB), F2
+	WFSDB   V0, V2, V0
+	FMOVD   F0, ret+8(FP)
+	RET
+
+tanhIsZero:      //return ±0
+	FMOVD   F0, ret+8(FP)
+	RET