Implement a meaningful Go port of libcap

This version of the Go package libcap/cap works well
enough to be used by others. Unfortunately, to use it
we need to apply something like the included patch
(contrib/go.patch) to the build sources for the Go
runtime and syscall packages.

I'll be trying to get these accepted by the Go team
in parallel.

Signed-off-by: Andrew G. Morgan <[email protected]>
diff --git a/Make.Rules b/Make.Rules
index b9a59b8..b685ab2 100644
--- a/Make.Rules
+++ b/Make.Rules
@@ -74,7 +74,7 @@
 GOLANG := $(shell if [ -n "$(shell go version 2>/dev/null)" ]; then echo yes ; else echo no ; fi)
 ifeq ($(GOLANG),yes)
 GOROOT := $(shell go env GOROOT)
-GOCGO := $(shell if [ "$(shell go env CGO_ENABLED)" == 1 ]; then echo yes ; else echo no ; fi)
+GOCGO := $(shell if [ "$(shell go env CGO_ENABLED)" = 1 ]; then echo yes ; else echo no ; fi)
 GOOSARCH := $(shell go env GOHOSTOS)_$(shell go env GOHOSTARCH)
 endif
 
diff --git a/cap/.gitignore b/cap/.gitignore
index e7946b9..1c780ed 100644
--- a/cap/.gitignore
+++ b/cap/.gitignore
@@ -1 +1,2 @@
 names.go
+syscalls.go
diff --git a/cap/cap.go b/cap/cap.go
index b5a72fb..0aa939c 100644
--- a/cap/cap.go
+++ b/cap/cap.go
@@ -52,12 +52,6 @@
 )
 
 var (
-	// callKernel variables overridable for testing purposes.
-	callKernel  = syscall.Syscall
-	callKernel6 = syscall.Syscall6
-
-	// OS environment provides these.
-
 	// starUp protects setting of the following values: magic,
 	// words, maxValues.
 	startUp sync.Once
@@ -80,25 +74,51 @@
 	pid   int32
 }
 
-// capcall provides a pointer etc wrapper for the system calls
-// associated with getcap and setcap.
-func capcall(call uintptr, h *header, d []data) error {
+// caprcall provides a pointer etc wrapper for the system calls
+// associated with getcap.
+func caprcall(call uintptr, h *header, d []data) error {
 	x := uintptr(0)
 	if d != nil {
 		x = uintptr(unsafe.Pointer(&d[0]))
 	}
-	_, _, err := callKernel(call, uintptr(unsafe.Pointer(h)), x, 0)
+	_, _, err := callRKernel(call, uintptr(unsafe.Pointer(h)), x, 0)
 	if err != 0 {
 		return err
 	}
 	return nil
 }
 
-// prctlcall provides a wrapper for the prctl systemcalls. There is a
-// limited number of arguments needed and the caller should use 0 for
-// those not needed.
-func prctlcall(prVal, v1, v2 uintptr) (int, error) {
-	r, _, err := callKernel6(syscall.SYS_PRCTL, prVal, v1, v2, 0, 0, 0)
+// capwcall provides a pointer etc wrapper for the system calls
+// associated with setcap.
+func capwcall(call uintptr, h *header, d []data) error {
+	x := uintptr(0)
+	if d != nil {
+		x = uintptr(unsafe.Pointer(&d[0]))
+	}
+	_, _, err := callWKernel(call, uintptr(unsafe.Pointer(h)), x, 0)
+	if err != 0 {
+		return err
+	}
+	return nil
+}
+
+// prctlrcall provides a wrapper for the prctl systemcalls that only
+// read kernel state. There is a limited number of arguments needed
+// and the caller should use 0 for those not needed.
+func prctlrcall(prVal, v1, v2 uintptr) (int, error) {
+	r, _, err := callRKernel6(syscall.SYS_PRCTL, prVal, v1, v2, 0, 0, 0)
+	if err != 0 {
+		return int(r), err
+	}
+	return int(r), nil
+}
+
+// prctlwcall provides a wrapper for the prctl systemcalls that
+// write/modify kernel state (where available, these will use the
+// POSIX semantics fixup system calls). There is a limited number of
+// arguments needed and the caller should use 0 for those not needed.
+func prctlwcall(prVal, v1, v2 uintptr) (int, error) {
+	r, _, err := callWKernel6(syscall.SYS_PRCTL, prVal, v1, v2, 0, 0, 0)
 	if err != 0 {
 		return int(r), err
 	}
@@ -111,7 +131,7 @@
 	h := &header{
 		magic: kv3,
 	}
-	capcall(syscall.SYS_CAPGET, h, nil)
+	caprcall(syscall.SYS_CAPGET, h, nil)
 	magic = h.magic
 	switch magic {
 	case kv1:
@@ -274,7 +294,7 @@
 // id; pid=0 is an alias for current.
 func GetPID(pid int) (*Set, error) {
 	v := NewSet()
-	if err := capcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
+	if err := caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
 		return nil, err
 	}
 	return v, nil
@@ -298,7 +318,7 @@
 	if c == nil {
 		return ErrBadSet
 	}
-	return capcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
+	return capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
 }
 
 // defines from uapi/linux/prctl.h
@@ -311,7 +331,7 @@
 // the local bounding set. On systems where the bounding set Value is
 // not present, this function returns an error.
 func GetBound(val Value) (bool, error) {
-	v, err := prctlcall(PR_CAPBSET_READ, uintptr(val), 0)
+	v, err := prctlrcall(PR_CAPBSET_READ, uintptr(val), 0)
 	if err != nil {
 		return false, err
 	}
@@ -328,7 +348,7 @@
 // ill-defined state.
 func DropBound(val ...Value) error {
 	for _, v := range val {
-		if _, err := prctlcall(PR_CAPBSET_DROP, uintptr(v), 0); err != nil {
+		if _, err := prctlwcall(PR_CAPBSET_DROP, uintptr(v), 0); err != nil {
 			return err
 		}
 	}
@@ -349,7 +369,7 @@
 // the local ambient set. On systems where the ambient set Value is
 // not present, this function returns an error.
 func GetAmbient(val Value) (bool, error) {
-	r, err := prctlcall(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, uintptr(val))
+	r, err := prctlrcall(PR_CAP_AMBIENT, PR_CAP_AMBIENT_IS_SET, uintptr(val))
 	return r > 0, err
 }
 
@@ -364,7 +384,7 @@
 		dir = PR_CAP_AMBIENT_RAISE
 	}
 	for _, v := range val {
-		_, err := prctlcall(PR_CAP_AMBIENT, dir, uintptr(v))
+		_, err := prctlwcall(PR_CAP_AMBIENT, dir, uintptr(v))
 		if err != nil {
 			return err
 		}
@@ -374,6 +394,6 @@
 
 // ResetAmbient attempts to fully clear the Ambient set.
 func ResetAmbient() error {
-	_, err := prctlcall(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0)
+	_, err := prctlwcall(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0)
 	return err
 }
diff --git a/cap/cap_test.go b/cap/cap_test.go
index a0151cc..e836fde 100644
--- a/cap/cap_test.go
+++ b/cap/cap_test.go
@@ -20,7 +20,7 @@
 		{"", "", ErrBadText},
 		{"=", "=", nil},
 		{"= cap_chown+iep cap_chown-i", "= cap_chown+ep", nil},
-		{"= cap_setfcap,cap_chown+iep cap_chown-i", "= cap_setfcap+epi cap_chown+ep", nil},
+		{"= cap_setfcap,cap_chown+iep cap_chown-i", "= cap_setfcap+eip cap_chown+ep", nil},
 	}
 	for i, v := range vs {
 		c, err := FromText(v.from)
diff --git a/cap/file.go b/cap/file.go
index 57144af..189ca61 100644
--- a/cap/file.go
+++ b/cap/file.go
@@ -122,7 +122,7 @@
 func GetFd(file *os.File) (*Set, error) {
 	var raw3 vfs_caps_3
 	d := make([]byte, binary.Size(raw3))
-	sz, _, oErr := callKernel6(syscall.SYS_FGETXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0)
+	sz, _, oErr := callRKernel6(syscall.SYS_FGETXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0)
 	var err error
 	if oErr != 0 {
 		err = oErr
@@ -138,7 +138,7 @@
 	}
 	var raw3 vfs_caps_3
 	d := make([]byte, binary.Size(raw3))
-	sz, _, oErr := callKernel6(syscall.SYS_GETXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0)
+	sz, _, oErr := callRKernel6(syscall.SYS_GETXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0)
 	if oErr != 0 {
 		err = oErr
 	}
@@ -210,7 +210,7 @@
 // capabilities, by calling with c = nil.
 func (c *Set) SetFd(file *os.File) error {
 	if c == nil {
-		if _, _, err := callKernel6(syscall.SYS_FREMOVEXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), 0, 0, 0, 0); err != 0 {
+		if _, _, err := callRKernel6(syscall.SYS_FREMOVEXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), 0, 0, 0, 0); err != 0 {
 			return err
 		}
 		return nil
@@ -221,7 +221,7 @@
 	if err != nil {
 		return err
 	}
-	if _, _, err := callKernel6(syscall.SYS_FSETXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0); err != 0 {
+	if _, _, err := callRKernel6(syscall.SYS_FSETXATTR, uintptr(file.Fd()), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0); err != 0 {
 		return err
 	}
 	return nil
@@ -249,7 +249,7 @@
 		return err
 	}
 	if c == nil {
-		if _, _, err := callKernel6(syscall.SYS_REMOVEXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), 0, 0, 0, 0); err != 0 {
+		if _, _, err := callRKernel6(syscall.SYS_REMOVEXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), 0, 0, 0, 0); err != 0 {
 			return err
 		}
 		return nil
@@ -260,7 +260,7 @@
 	if err != nil {
 		return err
 	}
-	if _, _, err := callKernel6(syscall.SYS_SETXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0); err != 0 {
+	if _, _, err := callRKernel6(syscall.SYS_SETXATTR, uintptr(unsafe.Pointer(p)), uintptr(unsafe.Pointer(xattrNameCaps)), uintptr(unsafe.Pointer(&d[0])), uintptr(len(d)), 0, 0); err != 0 {
 		return err
 	}
 	return nil
diff --git a/cap/text.go b/cap/text.go
index 0af6290..b6117e5 100644
--- a/cap/text.go
+++ b/cap/text.go
@@ -45,7 +45,7 @@
 	iBin      = (1 << Inheritable)
 )
 
-var combos = []string{"", "e", "p", "ep", "i", "ei", "pi", "epi"}
+var combos = []string{"", "e", "p", "ep", "i", "ei", "ip", "eip"}
 
 // histo generates a histogram of flag state combinations.
 func (c *Set) histo(m uint, bins []int, patterns []uint, from, limit Value) uint {
diff --git a/contrib/go.patch b/contrib/go.patch
new file mode 100644
index 0000000..bb8a1dd
--- /dev/null
+++ b/contrib/go.patch
@@ -0,0 +1,328 @@
+From dcf7596bf3ac7170352c162af4bd61292b720f24 Mon Sep 17 00:00:00 2001
+From: "Andrew G. Morgan" <[email protected]>
+Date: Sat, 16 Nov 2019 08:46:24 -0800
+Subject: [PATCH] POSIX semantics support for some Linux syscalls
+
+This change adds two new methods for invoking system calls
+under Linux: syscall.PosixSyscall() and syscall.PosixSyscall6().
+
+These system call wrappers ensure that all OSThreads mirror
+a common system call. The wrappers serialize execution of the
+runtime to ensure no race conditions where any Go code observes
+a non-atomic OS state change. As such, the syscalls have
+higher runtime overhead than regular system calls, and only
+need to be used where such thread (or 'm' in the parlance
+of the runtime sources) consistency is required.
+
+The new support is used to enable syscall.Setuid() and
+syscall.Setgid() support under Linux.
+
+Extensive discussion of the background issue addressed in this
+patch can be found here:
+
+   https://github.com/golang/go/issues/1435
+
+Signed-off-by: Andrew G. Morgan <[email protected]>
+---
+ src/runtime/proc.go          |  93 +++++++++++++++++++++++++++++++++++++-
+ src/runtime/runtime2.go      |   3 ++
+ src/syscall/syscall_linux.go | 105 ++++++++++++++++++++++++++++++++++++++++---
+ 3 files changed, 193 insertions(+), 8 deletions(-)
+
+diff --git a/src/runtime/proc.go b/src/runtime/proc.go
+index 56e9530ab6..aa96e84385 100644
+--- a/src/runtime/proc.go
++++ b/src/runtime/proc.go
+@@ -877,6 +877,68 @@ func startTheWorld() {
+ 	getg().m.preemptoff = ""
+ }
+ 
++// doPosixSyscall serializes Go execution and executes a specified
++// syscall on all m's.
++//go:linkname doPosixSyscall syscall.doPosixSyscall
++func doPosixSyscall(fn func(bool) bool) {
++	if fn == nil {
++		return
++	}
++	stopTheWorld("doposixsyscall")
++	if x := fn(true); x {
++		var n int32
++		_g_ := getg()
++		tid := _g_.m.procid
++		for tm := allm; tm != nil; tm = tm.alllink {
++			if tm.procid == tid || tm.procid == 0 {
++				continue
++			}
++			n++
++			lock(&tm.mfixlock)
++			tm.mfixupfn = fn
++			if tm.park.key == 0 {
++				// Because tm.mfixupfn is set, this
++				// will cause the wakeup to be short
++				// lived (once the mutex is
++				// unlocked). The next real wakeup
++				// will occur after startTheWorld() is
++				// called.
++				notewakeup(&tm.park)
++			}
++			unlock(&tm.mfixlock)
++		}
++		for {
++			done := true
++			for tm := allm; tm != nil; tm = tm.alllink {
++				if tm.procid == tid {
++					continue
++				}
++				lock(&tm.mfixlock)
++				done = done && (tm.mfixupfn == nil)
++				unlock(&tm.mfixlock)
++			}
++			if done {
++				break
++			}
++			// if needed force sysmon and/or newmHandoff to wakeup.
++			lock(&sched.lock)
++			if atomic.Load(&sched.sysmonwait) != 0 {
++				atomic.Store(&sched.sysmonwait, 0)
++				notewakeup(&sched.sysmonnote)
++			}
++			unlock(&sched.lock)
++			lock(&newmHandoff.lock)
++			if newmHandoff.waiting {
++				newmHandoff.waiting = false
++				notewakeup(&newmHandoff.wake)
++			}
++			unlock(&newmHandoff.lock)
++			usleep(293)
++		}
++	}
++	startTheWorld()
++}
++
+ // stopTheWorldGC has the same effect as stopTheWorld, but blocks
+ // until the GC is not running. It also blocks a GC from starting
+ // until startTheWorldGC is called.
+@@ -1796,6 +1858,21 @@ func startTemplateThread() {
+ 	newm(templateThread, nil)
+ }
+ 
++// mDidFixup runs any outstanding fixup function for the running m.
++//go:nosplit
++//go:nowritebarrierrec
++func mDidFixup() bool {
++	_g_ := getg()
++	lock(&_g_.m.mfixlock)
++	fn := _g_.m.mfixupfn
++	if fn != nil {
++		atomic.Storeuintptr((*uintptr)(unsafe.Pointer(&_g_.m.mfixupfn)), 0)
++		fn(false)
++	}
++	unlock(&_g_.m.mfixlock)
++	return fn != nil
++}
++
+ // templateThread is a thread in a known-good state that exists solely
+ // to start new threads in known-good states when the calling thread
+ // may not be in a good state.
+@@ -1832,6 +1909,7 @@ func templateThread() {
+ 		noteclear(&newmHandoff.wake)
+ 		unlock(&newmHandoff.lock)
+ 		notesleep(&newmHandoff.wake)
++		mDidFixup()
+ 	}
+ }
+ 
+@@ -1853,8 +1931,14 @@ func stopm() {
+ 	lock(&sched.lock)
+ 	mput(_g_.m)
+ 	unlock(&sched.lock)
+-	notesleep(&_g_.m.park)
+-	noteclear(&_g_.m.park)
++	// Loop only if we are woken up to perform a mfixupfn call.
++	for {
++		notesleep(&_g_.m.park)
++		noteclear(&_g_.m.park)
++		if !mDidFixup() {
++			break
++		}
++	}
+ 	acquirep(_g_.m.nextp.ptr())
+ 	_g_.m.nextp = 0
+ }
+@@ -4464,6 +4548,7 @@ func sysmon() {
+ 	lasttrace := int64(0)
+ 	idle := 0 // how many cycles in succession we had not wokeup somebody
+ 	delay := uint32(0)
++
+ 	for {
+ 		if idle == 0 { // start with 20us sleep...
+ 			delay = 20
+@@ -4474,6 +4559,7 @@ func sysmon() {
+ 			delay = 10 * 1000
+ 		}
+ 		usleep(delay)
++		mDidFixup()
+ 		now := nanotime()
+ 		next := timeSleepUntil()
+ 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
+@@ -4493,6 +4579,7 @@ func sysmon() {
+ 						osRelax(true)
+ 					}
+ 					notetsleep(&sched.sysmonnote, sleep)
++					mDidFixup()
+ 					if shouldRelax {
+ 						osRelax(false)
+ 					}
+@@ -4529,12 +4616,14 @@ func sysmon() {
+ 				incidlelocked(1)
+ 			}
+ 		}
++		mDidFixup()
+ 		if next < now {
+ 			// There are timers that should have already run,
+ 			// perhaps because there is an unpreemptible P.
+ 			// Try to start an M to run them.
+ 			startm(nil, false)
+ 		}
++
+ 		// retake P's blocked in syscalls
+ 		// and preempt long running G's
+ 		if retake(now) != 0 {
+diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
+index fe1147e247..b10d29ab39 100644
+--- a/src/runtime/runtime2.go
++++ b/src/runtime/runtime2.go
+@@ -525,6 +525,9 @@ type m struct {
+ 	thread        uintptr // thread handle
+ 	freelink      *m      // on sched.freem
+ 
++	mfixlock mutex           // lock to protect mfixupfn
++	mfixupfn func(bool) bool // used to synchronize OS related m state (credentials etc)
++
+ 	// these are here because they are too large to be on the stack
+ 	// of low-level NOSPLIT functions.
+ 	libcall   libcall
+diff --git a/src/syscall/syscall_linux.go b/src/syscall/syscall_linux.go
+index 2eba033d7c..39a918f6ff 100644
+--- a/src/syscall/syscall_linux.go
++++ b/src/syscall/syscall_linux.go
+@@ -943,17 +943,110 @@ func Getpgrp() (pid int) {
+ //sysnb	Setsid() (pid int, err error)
+ //sysnb	Settimeofday(tv *Timeval) (err error)
+ 
+-// issue 1435.
+-// On linux Setuid and Setgid only affects the current thread, not the process.
+-// This does not match what most callers expect so we must return an error
+-// here rather than letting the caller think that the call succeeded.
++// posixCaller holds the input and output state for performing a
++// posixSyscall that needs to synchronize all OS thread state. Linux
++// generally does not always support this natively, so we have to
++// manipulate the runtime to fix things up.
++type posixCaller struct {
++	// arguments
++	trap, a1, a2, a3, a4, a5, a6 uintptr
++
++	// return values (only set by 0th invocation)
++	r1, r2 uintptr
++
++	// err is the error code
++	err Errno
++}
++
++// doSyscall is a callback for executing a syscall on the current m
++// (OS thread).
++//go:nosplit
++func (pc *posixCaller) doSyscall(initial bool) bool {
++	r1, r2, err := RawSyscall(pc.trap, pc.a1, pc.a2, pc.a3)
++	if initial {
++		pc.r1 = r1
++		pc.r2 = r2
++		pc.err = err
++	}
++	return err == 0
++}
++
++// doSyscall6 is a callback for executing a syscall6 on the current m
++// (OS thread).
++//go:nosplit
++func (pc *posixCaller) doSyscall6(initial bool) bool {
++	r1, r2, err := RawSyscall6(pc.trap, pc.a1, pc.a2, pc.a3, pc.a4, pc.a5, pc.a6)
++	if initial {
++		pc.r1 = r1
++		pc.r2 = r2
++		pc.err = err
++	}
++	return err == 0
++}
++
++// PosixSyscall performs a syscall with POSIX semantics - namely it
++// serializes the runtime and performs the syscall once for each OS
++// thread of the Go runtime.  The return values and error status are
++// from the first invocation.  If this first invocation fails, no more
++// attempts are made.
++func PosixSyscall(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err Errno) {
++	pc := &posixCaller{
++		trap: trap,
++		a1:   a1,
++		a2:   a2,
++		a3:   a3,
++	}
++	doPosixSyscall(pc.doSyscall)
++	r1 = pc.r1
++	r2 = pc.r2
++	err = pc.err
++	return
++}
++
++// PosixSyscall6 performs a syscall6 with POSIX semantics - namely it
++// serializes the runtime and performs the syscall once for each OS
++// thread of the Go runtime.  The return values and error status are
++// from the first invocation.  If this first invocation fails, no more
++// attempts are made.
++func PosixSyscall6(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err Errno) {
++	pc := &posixCaller{
++		trap: trap,
++		a1:   a1,
++		a2:   a2,
++		a3:   a3,
++		a4:   a4,
++		a5:   a5,
++		a6:   a6,
++	}
++	doPosixSyscall(pc.doSyscall6)
++	r1 = pc.r1
++	r2 = pc.r2
++	err = pc.err
++	return
++}
++
++// issue 1435.  On linux the raw system calls Setuid, Setgid etc only
++// affect the current thread, not the process (and all its peer
++// threads). So, to match what most callers expect, we use the
++// runtime.doPosixSyscall wrapper for the following syscalls.
++//
++// Provided by runtime.doPosixSyscall which serializes the world and
++// invokes the fn on each OS thread (what the runtime refers to as
++// m's). Once this function returns, all threads are in sync.
++func doPosixSyscall(fn func(bool) bool)
+ 
+ func Setuid(uid int) (err error) {
+-	return EOPNOTSUPP
++	if _, _, e1 := PosixSyscall(sys_SETUID, uintptr(uid), 0, 0); e1 != 0 {
++		err = errnoErr(e1)
++	}
++	return
+ }
+ 
+ func Setgid(gid int) (err error) {
+-	return EOPNOTSUPP
++	if _, _, e1 := PosixSyscall(sys_SETGID, uintptr(gid), 0, 0); e1 != 0 {
++		err = errnoErr(e1)
++	}
++	return
+ }
+ 
+ //sys	Setpriority(which int, who int, prio int) (err error)
+-- 
+2.11.0
+
diff --git a/go/Makefile b/go/Makefile
index 1381f3d..0791768 100644
--- a/go/Makefile
+++ b/go/Makefile
@@ -22,8 +22,11 @@
 src/libcap/cap/names.go: ../libcap/cap_names.h src/libcap/cap  mknames.go
 	go run mknames.go --header=$< | gofmt > $@ || rm -f $@
 
+src/libcap/cap/syscalls.go: src/libcap/cap ./syscalls.sh
+	./syscalls.sh > $@
+
 GOPACKAGE=pkg/$(GOOSARCH)/libcap/cap.a
-$(GOPACKAGE): src/libcap/cap/names.go src/libcap/cap/cap.go src/libcap/cap/text.go
+$(GOPACKAGE): src/libcap/cap/syscalls.go src/libcap/cap/names.go src/libcap/cap/cap.go src/libcap/cap/text.go
 	echo testing Go package
 	GOPATH=$(realpath .) go test libcap/cap
 	echo building $(GOPACKAGE)
@@ -37,6 +40,7 @@
 	GOPATH=$(realpath .) go build $<
 
 clean:
-	GOPATH=$(realpath .) go clean -x -i libcap/cap || exit 0
-	rm -f *.o *.so mknames web compare-cap *~ ../cap/*~ ../cap/names.go
+	GOPATH=$(realpath .) go clean -x -i libcap/cap 2> /dev/null || exit 0
+	rm -f *.o *.so mknames web compare-cap *~
+	rm -f ../cap/*~ ../cap/names.go ../cap/syscalls.go
 	rm -fr pkg src
diff --git a/go/syscalls.sh b/go/syscalls.sh
new file mode 100755
index 0000000..4966742
--- /dev/null
+++ b/go/syscalls.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+cat <<EOF
+package cap
+
+import "syscall"
+
+// callKernel variables overridable for testing purposes.
+EOF
+
+if [ -n "$(go doc syscall 2>/dev/null|grep PosixSyscall)" ]; then
+    cat <<EOF
+// (Go build tree contains PosixSyscall support.)
+var callWKernel  = syscall.PosixSyscall
+var callWKernel6 = syscall.PosixSyscall6
+var callRKernel  = syscall.RawSyscall
+var callRKernel6 = syscall.RawSyscall6
+EOF
+else
+    cat <<EOF
+// (Go build tree does not contain PosixSyscall support.)
+var callWKernel  = syscall.RawSyscall
+var callWKernel6 = syscall.RawSyscall6
+var callRKernel  = syscall.RawSyscall
+var callRKernel6 = syscall.RawSyscall6
+EOF
+fi
diff --git a/go/web.go b/go/web.go
index 82b363c..a1ab937 100644
--- a/go/web.go
+++ b/go/web.go
@@ -1,11 +1,24 @@
 // Progam web provides an example of a webserver using capabilities to
 // bind to a privileged port.
 //
-// While this program serves as a demonstration of how to use
-// libcap/cap to achieve this, it currently reveals how problematic
-// the Go runtime is for actually dropping all privilege. For now, the
-// runtime can only raise and lower effective capabilities in critical
-// sections with any reliability: it cannot drop privilege.
+// This program will not work reliably without the equivalent of
+// the Go runtime patch that adds a POSIX semantics wrappers around
+// the system calls that change kernel state. A patch for the Go
+// compiler/runtime to add this support is available here [2019-11-16]:
+//
+// https://git.kernel.org/pub/scm/libs/libcap/libcap.git/tree/contrib/go.patch
+//
+// To set this up, compile and empower this binary as follows (package
+// libcap/cap should be installed):
+//
+//   go build web.go
+//   sudo setcap cap_net_bind_service=p web
+//   ./web --port=80
+//
+// Make requests using wget and observe the log of web (try --debug as
+// a web command line flag too):
+//
+//   wget -o/dev/null -O/dev/stdout localhost:80
 package main
 
 import (
@@ -21,23 +34,12 @@
 
 var (
 	port     = flag.Int("port", 0, "port to listen on")
-	debug    = flag.Bool("debug", false, "enable to observe the go runtime os thread state confusion")
 	skipPriv = flag.Bool("skip", false, "skip raising the effective capability - will fail for low ports")
 )
 
 // ensureNotEUID aborts the program if it is running setuid something,
-// since it can't be forced to get euid to match uid etc.  Go's
-// runtime model is fragile with respect fully dropping capabilities,
-// or other forms of privilege, so we need to collapse the runtime to
-// a single os process. Until such time as Go supports some sort of
-// "serialize execution and run this on all hardware threads before
-// resuming" functionality, dropping capabilities and euid vs uid
-// kinds of discrepencies cannot be secured for all hardware threads
-// of the running program.
-//
-// Read more about this here:
-//
-//     https://github.com/golang/go/issues/1435 .
+// or being invoked by root.  That is, the preparer isn't setting up
+// the program correctly.
 func ensureNotEUID() {
 	euid := syscall.Geteuid()
 	uid := syscall.Getuid()
@@ -46,81 +48,57 @@
 	if uid != euid || gid != egid {
 		log.Fatalf("go runtime unable to resolve differing uids:(%d vs %d), gids(%d vs %d)", uid, euid, gid, egid)
 	}
+	if uid == 0 {
+		log.Fatalf("go runtime is running as root - cheating")
+	}
 }
 
 // listen creates a listener by raising effective privilege only to
-// bind to address and then lowering that effective privilege. To set
-// this up, compile and empower this binary as follows (package
-// libcap/cap should be installed):
-//
-//   go build web.go
-//   sudo setcap cap_net_bind_service=p web
-//   ./web --port=80
-//
-// Make requests using wget and observe the log of web (try --debug as
-// a web command line flag too):
-//
-//   wget -o/dev/null -O/dev/stdout localhost:80
+// bind to address and then lowering that effective privilege.
 func listen(network, address string) (net.Listener, error) {
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-
-	// The intention of the following code is as follows.
-	// Collapse down the number of hardware threads to one so we
-	// can drop privilege and only then up them again. (This does
-	// not seem to do that by killing the surplas threads. You can
-	// run --debug and try "pstree -p ; getpcap <list of pids>" to
-	// get a sense of what is going on.)
-	count := runtime.GOMAXPROCS(1)
-	defer runtime.GOMAXPROCS(count)
-	log.Printf("max proc count = %d", count)
-
-	ensureNotEUID()
-
-	c := cap.GetProc()
-	orig, err := c.Dup()
-	if err != nil {
-		return nil, fmt.Errorf("failed to dup cap.Set: %v", err)
+	if *skipPriv {
+		return net.Listen(network, address)
 	}
-	if *debug {
-		defer func() {
-			if err := cap.NewSet().SetProc(); err != nil {
-				panic(fmt.Errorf("unable to drop all privilege: %v", err))
-			}
-			return
-		}()
-	} else {
-		defer func() {
-			if err := orig.SetProc(); err != nil {
-				panic(fmt.Errorf("unable to lower privilege (%q): %v", orig, err))
-			}
-		}()
+
+	orig := cap.GetProc()
+	defer orig.SetProc() // restore original caps on exit.
+
+	c, err := orig.Dup()
+	if err != nil {
+		return nil, fmt.Errorf("failed to dup caps: %v", err)
 	}
 
 	if on, _ := c.GetFlag(cap.Permitted, cap.NET_BIND_SERVICE); !on {
 		return nil, fmt.Errorf("insufficient privilege to bind to low ports - want %q, have %q", cap.NET_BIND_SERVICE, c)
 	}
-	if !*skipPriv {
-		if err := c.SetFlag(cap.Effective, true, cap.NET_BIND_SERVICE); err != nil {
-			return nil, fmt.Errorf("unable to set capability: %v", err)
-		}
+
+	if err := c.SetFlag(cap.Effective, true, cap.NET_BIND_SERVICE); err != nil {
+		return nil, fmt.Errorf("unable to set capability: %v", err)
 	}
+
 	if err := c.SetProc(); err != nil {
 		return nil, fmt.Errorf("unable to raise capabilities %q: %v", c, err)
 	}
-
 	return net.Listen(network, address)
 }
 
 // Handler is used to abstract the ServeHTTP function.
 type Handler struct{}
 
-// ServeHTTP says hello from a single Go hardware thread and reveals its capabilities.
+// ServeHTTP says hello from a single Go hardware thread and reveals
+// its capabilities.
 func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
+	runtime.LockOSThread()
+	// Get some numbers consistent to the current execution, so
+	// the returned web page demonstrates that the code execution
+	// is bouncing around on different kernel thread ids.
 	p := syscall.Getpid()
+	t := syscall.Gettid()
 	c := cap.GetProc()
-	log.Printf("Saying hello from proc: %d, caps=%q", p, c)
-	fmt.Fprintf(w, "Hello from proc: %d, caps=%q\n", p, c)
+	runtime.UnlockOSThread()
+
+	log.Printf("Saying hello from proc: %d->%d, caps=%q", p, t, c)
+	fmt.Fprintf(w, "Hello from proc: %d->%d, caps=%q\n", p, t, c)
 }
 
 func main() {
@@ -130,12 +108,20 @@
 		log.Fatal("please supply --port value")
 	}
 
+	ensureNotEUID()
+
 	ls, err := listen("tcp", fmt.Sprintf(":%d", *port))
 	if err != nil {
 		log.Fatalf("aborting: %v", err)
 	}
 	defer ls.Close()
 
+	if !*skipPriv {
+		if err := cap.NewSet().SetProc(); err != nil {
+			panic(fmt.Errorf("unable to drop all privilege: %v", err))
+		}
+	}
+
 	if err := http.Serve(ls, &Handler{}); err != nil {
 		log.Fatalf("server failed: %v", err)
 	}