| // Package cap provides all the Linux Capabilities userspace library API |
| // bindings in native Go. |
| // |
| // Capabilities are a feature of the Linux kernel that allow fine |
| // grain permissions to perform privileged operations. Privileged |
| // operations are required to do irregular system level operations |
| // from code. You can read more about how Capabilities are intended to |
| // work here: |
| // |
| // https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf |
| // |
| // This package supports native Go bindings for all the features |
| // described in that paper as well as supporting subsequent changes to |
| // the kernel for other styles of inheritable Capability. |
| // |
| // Some simple things you can do with this package are: |
| // |
| // // Read and display the capabilities of the running process |
| // c := cap.GetProc() |
| // log.Printf("this process has these caps:", c) |
| // |
| // // Drop any privilege a process might have (including for root, |
| // // but note root 'owns' a lot of system files so a cap-limited |
| // // root can still do considerable damage to a running system). |
| // old := cap.GetProc() |
| // empty := cap.NewSet() |
| // if err := empty.SetProc(); err != nil { |
| // log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err) |
| // } |
| // now := cap.GetProc() |
| // if cap.Differs(now.Compare(empty)) { |
| // log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty) |
| // } |
| // |
| // See https://sites.google.com/site/fullycapable/ for recent updates, |
| // some more complete walk-through examples of ways of using |
| // 'cap.Set's etc and information on how to file bugs. |
| // |
| // For CGo linked binaries, behind the scenes, the package |
| // "kernel.org/pub/linux/libs/security/libcap/psx" is used to perform |
| // POSIX semantics system calls that manipulate thread state |
| // uniformly over the whole Go (and CGo linked) process runtime. |
| // |
| // Note, if the Go runtime syscall interface contains the Linux |
| // variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see |
| // https://github.com/golang/go/issues/1435 for its history) then |
| // the "psx" package will use that to invoke Capability setting system |
| // calls in pure Go binaries. In such an enhanced Go runtime, to force |
| // this behavior, use the CGO_ENABLED=0 environment variable. |
| // |
| // |
| // Copyright (c) 2019-21 Andrew G. Morgan <[email protected]> |
| // |
| // The cap and psx packages are licensed with a (you choose) BSD |
| // 3-clause or GPL2. See LICENSE file for details. |
| package cap // import "kernel.org/pub/linux/libs/security/libcap/cap" |
| |
| import ( |
| "errors" |
| "sort" |
| "sync" |
| "syscall" |
| "unsafe" |
| ) |
| |
| // Value is the type of a single capability (or permission) bit. |
| type Value uint |
| |
| // Flag is the type of one of the three Value dimensions held in a |
| // Set. It is also used in the (*IAB).Fill() method for changing the |
| // Bounding and Ambient Vectors. |
| type Flag uint |
| |
| // Effective, Permitted, Inheritable are the three Flags of Values |
| // held in a Set. |
| const ( |
| Effective Flag = iota |
| Permitted |
| Inheritable |
| ) |
| |
| // String identifies a Flag value by its conventional "e", "p" or "i" |
| // string abbreviation. |
| func (f Flag) String() string { |
| switch f { |
| case Effective: |
| return "e" |
| case Permitted: |
| return "p" |
| case Inheritable: |
| return "i" |
| default: |
| return "<Error>" |
| } |
| } |
| |
| // data holds a 32-bit slice of the compressed bitmaps of capability |
| // sets as understood by the kernel. |
| type data [Inheritable + 1]uint32 |
| |
| // Set is an opaque capabilities container for a set of system |
| // capbilities. It holds individually addressable capability Value's |
| // for the three capability Flag's. See GetFlag() and SetFlag() for |
| // how to adjust them individually, and Clear() and ClearFlag() for |
| // how to do bulk operations. |
| // |
| // For admin tasks associated with managing namespace specific file |
| // capabilities, Set can also support a namespace-root-UID value which |
| // defaults to zero. See GetNSOwner() and SetNSOwner(). |
| type Set struct { |
| // mu protects all other members of a Set. |
| mu sync.RWMutex |
| |
| // flat holds Flag Value bitmaps for all capabilities |
| // associated with this Set. |
| flat []data |
| |
| // Linux specific |
| nsRoot int |
| } |
| |
| // Various known kernel magic values. |
| const ( |
| kv1 = 0x19980330 // First iteration of process capabilities (32 bits). |
| kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated. |
| kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits). |
| ) |
| |
| var ( |
| // starUp protects setting of the following values: magic, |
| // words, maxValues. |
| startUp sync.Once |
| |
| // magic holds the preferred magic number for the kernel ABI. |
| magic uint32 |
| |
| // words holds the number of uint32's associated with each |
| // capability Flag for this session. |
| words int |
| |
| // maxValues holds the number of bit values that are named by |
| // the running kernel. This is generally expected to match |
| // ValueCount which is autogenerated at packaging time. |
| maxValues uint |
| ) |
| |
| type header struct { |
| magic uint32 |
| pid int32 |
| } |
| |
| // scwMu is used to fully serialize the write system calls. Note, this |
| // is generally not necesary, but in the case of Launch we get into a |
| // situation where the launching thread is temporarily allowed to |
| // deviate from the kernel state of the rest of the runtime and |
| // allowing other threads to perform w* syscalls will potentially |
| // interfere with the launching process. |
| var scwMu sync.Mutex |
| |
| // syscaller is a type for abstracting syscalls. The r* variants are |
| // for reading state, and can be parallelized, the w* variants need to |
| // be serialized so all OS threads can share state. |
| type syscaller struct { |
| r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) |
| w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno) |
| r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) |
| w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno) |
| } |
| |
| // caprcall provides a pointer etc wrapper for the system calls |
| // associated with getcap. |
| //go:uintptrescapes |
| func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error { |
| x := uintptr(0) |
| if d != nil { |
| x = uintptr(unsafe.Pointer(&d[0])) |
| } |
| _, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0) |
| if err != 0 { |
| return err |
| } |
| return nil |
| } |
| |
| // capwcall provides a pointer etc wrapper for the system calls |
| // associated with setcap. |
| //go:uintptrescapes |
| func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error { |
| x := uintptr(0) |
| if d != nil { |
| x = uintptr(unsafe.Pointer(&d[0])) |
| } |
| _, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0) |
| if err != 0 { |
| return err |
| } |
| return nil |
| } |
| |
| // prctlrcall provides a wrapper for the prctl systemcalls that only |
| // read kernel state. There is a limited number of arguments needed |
| // and the caller should use 0 for those not needed. |
| func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) { |
| r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2) |
| if err != 0 { |
| return int(r), err |
| } |
| return int(r), nil |
| } |
| |
| // prctlrcall6 provides a wrapper for the prctl systemcalls that only |
| // read kernel state and require 6 arguments - ambient cap API, I'm |
| // looking at you. There is a limited number of arguments needed and |
| // the caller should use 0 for those not needed. |
| func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { |
| r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) |
| if err != 0 { |
| return int(r), err |
| } |
| return int(r), nil |
| } |
| |
| // prctlwcall provides a wrapper for the prctl systemcalls that |
| // write/modify kernel state. Where available, these will use the |
| // POSIX semantics fixup system calls. There is a limited number of |
| // arguments needed and the caller should use 0 for those not needed. |
| func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) { |
| r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2) |
| if err != 0 { |
| return int(r), err |
| } |
| return int(r), nil |
| } |
| |
| // prctlwcall6 provides a wrapper for the prctl systemcalls that |
| // write/modify kernel state and require 6 arguments - ambient cap |
| // API, I'm looking at you. (Where available, these will use the POSIX |
| // semantics fixup system calls). There is a limited number of |
| // arguments needed and the caller should use 0 for those not needed. |
| func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) { |
| r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5) |
| if err != 0 { |
| return int(r), err |
| } |
| return int(r), nil |
| } |
| |
| // cInit perfoms the lazy identification of the capability vintage of |
| // the running system. |
| func (sc *syscaller) cInit() { |
| h := &header{ |
| magic: kv3, |
| } |
| sc.caprcall(syscall.SYS_CAPGET, h, nil) |
| magic = h.magic |
| switch magic { |
| case kv1: |
| words = 1 |
| case kv2, kv3: |
| words = 2 |
| default: |
| // Fall back to a known good version. |
| magic = kv3 |
| words = 2 |
| } |
| // Use the bounding set to evaluate which capabilities exist. |
| maxValues = uint(sort.Search(32*words, func(n int) bool { |
| _, err := GetBound(Value(n)) |
| return err != nil |
| })) |
| if maxValues == 0 { |
| // Fall back to using the largest value defined at build time. |
| maxValues = NamedCount |
| } |
| } |
| |
| // MaxBits returns the number of kernel-named capabilities discovered |
| // at runtime in the current system. |
| func MaxBits() Value { |
| startUp.Do(multisc.cInit) |
| return Value(maxValues) |
| } |
| |
| // NewSet returns an empty capability set. |
| func NewSet() *Set { |
| startUp.Do(multisc.cInit) |
| return &Set{ |
| flat: make([]data, words), |
| } |
| } |
| |
| // ErrBadSet indicates a nil pointer was used for a *Set, or the |
| // request of the Set is invalid in some way. |
| var ErrBadSet = errors.New("bad capability set") |
| |
| // Dup returns a copy of the specified capability set. |
| func (c *Set) Dup() (*Set, error) { |
| if c == nil || len(c.flat) == 0 { |
| return nil, ErrBadSet |
| } |
| n := NewSet() |
| c.mu.RLock() |
| defer c.mu.RUnlock() |
| copy(n.flat, c.flat) |
| n.nsRoot = c.nsRoot |
| return n, nil |
| } |
| |
| // GetPID returns the capability set associated with the target process |
| // id; pid=0 is an alias for current. |
| func GetPID(pid int) (*Set, error) { |
| v := NewSet() |
| if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil { |
| return nil, err |
| } |
| return v, nil |
| } |
| |
| // GetProc returns the capability Set of the current process. If the |
| // kernel is unable to determine the Set associated with the current |
| // process, the function panic()s. |
| func GetProc() *Set { |
| c, err := GetPID(0) |
| if err != nil { |
| panic(err) |
| } |
| return c |
| } |
| |
| func (sc *syscaller) setProc(c *Set) error { |
| if c == nil || len(c.flat) == 0 { |
| return ErrBadSet |
| } |
| return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat) |
| } |
| |
| // SetProc attempts to set the capability Set of the current |
| // process. The kernel will perform permission checks and an error |
| // will be returned if the attempt fails. Should the attempt fail |
| // no process capabilities will have been modified. |
| func (c *Set) SetProc() error { |
| scwMu.Lock() |
| defer scwMu.Unlock() |
| return multisc.setProc(c) |
| } |
| |
| // defines from uapi/linux/prctl.h |
| const ( |
| prCapBSetRead = 23 |
| prCapBSetDrop = 24 |
| ) |
| |
| // GetBound determines if a specific capability is currently part of |
| // the local bounding set. On systems where the bounding set Value is |
| // not present, this function returns an error. |
| func GetBound(val Value) (bool, error) { |
| v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0) |
| if err != nil { |
| return false, err |
| } |
| return v > 0, nil |
| } |
| |
| //go:uintptrescapes |
| func (sc *syscaller) dropBound(val ...Value) error { |
| for _, v := range val { |
| if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| // DropBound attempts to suppress bounding set Values. The kernel will |
| // never allow a bounding set Value bit to be raised once successfully |
| // dropped. However, dropping requires the current process is |
| // sufficiently capable (usually via cap.SETPCAP being raised in the |
| // Effective flag of the process' Set). Note, the drops are performed |
| // in order and if one bounding value cannot be dropped, the function |
| // returns immediately with an error which may leave the system in an |
| // ill-defined state. The caller can determine where things went wrong |
| // using GetBound(). |
| func DropBound(val ...Value) error { |
| scwMu.Lock() |
| defer scwMu.Unlock() |
| return multisc.dropBound(val...) |
| } |
| |
| // defines from uapi/linux/prctl.h |
| const ( |
| prCapAmbient = 47 |
| |
| prCapAmbientIsSet = 1 |
| prCapAmbientRaise = 2 |
| prCapAmbientLower = 3 |
| prCapAmbientClearAll = 4 |
| ) |
| |
| // GetAmbient determines if a specific capability is currently part of |
| // the local ambient set. On systems where the ambient set Value is |
| // not present, this function returns an error. |
| func GetAmbient(val Value) (bool, error) { |
| r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0) |
| return r > 0, err |
| } |
| |
| //go:uintptrescapes |
| func (sc *syscaller) setAmbient(enable bool, val ...Value) error { |
| dir := uintptr(prCapAmbientLower) |
| if enable { |
| dir = prCapAmbientRaise |
| } |
| for _, v := range val { |
| _, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0) |
| if err != nil { |
| return err |
| } |
| } |
| return nil |
| } |
| |
| // SetAmbient attempts to set a specific Value bit to the state, |
| // enable. This function will return an error if insufficient |
| // permission is available to perform this task. The settings are |
| // performed in order and the function returns immediately an error is |
| // detected. Use GetAmbient() to unravel where things went |
| // wrong. Note, the cap package manages an abstraction IAB that |
| // captures all three inheritable vectors in a single type. Consider |
| // using that. |
| func SetAmbient(enable bool, val ...Value) error { |
| scwMu.Lock() |
| defer scwMu.Unlock() |
| return multisc.setAmbient(enable, val...) |
| } |
| |
| func (sc *syscaller) resetAmbient() error { |
| var v bool |
| var err error |
| |
| for c := Value(0); !v; c++ { |
| if v, err = GetAmbient(c); err != nil { |
| // no non-zero values found. |
| return nil |
| } |
| } |
| _, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0) |
| return err |
| } |
| |
| // ResetAmbient attempts to ensure the Ambient set is fully |
| // cleared. It works by first reading the set and if it finds any bits |
| // raised it will attempt a reset. The test before attempting a reset |
| // behavior is a workaround for situations where the Ambient API is |
| // locked, but a reset is not actually needed. No Ambient bit not |
| // already raised in both the Permitted and Inheritable Set is allowed |
| // to be raised by the kernel. |
| func ResetAmbient() error { |
| scwMu.Lock() |
| defer scwMu.Unlock() |
| return multisc.resetAmbient() |
| } |