1// Package cap provides all the Linux Capabilities userspace library API
2// bindings in native Go.
3//
4// Capabilities are a feature of the Linux kernel that allow fine
5// grain permissions to perform privileged operations. Privileged
6// operations are required to do irregular system level operations
7// from code. You can read more about how Capabilities are intended to
8// work here:
9//
10//   https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/33528.pdf
11//
12// This package supports native Go bindings for all the features
13// described in that paper as well as supporting subsequent changes to
14// the kernel for other styles of inheritable Capability.
15//
16// Some simple things you can do with this package are:
17//
18//   // Read and display the capabilities of the running process
19//   c := cap.GetProc()
20//   log.Printf("this process has these caps:", c)
21//
22//   // Drop any privilege a process might have (including for root,
23//   // but note root 'owns' a lot of system files so a cap-limited
24//   // root can still do considerable damage to a running system).
25//   old := cap.GetProc()
26//   empty := cap.NewSet()
27//   if err := empty.SetProc(); err != nil {
28//       log.Fatalf("failed to drop privilege: %q -> %q: %v", old, empty, err)
29//   }
30//   now := cap.GetProc()
31//   if cap.Differs(now.Compare(empty)) {
32//       log.Fatalf("failed to fully drop privilege: have=%q, wanted=%q", now, empty)
33//   }
34//
35// See https://sites.google.com/site/fullycapable/ for recent updates,
36// some more complete walk-through examples of ways of using
37// 'cap.Set's etc and information on how to file bugs.
38//
39// For CGo linked binaries, behind the scenes, the package
40// "kernel.org/pub/linux/libs/security/libcap/psx" is used to perform
41// POSIX semantics system calls that manipulate thread state
42// uniformly over the whole Go (and CGo linked) process runtime.
43//
44// Note, if the Go runtime syscall interface contains the Linux
45// variant syscall.AllThreadsSyscall() API (it debuted in go1.16 see
46// https://github.com/golang/go/issues/1435 for its history) then
47// the "psx" package will use that to invoke Capability setting system
48// calls in pure Go binaries. In such an enhanced Go runtime, to force
49// this behavior, use the CGO_ENABLED=0 environment variable.
50//
51//
52// Copyright (c) 2019-21 Andrew G. Morgan <morgan@kernel.org>
53//
54// The cap and psx packages are licensed with a (you choose) BSD
55// 3-clause or GPL2. See LICENSE file for details.
56package cap // import "kernel.org/pub/linux/libs/security/libcap/cap"
57
58import (
59	"errors"
60	"sort"
61	"sync"
62	"syscall"
63	"unsafe"
64)
65
66// Value is the type of a single capability (or permission) bit.
67type Value uint
68
69// Flag is the type of one of the three Value dimensions held in a
70// Set.  It is also used in the (*IAB).Fill() method for changing the
71// Bounding and Ambient Vectors.
72type Flag uint
73
74// Effective, Permitted, Inheritable are the three Flags of Values
75// held in a Set.
76const (
77	Effective Flag = iota
78	Permitted
79	Inheritable
80)
81
82// String identifies a Flag value by its conventional "e", "p" or "i"
83// string abbreviation.
84func (f Flag) String() string {
85	switch f {
86	case Effective:
87		return "e"
88	case Permitted:
89		return "p"
90	case Inheritable:
91		return "i"
92	default:
93		return "<Error>"
94	}
95}
96
97// data holds a 32-bit slice of the compressed bitmaps of capability
98// sets as understood by the kernel.
99type data [Inheritable + 1]uint32
100
101// Set is an opaque capabilities container for a set of system
102// capbilities. It holds individually addressable capability Value's
103// for the three capability Flag's. See GetFlag() and SetFlag() for
104// how to adjust them individually, and Clear() and ClearFlag() for
105// how to do bulk operations.
106//
107// For admin tasks associated with managing namespace specific file
108// capabilities, Set can also support a namespace-root-UID value which
109// defaults to zero. See GetNSOwner() and SetNSOwner().
110type Set struct {
111	// mu protects all other members of a Set.
112	mu sync.RWMutex
113
114	// flat holds Flag Value bitmaps for all capabilities
115	// associated with this Set.
116	flat []data
117
118	// Linux specific
119	nsRoot int
120}
121
122// Various known kernel magic values.
123const (
124	kv1 = 0x19980330 // First iteration of process capabilities (32 bits).
125	kv2 = 0x20071026 // First iteration of process and file capabilities (64 bits) - deprecated.
126	kv3 = 0x20080522 // Most recently supported process and file capabilities (64 bits).
127)
128
129var (
130	// starUp protects setting of the following values: magic,
131	// words, maxValues.
132	startUp sync.Once
133
134	// magic holds the preferred magic number for the kernel ABI.
135	magic uint32
136
137	// words holds the number of uint32's associated with each
138	// capability Flag for this session.
139	words int
140
141	// maxValues holds the number of bit values that are named by
142	// the running kernel. This is generally expected to match
143	// ValueCount which is autogenerated at packaging time.
144	maxValues uint
145)
146
147type header struct {
148	magic uint32
149	pid   int32
150}
151
152// scwMu is used to fully serialize the write system calls. Note, this
153// is generally not necesary, but in the case of Launch we get into a
154// situation where the launching thread is temporarily allowed to
155// deviate from the kernel state of the rest of the runtime and
156// allowing other threads to perform w* syscalls will potentially
157// interfere with the launching process.
158var scwMu sync.Mutex
159
160// syscaller is a type for abstracting syscalls. The r* variants are
161// for reading state, and can be parallelized, the w* variants need to
162// be serialized so all OS threads can share state.
163type syscaller struct {
164	r3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
165	w3 func(trap, a1, a2, a3 uintptr) (r1, r2 uintptr, err syscall.Errno)
166	r6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
167	w6 func(trap, a1, a2, a3, a4, a5, a6 uintptr) (r1, r2 uintptr, err syscall.Errno)
168}
169
170// caprcall provides a pointer etc wrapper for the system calls
171// associated with getcap.
172//go:uintptrescapes
173func (sc *syscaller) caprcall(call uintptr, h *header, d []data) error {
174	x := uintptr(0)
175	if d != nil {
176		x = uintptr(unsafe.Pointer(&d[0]))
177	}
178	_, _, err := sc.r3(call, uintptr(unsafe.Pointer(h)), x, 0)
179	if err != 0 {
180		return err
181	}
182	return nil
183}
184
185// capwcall provides a pointer etc wrapper for the system calls
186// associated with setcap.
187//go:uintptrescapes
188func (sc *syscaller) capwcall(call uintptr, h *header, d []data) error {
189	x := uintptr(0)
190	if d != nil {
191		x = uintptr(unsafe.Pointer(&d[0]))
192	}
193	_, _, err := sc.w3(call, uintptr(unsafe.Pointer(h)), x, 0)
194	if err != 0 {
195		return err
196	}
197	return nil
198}
199
200// prctlrcall provides a wrapper for the prctl systemcalls that only
201// read kernel state. There is a limited number of arguments needed
202// and the caller should use 0 for those not needed.
203func (sc *syscaller) prctlrcall(prVal, v1, v2 uintptr) (int, error) {
204	r, _, err := sc.r3(syscall.SYS_PRCTL, prVal, v1, v2)
205	if err != 0 {
206		return int(r), err
207	}
208	return int(r), nil
209}
210
211// prctlrcall6 provides a wrapper for the prctl systemcalls that only
212// read kernel state and require 6 arguments - ambient cap API, I'm
213// looking at you. There is a limited number of arguments needed and
214// the caller should use 0 for those not needed.
215func (sc *syscaller) prctlrcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
216	r, _, err := sc.r6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
217	if err != 0 {
218		return int(r), err
219	}
220	return int(r), nil
221}
222
223// prctlwcall provides a wrapper for the prctl systemcalls that
224// write/modify kernel state. Where available, these will use the
225// POSIX semantics fixup system calls. There is a limited number of
226// arguments needed and the caller should use 0 for those not needed.
227func (sc *syscaller) prctlwcall(prVal, v1, v2 uintptr) (int, error) {
228	r, _, err := sc.w3(syscall.SYS_PRCTL, prVal, v1, v2)
229	if err != 0 {
230		return int(r), err
231	}
232	return int(r), nil
233}
234
235// prctlwcall6 provides a wrapper for the prctl systemcalls that
236// write/modify kernel state and require 6 arguments - ambient cap
237// API, I'm looking at you. (Where available, these will use the POSIX
238// semantics fixup system calls). There is a limited number of
239// arguments needed and the caller should use 0 for those not needed.
240func (sc *syscaller) prctlwcall6(prVal, v1, v2, v3, v4, v5 uintptr) (int, error) {
241	r, _, err := sc.w6(syscall.SYS_PRCTL, prVal, v1, v2, v3, v4, v5)
242	if err != 0 {
243		return int(r), err
244	}
245	return int(r), nil
246}
247
248// cInit perfoms the lazy identification of the capability vintage of
249// the running system.
250func (sc *syscaller) cInit() {
251	h := &header{
252		magic: kv3,
253	}
254	sc.caprcall(syscall.SYS_CAPGET, h, nil)
255	magic = h.magic
256	switch magic {
257	case kv1:
258		words = 1
259	case kv2, kv3:
260		words = 2
261	default:
262		// Fall back to a known good version.
263		magic = kv3
264		words = 2
265	}
266	// Use the bounding set to evaluate which capabilities exist.
267	maxValues = uint(sort.Search(32*words, func(n int) bool {
268		_, err := GetBound(Value(n))
269		return err != nil
270	}))
271	if maxValues == 0 {
272		// Fall back to using the largest value defined at build time.
273		maxValues = NamedCount
274	}
275}
276
277// MaxBits returns the number of kernel-named capabilities discovered
278// at runtime in the current system.
279func MaxBits() Value {
280	startUp.Do(multisc.cInit)
281	return Value(maxValues)
282}
283
284// NewSet returns an empty capability set.
285func NewSet() *Set {
286	startUp.Do(multisc.cInit)
287	return &Set{
288		flat: make([]data, words),
289	}
290}
291
292// ErrBadSet indicates a nil pointer was used for a *Set, or the
293// request of the Set is invalid in some way.
294var ErrBadSet = errors.New("bad capability set")
295
296// Dup returns a copy of the specified capability set.
297func (c *Set) Dup() (*Set, error) {
298	if c == nil || len(c.flat) == 0 {
299		return nil, ErrBadSet
300	}
301	n := NewSet()
302	c.mu.RLock()
303	defer c.mu.RUnlock()
304	copy(n.flat, c.flat)
305	n.nsRoot = c.nsRoot
306	return n, nil
307}
308
309// GetPID returns the capability set associated with the target process
310// id; pid=0 is an alias for current.
311func GetPID(pid int) (*Set, error) {
312	v := NewSet()
313	if err := multisc.caprcall(syscall.SYS_CAPGET, &header{magic: magic, pid: int32(pid)}, v.flat); err != nil {
314		return nil, err
315	}
316	return v, nil
317}
318
319// GetProc returns the capability Set of the current process. If the
320// kernel is unable to determine the Set associated with the current
321// process, the function panic()s.
322func GetProc() *Set {
323	c, err := GetPID(0)
324	if err != nil {
325		panic(err)
326	}
327	return c
328}
329
330func (sc *syscaller) setProc(c *Set) error {
331	if c == nil || len(c.flat) == 0 {
332		return ErrBadSet
333	}
334	return sc.capwcall(syscall.SYS_CAPSET, &header{magic: magic}, c.flat)
335}
336
337// SetProc attempts to set the capability Set of the current
338// process. The kernel will perform permission checks and an error
339// will be returned if the attempt fails. Should the attempt fail
340// no process capabilities will have been modified.
341func (c *Set) SetProc() error {
342	scwMu.Lock()
343	defer scwMu.Unlock()
344	return multisc.setProc(c)
345}
346
347// defines from uapi/linux/prctl.h
348const (
349	prCapBSetRead = 23
350	prCapBSetDrop = 24
351)
352
353// GetBound determines if a specific capability is currently part of
354// the local bounding set. On systems where the bounding set Value is
355// not present, this function returns an error.
356func GetBound(val Value) (bool, error) {
357	v, err := multisc.prctlrcall(prCapBSetRead, uintptr(val), 0)
358	if err != nil {
359		return false, err
360	}
361	return v > 0, nil
362}
363
364//go:uintptrescapes
365func (sc *syscaller) dropBound(val ...Value) error {
366	for _, v := range val {
367		if _, err := sc.prctlwcall(prCapBSetDrop, uintptr(v), 0); err != nil {
368			return err
369		}
370	}
371	return nil
372}
373
374// DropBound attempts to suppress bounding set Values. The kernel will
375// never allow a bounding set Value bit to be raised once successfully
376// dropped. However, dropping requires the current process is
377// sufficiently capable (usually via cap.SETPCAP being raised in the
378// Effective flag of the process' Set). Note, the drops are performed
379// in order and if one bounding value cannot be dropped, the function
380// returns immediately with an error which may leave the system in an
381// ill-defined state. The caller can determine where things went wrong
382// using GetBound().
383func DropBound(val ...Value) error {
384	scwMu.Lock()
385	defer scwMu.Unlock()
386	return multisc.dropBound(val...)
387}
388
389// defines from uapi/linux/prctl.h
390const (
391	prCapAmbient = 47
392
393	prCapAmbientIsSet    = 1
394	prCapAmbientRaise    = 2
395	prCapAmbientLower    = 3
396	prCapAmbientClearAll = 4
397)
398
399// GetAmbient determines if a specific capability is currently part of
400// the local ambient set. On systems where the ambient set Value is
401// not present, this function returns an error.
402func GetAmbient(val Value) (bool, error) {
403	r, err := multisc.prctlrcall6(prCapAmbient, prCapAmbientIsSet, uintptr(val), 0, 0, 0)
404	return r > 0, err
405}
406
407//go:uintptrescapes
408func (sc *syscaller) setAmbient(enable bool, val ...Value) error {
409	dir := uintptr(prCapAmbientLower)
410	if enable {
411		dir = prCapAmbientRaise
412	}
413	for _, v := range val {
414		_, err := sc.prctlwcall6(prCapAmbient, dir, uintptr(v), 0, 0, 0)
415		if err != nil {
416			return err
417		}
418	}
419	return nil
420}
421
422// SetAmbient attempts to set a specific Value bit to the state,
423// enable. This function will return an error if insufficient
424// permission is available to perform this task. The settings are
425// performed in order and the function returns immediately an error is
426// detected. Use GetAmbient() to unravel where things went
427// wrong. Note, the cap package manages an abstraction IAB that
428// captures all three inheritable vectors in a single type. Consider
429// using that.
430func SetAmbient(enable bool, val ...Value) error {
431	scwMu.Lock()
432	defer scwMu.Unlock()
433	return multisc.setAmbient(enable, val...)
434}
435
436func (sc *syscaller) resetAmbient() error {
437	var v bool
438	var err error
439
440	for c := Value(0); !v; c++ {
441		if v, err = GetAmbient(c); err != nil {
442			// no non-zero values found.
443			return nil
444		}
445	}
446	_, err = sc.prctlwcall6(prCapAmbient, prCapAmbientClearAll, 0, 0, 0, 0)
447	return err
448}
449
450// ResetAmbient attempts to ensure the Ambient set is fully
451// cleared. It works by first reading the set and if it finds any bits
452// raised it will attempt a reset. The test before attempting a reset
453// behavior is a workaround for situations where the Ambient API is
454// locked, but a reset is not actually needed. No Ambient bit not
455// already raised in both the Permitted and Inheritable Set is allowed
456// to be raised by the kernel.
457func ResetAmbient() error {
458	scwMu.Lock()
459	defer scwMu.Unlock()
460	return multisc.resetAmbient()
461}
462