blob: bb60904d646172895ee8b2f6ad2a87a7f8032c74 [file] [log] [blame]
Mike Frysinger4c331892022-09-13 05:17:08 -04001/* Copyright 2012 The ChromiumOS Authors
Elly Jonescd7a9042011-07-22 13:56:51 -04002 * Use of this source code is governed by a BSD-style license that can be
Will Drewry32ac9f52011-08-18 21:36:27 -05003 * found in the LICENSE file.
4 */
Elly Jonescd7a9042011-07-22 13:56:51 -04005
6#define _BSD_SOURCE
Arthur Gautier7a569072016-04-23 17:25:20 +00007#define _DEFAULT_SOURCE
Elly Jonescd7a9042011-07-22 13:56:51 -04008#define _GNU_SOURCE
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07009
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080010#include <asm/unistd.h>
Allen Webbc7182682021-04-16 09:44:53 -050011#include <assert.h>
Luis Hector Chavez43ff0802016-10-07 12:21:07 -070012#include <dirent.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040013#include <errno.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070014#include <fcntl.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040015#include <grp.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040016#include <linux/capability.h>
Luis Hector Chavezc3e17722018-10-16 20:43:12 -070017#include <linux/filter.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040018#include <sched.h>
19#include <signal.h>
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -070020#include <stdbool.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080021#include <stddef.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040022#include <stdio.h>
23#include <stdlib.h>
24#include <string.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040025#include <sys/capability.h>
26#include <sys/mount.h>
Will Drewryf89aef52011-09-16 16:48:57 -050027#include <sys/param.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040028#include <sys/prctl.h>
Dylan Reid0f72ef42017-06-06 15:42:49 -070029#include <sys/resource.h>
Allen Webbc7182682021-04-16 09:44:53 -050030#include <sys/select.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070031#include <sys/stat.h>
Mike Frysinger33ffef32017-01-13 19:53:19 -050032#include <sys/sysmacros.h>
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -070033#include <sys/types.h>
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080034#include <sys/user.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040035#include <sys/wait.h>
Luis Hector Chavezfb449ab2016-10-14 09:49:22 -070036#include <syscall.h>
Elly Jonescd7a9042011-07-22 13:56:51 -040037#include <unistd.h>
38
Ben Scarlatod7e6e682022-06-30 03:27:30 +000039#include "landlock_util.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040040#include "libminijail-private.h"
Allen Webb7ae41c22021-09-16 10:23:37 -050041#include "libminijail.h"
Elly Jonescd7a9042011-07-22 13:56:51 -040042
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -070043#include "signal_handler.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080044#include "syscall_filter.h"
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -040045#include "syscall_wrapper.h"
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -040046#include "system.h"
Jorge Lucangeli Obesa6b034d2012-08-07 15:29:20 -070047#include "util.h"
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -080048
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -070049/* Until these are reliably available in linux/prctl.h. */
Andrew Brestickereac28942015-11-11 16:04:46 -080050#ifndef PR_ALT_SYSCALL
Allen Webb7ae41c22021-09-16 10:23:37 -050051#define PR_ALT_SYSCALL 0x43724f53
Andrew Brestickereac28942015-11-11 16:04:46 -080052#endif
53
Dylan Reid4cbc2a52016-06-17 19:06:07 -070054/* New cgroup namespace might not be in linux-headers yet. */
55#ifndef CLONE_NEWCGROUP
Allen Webb7ae41c22021-09-16 10:23:37 -050056#define CLONE_NEWCGROUP 0x02000000
Dylan Reid4cbc2a52016-06-17 19:06:07 -070057#endif
58
Dylan Reid605ce7f2016-01-19 19:21:00 -080059#define MAX_CGROUPS 10 /* 10 different controllers supported by Linux. */
60
Dylan Reid0f72ef42017-06-06 15:42:49 -070061#define MAX_RLIMITS 32 /* Currently there are 15 supported by Linux. */
62
Richard Fungb06ce9b2021-05-11 20:11:57 +000063#define MAX_PRESERVED_FDS 128U
Luis Hector Chavez1617f632017-08-01 18:32:30 -070064
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -080065/* Keyctl commands. */
66#define KEYCTL_JOIN_SESSION_KEYRING 1
67
Luis Hector Chavez0bacbf82018-07-10 20:06:55 -070068/*
69 * The userspace equivalent of MNT_USER_SETTABLE_MASK, which is the mask of all
70 * flags that can be modified by MS_REMOUNT.
71 */
72#define MS_USER_SETTABLE_MASK \
73 (MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_NOATIME | MS_NODIRATIME | \
74 MS_RELATIME | MS_RDONLY)
75
Ryan Borzellob12f5672022-08-19 22:48:06 +000076/*
77 * TODO(b/235960683): Drop this after CrOS upgrades to glibc >= 2.34
78 * because MS_NOSYMFOLLOW will be defined in sys/mount.h.
79 */
80#ifndef MS_NOSYMFOLLOW
81/* Added locally in kernels 4.x+. */
82#define MS_NOSYMFOLLOW 256
83#endif
84
Dylan Reid0f72ef42017-06-06 15:42:49 -070085struct minijail_rlimit {
86 int type;
Luis Hector Chavez7058a2d2018-01-29 08:41:34 -080087 rlim_t cur;
88 rlim_t max;
Dylan Reid0f72ef42017-06-06 15:42:49 -070089};
90
Dylan Reid648b2202015-10-23 00:50:00 -070091struct mountpoint {
Elly Jones51a5b6c2011-10-12 19:09:26 -040092 char *src;
93 char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -070094 char *type;
Dylan Reid81e23972016-05-18 14:06:35 -070095 char *data;
96 int has_data;
Dylan Reid648b2202015-10-23 00:50:00 -070097 unsigned long flags;
98 struct mountpoint *next;
Elly Jones51a5b6c2011-10-12 19:09:26 -040099};
100
Nicole Anderson-Au835f7172021-01-13 21:18:13 +0000101struct minijail_remount {
102 unsigned long remount_mode;
103 char *mount_name;
104 struct minijail_remount *next;
105};
106
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -0700107struct hook {
108 minijail_hook_t hook;
109 void *payload;
110 minijail_hook_event_t event;
111 struct hook *next;
112};
113
Ben Scarlatoee82b492022-08-09 18:33:25 +0000114struct fs_rule {
115 char *path;
116 uint64_t landlock_flags;
117 struct fs_rule *next;
118};
119
Luis Hector Chavez1617f632017-08-01 18:32:30 -0700120struct preserved_fd {
121 int parent_fd;
122 int child_fd;
123};
124
Will Drewryf89aef52011-09-16 16:48:57 -0500125struct minijail {
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700126 /*
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700127 * WARNING: if you add a flag here you need to make sure it's
128 * accounted for in minijail_pre{enter|exec}() below.
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700129 */
Elly Jonese1749eb2011-10-07 13:54:59 -0400130 struct {
Christian Blichmann42e52e92022-08-29 10:01:20 +0200131 bool uid : 1;
132 bool gid : 1;
133 bool inherit_suppl_gids : 1;
134 bool set_suppl_gids : 1;
135 bool keep_suppl_gids : 1;
136 bool use_caps : 1;
137 bool capbset_drop : 1;
138 bool set_ambient_caps : 1;
139 bool vfs : 1;
140 bool enter_vfs : 1;
141 bool pids : 1;
142 bool ipc : 1;
143 bool uts : 1;
144 bool net : 1;
145 bool enter_net : 1;
146 bool ns_cgroups : 1;
147 bool userns : 1;
148 bool disable_setgroups : 1;
149 bool seccomp : 1;
150 bool remount_proc_ro : 1;
151 bool no_new_privs : 1;
152 bool seccomp_filter : 1;
153 bool seccomp_filter_tsync : 1;
154 bool seccomp_filter_logging : 1;
155 bool seccomp_filter_allow_speculation : 1;
156 bool chroot : 1;
157 bool pivot_root : 1;
158 bool mount_dev : 1;
159 bool mount_tmp : 1;
160 bool do_init : 1;
161 bool run_as_init : 1;
162 bool pid_file : 1;
163 bool cgroups : 1;
164 bool alt_syscall : 1;
165 bool reset_signal_mask : 1;
166 bool reset_signal_handlers : 1;
167 bool close_open_fds : 1;
168 bool new_session_keyring : 1;
169 bool forward_signals : 1;
170 bool setsid : 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400171 } flags;
172 uid_t uid;
173 gid_t gid;
174 gid_t usergid;
175 char *user;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800176 size_t suppl_gid_count;
177 gid_t *suppl_gid_list;
Elly Jonese1749eb2011-10-07 13:54:59 -0400178 uint64_t caps;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800179 uint64_t cap_bset;
Elly Jonese1749eb2011-10-07 13:54:59 -0400180 pid_t initpid;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700181 int mountns_fd;
Dylan Reid1102f5a2015-09-15 11:52:20 -0700182 int netns_fd;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400183 char *chrootdir;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800184 char *pid_file_path;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800185 char *uidmap;
186 char *gidmap;
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400187 char *hostname;
Luis Hector Chavez9acba452018-10-11 10:13:25 -0700188 char *preload_path;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800189 size_t filter_len;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -0800190 struct sock_fprog *filter_prog;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800191 char *alt_syscall_table;
Dylan Reid648b2202015-10-23 00:50:00 -0700192 struct mountpoint *mounts_head;
193 struct mountpoint *mounts_tail;
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -0800194 size_t mounts_count;
Mike Frysinger785b1c32018-02-23 15:47:24 -0500195 unsigned long remount_mode;
Nicole Anderson-Au835f7172021-01-13 21:18:13 +0000196 struct minijail_remount *remounts_head;
197 struct minijail_remount *remounts_tail;
Martin Pelikánab9eb442017-01-25 11:53:58 +1100198 size_t tmpfs_size;
Ben Scarlatof6102622022-09-05 19:31:42 +0000199 bool using_minimalistic_mountns;
Ben Scarlatoee82b492022-08-09 18:33:25 +0000200 struct fs_rule *fs_rules_head;
201 struct fs_rule *fs_rules_tail;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800202 char *cgroups[MAX_CGROUPS];
203 size_t cgroup_count;
Dylan Reid0f72ef42017-06-06 15:42:49 -0700204 struct minijail_rlimit rlimits[MAX_RLIMITS];
205 size_t rlimit_count;
Luis Hector Chavezec0a2c12017-06-29 20:29:57 -0700206 uint64_t securebits_skip_mask;
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -0700207 struct hook *hooks_head;
208 struct hook *hooks_tail;
Luis Hector Chavez1617f632017-08-01 18:32:30 -0700209 struct preserved_fd preserved_fds[MAX_PRESERVED_FDS];
210 size_t preserved_fd_count;
Allen Webb77383c72021-10-15 10:34:24 -0700211 char *seccomp_policy_path;
Will Drewryf89aef52011-09-16 16:48:57 -0500212};
213
Luis Hector Chavez64730af2017-09-13 13:18:59 -0700214static void run_hooks_or_die(const struct minijail *j,
215 minijail_hook_event_t event);
216
Adrian Ratiu8ef61252021-06-08 03:46:24 +0300217static bool seccomp_is_logging_allowed(const struct minijail *j)
218{
219 return seccomp_default_ret_log() || j->flags.seccomp_filter_logging;
220}
221
Mike Frysingerac08a682017-10-10 02:04:50 -0400222static void free_mounts_list(struct minijail *j)
223{
224 while (j->mounts_head) {
225 struct mountpoint *m = j->mounts_head;
226 j->mounts_head = j->mounts_head->next;
227 free(m->data);
228 free(m->type);
229 free(m->dest);
230 free(m->src);
231 free(m);
232 }
233 // No need to clear mounts_head as we know it's NULL after the loop.
234 j->mounts_tail = NULL;
235}
236
Nicole Anderson-Au835f7172021-01-13 21:18:13 +0000237static void free_remounts_list(struct minijail *j)
238{
239 while (j->remounts_head) {
240 struct minijail_remount *m = j->remounts_head;
241 j->remounts_head = j->remounts_head->next;
242 free(m->mount_name);
243 free(m);
244 }
245 // No need to clear remounts_head as we know it's NULL after the loop.
246 j->remounts_tail = NULL;
247}
248
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700249/*
François Degros664eba72019-11-05 13:18:24 +1100250 * Writes exactly n bytes from buf to file descriptor fd.
251 * Returns 0 on success or a negative error code on error.
252 */
253static int write_exactly(int fd, const void *buf, size_t n)
254{
255 const char *p = buf;
256 while (n > 0) {
257 const ssize_t written = write(fd, p, n);
258 if (written < 0) {
259 if (errno == EINTR)
260 continue;
261
262 return -errno;
263 }
264
265 p += written;
266 n -= written;
267 }
268
269 return 0;
270}
271
Mattias Nissler6123e5a2020-02-11 13:38:03 +0100272/* Closes *pfd and sets it to -1. */
273static void close_and_reset(int *pfd)
274{
275 if (*pfd != -1)
276 close(*pfd);
277 *pfd = -1;
278}
279
François Degros664eba72019-11-05 13:18:24 +1100280/*
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700281 * Strip out flags meant for the parent.
282 * We keep things that are not inherited across execve(2) (e.g. capabilities),
283 * or are easier to set after execve(2) (e.g. seccomp filters).
284 */
285void minijail_preenter(struct minijail *j)
286{
287 j->flags.vfs = 0;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700288 j->flags.enter_vfs = 0;
Luis Hector Chavez83a44892018-10-12 08:56:20 -0700289 j->flags.ns_cgroups = 0;
290 j->flags.net = 0;
291 j->flags.uts = 0;
Dylan Reid791f5772015-09-14 20:02:42 -0700292 j->flags.remount_proc_ro = 0;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700293 j->flags.pids = 0;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800294 j->flags.do_init = 0;
Luis Hector Chavezac981fc2017-09-18 15:52:38 -0700295 j->flags.run_as_init = 0;
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800296 j->flags.pid_file = 0;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800297 j->flags.cgroups = 0;
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -0400298 j->flags.forward_signals = 0;
Xiyuan Xia9b41e652019-05-23 11:03:04 -0700299 j->flags.setsid = 0;
Mike Frysinger785b1c32018-02-23 15:47:24 -0500300 j->remount_mode = 0;
Nicole Anderson-Au835f7172021-01-13 21:18:13 +0000301 free_remounts_list(j);
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700302}
303
Ben Scarlatoee82b492022-08-09 18:33:25 +0000304/* Adds a rule for a given path to apply once minijail is entered. */
305int add_fs_restriction_path(struct minijail *j,
306 const char *path,
307 uint64_t landlock_flags)
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000308{
Ben Scarlatoee82b492022-08-09 18:33:25 +0000309 struct fs_rule *r = calloc(1, sizeof(*r));
310 if (!r)
311 return -ENOMEM;
312 r->path = strdup(path);
313 r->landlock_flags = landlock_flags;
314
315 if (j->fs_rules_tail) {
316 j->fs_rules_tail->next = r;
317 j->fs_rules_tail = r;
318 } else {
319 j->fs_rules_head = r;
320 j->fs_rules_tail = r;
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000321 }
Ben Scarlatoee82b492022-08-09 18:33:25 +0000322
323 return 0;
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000324}
325
Ben Scarlatof6102622022-09-05 19:31:42 +0000326bool mount_has_bind_flag(struct mountpoint *m) {
327 return !!(m->flags & MS_BIND);
328}
329
330bool mount_has_readonly_flag(struct mountpoint *m) {
331 return !!(m->flags & MS_RDONLY);
332}
333
Ben Scarlato4345afb2022-09-19 21:54:25 +0000334bool mount_events_allowed(struct mountpoint *m) {
335 return !!(m->flags & MS_SHARED) || !!(m->flags & MS_SLAVE);
336}
337
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700338/*
339 * Strip out flags meant for the child.
340 * We keep things that are inherited across execve(2).
341 */
342void minijail_preexec(struct minijail *j)
343{
344 int vfs = j->flags.vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700345 int enter_vfs = j->flags.enter_vfs;
Luis Hector Chavez83a44892018-10-12 08:56:20 -0700346 int ns_cgroups = j->flags.ns_cgroups;
347 int net = j->flags.net;
348 int uts = j->flags.uts;
Dylan Reid791f5772015-09-14 20:02:42 -0700349 int remount_proc_ro = j->flags.remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800350 int userns = j->flags.userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700351 if (j->user)
352 free(j->user);
353 j->user = NULL;
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -0800354 if (j->suppl_gid_list)
355 free(j->suppl_gid_list);
356 j->suppl_gid_list = NULL;
Luis Hector Chavez9acba452018-10-11 10:13:25 -0700357 if (j->preload_path)
358 free(j->preload_path);
359 j->preload_path = NULL;
Mike Frysingerac08a682017-10-10 02:04:50 -0400360 free_mounts_list(j);
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700361 memset(&j->flags, 0, sizeof(j->flags));
362 /* Now restore anything we meant to keep. */
363 j->flags.vfs = vfs;
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700364 j->flags.enter_vfs = enter_vfs;
Luis Hector Chavez83a44892018-10-12 08:56:20 -0700365 j->flags.ns_cgroups = ns_cgroups;
366 j->flags.net = net;
367 j->flags.uts = uts;
Dylan Reid791f5772015-09-14 20:02:42 -0700368 j->flags.remount_proc_ro = remount_proc_ro;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800369 j->flags.userns = userns;
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -0700370 /* Note, |pids| will already have been used before this call. */
371}
372
373/* Minijail API. */
374
Will Drewry6ac91122011-10-21 16:38:58 -0500375struct minijail API *minijail_new(void)
Elly Jonese1749eb2011-10-07 13:54:59 -0400376{
Mike Frysinger785b1c32018-02-23 15:47:24 -0500377 struct minijail *j = calloc(1, sizeof(struct minijail));
Mike Frysinger1036cd82020-08-28 00:15:59 -0400378 if (j) {
379 j->remount_mode = MS_PRIVATE;
Ben Scarlatof6102622022-09-05 19:31:42 +0000380 j->using_minimalistic_mountns = false;
Mike Frysinger1036cd82020-08-28 00:15:59 -0400381 }
Mike Frysinger785b1c32018-02-23 15:47:24 -0500382 return j;
Elly Jonescd7a9042011-07-22 13:56:51 -0400383}
384
Will Drewry6ac91122011-10-21 16:38:58 -0500385void API minijail_change_uid(struct minijail *j, uid_t uid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400386{
387 if (uid == 0)
388 die("useless change to uid 0");
389 j->uid = uid;
390 j->flags.uid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400391}
392
Will Drewry6ac91122011-10-21 16:38:58 -0500393void API minijail_change_gid(struct minijail *j, gid_t gid)
Elly Jonese1749eb2011-10-07 13:54:59 -0400394{
395 if (gid == 0)
396 die("useless change to gid 0");
397 j->gid = gid;
398 j->flags.gid = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400399}
400
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800401void API minijail_set_supplementary_gids(struct minijail *j, size_t size,
402 const gid_t *list)
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800403{
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800404 size_t i;
405
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -0500406 if (j->flags.inherit_suppl_gids)
407 die("cannot inherit *and* set supplementary groups");
408 if (j->flags.keep_suppl_gids)
409 die("cannot keep *and* set supplementary groups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800410
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800411 if (size == 0) {
412 /* Clear supplementary groups. */
413 j->suppl_gid_list = NULL;
414 j->suppl_gid_count = 0;
Lutz Justen13807cb2017-01-03 17:11:55 +0100415 j->flags.set_suppl_gids = 1;
Jorge Lucangeli Obesbc67f442016-01-08 14:43:45 -0800416 return;
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800417 }
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800418
419 /* Copy the gid_t array. */
420 j->suppl_gid_list = calloc(size, sizeof(gid_t));
421 if (!j->suppl_gid_list) {
Jorge Lucangeli Obesfd5fc562016-01-08 10:29:27 -0800422 die("failed to allocate internal supplementary group array");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800423 }
Jorge Lucangeli Obes06940be2015-12-04 18:09:21 -0800424 for (i = 0; i < size; i++) {
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800425 j->suppl_gid_list[i] = list[i];
426 }
427 j->suppl_gid_count = size;
Lutz Justen13807cb2017-01-03 17:11:55 +0100428 j->flags.set_suppl_gids = 1;
429}
430
Allen Webb7ae41c22021-09-16 10:23:37 -0500431void API minijail_keep_supplementary_gids(struct minijail *j)
432{
Lutz Justen13807cb2017-01-03 17:11:55 +0100433 j->flags.keep_suppl_gids = 1;
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -0800434}
435
Will Drewry6ac91122011-10-21 16:38:58 -0500436int API minijail_change_user(struct minijail *j, const char *user)
Elly Jonese1749eb2011-10-07 13:54:59 -0400437{
Luis Hector Chavez71323552017-09-05 09:17:22 -0700438 uid_t uid;
439 gid_t gid;
440 int rc = lookup_user(user, &uid, &gid);
441 if (rc)
442 return rc;
443 minijail_change_uid(j, uid);
Elly Jonese1749eb2011-10-07 13:54:59 -0400444 j->user = strdup(user);
445 if (!j->user)
446 return -ENOMEM;
Luis Hector Chavez71323552017-09-05 09:17:22 -0700447 j->usergid = gid;
Elly Jonese1749eb2011-10-07 13:54:59 -0400448 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400449}
450
Will Drewry6ac91122011-10-21 16:38:58 -0500451int API minijail_change_group(struct minijail *j, const char *group)
Elly Jonese1749eb2011-10-07 13:54:59 -0400452{
Luis Hector Chavez71323552017-09-05 09:17:22 -0700453 gid_t gid;
454 int rc = lookup_group(group, &gid);
455 if (rc)
456 return rc;
457 minijail_change_gid(j, gid);
Elly Jonese1749eb2011-10-07 13:54:59 -0400458 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -0400459}
460
Will Drewry6ac91122011-10-21 16:38:58 -0500461void API minijail_use_seccomp(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400462{
463 j->flags.seccomp = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400464}
465
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -0700466void API minijail_no_new_privs(struct minijail *j)
467{
468 j->flags.no_new_privs = 1;
469}
470
Will Drewry6ac91122011-10-21 16:38:58 -0500471void API minijail_use_seccomp_filter(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400472{
473 j->flags.seccomp_filter = 1;
Will Drewry32ac9f52011-08-18 21:36:27 -0500474}
475
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400476void API minijail_set_seccomp_filter_tsync(struct minijail *j)
477{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400478 if (j->filter_len > 0 && j->filter_prog != NULL) {
479 die("minijail_set_seccomp_filter_tsync() must be called "
480 "before minijail_parse_seccomp_filters()");
481 }
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -0400482
Adrian Ratiu8ef61252021-06-08 03:46:24 +0300483 if (seccomp_is_logging_allowed(j) && !seccomp_ret_log_available()) {
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -0400484 /*
485 * If SECCOMP_RET_LOG is not available, we don't want to use
486 * SECCOMP_RET_TRAP to both kill the entire process and report
487 * failing syscalls, since it will be brittle. Just bail.
488 */
Mike Frysinger52f6ada2019-06-26 16:59:36 -0400489 die("SECCOMP_RET_LOG not available, cannot use logging with "
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -0400490 "thread sync at the same time");
491 }
492
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -0400493 j->flags.seccomp_filter_tsync = 1;
494}
495
Anand K Mistry31adc6c2020-11-26 11:39:46 +1100496void API minijail_set_seccomp_filter_allow_speculation(struct minijail *j)
497{
498 if (j->filter_len > 0 && j->filter_prog != NULL) {
499 die("minijail_set_seccomp_filter_allow_speculation() must be "
500 "called before minijail_parse_seccomp_filters()");
501 }
502
503 j->flags.seccomp_filter_allow_speculation = 1;
504}
505
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700506void API minijail_log_seccomp_filter_failures(struct minijail *j)
507{
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -0400508 if (j->filter_len > 0 && j->filter_prog != NULL) {
509 die("minijail_log_seccomp_filter_failures() must be called "
510 "before minijail_parse_seccomp_filters()");
511 }
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -0400512
513 if (j->flags.seccomp_filter_tsync && !seccomp_ret_log_available()) {
514 /*
515 * If SECCOMP_RET_LOG is not available, we don't want to use
516 * SECCOMP_RET_TRAP to both kill the entire process and report
517 * failing syscalls, since it will be brittle. Just bail.
518 */
Allen Webb7ae41c22021-09-16 10:23:37 -0500519 die("SECCOMP_RET_LOG not available, cannot use thread sync "
520 "with logging at the same time");
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -0400521 }
522
523 if (debug_logging_allowed()) {
524 j->flags.seccomp_filter_logging = 1;
525 } else {
526 warn("non-debug build: ignoring request to enable seccomp "
527 "logging");
528 }
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -0700529}
530
Ben Scarlatof6102622022-09-05 19:31:42 +0000531void API minijail_set_using_minimalistic_mountns(struct minijail *j)
532{
533 j->using_minimalistic_mountns = true;
534}
535
536void API minijail_add_minimalistic_mountns_fs_rules(struct minijail *j)
537{
538 struct mountpoint *m = j->mounts_head;
539 bool landlock_enabled_by_profile = false;
540 if (!j->using_minimalistic_mountns)
541 return;
542
Ben Scarlatof6102622022-09-05 19:31:42 +0000543 /* Apply Landlock rules. */
Ben Scarlatof6102622022-09-05 19:31:42 +0000544 while (m) {
545 landlock_enabled_by_profile = true;
546 minijail_add_fs_restriction_rx(j, m->dest);
Ben Scarlato4345afb2022-09-19 21:54:25 +0000547 /* Allow rw if mounted as writable, or mount flags allow mount events.*/
548 if (!mount_has_readonly_flag(m) || mount_events_allowed(m))
Ben Scarlatof6102622022-09-05 19:31:42 +0000549 minijail_add_fs_restriction_rw(j, m->dest);
550 m = m->next;
551 }
552 if (landlock_enabled_by_profile) {
553 minijail_enable_default_fs_restrictions(j);
554 minijail_add_fs_restriction_edit(j, "/dev");
555 minijail_add_fs_restriction_ro(j, "/proc");
556 if (j->flags.vfs)
557 minijail_add_fs_restriction_rw(j, "/tmp");
558 }
559}
560
561void API minijail_enable_default_fs_restrictions(struct minijail *j)
562{
563 // Common library locations.
564 minijail_add_fs_restriction_rx(j, "/lib");
565 minijail_add_fs_restriction_rx(j, "/lib64");
566 minijail_add_fs_restriction_rx(j, "/usr/lib");
567 minijail_add_fs_restriction_rx(j, "/usr/lib64");
568 // Common locations for services invoking Minijail.
569 minijail_add_fs_restriction_rx(j, "/bin");
570 minijail_add_fs_restriction_rx(j, "/sbin");
571 minijail_add_fs_restriction_rx(j, "/usr/sbin");
572 minijail_add_fs_restriction_rx(j, "/usr/bin");
573}
574
Will Drewry6ac91122011-10-21 16:38:58 -0500575void API minijail_use_caps(struct minijail *j, uint64_t capmask)
Elly Jonese1749eb2011-10-07 13:54:59 -0400576{
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800577 /*
578 * 'minijail_use_caps' configures a runtime-capabilities-only
579 * environment, including a bounding set matching the thread's runtime
580 * (permitted|inheritable|effective) sets.
581 * Therefore, it will override any existing bounding set configurations
582 * since the latter would allow gaining extra runtime capabilities from
583 * file capabilities.
584 */
585 if (j->flags.capbset_drop) {
586 warn("overriding bounding set configuration");
587 j->cap_bset = 0;
588 j->flags.capbset_drop = 0;
589 }
Elly Jonese1749eb2011-10-07 13:54:59 -0400590 j->caps = capmask;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800591 j->flags.use_caps = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400592}
593
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800594void API minijail_capbset_drop(struct minijail *j, uint64_t capmask)
595{
596 if (j->flags.use_caps) {
597 /*
598 * 'minijail_use_caps' will have already configured a capability
599 * bounding set matching the (permitted|inheritable|effective)
600 * sets. Abort if the user tries to configure a separate
601 * bounding set. 'minijail_capbset_drop' and 'minijail_use_caps'
602 * are mutually exclusive.
603 */
604 die("runtime capabilities already configured, can't drop "
605 "bounding set separately");
606 }
607 j->cap_bset = capmask;
608 j->flags.capbset_drop = 1;
609}
610
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -0400611void API minijail_set_ambient_caps(struct minijail *j)
612{
613 j->flags.set_ambient_caps = 1;
614}
615
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -0800616void API minijail_reset_signal_mask(struct minijail *j)
617{
Peter Qiu2860c462015-12-16 15:13:06 -0800618 j->flags.reset_signal_mask = 1;
619}
620
Luis Hector Chaveza27118a2018-04-04 08:18:01 -0700621void API minijail_reset_signal_handlers(struct minijail *j)
622{
623 j->flags.reset_signal_handlers = 1;
624}
625
Will Drewry6ac91122011-10-21 16:38:58 -0500626void API minijail_namespace_vfs(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400627{
628 j->flags.vfs = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400629}
630
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700631void API minijail_namespace_enter_vfs(struct minijail *j, const char *ns_path)
632{
Mike Frysinger902a4492018-12-27 05:22:56 -0500633 /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */
634 int ns_fd = open(ns_path, O_RDONLY);
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -0700635 if (ns_fd < 0) {
636 pdie("failed to open namespace '%s'", ns_path);
637 }
638 j->mountns_fd = ns_fd;
639 j->flags.enter_vfs = 1;
640}
641
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -0800642void API minijail_new_session_keyring(struct minijail *j)
643{
644 j->flags.new_session_keyring = 1;
645}
646
Luis Hector Chavezec0a2c12017-06-29 20:29:57 -0700647void API minijail_skip_setting_securebits(struct minijail *j,
648 uint64_t securebits_skip_mask)
649{
650 j->securebits_skip_mask = securebits_skip_mask;
651}
652
Mike Frysinger785b1c32018-02-23 15:47:24 -0500653void API minijail_remount_mode(struct minijail *j, unsigned long mode)
654{
655 j->remount_mode = mode;
656}
657
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800658void API minijail_skip_remount_private(struct minijail *j)
659{
Mike Frysinger785b1c32018-02-23 15:47:24 -0500660 j->remount_mode = 0;
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -0800661}
662
Will Drewry6ac91122011-10-21 16:38:58 -0500663void API minijail_namespace_pids(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400664{
Elly Jonese58176c2012-01-23 11:46:17 -0500665 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700666 j->flags.remount_proc_ro = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -0400667 j->flags.pids = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800668 j->flags.do_init = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400669}
670
Jorge Lucangeli Obes2fa96d12019-02-05 10:51:57 -0500671void API minijail_namespace_pids_rw_proc(struct minijail *j)
672{
673 j->flags.vfs = 1;
674 j->flags.pids = 1;
675 j->flags.do_init = 1;
676}
677
Dylan Reidf7942472015-11-18 17:55:26 -0800678void API minijail_namespace_ipc(struct minijail *j)
679{
680 j->flags.ipc = 1;
681}
682
Mike Frysingerb9a7b162017-05-30 15:25:49 -0400683void API minijail_namespace_uts(struct minijail *j)
684{
685 j->flags.uts = 1;
686}
687
688int API minijail_namespace_set_hostname(struct minijail *j, const char *name)
689{
690 if (j->hostname)
691 return -EINVAL;
692 minijail_namespace_uts(j);
693 j->hostname = strdup(name);
694 if (!j->hostname)
695 return -ENOMEM;
696 return 0;
697}
698
Elly Fong-Jones6c086302013-03-20 17:15:28 -0400699void API minijail_namespace_net(struct minijail *j)
700{
701 j->flags.net = 1;
702}
703
Dylan Reid1102f5a2015-09-15 11:52:20 -0700704void API minijail_namespace_enter_net(struct minijail *j, const char *ns_path)
705{
Mike Frysinger902a4492018-12-27 05:22:56 -0500706 /* Note: Do not use O_CLOEXEC here. We'll close it after we use it. */
707 int ns_fd = open(ns_path, O_RDONLY);
Dylan Reid1102f5a2015-09-15 11:52:20 -0700708 if (ns_fd < 0) {
709 pdie("failed to open namespace '%s'", ns_path);
710 }
711 j->netns_fd = ns_fd;
712 j->flags.enter_net = 1;
713}
714
Dylan Reid4cbc2a52016-06-17 19:06:07 -0700715void API minijail_namespace_cgroups(struct minijail *j)
716{
717 j->flags.ns_cgroups = 1;
718}
719
Luis Hector Chavez43ff0802016-10-07 12:21:07 -0700720void API minijail_close_open_fds(struct minijail *j)
721{
722 j->flags.close_open_fds = 1;
723}
724
Dylan Reid791f5772015-09-14 20:02:42 -0700725void API minijail_remount_proc_readonly(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400726{
727 j->flags.vfs = 1;
Dylan Reid791f5772015-09-14 20:02:42 -0700728 j->flags.remount_proc_ro = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400729}
730
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800731void API minijail_namespace_user(struct minijail *j)
732{
733 j->flags.userns = 1;
734}
735
Jorge Lucangeli Obes200299c2016-09-23 15:21:57 -0400736void API minijail_namespace_user_disable_setgroups(struct minijail *j)
737{
738 j->flags.disable_setgroups = 1;
739}
740
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800741int API minijail_uidmap(struct minijail *j, const char *uidmap)
742{
743 j->uidmap = strdup(uidmap);
744 if (!j->uidmap)
745 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800746 char *ch;
747 for (ch = j->uidmap; *ch; ch++) {
748 if (*ch == ',')
749 *ch = '\n';
750 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800751 return 0;
752}
753
754int API minijail_gidmap(struct minijail *j, const char *gidmap)
755{
756 j->gidmap = strdup(gidmap);
757 if (!j->gidmap)
758 return -ENOMEM;
Yu-Hsi Chiang1912c5b2015-08-31 18:59:49 +0800759 char *ch;
760 for (ch = j->gidmap; *ch; ch++) {
761 if (*ch == ',')
762 *ch = '\n';
763 }
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +0800764 return 0;
765}
766
Will Drewry6ac91122011-10-21 16:38:58 -0500767void API minijail_inherit_usergroups(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -0400768{
Lutz Justen13807cb2017-01-03 17:11:55 +0100769 j->flags.inherit_suppl_gids = 1;
Elly Jonescd7a9042011-07-22 13:56:51 -0400770}
771
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800772void API minijail_run_as_init(struct minijail *j)
773{
774 /*
775 * Since the jailed program will become 'init' in the new PID namespace,
776 * Minijail does not need to fork an 'init' process.
777 */
Luis Hector Chavezac981fc2017-09-18 15:52:38 -0700778 j->flags.run_as_init = 1;
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +0800779}
780
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700781int API minijail_enter_chroot(struct minijail *j, const char *dir)
782{
Elly Jones51a5b6c2011-10-12 19:09:26 -0400783 if (j->chrootdir)
784 return -EINVAL;
785 j->chrootdir = strdup(dir);
786 if (!j->chrootdir)
787 return -ENOMEM;
788 j->flags.chroot = 1;
789 return 0;
790}
791
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +0800792int API minijail_enter_pivot_root(struct minijail *j, const char *dir)
793{
794 if (j->chrootdir)
795 return -EINVAL;
796 j->chrootdir = strdup(dir);
797 if (!j->chrootdir)
798 return -ENOMEM;
799 j->flags.pivot_root = 1;
800 return 0;
801}
802
Dylan Reida14e08d2015-10-22 21:05:29 -0700803char API *minijail_get_original_path(struct minijail *j,
804 const char *path_inside_chroot)
805{
Dylan Reid648b2202015-10-23 00:50:00 -0700806 struct mountpoint *b;
Dylan Reida14e08d2015-10-22 21:05:29 -0700807
Dylan Reid648b2202015-10-23 00:50:00 -0700808 b = j->mounts_head;
Dylan Reida14e08d2015-10-22 21:05:29 -0700809 while (b) {
810 /*
811 * If |path_inside_chroot| is the exact destination of a
Dylan Reid648b2202015-10-23 00:50:00 -0700812 * mount, then the original path is exactly the source of
813 * the mount.
Dylan Reida14e08d2015-10-22 21:05:29 -0700814 * for example: "-b /some/path/exe,/chroot/path/exe"
Dylan Reid648b2202015-10-23 00:50:00 -0700815 * mount source = /some/path/exe, mount dest =
816 * /chroot/path/exe Then when getting the original path of
817 * "/chroot/path/exe", the source of that mount,
818 * "/some/path/exe" is what should be returned.
Dylan Reida14e08d2015-10-22 21:05:29 -0700819 */
Mike Frysinger22dc3522022-07-07 19:24:13 -0400820 if (streq(b->dest, path_inside_chroot))
Dylan Reida14e08d2015-10-22 21:05:29 -0700821 return strdup(b->src);
822
823 /*
824 * If |path_inside_chroot| is within the destination path of a
Dylan Reid648b2202015-10-23 00:50:00 -0700825 * mount, take the suffix of the chroot path relative to the
826 * mount destination path, and append it to the mount source
827 * path.
Dylan Reida14e08d2015-10-22 21:05:29 -0700828 */
829 if (!strncmp(b->dest, path_inside_chroot, strlen(b->dest))) {
830 const char *relative_path =
Allen Webb7ae41c22021-09-16 10:23:37 -0500831 path_inside_chroot + strlen(b->dest);
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400832 return path_join(b->src, relative_path);
Dylan Reida14e08d2015-10-22 21:05:29 -0700833 }
834 b = b->next;
835 }
836
837 /* If there is a chroot path, append |path_inside_chroot| to that. */
838 if (j->chrootdir)
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -0400839 return path_join(j->chrootdir, path_inside_chroot);
Dylan Reida14e08d2015-10-22 21:05:29 -0700840
841 /* No chroot, so the path outside is the same as it is inside. */
842 return strdup(path_inside_chroot);
Dylan Reid08946cc2015-09-16 19:10:57 -0700843}
844
Mike Frysinger33ffef32017-01-13 19:53:19 -0500845void API minijail_mount_dev(struct minijail *j)
846{
847 j->flags.mount_dev = 1;
848}
849
Lee Campbell11af0622014-05-22 12:36:04 -0700850void API minijail_mount_tmp(struct minijail *j)
851{
Martin Pelikánab9eb442017-01-25 11:53:58 +1100852 minijail_mount_tmp_size(j, 64 * 1024 * 1024);
853}
854
855void API minijail_mount_tmp_size(struct minijail *j, size_t size)
856{
857 j->tmpfs_size = size;
Lee Campbell11af0622014-05-22 12:36:04 -0700858 j->flags.mount_tmp = 1;
859}
860
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +0800861int API minijail_write_pid_file(struct minijail *j, const char *path)
862{
863 j->pid_file_path = strdup(path);
864 if (!j->pid_file_path)
865 return -ENOMEM;
866 j->flags.pid_file = 1;
867 return 0;
868}
869
Dylan Reid605ce7f2016-01-19 19:21:00 -0800870int API minijail_add_to_cgroup(struct minijail *j, const char *path)
871{
872 if (j->cgroup_count >= MAX_CGROUPS)
873 return -ENOMEM;
874 j->cgroups[j->cgroup_count] = strdup(path);
875 if (!j->cgroups[j->cgroup_count])
876 return -ENOMEM;
877 j->cgroup_count++;
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -0800878 j->flags.cgroups = 1;
Dylan Reid605ce7f2016-01-19 19:21:00 -0800879 return 0;
880}
881
Luis Hector Chavez7058a2d2018-01-29 08:41:34 -0800882int API minijail_rlimit(struct minijail *j, int type, rlim_t cur, rlim_t max)
Dylan Reid0f72ef42017-06-06 15:42:49 -0700883{
884 size_t i;
885
886 if (j->rlimit_count >= MAX_RLIMITS)
887 return -ENOMEM;
888 /* It's an error if the caller sets the same rlimit multiple times. */
889 for (i = 0; i < j->rlimit_count; i++) {
890 if (j->rlimits[i].type == type)
891 return -EEXIST;
892 }
893
894 j->rlimits[j->rlimit_count].type = type;
895 j->rlimits[j->rlimit_count].cur = cur;
896 j->rlimits[j->rlimit_count].max = max;
897 j->rlimit_count++;
898 return 0;
899}
900
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -0400901int API minijail_forward_signals(struct minijail *j)
902{
903 j->flags.forward_signals = 1;
904 return 0;
905}
906
Allen Webb7ae41c22021-09-16 10:23:37 -0500907int API minijail_create_session(struct minijail *j)
908{
Xiyuan Xia9b41e652019-05-23 11:03:04 -0700909 j->flags.setsid = 1;
910 return 0;
911}
912
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000913int API minijail_add_fs_restriction_rx(struct minijail *j, const char *path)
914{
Ben Scarlatoee82b492022-08-09 18:33:25 +0000915 return !add_fs_restriction_path(j, path,
916 ACCESS_FS_ROUGHLY_READ_EXECUTE);
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000917}
918
919int API minijail_add_fs_restriction_ro(struct minijail *j, const char *path)
920{
Ben Scarlatoee82b492022-08-09 18:33:25 +0000921 return !add_fs_restriction_path(j, path, ACCESS_FS_ROUGHLY_READ);
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000922}
923
924int API minijail_add_fs_restriction_rw(struct minijail *j, const char *path)
925{
Ben Scarlatoee82b492022-08-09 18:33:25 +0000926 return !add_fs_restriction_path(j, path,
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000927 ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_BASIC_WRITE);
928}
929
930int API minijail_add_fs_restriction_advanced_rw(struct minijail *j,
931 const char *path)
932{
Ben Scarlatoee82b492022-08-09 18:33:25 +0000933 return !add_fs_restriction_path(j, path,
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000934 ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_FULL_WRITE);
935}
936
Ben Scarlatof6102622022-09-05 19:31:42 +0000937int API minijail_add_fs_restriction_edit(struct minijail *j,
938 const char *path)
939{
940 return !add_fs_restriction_path(j, path,
941 ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_EDIT);
942}
943
Jorge Lucangeli Obesa8eef8b2022-07-20 19:20:06 -0400944static bool is_valid_bind_path(const char *path)
945{
946 if (!block_symlinks_in_bindmount_paths()) {
947 return true;
948 }
949
950 /*
951 * tokenize() will modify both the |prefixes| pointer and the contents
952 * of the string, so:
953 * -Copy |BINDMOUNT_ALLOWED_PREFIXES| since it lives in .rodata.
954 * -Save the original pointer for free()ing.
955 */
956 char *prefixes = strdup(BINDMOUNT_ALLOWED_PREFIXES);
957 attribute_cleanup_str char *orig_prefixes = prefixes;
958 (void)orig_prefixes;
959
960 char *prefix = NULL;
961 bool found_prefix = false;
962 if (!is_canonical_path(path)) {
963 while ((prefix = tokenize(&prefixes, ",")) != NULL) {
964 if (path_is_parent(prefix, path)) {
965 found_prefix = true;
966 break;
967 }
968 }
969 if (!found_prefix) {
970 /*
971 * If the path does not include one of the allowed
972 * prefixes, fail.
973 */
974 warn("path '%s' is not a canonical path", path);
975 return false;
976 }
977 }
978 return true;
979}
Ben Scarlatod7e6e682022-06-30 03:27:30 +0000980
Dylan Reid81e23972016-05-18 14:06:35 -0700981int API minijail_mount_with_data(struct minijail *j, const char *src,
982 const char *dest, const char *type,
983 unsigned long flags, const char *data)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -0700984{
Dylan Reid648b2202015-10-23 00:50:00 -0700985 struct mountpoint *m;
Elly Jones51a5b6c2011-10-12 19:09:26 -0400986
987 if (*dest != '/')
988 return -EINVAL;
Dylan Reid648b2202015-10-23 00:50:00 -0700989 m = calloc(1, sizeof(*m));
990 if (!m)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400991 return -ENOMEM;
Dylan Reid648b2202015-10-23 00:50:00 -0700992 m->dest = strdup(dest);
993 if (!m->dest)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400994 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700995 m->src = strdup(src);
996 if (!m->src)
Elly Jones51a5b6c2011-10-12 19:09:26 -0400997 goto error;
Dylan Reid648b2202015-10-23 00:50:00 -0700998 m->type = strdup(type);
999 if (!m->type)
1000 goto error;
Mike Frysingerb7803c82018-08-23 15:43:15 -04001001
1002 if (!data || !data[0]) {
1003 /*
1004 * Set up secure defaults for certain filesystems. Adding this
1005 * fs-specific logic here kind of sucks, but considering how
1006 * people use these in practice, it's probably OK. If they want
1007 * the kernel defaults, they can pass data="" instead of NULL.
1008 */
Mike Frysinger22dc3522022-07-07 19:24:13 -04001009 if (streq(type, "tmpfs")) {
Mike Frysingerb7803c82018-08-23 15:43:15 -04001010 /* tmpfs defaults to mode=1777 and size=50%. */
1011 data = "mode=0755,size=10M";
1012 }
1013 }
Dylan Reid81e23972016-05-18 14:06:35 -07001014 if (data) {
1015 m->data = strdup(data);
1016 if (!m->data)
1017 goto error;
1018 m->has_data = 1;
1019 }
Mike Frysingercb8674d2018-08-12 00:53:35 -04001020
1021 /* If they don't specify any flags, default to secure ones. */
1022 if (flags == 0)
1023 flags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Dylan Reid648b2202015-10-23 00:50:00 -07001024 m->flags = flags;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001025
Elly Jonesdd3e8512012-01-23 15:13:38 -05001026 /*
Jorge Lucangeli Obes0a0514c2020-01-03 11:18:32 -05001027 * Unless asked to enter an existing namespace, force vfs namespacing
1028 * so the mounts don't leak out into the containing vfs namespace.
1029 * If Minijail is being asked to enter the root vfs namespace this will
1030 * leak mounts, but it's unlikely that the user would ask to do that by
1031 * mistake.
Elly Jones51a5b6c2011-10-12 19:09:26 -04001032 */
Jorge Lucangeli Obes0a0514c2020-01-03 11:18:32 -05001033 if (!j->flags.enter_vfs)
1034 minijail_namespace_vfs(j);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001035
Dylan Reid648b2202015-10-23 00:50:00 -07001036 if (j->mounts_tail)
1037 j->mounts_tail->next = m;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001038 else
Dylan Reid648b2202015-10-23 00:50:00 -07001039 j->mounts_head = m;
1040 j->mounts_tail = m;
1041 j->mounts_count++;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001042
1043 return 0;
1044
1045error:
Dylan Reid81e23972016-05-18 14:06:35 -07001046 free(m->type);
Dylan Reid648b2202015-10-23 00:50:00 -07001047 free(m->src);
1048 free(m->dest);
1049 free(m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001050 return -ENOMEM;
1051}
1052
Dylan Reid81e23972016-05-18 14:06:35 -07001053int API minijail_mount(struct minijail *j, const char *src, const char *dest,
1054 const char *type, unsigned long flags)
1055{
1056 return minijail_mount_with_data(j, src, dest, type, flags, NULL);
1057}
1058
Dylan Reid648b2202015-10-23 00:50:00 -07001059int API minijail_bind(struct minijail *j, const char *src, const char *dest,
1060 int writeable)
1061{
1062 unsigned long flags = MS_BIND;
1063
Jorge Lucangeli Obes84d8d052022-08-19 21:47:30 +00001064 /*
1065 * Check for symlinks in bind-mount source paths to warn the user early.
1066 * Minijail will perform one final check immediately before the mount()
1067 * call.
1068 */
Jorge Lucangeli Obesa8eef8b2022-07-20 19:20:06 -04001069 if (!is_valid_bind_path(src)) {
1070 warn("src '%s' is not a valid bind mount path", src);
1071 return -ELOOP;
1072 }
1073
Jorge Lucangeli Obes84d8d052022-08-19 21:47:30 +00001074 /*
1075 * Symlinks in |dest| are blocked by the ChromiumOS LSM:
1076 * <kernel>/security/chromiumos/lsm.c#77
1077 */
Jorge Lucangeli Obesa8eef8b2022-07-20 19:20:06 -04001078
Dylan Reid648b2202015-10-23 00:50:00 -07001079 if (!writeable)
1080 flags |= MS_RDONLY;
1081
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001082 /*
1083 * |type| is ignored for bind mounts, use it to signal that this mount
1084 * came from minijail_bind().
1085 * TODO(b/238362528): Implement a better way to signal this.
1086 */
1087 return minijail_mount(j, src, dest, "minijail_bind", flags);
Dylan Reid648b2202015-10-23 00:50:00 -07001088}
1089
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00001090int API minijail_add_remount(struct minijail *j, const char *mount_name,
1091 unsigned long remount_mode)
1092{
1093 struct minijail_remount *m;
1094
1095 if (*mount_name != '/')
1096 return -EINVAL;
1097 m = calloc(1, sizeof(*m));
1098 if (!m)
1099 return -ENOMEM;
1100 m->mount_name = strdup(mount_name);
1101 if (!m->mount_name) {
1102 free(m);
1103 return -ENOMEM;
1104 }
1105
1106 m->remount_mode = remount_mode;
1107
1108 if (j->remounts_tail)
1109 j->remounts_tail->next = m;
1110 else
1111 j->remounts_head = m;
1112 j->remounts_tail = m;
1113
1114 return 0;
1115}
1116
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07001117int API minijail_add_hook(struct minijail *j, minijail_hook_t hook,
1118 void *payload, minijail_hook_event_t event)
1119{
1120 struct hook *c;
1121
1122 if (hook == NULL)
1123 return -EINVAL;
1124 if (event >= MINIJAIL_HOOK_EVENT_MAX)
1125 return -EINVAL;
1126 c = calloc(1, sizeof(*c));
1127 if (!c)
1128 return -ENOMEM;
1129
1130 c->hook = hook;
1131 c->payload = payload;
1132 c->event = event;
1133
1134 if (j->hooks_tail)
1135 j->hooks_tail->next = c;
1136 else
1137 j->hooks_head = c;
1138 j->hooks_tail = c;
1139
1140 return 0;
1141}
1142
Luis Hector Chavez1617f632017-08-01 18:32:30 -07001143int API minijail_preserve_fd(struct minijail *j, int parent_fd, int child_fd)
1144{
1145 if (parent_fd < 0 || child_fd < 0)
1146 return -EINVAL;
1147 if (j->preserved_fd_count >= MAX_PRESERVED_FDS)
1148 return -ENOMEM;
1149 j->preserved_fds[j->preserved_fd_count].parent_fd = parent_fd;
1150 j->preserved_fds[j->preserved_fd_count].child_fd = child_fd;
1151 j->preserved_fd_count++;
1152 return 0;
1153}
1154
Luis Hector Chavez9acba452018-10-11 10:13:25 -07001155int API minijail_set_preload_path(struct minijail *j, const char *preload_path)
1156{
1157 if (j->preload_path)
1158 return -EINVAL;
1159 j->preload_path = strdup(preload_path);
1160 if (!j->preload_path)
1161 return -ENOMEM;
1162 return 0;
1163}
1164
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001165static void clear_seccomp_options(struct minijail *j)
1166{
1167 j->flags.seccomp_filter = 0;
1168 j->flags.seccomp_filter_tsync = 0;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04001169 j->flags.seccomp_filter_logging = 0;
Anand K Mistry31adc6c2020-11-26 11:39:46 +11001170 j->flags.seccomp_filter_allow_speculation = 0;
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001171 j->filter_len = 0;
1172 j->filter_prog = NULL;
1173 j->flags.no_new_privs = 0;
Allen Webb77383c72021-10-15 10:34:24 -07001174 if (j->seccomp_policy_path) {
1175 free(j->seccomp_policy_path);
1176 }
1177 j->seccomp_policy_path = NULL;
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001178}
1179
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001180static int seccomp_should_use_filters(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001181{
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001182 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL) == -1) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001183 /*
1184 * |errno| will be set to EINVAL when seccomp has not been
1185 * compiled into the kernel. On certain platforms and kernel
1186 * versions this is not a fatal failure. In that case, and only
1187 * in that case, disable seccomp and skip loading the filters.
1188 */
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04001189 if ((errno == EINVAL) && seccomp_can_softfail()) {
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001190 warn("not loading seccomp filters, seccomp filter not "
1191 "supported");
1192 clear_seccomp_options(j);
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001193 return 0;
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001194 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001195 /*
1196 * If |errno| != EINVAL or seccomp_can_softfail() is false,
1197 * we can proceed. Worst case scenario minijail_enter() will
1198 * abort() if seccomp fails.
1199 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07001200 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001201 if (j->flags.seccomp_filter_tsync) {
1202 /* Are the seccomp(2) syscall and the TSYNC option supported? */
1203 if (sys_seccomp(SECCOMP_SET_MODE_FILTER,
1204 SECCOMP_FILTER_FLAG_TSYNC, NULL) == -1) {
1205 int saved_errno = errno;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04001206 if (saved_errno == ENOSYS && seccomp_can_softfail()) {
1207 warn("seccomp(2) syscall not supported");
1208 clear_seccomp_options(j);
1209 return 0;
1210 } else if (saved_errno == EINVAL &&
1211 seccomp_can_softfail()) {
1212 warn(
1213 "seccomp filter thread sync not supported");
1214 clear_seccomp_options(j);
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04001215 return 0;
1216 }
1217 /*
1218 * Similar logic here. If seccomp_can_softfail() is
1219 * false, or |errno| != ENOSYS, or |errno| != EINVAL,
1220 * we can proceed. Worst case scenario minijail_enter()
1221 * will abort() if seccomp or TSYNC fail.
1222 */
1223 }
1224 }
Anand K Mistry31adc6c2020-11-26 11:39:46 +11001225 if (j->flags.seccomp_filter_allow_speculation) {
1226 /* Is the SPEC_ALLOW flag supported? */
Luis Héctor Chávez01b628c2021-01-03 05:46:57 -08001227 if (!seccomp_filter_flags_available(
1228 SECCOMP_FILTER_FLAG_SPEC_ALLOW)) {
Anand K Mistry31adc6c2020-11-26 11:39:46 +11001229 warn("allowing speculative execution on seccomp "
1230 "processes not supported");
1231 j->flags.seccomp_filter_allow_speculation = 0;
1232 }
1233 }
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001234 return 1;
1235}
1236
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001237static int set_seccomp_filters_internal(struct minijail *j,
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001238 const struct sock_fprog *filter,
1239 bool owned)
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001240{
1241 struct sock_fprog *fprog;
1242
1243 if (owned) {
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001244 /*
1245 * If |owned| is true, it's OK to cast away the const-ness since
1246 * we'll own the pointer going forward.
1247 */
1248 fprog = (struct sock_fprog *)filter;
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001249 } else {
1250 fprog = malloc(sizeof(struct sock_fprog));
1251 if (!fprog)
1252 return -ENOMEM;
1253 fprog->len = filter->len;
1254 fprog->filter = malloc(sizeof(struct sock_filter) * fprog->len);
1255 if (!fprog->filter) {
1256 free(fprog);
1257 return -ENOMEM;
1258 }
1259 memcpy(fprog->filter, filter->filter,
1260 sizeof(struct sock_filter) * fprog->len);
1261 }
1262
1263 if (j->filter_prog) {
1264 free(j->filter_prog->filter);
1265 free(j->filter_prog);
1266 }
1267
1268 j->filter_len = fprog->len;
1269 j->filter_prog = fprog;
1270 return 0;
1271}
1272
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001273static int parse_seccomp_filters(struct minijail *j, const char *filename,
1274 FILE *policy_file)
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001275{
1276 struct sock_fprog *fprog = malloc(sizeof(struct sock_fprog));
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001277 if (!fprog)
1278 return -ENOMEM;
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04001279
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001280 struct filter_options filteropts;
1281
1282 /*
1283 * Figure out filter options.
1284 * Allow logging?
1285 */
1286 filteropts.allow_logging =
Adrian Ratiu8ef61252021-06-08 03:46:24 +03001287 debug_logging_allowed() && seccomp_is_logging_allowed(j);
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001288
1289 /* What to do on a blocked system call? */
1290 if (filteropts.allow_logging) {
1291 if (seccomp_ret_log_available())
1292 filteropts.action = ACTION_RET_LOG;
1293 else
1294 filteropts.action = ACTION_RET_TRAP;
1295 } else {
Jorge Lucangeli Obesd23ad7922020-10-13 10:26:40 -04001296 if (j->flags.seccomp_filter_tsync) {
1297 if (seccomp_ret_kill_process_available()) {
1298 filteropts.action = ACTION_RET_KILL_PROCESS;
1299 } else {
1300 filteropts.action = ACTION_RET_TRAP;
1301 }
1302 } else {
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001303 filteropts.action = ACTION_RET_KILL;
Jorge Lucangeli Obesd23ad7922020-10-13 10:26:40 -04001304 }
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001305 }
1306
1307 /*
1308 * If SECCOMP_RET_LOG is not available, need to allow extra syscalls
1309 * for logging.
1310 */
1311 filteropts.allow_syscalls_for_logging =
1312 filteropts.allow_logging && !seccomp_ret_log_available();
1313
Nicole Anderson-Aubcc8cfd2020-11-10 20:33:27 +00001314 /* Whether to fail on duplicate syscalls. */
1315 filteropts.allow_duplicate_syscalls = allow_duplicate_syscalls();
1316
Jorge Lucangeli Obese1a86892019-06-10 16:17:03 -04001317 if (compile_filter(filename, policy_file, fprog, &filteropts)) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001318 free(fprog);
1319 return -1;
1320 }
1321
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001322 return set_seccomp_filters_internal(j, fprog, true /* owned */);
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001323}
1324
1325void API minijail_parse_seccomp_filters(struct minijail *j, const char *path)
1326{
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001327 if (!seccomp_should_use_filters(j))
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001328 return;
1329
Mike Frysingerdebdf5d2021-06-21 09:52:06 -04001330 attribute_cleanup_fp FILE *file = fopen(path, "re");
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001331 if (!file) {
Jorge Lucangeli Obes224e4272012-08-02 14:31:39 -07001332 pdie("failed to open seccomp filter file '%s'", path);
Elly Jonese1749eb2011-10-07 13:54:59 -04001333 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001334
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001335 if (parse_seccomp_filters(j, path, file) != 0) {
Jorge Lucangeli Obesbda833c2012-07-31 16:25:56 -07001336 die("failed to compile seccomp filter BPF program in '%s'",
1337 path);
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001338 }
Allen Webb77383c72021-10-15 10:34:24 -07001339 if (j->seccomp_policy_path) {
1340 free(j->seccomp_policy_path);
1341 }
1342 j->seccomp_policy_path = strdup(path);
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001343}
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001344
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001345void API minijail_parse_seccomp_filters_from_fd(struct minijail *j, int fd)
1346{
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001347 char *fd_path, *path;
Mike Frysingerdebdf5d2021-06-21 09:52:06 -04001348 attribute_cleanup_fp FILE *file = NULL;
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001349
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001350 if (!seccomp_should_use_filters(j))
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001351 return;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001352
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001353 file = fdopen(fd, "r");
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001354 if (!file) {
1355 pdie("failed to associate stream with fd %d", fd);
1356 }
1357
Luis Hector Chavez7624e712017-08-28 19:30:59 -07001358 if (asprintf(&fd_path, "/proc/self/fd/%d", fd) == -1)
1359 pdie("failed to create path for fd %d", fd);
1360 path = realpath(fd_path, NULL);
1361 if (path == NULL)
1362 pwarn("failed to get path of fd %d", fd);
1363 free(fd_path);
1364
1365 if (parse_seccomp_filters(j, path ? path : "<fd>", file) != 0) {
Jorge Lucangeli Obes4d4b3be2016-08-16 16:58:14 -04001366 die("failed to compile seccomp filter BPF program from fd %d",
1367 fd);
1368 }
Allen Webb77383c72021-10-15 10:34:24 -07001369 if (j->seccomp_policy_path) {
1370 free(j->seccomp_policy_path);
1371 }
1372 j->seccomp_policy_path = path;
Will Drewry32ac9f52011-08-18 21:36:27 -05001373}
1374
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001375void API minijail_set_seccomp_filters(struct minijail *j,
1376 const struct sock_fprog *filter)
1377{
1378 if (!seccomp_should_use_filters(j))
1379 return;
1380
Adrian Ratiu8ef61252021-06-08 03:46:24 +03001381 if (seccomp_is_logging_allowed(j)) {
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04001382 die("minijail_log_seccomp_filter_failures() is incompatible "
1383 "with minijail_set_seccomp_filters()");
1384 }
1385
1386 /*
1387 * set_seccomp_filters_internal() can only fail with ENOMEM.
1388 * Furthermore, since we won't own the incoming filter, it will not be
1389 * modified.
1390 */
1391 if (set_seccomp_filters_internal(j, filter, false /* owned */) < 0) {
1392 die("failed to set seccomp filter");
1393 }
1394}
1395
Andrew Brestickereac28942015-11-11 16:04:46 -08001396int API minijail_use_alt_syscall(struct minijail *j, const char *table)
1397{
1398 j->alt_syscall_table = strdup(table);
1399 if (!j->alt_syscall_table)
1400 return -ENOMEM;
1401 j->flags.alt_syscall = 1;
1402 return 0;
1403}
1404
Will Drewryf89aef52011-09-16 16:48:57 -05001405struct marshal_state {
Elly Jonese1749eb2011-10-07 13:54:59 -04001406 size_t available;
1407 size_t total;
1408 char *buf;
Will Drewryf89aef52011-09-16 16:48:57 -05001409};
1410
Mike Frysinger0a27ab02020-09-04 16:18:12 -04001411static void marshal_state_init(struct marshal_state *state, char *buf,
1412 size_t available)
Elly Jonese1749eb2011-10-07 13:54:59 -04001413{
1414 state->available = available;
1415 state->buf = buf;
1416 state->total = 0;
Will Drewryf89aef52011-09-16 16:48:57 -05001417}
1418
Mike Frysinger0a27ab02020-09-04 16:18:12 -04001419static void marshal_append(struct marshal_state *state, const void *src,
1420 size_t length)
Elly Jonese1749eb2011-10-07 13:54:59 -04001421{
1422 size_t copy_len = MIN(state->available, length);
Will Drewryf89aef52011-09-16 16:48:57 -05001423
Elly Jonese1749eb2011-10-07 13:54:59 -04001424 /* Up to |available| will be written. */
1425 if (copy_len) {
1426 memcpy(state->buf, src, copy_len);
1427 state->buf += copy_len;
1428 state->available -= copy_len;
1429 }
1430 /* |total| will contain the expected length. */
1431 state->total += length;
Will Drewryf89aef52011-09-16 16:48:57 -05001432}
1433
Mike Frysinger5f9e3002020-09-04 16:20:36 -04001434static void marshal_append_string(struct marshal_state *state, const char *src)
1435{
1436 marshal_append(state, src, strlen(src) + 1);
1437}
1438
Mike Frysinger0a27ab02020-09-04 16:18:12 -04001439static void marshal_mount(struct marshal_state *state,
1440 const struct mountpoint *m)
Dylan Reid81e23972016-05-18 14:06:35 -07001441{
1442 marshal_append(state, m->src, strlen(m->src) + 1);
1443 marshal_append(state, m->dest, strlen(m->dest) + 1);
1444 marshal_append(state, m->type, strlen(m->type) + 1);
1445 marshal_append(state, (char *)&m->has_data, sizeof(m->has_data));
1446 if (m->has_data)
1447 marshal_append(state, m->data, strlen(m->data) + 1);
1448 marshal_append(state, (char *)&m->flags, sizeof(m->flags));
1449}
1450
Mike Frysinger0a27ab02020-09-04 16:18:12 -04001451static void minijail_marshal_helper(struct marshal_state *state,
1452 const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001453{
Dylan Reid648b2202015-10-23 00:50:00 -07001454 struct mountpoint *m = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -08001455 size_t i;
1456
Elly Jonese1749eb2011-10-07 13:54:59 -04001457 marshal_append(state, (char *)j, sizeof(*j));
1458 if (j->user)
Mike Frysinger5f9e3002020-09-04 16:20:36 -04001459 marshal_append_string(state, j->user);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001460 if (j->suppl_gid_list) {
1461 marshal_append(state, j->suppl_gid_list,
1462 j->suppl_gid_count * sizeof(gid_t));
1463 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001464 if (j->chrootdir)
Mike Frysinger5f9e3002020-09-04 16:20:36 -04001465 marshal_append_string(state, j->chrootdir);
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001466 if (j->hostname)
Mike Frysinger5f9e3002020-09-04 16:20:36 -04001467 marshal_append_string(state, j->hostname);
Andrew Brestickereac28942015-11-11 16:04:46 -08001468 if (j->alt_syscall_table) {
1469 marshal_append(state, j->alt_syscall_table,
1470 strlen(j->alt_syscall_table) + 1);
1471 }
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001472 if (j->flags.seccomp_filter && j->filter_prog) {
1473 struct sock_fprog *fp = j->filter_prog;
1474 marshal_append(state, (char *)fp->filter,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08001475 fp->len * sizeof(struct sock_filter));
Elly Jonese1749eb2011-10-07 13:54:59 -04001476 }
Dylan Reid648b2202015-10-23 00:50:00 -07001477 for (m = j->mounts_head; m; m = m->next) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04001478 marshal_mount(state, m);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001479 }
Dylan Reid605ce7f2016-01-19 19:21:00 -08001480 for (i = 0; i < j->cgroup_count; ++i)
Mike Frysinger5f9e3002020-09-04 16:20:36 -04001481 marshal_append_string(state, j->cgroups[i]);
Allen Webb77383c72021-10-15 10:34:24 -07001482 if (j->seccomp_policy_path)
1483 marshal_append_string(state, j->seccomp_policy_path);
Will Drewryf89aef52011-09-16 16:48:57 -05001484}
1485
Will Drewry6ac91122011-10-21 16:38:58 -05001486size_t API minijail_size(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04001487{
1488 struct marshal_state state;
1489 marshal_state_init(&state, NULL, 0);
1490 minijail_marshal_helper(&state, j);
1491 return state.total;
Will Drewry2ddaad02011-09-16 11:36:08 -05001492}
1493
Elly Jonese1749eb2011-10-07 13:54:59 -04001494int minijail_marshal(const struct minijail *j, char *buf, size_t available)
1495{
1496 struct marshal_state state;
1497 marshal_state_init(&state, buf, available);
1498 minijail_marshal_helper(&state, j);
1499 return (state.total > available);
Will Drewry2ddaad02011-09-16 11:36:08 -05001500}
1501
Elly Jonese1749eb2011-10-07 13:54:59 -04001502int minijail_unmarshal(struct minijail *j, char *serialized, size_t length)
1503{
Jorge Lucangeli Obesc2ba9f52015-12-01 07:58:10 -08001504 size_t i;
1505 size_t count;
Will Drewrybee7ba72011-10-21 20:47:01 -05001506 int ret = -EINVAL;
1507
Elly Jonese1749eb2011-10-07 13:54:59 -04001508 if (length < sizeof(*j))
Will Drewrybee7ba72011-10-21 20:47:01 -05001509 goto out;
Elly Jonese1749eb2011-10-07 13:54:59 -04001510 memcpy((void *)j, serialized, sizeof(*j));
1511 serialized += sizeof(*j);
1512 length -= sizeof(*j);
Will Drewryf89aef52011-09-16 16:48:57 -05001513
Will Drewrybee7ba72011-10-21 20:47:01 -05001514 /* Potentially stale pointers not used as signals. */
Luis Hector Chavez9acba452018-10-11 10:13:25 -07001515 j->preload_path = NULL;
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -04001516 j->pid_file_path = NULL;
1517 j->uidmap = NULL;
1518 j->gidmap = NULL;
Dylan Reid648b2202015-10-23 00:50:00 -07001519 j->mounts_head = NULL;
1520 j->mounts_tail = NULL;
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00001521 j->remounts_head = NULL;
1522 j->remounts_tail = NULL;
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001523 j->filter_prog = NULL;
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07001524 j->hooks_head = NULL;
1525 j->hooks_tail = NULL;
Ben Scarlatoee82b492022-08-09 18:33:25 +00001526 j->fs_rules_head = NULL;
1527 j->fs_rules_tail = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001528
Allen Webb7ae41c22021-09-16 10:23:37 -05001529 if (j->user) { /* stale pointer */
Elly Jones51a5b6c2011-10-12 19:09:26 -04001530 char *user = consumestr(&serialized, &length);
1531 if (!user)
Will Drewrybee7ba72011-10-21 20:47:01 -05001532 goto clear_pointers;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001533 j->user = strdup(user);
Will Drewrybee7ba72011-10-21 20:47:01 -05001534 if (!j->user)
1535 goto clear_pointers;
Elly Jonese1749eb2011-10-07 13:54:59 -04001536 }
Will Drewryf89aef52011-09-16 16:48:57 -05001537
Allen Webb7ae41c22021-09-16 10:23:37 -05001538 if (j->suppl_gid_list) { /* stale pointer */
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001539 if (j->suppl_gid_count > NGROUPS_MAX) {
1540 goto bad_gid_list;
1541 }
1542 size_t gid_list_size = j->suppl_gid_count * sizeof(gid_t);
1543 void *gid_list_bytes =
1544 consumebytes(gid_list_size, &serialized, &length);
1545 if (!gid_list_bytes)
1546 goto bad_gid_list;
1547
1548 j->suppl_gid_list = calloc(j->suppl_gid_count, sizeof(gid_t));
1549 if (!j->suppl_gid_list)
1550 goto bad_gid_list;
1551
1552 memcpy(j->suppl_gid_list, gid_list_bytes, gid_list_size);
1553 }
1554
Allen Webb7ae41c22021-09-16 10:23:37 -05001555 if (j->chrootdir) { /* stale pointer */
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001556 char *chrootdir = consumestr(&serialized, &length);
1557 if (!chrootdir)
Will Drewrybee7ba72011-10-21 20:47:01 -05001558 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001559 j->chrootdir = strdup(chrootdir);
Will Drewrybee7ba72011-10-21 20:47:01 -05001560 if (!j->chrootdir)
1561 goto bad_chrootdir;
Elly Jonesa8d1e1b2011-10-21 15:38:00 -04001562 }
1563
Allen Webb7ae41c22021-09-16 10:23:37 -05001564 if (j->hostname) { /* stale pointer */
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001565 char *hostname = consumestr(&serialized, &length);
1566 if (!hostname)
1567 goto bad_hostname;
1568 j->hostname = strdup(hostname);
1569 if (!j->hostname)
1570 goto bad_hostname;
1571 }
1572
Allen Webb7ae41c22021-09-16 10:23:37 -05001573 if (j->alt_syscall_table) { /* stale pointer */
Andrew Brestickereac28942015-11-11 16:04:46 -08001574 char *alt_syscall_table = consumestr(&serialized, &length);
1575 if (!alt_syscall_table)
1576 goto bad_syscall_table;
1577 j->alt_syscall_table = strdup(alt_syscall_table);
1578 if (!j->alt_syscall_table)
1579 goto bad_syscall_table;
1580 }
1581
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001582 if (j->flags.seccomp_filter && j->filter_len > 0) {
1583 size_t ninstrs = j->filter_len;
1584 if (ninstrs > (SIZE_MAX / sizeof(struct sock_filter)) ||
1585 ninstrs > USHRT_MAX)
1586 goto bad_filters;
1587
1588 size_t program_len = ninstrs * sizeof(struct sock_filter);
1589 void *program = consumebytes(program_len, &serialized, &length);
1590 if (!program)
1591 goto bad_filters;
1592
1593 j->filter_prog = malloc(sizeof(struct sock_fprog));
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001594 if (!j->filter_prog)
1595 goto bad_filters;
1596
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001597 j->filter_prog->len = ninstrs;
1598 j->filter_prog->filter = malloc(program_len);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001599 if (!j->filter_prog->filter)
1600 goto bad_filter_prog_instrs;
1601
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001602 memcpy(j->filter_prog->filter, program, program_len);
Elly Jonese1749eb2011-10-07 13:54:59 -04001603 }
Elly Jones51a5b6c2011-10-12 19:09:26 -04001604
Dylan Reid648b2202015-10-23 00:50:00 -07001605 count = j->mounts_count;
1606 j->mounts_count = 0;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001607 for (i = 0; i < count; ++i) {
Dylan Reid648b2202015-10-23 00:50:00 -07001608 unsigned long *flags;
Dylan Reid81e23972016-05-18 14:06:35 -07001609 int *has_data;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001610 const char *dest;
Dylan Reid648b2202015-10-23 00:50:00 -07001611 const char *type;
Dylan Reid81e23972016-05-18 14:06:35 -07001612 const char *data = NULL;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001613 const char *src = consumestr(&serialized, &length);
1614 if (!src)
Dylan Reid648b2202015-10-23 00:50:00 -07001615 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001616 dest = consumestr(&serialized, &length);
1617 if (!dest)
Dylan Reid648b2202015-10-23 00:50:00 -07001618 goto bad_mounts;
1619 type = consumestr(&serialized, &length);
1620 if (!type)
1621 goto bad_mounts;
Allen Webb7ae41c22021-09-16 10:23:37 -05001622 has_data =
1623 consumebytes(sizeof(*has_data), &serialized, &length);
Dylan Reid81e23972016-05-18 14:06:35 -07001624 if (!has_data)
1625 goto bad_mounts;
1626 if (*has_data) {
1627 data = consumestr(&serialized, &length);
1628 if (!data)
1629 goto bad_mounts;
1630 }
Dylan Reid648b2202015-10-23 00:50:00 -07001631 flags = consumebytes(sizeof(*flags), &serialized, &length);
1632 if (!flags)
1633 goto bad_mounts;
Dylan Reid81e23972016-05-18 14:06:35 -07001634 if (minijail_mount_with_data(j, src, dest, type, *flags, data))
Dylan Reid648b2202015-10-23 00:50:00 -07001635 goto bad_mounts;
Elly Jones51a5b6c2011-10-12 19:09:26 -04001636 }
1637
Dylan Reid605ce7f2016-01-19 19:21:00 -08001638 count = j->cgroup_count;
1639 j->cgroup_count = 0;
1640 for (i = 0; i < count; ++i) {
1641 char *cgroup = consumestr(&serialized, &length);
1642 if (!cgroup)
1643 goto bad_cgroups;
1644 j->cgroups[i] = strdup(cgroup);
1645 if (!j->cgroups[i])
1646 goto bad_cgroups;
1647 ++j->cgroup_count;
1648 }
1649
Allen Webb77383c72021-10-15 10:34:24 -07001650 if (j->seccomp_policy_path) { /* stale pointer */
1651 char *seccomp_policy_path = consumestr(&serialized, &length);
1652 if (!seccomp_policy_path)
1653 goto bad_cgroups;
1654 j->seccomp_policy_path = strdup(seccomp_policy_path);
1655 if (!j->seccomp_policy_path)
1656 goto bad_cgroups;
1657 }
1658
Elly Jonese1749eb2011-10-07 13:54:59 -04001659 return 0;
Will Drewrybee7ba72011-10-21 20:47:01 -05001660
Allen Webb77383c72021-10-15 10:34:24 -07001661 /*
1662 * If more is added after j->seccomp_policy_path, then this is needed:
1663 * if (j->seccomp_policy_path)
1664 * free(j->seccomp_policy_path);
1665 */
1666
Dylan Reid605ce7f2016-01-19 19:21:00 -08001667bad_cgroups:
Mike Frysingerac08a682017-10-10 02:04:50 -04001668 free_mounts_list(j);
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00001669 free_remounts_list(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08001670 for (i = 0; i < j->cgroup_count; ++i)
1671 free(j->cgroups[i]);
Dylan Reid648b2202015-10-23 00:50:00 -07001672bad_mounts:
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07001673 if (j->filter_prog && j->filter_prog->filter)
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08001674 free(j->filter_prog->filter);
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001675bad_filter_prog_instrs:
1676 if (j->filter_prog)
1677 free(j->filter_prog);
Will Drewrybee7ba72011-10-21 20:47:01 -05001678bad_filters:
Andrew Brestickereac28942015-11-11 16:04:46 -08001679 if (j->alt_syscall_table)
1680 free(j->alt_syscall_table);
1681bad_syscall_table:
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001682 if (j->hostname)
1683 free(j->hostname);
1684bad_hostname:
Stéphane Lesimple3cf37e12022-01-10 14:24:51 +01001685 if (j->chrootdir)
1686 free(j->chrootdir);
1687bad_chrootdir:
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001688 if (j->suppl_gid_list)
1689 free(j->suppl_gid_list);
1690bad_gid_list:
Will Drewrybee7ba72011-10-21 20:47:01 -05001691 if (j->user)
1692 free(j->user);
1693clear_pointers:
1694 j->user = NULL;
Jorge Lucangeli Obesde02a5b2015-12-11 15:28:52 -08001695 j->suppl_gid_list = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001696 j->chrootdir = NULL;
Mike Frysingerb9a7b162017-05-30 15:25:49 -04001697 j->hostname = NULL;
Andrew Brestickereac28942015-11-11 16:04:46 -08001698 j->alt_syscall_table = NULL;
Dylan Reid605ce7f2016-01-19 19:21:00 -08001699 j->cgroup_count = 0;
Allen Webb77383c72021-10-15 10:34:24 -07001700 j->seccomp_policy_path = NULL;
Will Drewrybee7ba72011-10-21 20:47:01 -05001701out:
1702 return ret;
Will Drewry2ddaad02011-09-16 11:36:08 -05001703}
1704
Mike Frysinger33ffef32017-01-13 19:53:19 -05001705struct dev_spec {
1706 const char *name;
1707 mode_t mode;
1708 dev_t major, minor;
1709};
1710
Allen Webb7ae41c22021-09-16 10:23:37 -05001711// clang-format off
Mike Frysinger33ffef32017-01-13 19:53:19 -05001712static const struct dev_spec device_nodes[] = {
Allen Webb7ae41c22021-09-16 10:23:37 -05001713 {
1714"null",
1715 S_IFCHR | 0666, 1, 3,
1716 },
1717 {
1718 "zero",
1719 S_IFCHR | 0666, 1, 5,
1720 },
1721 {
1722 "full",
1723 S_IFCHR | 0666, 1, 7,
1724 },
1725 {
1726 "urandom",
1727 S_IFCHR | 0444, 1, 9,
1728 },
1729 {
1730 "tty",
1731 S_IFCHR | 0666, 5, 0,
1732 },
Mike Frysinger33ffef32017-01-13 19:53:19 -05001733};
Allen Webb7ae41c22021-09-16 10:23:37 -05001734// clang-format on
Mike Frysinger33ffef32017-01-13 19:53:19 -05001735
1736struct dev_sym_spec {
1737 const char *source, *dest;
1738};
1739
1740static const struct dev_sym_spec device_symlinks[] = {
Allen Webb7ae41c22021-09-16 10:23:37 -05001741 {
1742 "ptmx",
1743 "pts/ptmx",
1744 },
1745 {
1746 "fd",
1747 "/proc/self/fd",
1748 },
1749 {
1750 "stdin",
1751 "fd/0",
1752 },
1753 {
1754 "stdout",
1755 "fd/1",
1756 },
1757 {
1758 "stderr",
1759 "fd/2",
1760 },
Mike Frysinger33ffef32017-01-13 19:53:19 -05001761};
1762
1763/*
1764 * Clean up the temporary dev path we had setup previously. In case of errors,
1765 * we don't want to go leaking empty tempdirs.
1766 */
1767static void mount_dev_cleanup(char *dev_path)
1768{
1769 umount2(dev_path, MNT_DETACH);
1770 rmdir(dev_path);
1771 free(dev_path);
1772}
1773
1774/*
1775 * Set up the pseudo /dev path at the temporary location.
1776 * See mount_dev_finalize for more details.
1777 */
1778static int mount_dev(char **dev_path_ret)
1779{
1780 int ret;
Mike Frysinger94cff172021-07-16 02:59:04 -04001781 attribute_cleanup_fd int dev_fd = -1;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001782 size_t i;
1783 mode_t mask;
1784 char *dev_path;
1785
1786 /*
1787 * Create a temp path for the /dev init. We'll relocate this to the
1788 * final location later on in the startup process.
1789 */
1790 dev_path = *dev_path_ret = strdup("/tmp/minijail.dev.XXXXXX");
1791 if (dev_path == NULL || mkdtemp(dev_path) == NULL)
1792 pdie("could not create temp path for /dev");
1793
1794 /* Set up the empty /dev mount point first. */
Allen Webb7ae41c22021-09-16 10:23:37 -05001795 ret = mount("minijail-devfs", dev_path, "tmpfs", MS_NOEXEC | MS_NOSUID,
1796 "size=5M,mode=755");
Mike Frysinger33ffef32017-01-13 19:53:19 -05001797 if (ret) {
1798 rmdir(dev_path);
1799 return ret;
1800 }
1801
1802 /* We want to set the mode directly from the spec. */
1803 mask = umask(0);
1804
1805 /* Get a handle to the temp dev path for *at funcs below. */
Allen Webb7ae41c22021-09-16 10:23:37 -05001806 dev_fd = open(dev_path, O_DIRECTORY | O_PATH | O_CLOEXEC);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001807 if (dev_fd < 0) {
1808 ret = 1;
1809 goto done;
1810 }
1811
1812 /* Create all the nodes in /dev. */
1813 for (i = 0; i < ARRAY_SIZE(device_nodes); ++i) {
1814 const struct dev_spec *ds = &device_nodes[i];
1815 ret = mknodat(dev_fd, ds->name, ds->mode,
Allen Webb7ae41c22021-09-16 10:23:37 -05001816 makedev(ds->major, ds->minor));
Mike Frysinger33ffef32017-01-13 19:53:19 -05001817 if (ret)
1818 goto done;
1819 }
1820
1821 /* Create all the symlinks in /dev. */
1822 for (i = 0; i < ARRAY_SIZE(device_symlinks); ++i) {
1823 const struct dev_sym_spec *ds = &device_symlinks[i];
1824 ret = symlinkat(ds->dest, dev_fd, ds->source);
1825 if (ret)
1826 goto done;
1827 }
1828
Mike Frysinger604cc7b2020-12-29 18:18:56 -05001829 /* Create empty dir for glibc shared mem APIs. */
1830 ret = mkdirat(dev_fd, "shm", 01777);
1831 if (ret)
1832 goto done;
1833
Mike Frysinger33ffef32017-01-13 19:53:19 -05001834 /* Restore old mask. */
Allen Webb7ae41c22021-09-16 10:23:37 -05001835done:
Mike Frysinger33ffef32017-01-13 19:53:19 -05001836 umask(mask);
1837
1838 if (ret)
1839 mount_dev_cleanup(dev_path);
1840
1841 return ret;
1842}
1843
1844/*
1845 * Relocate the temporary /dev mount to its final /dev place.
1846 * We have to do this two step process so people can bind mount extra
1847 * /dev paths like /dev/log.
1848 */
1849static int mount_dev_finalize(const struct minijail *j, char *dev_path)
1850{
1851 int ret = -1;
1852 char *dest = NULL;
1853
1854 /* Unmount the /dev mount if possible. */
1855 if (umount2("/dev", MNT_DETACH))
1856 goto done;
1857
Allen Webb7ae41c22021-09-16 10:23:37 -05001858 if (asprintf(&dest, "%s/dev", j->chrootdir ?: "") < 0)
Mike Frysinger33ffef32017-01-13 19:53:19 -05001859 goto done;
1860
1861 if (mount(dev_path, dest, NULL, MS_MOVE, NULL))
1862 goto done;
1863
1864 ret = 0;
Allen Webb7ae41c22021-09-16 10:23:37 -05001865done:
Mike Frysinger33ffef32017-01-13 19:53:19 -05001866 free(dest);
1867 mount_dev_cleanup(dev_path);
1868
1869 return ret;
1870}
1871
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08001872/*
1873 * mount_one: Applies mounts from @m for @j, recursing as needed.
Dylan Reid648b2202015-10-23 00:50:00 -07001874 * @j Minijail these mounts are for
1875 * @m Head of list of mounts
Elly Jones51a5b6c2011-10-12 19:09:26 -04001876 *
1877 * Returns 0 for success.
1878 */
Mike Frysinger33ffef32017-01-13 19:53:19 -05001879static int mount_one(const struct minijail *j, struct mountpoint *m,
1880 const char *dev_path)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001881{
Dylan Reid648b2202015-10-23 00:50:00 -07001882 int ret;
1883 char *dest;
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001884 bool do_remount = false;
Ben Scarlatof6102622022-09-05 19:31:42 +00001885 bool has_bind_flag = mount_has_bind_flag(m);
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001886 bool has_remount_flag = !!(m->flags & MS_REMOUNT);
Luis Hector Chavez0bacbf82018-07-10 20:06:55 -07001887 unsigned long original_mnt_flags = 0;
Dylan Reid648b2202015-10-23 00:50:00 -07001888
Jorge Lucangeli Obes7654c6e2019-09-09 10:45:38 -04001889 /* We assume |dest| has a leading "/". */
Mike Frysinger33ffef32017-01-13 19:53:19 -05001890 if (dev_path && strncmp("/dev/", m->dest, 5) == 0) {
Jorge Lucangeli Obes9299cae2019-08-23 11:28:39 -04001891 /*
Jorge Lucangeli Obes7654c6e2019-09-09 10:45:38 -04001892 * Since the temp path is rooted at /dev, skip that dest part.
Jorge Lucangeli Obes9299cae2019-08-23 11:28:39 -04001893 */
Mike Frysinger33ffef32017-01-13 19:53:19 -05001894 if (asprintf(&dest, "%s%s", dev_path, m->dest + 4) < 0)
1895 return -ENOMEM;
1896 } else {
Mike Frysingerac08a682017-10-10 02:04:50 -04001897 if (asprintf(&dest, "%s%s", j->chrootdir ?: "", m->dest) < 0)
Mike Frysinger33ffef32017-01-13 19:53:19 -05001898 return -ENOMEM;
1899 }
Dylan Reid648b2202015-10-23 00:50:00 -07001900
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001901 ret = setup_mount_destination(m->src, dest, j->uid, j->gid,
1902 has_bind_flag);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001903 if (ret) {
François Degrosa42182d2020-04-29 00:41:52 +10001904 warn("cannot create mount target '%s'", dest);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001905 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001906 }
Dylan Reideec77962016-06-30 19:35:10 -07001907
Dylan Reid648b2202015-10-23 00:50:00 -07001908 /*
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001909 * Remount bind mounts that:
1910 * - Come from the minijail_bind() API, and
1911 * - Add the 'ro' flag
1912 * since 'bind' and other flags can't both be specified in the same
1913 * mount(2) call.
1914 * Callers using minijail_mount() to perform bind mounts are expected to
1915 * know what they're doing and call minijail_mount() with MS_REMOUNT as
1916 * needed.
1917 * Therefore, if the caller is asking for a remount (using MS_REMOUNT),
1918 * there is no need to do an extra remount here.
Dylan Reid648b2202015-10-23 00:50:00 -07001919 */
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001920 if (has_bind_flag && strcmp(m->type, "minijail_bind") == 0 &&
1921 !has_remount_flag) {
Luis Hector Chavez0bacbf82018-07-10 20:06:55 -07001922 /*
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001923 * Grab the mount flags of the source. These are used to figure
1924 * out whether the bind mount needs to be remounted read-only.
Luis Hector Chavez0bacbf82018-07-10 20:06:55 -07001925 */
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001926 if (get_mount_flags(m->src, &original_mnt_flags)) {
1927 warn("cannot get mount flags for '%s'", m->src);
1928 goto error;
1929 }
1930
1931 if ((m->flags & MS_RDONLY) !=
1932 (original_mnt_flags & MS_RDONLY)) {
1933 do_remount = 1;
Jorge Lucangeli Obes1bcdccd2022-06-24 13:53:22 -04001934 /*
1935 * Restrict the mount flags to those that are
1936 * user-settable in a MS_REMOUNT request, but excluding
1937 * MS_RDONLY. The user-requested mount flags will
1938 * dictate whether the remount will have that flag or
1939 * not.
1940 */
1941 original_mnt_flags &=
1942 (MS_USER_SETTABLE_MASK & ~MS_RDONLY);
Jorge Lucangeli Obes1bcdccd2022-06-24 13:53:22 -04001943 }
Elly Jonesa1059632011-12-15 15:17:07 -05001944 }
Dylan Reid648b2202015-10-23 00:50:00 -07001945
Jorge Lucangeli Obes84d8d052022-08-19 21:47:30 +00001946 /*
1947 * Do a final check for symlinks in |m->src|.
1948 * |m->src| will only contain a valid path when purely bind-mounting
1949 * (but not when remounting a bind mount).
1950 *
1951 * Short of having a version of mount(2) that can take fd's, this is the
1952 * smallest we can make the TOCTOU window.
1953 */
1954 if (has_bind_flag && !has_remount_flag && !is_valid_bind_path(m->src)) {
1955 warn("src '%s' is not a valid bind mount path", m->src);
1956 goto error;
1957 }
1958
Dylan Reid81e23972016-05-18 14:06:35 -07001959 ret = mount(m->src, dest, m->type, m->flags, m->data);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001960 if (ret) {
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001961 pwarn("cannot mount '%s' as '%s' with flags %#lx", m->src, dest,
1962 m->flags);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001963 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001964 }
Dylan Reid648b2202015-10-23 00:50:00 -07001965
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04001966 /* Remount *after* the initial mount. */
1967 if (do_remount) {
Luis Hector Chavez0bacbf82018-07-10 20:06:55 -07001968 ret =
1969 mount(m->src, dest, NULL,
1970 m->flags | original_mnt_flags | MS_REMOUNT, m->data);
Mike Frysinger33ffef32017-01-13 19:53:19 -05001971 if (ret) {
François Degrosa42182d2020-04-29 00:41:52 +10001972 pwarn(
1973 "cannot bind-remount '%s' as '%s' with flags %#lx",
1974 m->src, dest,
1975 m->flags | original_mnt_flags | MS_REMOUNT);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001976 goto error;
Mike Frysinger33ffef32017-01-13 19:53:19 -05001977 }
Dylan Reid648b2202015-10-23 00:50:00 -07001978 }
1979
Elly Jones51a5b6c2011-10-12 19:09:26 -04001980 free(dest);
Dylan Reid648b2202015-10-23 00:50:00 -07001981 if (m->next)
Mike Frysinger33ffef32017-01-13 19:53:19 -05001982 return mount_one(j, m->next, dev_path);
Luis Hector Chavez8c3acbc2017-10-24 16:45:00 -07001983 return 0;
1984
1985error:
1986 free(dest);
Elly Jones51a5b6c2011-10-12 19:09:26 -04001987 return ret;
1988}
1989
Mike Frysingerac08a682017-10-10 02:04:50 -04001990static void process_mounts_or_die(const struct minijail *j)
Jorge Lucangeli Obesc8b21e12014-06-13 14:26:16 -07001991{
Mike Frysingerac08a682017-10-10 02:04:50 -04001992 /*
1993 * We have to mount /dev first in case there are bind mounts from
1994 * the original /dev into the new unique tmpfs one.
1995 */
1996 char *dev_path = NULL;
1997 if (j->flags.mount_dev && mount_dev(&dev_path))
1998 pdie("mount_dev failed");
Dylan Reid648b2202015-10-23 00:50:00 -07001999
Mike Frysingerac08a682017-10-10 02:04:50 -04002000 if (j->mounts_head && mount_one(j, j->mounts_head, dev_path)) {
Jorge Lucangeli Obes537a1c92022-07-07 18:55:54 -04002001 warn("mount_one failed with /dev at '%s'", dev_path);
2002
François Degrosa42182d2020-04-29 00:41:52 +10002003 if (dev_path)
Mike Frysingerac08a682017-10-10 02:04:50 -04002004 mount_dev_cleanup(dev_path);
François Degrosa42182d2020-04-29 00:41:52 +10002005
2006 _exit(MINIJAIL_ERR_MOUNT);
Mike Frysingerac08a682017-10-10 02:04:50 -04002007 }
Mike Frysinger33ffef32017-01-13 19:53:19 -05002008
2009 /*
Mike Frysingerac08a682017-10-10 02:04:50 -04002010 * Once all bind mounts have been processed, move the temp dev to
2011 * its final /dev home.
Mike Frysinger33ffef32017-01-13 19:53:19 -05002012 */
2013 if (j->flags.mount_dev && mount_dev_finalize(j, dev_path))
Mike Frysingerac08a682017-10-10 02:04:50 -04002014 pdie("mount_dev_finalize failed");
2015}
Elly Jones51a5b6c2011-10-12 19:09:26 -04002016
Mike Frysingerac08a682017-10-10 02:04:50 -04002017static int enter_chroot(const struct minijail *j)
2018{
Luis Hector Chavez64730af2017-09-13 13:18:59 -07002019 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
2020
Elly Jones51a5b6c2011-10-12 19:09:26 -04002021 if (chroot(j->chrootdir))
2022 return -errno;
2023
2024 if (chdir("/"))
2025 return -errno;
2026
2027 return 0;
2028}
2029
Mike Frysingerac08a682017-10-10 02:04:50 -04002030static int enter_pivot_root(const struct minijail *j)
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002031{
Mike Frysinger94cff172021-07-16 02:59:04 -04002032 attribute_cleanup_fd int oldroot = -1;
2033 attribute_cleanup_fd int newroot = -1;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002034
Luis Hector Chavez64730af2017-09-13 13:18:59 -07002035 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_CHROOT);
2036
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08002037 /*
2038 * Keep the fd for both old and new root.
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07002039 * It will be used in fchdir(2) later.
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08002040 */
Ricky Zhoubce609d2016-03-02 21:47:56 -08002041 oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002042 if (oldroot < 0)
2043 pdie("failed to open / for fchdir");
Ricky Zhoubce609d2016-03-02 21:47:56 -08002044 newroot = open(j->chrootdir, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002045 if (newroot < 0)
2046 pdie("failed to open %s for fchdir", j->chrootdir);
2047
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08002048 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07002049 * To ensure j->chrootdir is the root of a filesystem,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08002050 * do a self bind mount.
2051 */
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002052 if (mount(j->chrootdir, j->chrootdir, "bind", MS_BIND | MS_REC, ""))
2053 pdie("failed to bind mount '%s'", j->chrootdir);
2054 if (chdir(j->chrootdir))
2055 return -errno;
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002056 if (syscall(SYS_pivot_root, ".", "."))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002057 pdie("pivot_root");
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002058
2059 /*
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07002060 * Now the old root is mounted on top of the new root. Use fchdir(2) to
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002061 * change to the old root and unmount it.
2062 */
2063 if (fchdir(oldroot))
2064 pdie("failed to fchdir to old /");
Hidehiko Abe097b7192016-03-16 18:00:36 +09002065
2066 /*
Mike Frysinger785b1c32018-02-23 15:47:24 -05002067 * If skip_remount_private was enabled for minijail_enter(),
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07002068 * there could be a shared mount point under |oldroot|. In that case,
2069 * mounts under this shared mount point will be unmounted below, and
2070 * this unmounting will propagate to the original mount namespace
2071 * (because the mount point is shared). To prevent this unexpected
2072 * unmounting, remove these mounts from their peer groups by recursively
2073 * remounting them as MS_PRIVATE.
Hidehiko Abe097b7192016-03-16 18:00:36 +09002074 */
2075 if (mount(NULL, ".", NULL, MS_REC | MS_PRIVATE, NULL))
Jorge Lucangeli Obes6b0de9b2016-03-16 22:41:34 -07002076 pdie("failed to mount(/, private) before umount(/)");
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002077 /* The old root might be busy, so use lazy unmount. */
Yu-Hsi Chiange0a530e2015-09-08 18:49:49 +08002078 if (umount2(".", MNT_DETACH))
2079 pdie("umount(/)");
2080 /* Change back to the new root. */
2081 if (fchdir(newroot))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002082 return -errno;
2083 if (chroot("/"))
2084 return -errno;
Jorge Lucangeli Obes46a55092015-10-12 15:31:59 -07002085 /* Set correct CWD for getcwd(3). */
2086 if (chdir("/"))
2087 return -errno;
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002088
2089 return 0;
2090}
2091
Martin Pelikánab9eb442017-01-25 11:53:58 +11002092static int mount_tmp(const struct minijail *j)
Lee Campbell11af0622014-05-22 12:36:04 -07002093{
Martin Pelikánab9eb442017-01-25 11:53:58 +11002094 const char fmt[] = "size=%zu,mode=1777";
2095 /* Count for the user storing ULLONG_MAX literally + extra space. */
2096 char data[sizeof(fmt) + sizeof("18446744073709551615ULL")];
2097 int ret;
2098
2099 ret = snprintf(data, sizeof(data), fmt, j->tmpfs_size);
2100
2101 if (ret <= 0)
2102 pdie("tmpfs size spec error");
2103 else if ((size_t)ret >= sizeof(data))
2104 pdie("tmpfs size spec too large");
Ryan Borzellob12f5672022-08-19 22:48:06 +00002105
2106 unsigned long flags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
2107
2108 if (block_symlinks_in_noninit_mountns_tmp()) {
2109 flags |= MS_NOSYMFOLLOW;
2110 }
2111
2112 return mount("none", "/tmp", "tmpfs", flags, data);
Lee Campbell11af0622014-05-22 12:36:04 -07002113}
2114
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002115static int remount_proc_readonly(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002116{
2117 const char *kProcPath = "/proc";
2118 const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
Elly Jonesdd3e8512012-01-23 15:13:38 -05002119 /*
2120 * Right now, we're holding a reference to our parent's old mount of
Elly Jonese1749eb2011-10-07 13:54:59 -04002121 * /proc in our namespace, which means using MS_REMOUNT here would
2122 * mutate our parent's mount as well, even though we're in a VFS
Jorge Lucangeli Obesdf7fab12016-06-01 17:15:31 -07002123 * namespace (!). Instead, remove their mount from our namespace lazily
2124 * (MNT_DETACH) and make our own.
Jorge Lucangeli Obes320c4fc2020-12-10 10:38:30 -05002125 *
2126 * However, we skip this in the user namespace case because it will
2127 * invariably fail. Every mount namespace is "owned" by the
2128 * user namespace of the process that creates it. Mount namespace A is
2129 * "less privileged" than mount namespace B if A is created off of B,
2130 * and B is owned by a different user namespace.
2131 * When a less privileged mount namespace is created, the mounts used to
2132 * initialize it (coming from the more privileged mount namespace) come
2133 * as a unit, and are locked together. This means that code running in
2134 * the new mount (and user) namespace cannot piecemeal unmount
2135 * individual mounts inherited from a more privileged mount namespace.
2136 * See https://man7.org/linux/man-pages/man7/mount_namespaces.7.html,
2137 * "Restrictions on mount namespaces" for details.
2138 *
2139 * This happens in our use case because we first enter a new user
2140 * namespace (on clone(2)) and then we unshare(2) a new mount namespace,
2141 * which means the new mount namespace is less privileged than its
2142 * parent mount namespace. This would also happen if we entered a new
2143 * mount namespace on clone(2), since the user namespace is created
2144 * first.
2145 * In all other non-user-namespace cases the new mount namespace is
2146 * similarly privileged as the parent mount namespace so unmounting a
2147 * single mount is allowed.
2148 *
2149 * We still remount /proc as read-only in the user namespace case
2150 * because while a process with CAP_SYS_ADMIN in the new user namespace
2151 * can unmount the RO mount and get at the RW mount, an attacker with
2152 * access only to a write primitive will not be able to modify /proc.
Elly Jonese1749eb2011-10-07 13:54:59 -04002153 */
Jorge Lucangeli Obes320c4fc2020-12-10 10:38:30 -05002154 if (!j->flags.userns && umount2(kProcPath, MNT_DETACH))
2155 return -errno;
Mike Frysinger3ba81572017-01-17 23:33:28 -05002156 if (mount("proc", kProcPath, "proc", kSafeFlags | MS_RDONLY, ""))
Elly Jonese1749eb2011-10-07 13:54:59 -04002157 return -errno;
2158 return 0;
Elly Jonescd7a9042011-07-22 13:56:51 -04002159}
2160
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002161static void kill_child_and_die(const struct minijail *j, const char *msg)
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002162{
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002163 kill(j->initpid, SIGKILL);
2164 die("%s", msg);
Dylan Reid605ce7f2016-01-19 19:21:00 -08002165}
2166
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002167static void write_pid_file_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08002168{
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07002169 if (write_pid_to_path(j->initpid, j->pid_file_path))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002170 kill_child_and_die(j, "failed to write pid file");
Dylan Reid605ce7f2016-01-19 19:21:00 -08002171}
2172
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002173static void add_to_cgroups_or_die(const struct minijail *j)
Dylan Reid605ce7f2016-01-19 19:21:00 -08002174{
2175 size_t i;
2176
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002177 for (i = 0; i < j->cgroup_count; ++i) {
Keshav Santhanamdb6dab42016-08-10 16:33:34 -07002178 if (write_pid_to_path(j->initpid, j->cgroups[i]))
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002179 kill_child_and_die(j, "failed to add to cgroups");
2180 }
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08002181}
2182
Dylan Reid0f72ef42017-06-06 15:42:49 -07002183static void set_rlimits_or_die(const struct minijail *j)
2184{
2185 size_t i;
2186
2187 for (i = 0; i < j->rlimit_count; ++i) {
2188 struct rlimit limit;
2189 limit.rlim_cur = j->rlimits[i].cur;
2190 limit.rlim_max = j->rlimits[i].max;
2191 if (prlimit(j->initpid, j->rlimits[i].type, &limit, NULL))
2192 kill_child_and_die(j, "failed to set rlimit");
2193 }
2194}
2195
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002196static void write_ugid_maps_or_die(const struct minijail *j)
2197{
2198 if (j->uidmap && write_proc_file(j->initpid, j->uidmap, "uid_map") != 0)
2199 kill_child_and_die(j, "failed to write uid_map");
Mike Frysinger6b190c02017-01-04 17:18:42 -05002200 if (j->gidmap && j->flags.disable_setgroups) {
Jorge Lucangeli Obes93418062019-09-27 10:59:45 -04002201 /*
2202 * Older kernels might not have the /proc/<pid>/setgroups files.
2203 */
Mike Frysinger6b190c02017-01-04 17:18:42 -05002204 int ret = write_proc_file(j->initpid, "deny", "setgroups");
Mike Frysingereea841b2017-01-13 18:11:57 -05002205 if (ret != 0) {
Mike Frysinger6b190c02017-01-04 17:18:42 -05002206 if (ret == -ENOENT) {
Allen Webb7ae41c22021-09-16 10:23:37 -05002207 /*
2208 * See
2209 * http://man7.org/linux/man-pages/man7/user_namespaces.7.html.
2210 */
Mike Frysinger6b190c02017-01-04 17:18:42 -05002211 warn("could not disable setgroups(2)");
2212 } else
Jorge Lucangeli Obes93418062019-09-27 10:59:45 -04002213 kill_child_and_die(
2214 j, "failed to disable setgroups(2)");
Mike Frysinger6b190c02017-01-04 17:18:42 -05002215 }
2216 }
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002217 if (j->gidmap && write_proc_file(j->initpid, j->gidmap, "gid_map") != 0)
2218 kill_child_and_die(j, "failed to write gid_map");
2219}
2220
2221static void enter_user_namespace(const struct minijail *j)
2222{
Luis Hector Chavez71323552017-09-05 09:17:22 -07002223 int uid = j->flags.uid ? j->uid : 0;
2224 int gid = j->flags.gid ? j->gid : 0;
2225 if (j->gidmap && setresgid(gid, gid, gid)) {
2226 pdie("user_namespaces: setresgid(%d, %d, %d) failed", gid, gid,
2227 gid);
2228 }
2229 if (j->uidmap && setresuid(uid, uid, uid)) {
2230 pdie("user_namespaces: setresuid(%d, %d, %d) failed", uid, uid,
2231 uid);
2232 }
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002233}
2234
2235static void parent_setup_complete(int *pipe_fds)
2236{
Mattias Nissler6123e5a2020-02-11 13:38:03 +01002237 close_and_reset(&pipe_fds[0]);
2238 close_and_reset(&pipe_fds[1]);
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002239}
2240
2241/*
2242 * wait_for_parent_setup: Called by the child process to wait for any
2243 * further parent-side setup to complete before continuing.
2244 */
2245static void wait_for_parent_setup(int *pipe_fds)
2246{
2247 char buf;
2248
Mattias Nissler6123e5a2020-02-11 13:38:03 +01002249 close_and_reset(&pipe_fds[1]);
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002250
2251 /* Wait for parent to complete setup and close the pipe. */
2252 if (read(pipe_fds[0], &buf, 1) != 0)
2253 die("failed to sync with parent");
Mattias Nissler6123e5a2020-02-11 13:38:03 +01002254 close_and_reset(&pipe_fds[0]);
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002255}
2256
2257static void drop_ugid(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002258{
Lutz Justen13807cb2017-01-03 17:11:55 +01002259 if (j->flags.inherit_suppl_gids + j->flags.keep_suppl_gids +
Allen Webb7ae41c22021-09-16 10:23:37 -05002260 j->flags.set_suppl_gids >
2261 1) {
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -05002262 die("can only do one of inherit, keep, or set supplementary "
2263 "groups");
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08002264 }
2265
Lutz Justen13807cb2017-01-03 17:11:55 +01002266 if (j->flags.inherit_suppl_gids) {
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002267 if (initgroups(j->user, j->usergid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002268 pdie("initgroups(%s, %d) failed", j->user, j->usergid);
Lutz Justen13807cb2017-01-03 17:11:55 +01002269 } else if (j->flags.set_suppl_gids) {
2270 if (setgroups(j->suppl_gid_count, j->suppl_gid_list))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002271 pdie("setgroups(suppl_gids) failed");
Luis Hector Chavez71323552017-09-05 09:17:22 -07002272 } else if (!j->flags.keep_suppl_gids && !j->flags.disable_setgroups) {
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08002273 /*
Jorge Lucangeli Obesd16ac492015-12-03 14:44:35 -08002274 * Only attempt to clear supplementary groups if we are changing
Luis Hector Chavez71323552017-09-05 09:17:22 -07002275 * users or groups, and if the caller did not request to disable
2276 * setgroups (used when entering a user namespace as a
2277 * non-privileged user).
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08002278 */
Jorge Lucangeli Obes24499562016-12-01 11:59:27 -05002279 if ((j->flags.uid || j->flags.gid) && setgroups(0, NULL))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002280 pdie("setgroups(0, NULL) failed");
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002281 }
2282
2283 if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002284 pdie("setresgid(%d, %d, %d) failed", j->gid, j->gid, j->gid);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002285
2286 if (j->flags.uid && setresuid(j->uid, j->uid, j->uid))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002287 pdie("setresuid(%d, %d, %d) failed", j->uid, j->uid, j->uid);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002288}
2289
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08002290static void drop_capbset(uint64_t keep_mask, unsigned int last_valid_cap)
2291{
2292 const uint64_t one = 1;
2293 unsigned int i;
2294 for (i = 0; i < sizeof(keep_mask) * 8 && i <= last_valid_cap; ++i) {
2295 if (keep_mask & (one << i))
2296 continue;
2297 if (prctl(PR_CAPBSET_DROP, i))
2298 pdie("could not drop capability from bounding set");
2299 }
2300}
2301
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002302static void drop_caps(const struct minijail *j, unsigned int last_valid_cap)
Elly Jonese1749eb2011-10-07 13:54:59 -04002303{
Jorge Lucangeli Obes7ea269e2016-02-26 22:07:09 -08002304 if (!j->flags.use_caps)
2305 return;
2306
Elly Jonese1749eb2011-10-07 13:54:59 -04002307 cap_t caps = cap_get_proc();
Kees Cook323878a2013-02-05 15:35:24 -08002308 cap_value_t flag[1];
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04002309 const size_t ncaps = sizeof(j->caps) * 8;
Kees Cooke5609ac2013-02-06 14:12:41 -08002310 const uint64_t one = 1;
Elly Jonese1749eb2011-10-07 13:54:59 -04002311 unsigned int i;
2312 if (!caps)
2313 die("can't get process caps");
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04002314 if (cap_clear(caps))
2315 die("can't clear caps");
2316
2317 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
Kees Cook323878a2013-02-05 15:35:24 -08002318 /* Keep CAP_SETPCAP for dropping bounding set bits. */
Kees Cooke5609ac2013-02-06 14:12:41 -08002319 if (i != CAP_SETPCAP && !(j->caps & (one << i)))
Elly Jonese1749eb2011-10-07 13:54:59 -04002320 continue;
Kees Cook323878a2013-02-05 15:35:24 -08002321 flag[0] = i;
2322 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04002323 die("can't add effective cap");
Kees Cook323878a2013-02-05 15:35:24 -08002324 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04002325 die("can't add permitted cap");
Kees Cook323878a2013-02-05 15:35:24 -08002326 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_SET))
Elly Jonese1749eb2011-10-07 13:54:59 -04002327 die("can't add inheritable cap");
2328 }
2329 if (cap_set_proc(caps))
Kees Cook323878a2013-02-05 15:35:24 -08002330 die("can't apply initial cleaned capset");
2331
2332 /*
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04002333 * Instead of dropping the bounding set first, do it here in case
Kees Cook323878a2013-02-05 15:35:24 -08002334 * the caller had a more permissive bounding set which could
2335 * have been used above to raise a capability that wasn't already
2336 * present. This requires CAP_SETPCAP, so we raised/kept it above.
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04002337 *
2338 * However, if we're asked to skip setting *and* locking the
2339 * SECURE_NOROOT securebit, also skip dropping the bounding set.
2340 * If the caller wants to regain all capabilities when executing a
2341 * set-user-ID-root program, allow them to do so. The default behavior
2342 * (i.e. the behavior without |securebits_skip_mask| set) will still put
2343 * the jailed process tree in a capabilities-only environment.
2344 *
2345 * We check the negated skip mask for SECURE_NOROOT and
2346 * SECURE_NOROOT_LOCKED. If the bits are set in the negated mask they
2347 * will *not* be skipped in lock_securebits(), and therefore we should
2348 * drop the bounding set.
Kees Cook323878a2013-02-05 15:35:24 -08002349 */
Jorge Lucangeli Obes54234212018-04-26 11:52:15 -04002350 if (secure_noroot_set_and_locked(~j->securebits_skip_mask)) {
2351 drop_capbset(j->caps, last_valid_cap);
2352 } else {
2353 warn("SECURE_NOROOT not set, not dropping bounding set");
2354 }
Kees Cook323878a2013-02-05 15:35:24 -08002355
2356 /* If CAP_SETPCAP wasn't specifically requested, now we remove it. */
Kees Cooke5609ac2013-02-06 14:12:41 -08002357 if ((j->caps & (one << CAP_SETPCAP)) == 0) {
Kees Cook323878a2013-02-05 15:35:24 -08002358 flag[0] = CAP_SETPCAP;
2359 if (cap_set_flag(caps, CAP_EFFECTIVE, 1, flag, CAP_CLEAR))
2360 die("can't clear effective cap");
2361 if (cap_set_flag(caps, CAP_PERMITTED, 1, flag, CAP_CLEAR))
2362 die("can't clear permitted cap");
2363 if (cap_set_flag(caps, CAP_INHERITABLE, 1, flag, CAP_CLEAR))
2364 die("can't clear inheritable cap");
2365 }
2366
2367 if (cap_set_proc(caps))
2368 die("can't apply final cleaned capset");
2369
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04002370 /*
2371 * If ambient capabilities are supported, clear all capabilities first,
2372 * then raise the requested ones.
2373 */
2374 if (j->flags.set_ambient_caps) {
2375 if (!cap_ambient_supported()) {
2376 pdie("ambient capabilities not supported");
2377 }
Jorge Lucangeli Obesf6058c32017-04-26 10:26:59 -04002378 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_CLEAR_ALL, 0, 0, 0) !=
2379 0) {
Jorge Lucangeli Obesa6eb21a2017-04-20 10:44:00 -04002380 pdie("can't clear ambient capabilities");
2381 }
2382
2383 for (i = 0; i < ncaps && i <= last_valid_cap; ++i) {
2384 if (!(j->caps & (one << i)))
2385 continue;
2386
2387 if (prctl(PR_CAP_AMBIENT, PR_CAP_AMBIENT_RAISE, i, 0,
2388 0) != 0) {
2389 pdie("prctl(PR_CAP_AMBIENT, "
2390 "PR_CAP_AMBIENT_RAISE, %u) failed",
2391 i);
2392 }
2393 }
2394 }
2395
Kees Cook323878a2013-02-05 15:35:24 -08002396 cap_free(caps);
Elly Jonescd7a9042011-07-22 13:56:51 -04002397}
2398
Ben Scarlatoee82b492022-08-09 18:33:25 +00002399/* Creates a ruleset for current inodes then calls landlock_restrict_self(). */
Ben Scarlato585baa12022-07-22 06:04:20 +00002400static void apply_landlock_restrictions(const struct minijail *j)
2401{
Ben Scarlatoee82b492022-08-09 18:33:25 +00002402 struct fs_rule *r;
2403 attribute_cleanup_fd int ruleset_fd = -1;
2404
2405 r = j->fs_rules_head;
2406 while (r) {
2407 if (ruleset_fd < 0) {
2408 struct minijail_landlock_ruleset_attr ruleset_attr = {
2409 .handled_access_fs = HANDLED_ACCESS_TYPES
2410 };
2411 ruleset_fd = landlock_create_ruleset(
2412 &ruleset_attr, sizeof(ruleset_attr), 0);
2413 if (ruleset_fd < 0) {
2414 const int err = errno;
2415 pwarn("Failed to create a ruleset");
2416 switch (err) {
2417 case ENOSYS:
2418 pwarn("Landlock is not supported by the current kernel");
2419 break;
2420 case EOPNOTSUPP:
2421 pwarn("Landlock is currently disabled by kernel config");
2422 break;
2423 }
2424 return;
2425 }
2426 }
2427 populate_ruleset_internal(r->path, ruleset_fd, r->landlock_flags);
2428 r = r->next;
2429 }
2430
2431 if (ruleset_fd >= 0) {
2432 if (landlock_restrict_self(ruleset_fd, 0)) {
Ben Scarlato585baa12022-07-22 06:04:20 +00002433 pdie("Failed to enforce ruleset");
2434 }
2435 }
2436}
2437
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04002438static void set_seccomp_filter(const struct minijail *j)
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002439{
2440 /*
2441 * Set no_new_privs. See </kernel/seccomp.c> and </kernel/sys.c>
2442 * in the kernel source tree for an explanation of the parameters.
2443 */
2444 if (j->flags.no_new_privs) {
2445 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0))
2446 pdie("prctl(PR_SET_NO_NEW_PRIVS)");
2447 }
2448
2449 /*
Jorge Lucangeli Obes2413f372016-04-06 18:43:10 -07002450 * Code running with ASan
2451 * (https://github.com/google/sanitizers/wiki/AddressSanitizer)
2452 * will make system calls not included in the syscall filter policy,
2453 * which will likely crash the program. Skip setting seccomp filter in
2454 * that case.
2455 * 'running_with_asan()' has no inputs and is completely defined at
2456 * build time, so this cannot be used by an attacker to skip setting
2457 * seccomp filter.
2458 */
Evgenii Stepanov3d98f3c2018-08-23 15:06:50 -07002459 if (j->flags.seccomp_filter && running_with_asan()) {
Evgenii Stepanov825828c2018-07-27 11:57:07 -07002460 warn("running with (HW)ASan, not setting seccomp filter");
Jorge Lucangeli Obes2413f372016-04-06 18:43:10 -07002461 return;
2462 }
2463
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04002464 if (j->flags.seccomp_filter) {
Adrian Ratiu8ef61252021-06-08 03:46:24 +03002465 if (seccomp_is_logging_allowed(j)) {
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04002466 warn("logging seccomp filter failures");
Jorge Lucangeli Obes32201f82019-06-12 14:45:06 -04002467 if (!seccomp_ret_log_available()) {
2468 /*
2469 * If SECCOMP_RET_LOG is not available,
2470 * install the SIGSYS handler first.
2471 */
2472 if (install_sigsys_handler())
2473 pdie(
2474 "failed to install SIGSYS handler");
2475 }
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04002476 } else if (j->flags.seccomp_filter_tsync) {
2477 /*
2478 * If setting thread sync,
2479 * reset the SIGSYS signal handler so that
2480 * the entire thread group is killed.
2481 */
2482 if (signal(SIGSYS, SIG_DFL) == SIG_ERR)
2483 pdie("failed to reset SIGSYS disposition");
Jorge Lucangeli Obes713f6fb2016-10-03 13:03:25 -04002484 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002485 }
2486
2487 /*
2488 * Install the syscall filter.
2489 */
2490 if (j->flags.seccomp_filter) {
Anand K Mistry31adc6c2020-11-26 11:39:46 +11002491 if (j->flags.seccomp_filter_tsync ||
2492 j->flags.seccomp_filter_allow_speculation) {
2493 int filter_flags =
2494 (j->flags.seccomp_filter_tsync
2495 ? SECCOMP_FILTER_FLAG_TSYNC
2496 : 0) |
2497 (j->flags.seccomp_filter_allow_speculation
2498 ? SECCOMP_FILTER_FLAG_SPEC_ALLOW
2499 : 0);
2500 if (sys_seccomp(SECCOMP_SET_MODE_FILTER, filter_flags,
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04002501 j->filter_prog)) {
2502 pdie("seccomp(tsync) failed");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002503 }
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04002504 } else {
2505 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER,
2506 j->filter_prog)) {
2507 pdie("prctl(seccomp_filter) failed");
2508 }
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002509 }
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002510 }
2511}
2512
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002513static pid_t forward_pid = -1;
2514
Allen Webb7ae41c22021-09-16 10:23:37 -05002515static void forward_signal(int sig, siginfo_t *siginfo attribute_unused,
Mike Frysingerd9ef07c2018-05-30 16:51:36 -04002516 void *void_context attribute_unused)
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002517{
2518 if (forward_pid != -1) {
Mike Frysinger33d051a2018-05-30 16:41:10 -04002519 kill(forward_pid, sig);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002520 }
2521}
2522
2523static void install_signal_handlers(void)
2524{
2525 struct sigaction act;
2526
2527 memset(&act, 0, sizeof(act));
2528 act.sa_sigaction = &forward_signal;
2529 act.sa_flags = SA_SIGINFO | SA_RESTART;
2530
2531 /* Handle all signals, except SIGCHLD. */
Mike Frysinger33d051a2018-05-30 16:41:10 -04002532 for (int sig = 1; sig < NSIG; sig++) {
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002533 /*
2534 * We don't care if we get EINVAL: that just means that we
2535 * can't handle this signal, so let's skip it and continue.
2536 */
Mike Frysinger33d051a2018-05-30 16:41:10 -04002537 sigaction(sig, &act, NULL);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002538 }
2539 /* Reset SIGCHLD's handler. */
2540 signal(SIGCHLD, SIG_DFL);
2541
2542 /* Handle real-time signals. */
Mike Frysinger33d051a2018-05-30 16:41:10 -04002543 for (int sig = SIGRTMIN; sig <= SIGRTMAX; sig++) {
2544 sigaction(sig, &act, NULL);
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04002545 }
2546}
2547
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002548static const char *lookup_hook_name(minijail_hook_event_t event)
2549{
2550 switch (event) {
2551 case MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS:
2552 return "pre-drop-caps";
2553 case MINIJAIL_HOOK_EVENT_PRE_EXECVE:
2554 return "pre-execve";
Luis Hector Chavez64730af2017-09-13 13:18:59 -07002555 case MINIJAIL_HOOK_EVENT_PRE_CHROOT:
2556 return "pre-chroot";
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002557 case MINIJAIL_HOOK_EVENT_MAX:
2558 /*
2559 * Adding this in favor of a default case to force the
2560 * compiler to error out if a new enum value is added.
2561 */
2562 break;
2563 }
2564 return "unknown";
2565}
2566
2567static void run_hooks_or_die(const struct minijail *j,
2568 minijail_hook_event_t event)
2569{
2570 int rc;
2571 int hook_index = 0;
2572 for (struct hook *c = j->hooks_head; c; c = c->next) {
2573 if (c->event != event)
2574 continue;
2575 rc = c->hook(c->payload);
2576 if (rc != 0) {
2577 errno = -rc;
2578 pdie("%s hook (index %d) failed",
2579 lookup_hook_name(event), hook_index);
2580 }
2581 /* Only increase the index within the same hook event type. */
2582 ++hook_index;
2583 }
2584}
2585
Will Drewry6ac91122011-10-21 16:38:58 -05002586void API minijail_enter(const struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002587{
Dylan Reidf682d472015-09-17 21:39:07 -07002588 /*
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08002589 * If we're dropping caps, get the last valid cap from /proc now,
2590 * since /proc can be unmounted before drop_caps() is called.
Dylan Reidf682d472015-09-17 21:39:07 -07002591 */
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08002592 unsigned int last_valid_cap = 0;
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08002593 if (j->flags.capbset_drop || j->flags.use_caps)
Jorge Lucangeli Obes43e29b32015-12-08 21:07:14 -08002594 last_valid_cap = get_last_valid_cap();
Dylan Reidf682d472015-09-17 21:39:07 -07002595
Elly Jonese1749eb2011-10-07 13:54:59 -04002596 if (j->flags.pids)
2597 die("tried to enter a pid-namespaced jail;"
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07002598 " try minijail_run()?");
Elly Jonescd7a9042011-07-22 13:56:51 -04002599
Lutz Justen13807cb2017-01-03 17:11:55 +01002600 if (j->flags.inherit_suppl_gids && !j->user)
Jorge Lucangeli Obes34543192017-01-11 16:07:57 -05002601 die("cannot inherit supplementary groups without setting a "
2602 "username");
Elly Jonescd7a9042011-07-22 13:56:51 -04002603
Elly Jonesdd3e8512012-01-23 15:13:38 -05002604 /*
2605 * We can't recover from failures if we've dropped privileges partially,
Elly Jonese1749eb2011-10-07 13:54:59 -04002606 * so we don't even try. If any of our operations fail, we abort() the
2607 * entire process.
2608 */
Mike Frysinger902a4492018-12-27 05:22:56 -05002609 if (j->flags.enter_vfs) {
2610 if (setns(j->mountns_fd, CLONE_NEWNS))
2611 pdie("setns(CLONE_NEWNS) failed");
2612 close(j->mountns_fd);
2613 }
Jorge Lucangeli Obes1563b5b2014-07-10 07:01:53 -07002614
Jorge Lucangeli Obes805be392015-10-12 15:55:59 -07002615 if (j->flags.vfs) {
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08002616 if (unshare(CLONE_NEWNS))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002617 pdie("unshare(CLONE_NEWNS) failed");
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08002618 /*
Mike Frysinger785b1c32018-02-23 15:47:24 -05002619 * By default, remount all filesystems as private, unless
Jorge Lucangeli Obes93418062019-09-27 10:59:45 -04002620 * - Passed a specific remount mode, in which case remount with
2621 * that,
2622 * - Asked not to remount at all, in which case skip the
2623 * mount(2) call.
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08002624 * https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
2625 */
Mike Frysinger785b1c32018-02-23 15:47:24 -05002626 if (j->remount_mode) {
Jorge Lucangeli Obes93418062019-09-27 10:59:45 -04002627 if (mount(NULL, "/", NULL, MS_REC | j->remount_mode,
2628 NULL))
Jorge Lucangeli Obes9e1ac372020-01-23 14:36:50 -05002629 pdie("mount(NULL, /, NULL, "
2630 "MS_REC | j->remount_mode, NULL) failed");
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00002631
2632 struct minijail_remount *temp = j->remounts_head;
2633 while (temp) {
Nicole Anderson-Aue119bbb2021-02-04 23:12:12 +00002634 if (temp->remount_mode < j->remount_mode)
2635 die("cannot remount %s as stricter "
2636 "than the root dir",
2637 temp->mount_name);
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00002638 if (mount(NULL, temp->mount_name, NULL,
2639 MS_REC | temp->remount_mode, NULL))
2640 pdie("mount(NULL, %s, NULL, "
Allen Webb7ae41c22021-09-16 10:23:37 -05002641 "MS_REC | temp->remount_mode, "
2642 "NULL) failed",
2643 temp->mount_name);
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00002644 temp = temp->next;
2645 }
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08002646 }
Jorge Lucangeli Obesf7a38682015-12-04 15:43:30 -08002647 }
Elly Fong-Jones6c086302013-03-20 17:15:28 -04002648
Dylan Reidf7942472015-11-18 17:55:26 -08002649 if (j->flags.ipc && unshare(CLONE_NEWIPC)) {
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002650 pdie("unshare(CLONE_NEWIPC) failed");
Dylan Reidf7942472015-11-18 17:55:26 -08002651 }
2652
Mike Frysingerb9a7b162017-05-30 15:25:49 -04002653 if (j->flags.uts) {
2654 if (unshare(CLONE_NEWUTS))
2655 pdie("unshare(CLONE_NEWUTS) failed");
2656
Allen Webb7ae41c22021-09-16 10:23:37 -05002657 if (j->hostname &&
2658 sethostname(j->hostname, strlen(j->hostname)))
Mike Frysingerb9a7b162017-05-30 15:25:49 -04002659 pdie("sethostname(%s) failed", j->hostname);
2660 }
2661
Dylan Reid1102f5a2015-09-15 11:52:20 -07002662 if (j->flags.enter_net) {
2663 if (setns(j->netns_fd, CLONE_NEWNET))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002664 pdie("setns(CLONE_NEWNET) failed");
Mike Frysinger902a4492018-12-27 05:22:56 -05002665 close(j->netns_fd);
Mike Frysinger7559dfe2016-11-15 18:58:39 -05002666 } else if (j->flags.net) {
2667 if (unshare(CLONE_NEWNET))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002668 pdie("unshare(CLONE_NEWNET) failed");
2669 config_net_loopback();
Dylan Reid1102f5a2015-09-15 11:52:20 -07002670 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002671
Dylan Reid4cbc2a52016-06-17 19:06:07 -07002672 if (j->flags.ns_cgroups && unshare(CLONE_NEWCGROUP))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002673 pdie("unshare(CLONE_NEWCGROUP) failed");
Dylan Reid4cbc2a52016-06-17 19:06:07 -07002674
Chirantan Ekbote866bb3a2017-02-07 12:26:42 -08002675 if (j->flags.new_session_keyring) {
2676 if (syscall(SYS_keyctl, KEYCTL_JOIN_SESSION_KEYRING, NULL) < 0)
2677 pdie("keyctl(KEYCTL_JOIN_SESSION_KEYRING) failed");
2678 }
2679
Mike Frysingerac08a682017-10-10 02:04:50 -04002680 /* We have to process all the mounts before we chroot/pivot_root. */
2681 process_mounts_or_die(j);
Elly Jones51a5b6c2011-10-12 19:09:26 -04002682
Mike Frysingerac08a682017-10-10 02:04:50 -04002683 if (j->flags.chroot && enter_chroot(j))
Mike Frysinger33ffef32017-01-13 19:53:19 -05002684 pdie("chroot");
Mike Frysinger33ffef32017-01-13 19:53:19 -05002685
Mike Frysingerac08a682017-10-10 02:04:50 -04002686 if (j->flags.pivot_root && enter_pivot_root(j))
Yu-Hsi Chiang64d65a72015-08-13 17:43:27 +08002687 pdie("pivot_root");
2688
Martin Pelikánab9eb442017-01-25 11:53:58 +11002689 if (j->flags.mount_tmp && mount_tmp(j))
Lee Campbell11af0622014-05-22 12:36:04 -07002690 pdie("mount_tmp");
2691
Dylan Reid791f5772015-09-14 20:02:42 -07002692 if (j->flags.remount_proc_ro && remount_proc_readonly(j))
Elly Jonese1749eb2011-10-07 13:54:59 -04002693 pdie("remount");
Elly Jonescd7a9042011-07-22 13:56:51 -04002694
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07002695 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_DROP_CAPS);
2696
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08002697 /*
2698 * If we're only dropping capabilities from the bounding set, but not
2699 * from the thread's (permitted|inheritable|effective) sets, do it now.
2700 */
2701 if (j->flags.capbset_drop) {
2702 drop_capbset(j->cap_bset, last_valid_cap);
2703 }
2704
Luis Hector Chavez89cbc322018-08-06 11:31:15 -07002705 /*
Mattias Nissler48b5ff12018-10-11 15:31:41 +02002706 * POSIX capabilities are a bit tricky. We must set SECBIT_KEEP_CAPS
2707 * before drop_ugid() below as the latter would otherwise drop all
2708 * capabilities.
Luis Hector Chavez89cbc322018-08-06 11:31:15 -07002709 */
Jorge Lucangeli Obesf9fcdbe2016-02-19 15:04:09 -08002710 if (j->flags.use_caps) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002711 /*
Mattias Nissler48b5ff12018-10-11 15:31:41 +02002712 * When using ambient capabilities, CAP_SET{GID,UID} can be
2713 * inherited across execve(2), so SECBIT_KEEP_CAPS is not
2714 * strictly needed.
Elly Jonese1749eb2011-10-07 13:54:59 -04002715 */
Mattias Nissler48b5ff12018-10-11 15:31:41 +02002716 bool require_keep_caps = !j->flags.set_ambient_caps;
2717 if (lock_securebits(j->securebits_skip_mask,
2718 require_keep_caps) < 0) {
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002719 pdie("locking securebits failed");
Jorge Lucangeli Obesf783b522016-03-14 14:34:10 -07002720 }
Elly Jonese1749eb2011-10-07 13:54:59 -04002721 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002722
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07002723 if (j->flags.no_new_privs) {
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002724 /*
2725 * If we're setting no_new_privs, we can drop privileges
2726 * before setting seccomp filter. This way filter policies
2727 * don't need to allow privilege-dropping syscalls.
2728 */
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002729 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002730 drop_caps(j, last_valid_cap);
Ben Scarlato585baa12022-07-22 06:04:20 +00002731
2732 // Landlock is applied as late as possible. If no_new_privs is
2733 // set, then it can be applied after dropping caps.
2734 apply_landlock_restrictions(j);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002735 set_seccomp_filter(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04002736 } else {
Ben Scarlato585baa12022-07-22 06:04:20 +00002737 apply_landlock_restrictions(j);
2738
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002739 /*
2740 * If we're not setting no_new_privs,
2741 * we need to set seccomp filter *before* dropping privileges.
2742 * WARNING: this means that filter policies *must* allow
2743 * setgroups()/setresgid()/setresuid() for dropping root and
2744 * capget()/capset()/prctl() for dropping caps.
2745 */
2746 set_seccomp_filter(j);
Jorge Lucangeli Obes6201cf52012-08-23 11:42:27 -07002747 drop_ugid(j);
Jorge Lucangeli Obesd8c82052016-02-25 16:00:32 -08002748 drop_caps(j, last_valid_cap);
Elly Jonese1749eb2011-10-07 13:54:59 -04002749 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002750
Elly Jonesdd3e8512012-01-23 15:13:38 -05002751 /*
Andrew Brestickereac28942015-11-11 16:04:46 -08002752 * Select the specified alternate syscall table. The table must not
2753 * block prctl(2) if we're using seccomp as well.
2754 */
2755 if (j->flags.alt_syscall) {
2756 if (prctl(PR_ALT_SYSCALL, 1, j->alt_syscall_table))
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002757 pdie("prctl(PR_ALT_SYSCALL) failed");
Andrew Brestickereac28942015-11-11 16:04:46 -08002758 }
2759
2760 /*
Elly Jonesdd3e8512012-01-23 15:13:38 -05002761 * seccomp has to come last since it cuts off all the other
Elly Jonese1749eb2011-10-07 13:54:59 -04002762 * privilege-dropping syscalls :)
2763 */
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002764 if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1)) {
Jorge Lucangeli Obes7b2e29c2016-08-04 12:21:03 -04002765 if ((errno == EINVAL) && seccomp_can_softfail()) {
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002766 warn("seccomp not supported");
2767 return;
2768 }
Jorge Lucangeli Obes457a5e32016-11-23 15:18:56 -05002769 pdie("prctl(PR_SET_SECCOMP) failed");
Utkarsh Sanghi0ef8a662014-08-18 15:50:11 -07002770 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002771}
2772
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002773/* TODO(wad): will visibility affect this variable? */
Elly Jonescd7a9042011-07-22 13:56:51 -04002774static int init_exitstatus = 0;
2775
Mike Frysinger0a27ab02020-09-04 16:18:12 -04002776static void init_term(int sig attribute_unused)
Elly Jonese1749eb2011-10-07 13:54:59 -04002777{
2778 _exit(init_exitstatus);
Elly Jonescd7a9042011-07-22 13:56:51 -04002779}
2780
Mike Frysinger0a27ab02020-09-04 16:18:12 -04002781static void init(pid_t rootpid)
Elly Jonese1749eb2011-10-07 13:54:59 -04002782{
2783 pid_t pid;
2784 int status;
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002785 /* So that we exit with the right status. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002786 signal(SIGTERM, init_term);
Jorge Lucangeli Obesdb0bc672016-08-03 10:45:21 -04002787 /* TODO(wad): self jail with seccomp filters here. */
Elly Jonese1749eb2011-10-07 13:54:59 -04002788 while ((pid = wait(&status)) > 0) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05002789 /*
2790 * This loop will only end when either there are no processes
Elly Jonese1749eb2011-10-07 13:54:59 -04002791 * left inside our pid namespace or we get a signal.
2792 */
2793 if (pid == rootpid)
2794 init_exitstatus = status;
2795 }
2796 if (!WIFEXITED(init_exitstatus))
2797 _exit(MINIJAIL_ERR_INIT);
2798 _exit(WEXITSTATUS(init_exitstatus));
Elly Jonescd7a9042011-07-22 13:56:51 -04002799}
2800
Will Drewry6ac91122011-10-21 16:38:58 -05002801int API minijail_from_fd(int fd, struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04002802{
2803 size_t sz = 0;
2804 size_t bytes = read(fd, &sz, sizeof(sz));
Mike Frysingere933fce2021-10-02 01:28:06 -04002805 attribute_cleanup_str char *buf = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04002806 int r;
2807 if (sizeof(sz) != bytes)
2808 return -EINVAL;
Allen Webb7ae41c22021-09-16 10:23:37 -05002809 if (sz > USHRT_MAX) /* arbitrary check */
Elly Jonese1749eb2011-10-07 13:54:59 -04002810 return -E2BIG;
2811 buf = malloc(sz);
2812 if (!buf)
2813 return -ENOMEM;
2814 bytes = read(fd, buf, sz);
Mike Frysingere933fce2021-10-02 01:28:06 -04002815 if (bytes != sz)
Elly Jonese1749eb2011-10-07 13:54:59 -04002816 return -EINVAL;
Elly Jonese1749eb2011-10-07 13:54:59 -04002817 r = minijail_unmarshal(j, buf, sz);
Elly Jonese1749eb2011-10-07 13:54:59 -04002818 return r;
Will Drewry2f54b6a2011-09-16 13:45:31 -05002819}
2820
Will Drewry6ac91122011-10-21 16:38:58 -05002821int API minijail_to_fd(struct minijail *j, int fd)
Elly Jonese1749eb2011-10-07 13:54:59 -04002822{
Elly Jonese1749eb2011-10-07 13:54:59 -04002823 size_t sz = minijail_size(j);
Elly Jonese1749eb2011-10-07 13:54:59 -04002824 if (!sz)
2825 return -EINVAL;
François Degros664eba72019-11-05 13:18:24 +11002826
Mike Frysingere933fce2021-10-02 01:28:06 -04002827 attribute_cleanup_str char *buf = malloc(sz);
François Degros664eba72019-11-05 13:18:24 +11002828 if (!buf)
2829 return -ENOMEM;
2830
2831 int err = minijail_marshal(j, buf, sz);
2832 if (err)
Mike Frysingere933fce2021-10-02 01:28:06 -04002833 return err;
François Degros664eba72019-11-05 13:18:24 +11002834
Elly Jonese1749eb2011-10-07 13:54:59 -04002835 /* Sends [size][minijail]. */
François Degros664eba72019-11-05 13:18:24 +11002836 err = write_exactly(fd, &sz, sizeof(sz));
2837 if (err)
Mike Frysingere933fce2021-10-02 01:28:06 -04002838 return err;
François Degros664eba72019-11-05 13:18:24 +11002839
Mike Frysingere933fce2021-10-02 01:28:06 -04002840 return write_exactly(fd, buf, sz);
Will Drewry2f54b6a2011-09-16 13:45:31 -05002841}
Elly Jonescd7a9042011-07-22 13:56:51 -04002842
Dylan Reid6dc224f2021-05-12 17:06:25 -07002843int API minijail_copy_jail(const struct minijail *from, struct minijail *out)
2844{
2845 size_t sz = minijail_size(from);
2846 if (!sz)
2847 return -EINVAL;
2848
Mike Frysingere933fce2021-10-02 01:28:06 -04002849 attribute_cleanup_str char *buf = malloc(sz);
Dylan Reid6dc224f2021-05-12 17:06:25 -07002850 if (!buf)
2851 return -ENOMEM;
2852
2853 int err = minijail_marshal(from, buf, sz);
2854 if (err)
Mike Frysingere933fce2021-10-02 01:28:06 -04002855 return err;
Dylan Reid6dc224f2021-05-12 17:06:25 -07002856
Mike Frysingere933fce2021-10-02 01:28:06 -04002857 return minijail_unmarshal(out, buf, sz);
Dylan Reid6dc224f2021-05-12 17:06:25 -07002858}
2859
Luis Hector Chavez9acba452018-10-11 10:13:25 -07002860static int setup_preload(const struct minijail *j attribute_unused,
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01002861 char ***child_env attribute_unused)
Elly Jonese1749eb2011-10-07 13:54:59 -04002862{
Daniel Erat5b7a3182015-08-19 16:06:22 -06002863#if defined(__ANDROID__)
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002864 /* Don't use LDPRELOAD on Android. */
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07002865 return 0;
2866#else
Luis Hector Chavez9acba452018-10-11 10:13:25 -07002867 const char *preload_path = j->preload_path ?: PRELOADPATH;
2868 char *newenv = NULL;
Luis Hector Chavezd1d24d22019-02-11 17:59:21 -08002869 int ret = 0;
Stéphane Lesimplef65da3a2022-01-11 11:44:47 +01002870 const char *oldenv = minijail_getenv(*child_env, kLdPreloadEnvVar);
Luis Hector Chavez9acba452018-10-11 10:13:25 -07002871
2872 if (!oldenv)
2873 oldenv = "";
Elly Jonescd7a9042011-07-22 13:56:51 -04002874
Elly Jonese1749eb2011-10-07 13:54:59 -04002875 /* Only insert a separating space if we have something to separate... */
Luis Hector Chavezd1d24d22019-02-11 17:59:21 -08002876 if (asprintf(&newenv, "%s%s%s", oldenv, oldenv[0] != '\0' ? " " : "",
2877 preload_path) < 0) {
2878 return -1;
Luis Hector Chavez9acba452018-10-11 10:13:25 -07002879 }
Elly Jonescd7a9042011-07-22 13:56:51 -04002880
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01002881 ret = minijail_setenv(child_env, kLdPreloadEnvVar, newenv, 1);
Luis Hector Chavezd1d24d22019-02-11 17:59:21 -08002882 free(newenv);
2883 return ret;
Jorge Lucangeli Obesa21c8fc2015-07-15 16:22:34 -07002884#endif
Elly Jonescd7a9042011-07-22 13:56:51 -04002885}
2886
Allen Webb77383c72021-10-15 10:34:24 -07002887/*
2888 * This is for logging purposes and does not change the enforced seccomp
2889 * filter.
2890 */
2891static int setup_seccomp_policy_path(const struct minijail *j,
2892 char ***child_env)
2893{
2894 return minijail_setenv(child_env, kSeccompPolicyPathEnvVar,
2895 j->seccomp_policy_path ? j->seccomp_policy_path
2896 : "NO-LABEL",
2897 1 /* overwrite */);
2898}
2899
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01002900static int setup_pipe(char ***child_env, int fds[2])
Elly Jonese1749eb2011-10-07 13:54:59 -04002901{
2902 int r = pipe(fds);
2903 char fd_buf[11];
2904 if (r)
2905 return r;
2906 r = snprintf(fd_buf, sizeof(fd_buf), "%d", fds[0]);
2907 if (r <= 0)
2908 return -EINVAL;
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01002909 return minijail_setenv(child_env, kFdEnvVar, fd_buf, 1);
Will Drewryf89aef52011-09-16 16:48:57 -05002910}
2911
Jorge Lucangeli Obes0b208772017-04-19 14:15:46 -04002912static int close_open_fds(int *inheritable_fds, size_t size)
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07002913{
2914 const char *kFdPath = "/proc/self/fd";
2915
2916 DIR *d = opendir(kFdPath);
2917 struct dirent *dir_entry;
2918
2919 if (d == NULL)
2920 return -1;
2921 int dir_fd = dirfd(d);
2922 while ((dir_entry = readdir(d)) != NULL) {
2923 size_t i;
2924 char *end;
2925 bool should_close = true;
2926 const int fd = strtol(dir_entry->d_name, &end, 10);
2927
2928 if ((*end) != '\0') {
2929 continue;
2930 }
2931 /*
2932 * We might have set up some pipes that we want to share with
2933 * the parent process, and should not be closed.
2934 */
2935 for (i = 0; i < size; ++i) {
2936 if (fd == inheritable_fds[i]) {
2937 should_close = false;
2938 break;
2939 }
2940 }
2941 /* Also avoid closing the directory fd. */
2942 if (should_close && fd != dir_fd)
2943 close(fd);
2944 }
2945 closedir(d);
2946 return 0;
2947}
2948
Allen Webbc7182682021-04-16 09:44:53 -05002949/* Return true if the specified file descriptor is already open. */
2950static int fd_is_open(int fd)
2951{
2952 return fcntl(fd, F_GETFD) != -1 || errno != EBADF;
2953}
2954
2955static_assert(FD_SETSIZE >= MAX_PRESERVED_FDS * 2 - 1,
2956 "If true, ensure_no_fd_conflict will always find an unused fd.");
2957
Allen Webb66417bd2021-07-16 15:07:24 -07002958/* If parent_fd will be used by a child fd, move it to an unused fd. */
Allen Webb7ae41c22021-09-16 10:23:37 -05002959static int ensure_no_fd_conflict(const fd_set *child_fds, int child_fd,
2960 int *parent_fd)
Allen Webbc7182682021-04-16 09:44:53 -05002961{
Allen Webb66417bd2021-07-16 15:07:24 -07002962 if (!FD_ISSET(*parent_fd, child_fds)) {
Allen Webbc7182682021-04-16 09:44:53 -05002963 return 0;
2964 }
2965
2966 /*
2967 * If no other parent_fd matches the child_fd then use it instead of a
2968 * temporary.
2969 */
Allen Webb66417bd2021-07-16 15:07:24 -07002970 int fd = child_fd;
2971 if (fd == -1 || fd_is_open(fd)) {
Allen Webbc7182682021-04-16 09:44:53 -05002972 fd = FD_SETSIZE - 1;
2973 while (FD_ISSET(fd, child_fds) || fd_is_open(fd)) {
2974 --fd;
2975 if (fd < 0) {
2976 die("failed to find an unused fd");
2977 }
2978 }
2979 }
2980
Allen Webb66417bd2021-07-16 15:07:24 -07002981 int ret = dup2(*parent_fd, fd);
Allen Webbc7182682021-04-16 09:44:53 -05002982 /*
2983 * warn() opens a file descriptor so it needs to happen after dup2 to
2984 * avoid unintended side effects. This can be avoided by reordering the
2985 * mapping requests so that the source fds with overlap are mapped
2986 * first (unless there are cycles).
2987 */
Allen Webb66417bd2021-07-16 15:07:24 -07002988 warn("mapped fd overlap: moving %d to %d", *parent_fd, fd);
Allen Webbc7182682021-04-16 09:44:53 -05002989 if (ret == -1) {
2990 return -1;
2991 }
2992
Allen Webb66417bd2021-07-16 15:07:24 -07002993 *parent_fd = fd;
Allen Webbc7182682021-04-16 09:44:53 -05002994 return 0;
2995}
2996
Allen Webb66417bd2021-07-16 15:07:24 -07002997/*
Allen Webb05af7762021-07-16 12:56:44 -05002998 * Populate child_fds_out with the set of file descriptors that will be replaced
2999 * by redirect_fds().
3000 *
3001 * NOTE: This creates temporaries for parent file descriptors that would
3002 * otherwise be overwritten during redirect_fds().
3003 */
Allen Webb7ae41c22021-09-16 10:23:37 -05003004static int get_child_fds(struct minijail *j, fd_set *child_fds_out)
3005{
Allen Webb05af7762021-07-16 12:56:44 -05003006 /* Relocate parent_fds that would be replaced by a child_fd. */
3007 for (size_t i = 0; i < j->preserved_fd_count; i++) {
3008 int child_fd = j->preserved_fds[i].child_fd;
3009 if (FD_ISSET(child_fd, child_fds_out)) {
3010 die("fd %d is mapped more than once", child_fd);
3011 }
3012
3013 int *parent_fd = &j->preserved_fds[i].parent_fd;
Allen Webb7ae41c22021-09-16 10:23:37 -05003014 if (ensure_no_fd_conflict(child_fds_out, child_fd, parent_fd) ==
3015 -1) {
Allen Webb05af7762021-07-16 12:56:44 -05003016 return -1;
3017 }
3018
3019 FD_SET(child_fd, child_fds_out);
3020 }
3021 return 0;
3022}
3023
3024/*
Allen Webb66417bd2021-07-16 15:07:24 -07003025 * Structure holding resources and state created when running a minijail.
3026 */
3027struct minijail_run_state {
3028 pid_t child_pid;
3029 int pipe_fds[2];
3030 int stdin_fds[2];
3031 int stdout_fds[2];
3032 int stderr_fds[2];
3033 int child_sync_pipe_fds[2];
3034 char **child_env;
3035};
3036
Allen Webb05af7762021-07-16 12:56:44 -05003037/*
3038 * Move pipe_fds if they conflict with a child_fd.
3039 */
3040static int avoid_pipe_conflicts(struct minijail_run_state *state,
Allen Webb7ae41c22021-09-16 10:23:37 -05003041 fd_set *child_fds_out)
3042{
Allen Webb66417bd2021-07-16 15:07:24 -07003043 int *pipe_fds[] = {
Allen Webb7ae41c22021-09-16 10:23:37 -05003044 state->pipe_fds, state->child_sync_pipe_fds, state->stdin_fds,
3045 state->stdout_fds, state->stderr_fds,
Allen Webb66417bd2021-07-16 15:07:24 -07003046 };
3047 for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) {
3048 if (pipe_fds[i][0] != -1 &&
Allen Webb7ae41c22021-09-16 10:23:37 -05003049 ensure_no_fd_conflict(child_fds_out, -1, &pipe_fds[i][0]) ==
3050 -1) {
Allen Webb66417bd2021-07-16 15:07:24 -07003051 return -1;
3052 }
3053 if (pipe_fds[i][1] != -1 &&
Allen Webb7ae41c22021-09-16 10:23:37 -05003054 ensure_no_fd_conflict(child_fds_out, -1, &pipe_fds[i][1]) ==
3055 -1) {
Allen Webb66417bd2021-07-16 15:07:24 -07003056 return -1;
3057 }
3058 }
Allen Webb05af7762021-07-16 12:56:44 -05003059 return 0;
3060}
Allen Webb66417bd2021-07-16 15:07:24 -07003061
Allen Webb05af7762021-07-16 12:56:44 -05003062/*
3063 * Redirect j->preserved_fds from the parent_fd to the child_fd.
3064 *
3065 * NOTE: This will clear FD_CLOEXEC since otherwise the child_fd would not be
3066 * inherited after the exec call.
3067 */
Allen Webb7ae41c22021-09-16 10:23:37 -05003068static int redirect_fds(struct minijail *j, fd_set *child_fds)
3069{
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003070 for (size_t i = 0; i < j->preserved_fd_count; i++) {
Luis Héctor Cháveza63407a2021-01-03 05:47:00 -08003071 if (j->preserved_fds[i].parent_fd ==
3072 j->preserved_fds[i].child_fd) {
Allen Webb1e925f72021-07-16 15:39:29 -05003073 // Clear CLOEXEC if it is set so the FD will be
3074 // inherited by the child.
3075 int flags =
3076 fcntl(j->preserved_fds[i].child_fd, F_GETFD);
3077 if (flags == -1 || (flags & FD_CLOEXEC) == 0) {
3078 continue;
3079 }
3080
3081 // Currently FD_CLOEXEC is cleared without being
3082 // restored. It may make sense to track when this
3083 // happens and restore FD_CLOEXEC in the child process.
3084 flags &= ~FD_CLOEXEC;
3085 if (fcntl(j->preserved_fds[i].child_fd, F_SETFD,
3086 flags) == -1) {
3087 pwarn("failed to clear CLOEXEC for %d",
3088 j->preserved_fds[i].parent_fd);
3089 }
Luis Héctor Cháveza63407a2021-01-03 05:47:00 -08003090 continue;
3091 }
Luis Hector Chavez1617f632017-08-01 18:32:30 -07003092 if (dup2(j->preserved_fds[i].parent_fd,
3093 j->preserved_fds[i].child_fd) == -1) {
3094 return -1;
3095 }
3096 }
Allen Webb66417bd2021-07-16 15:07:24 -07003097
Mattias Nissler1cf29fb2020-04-20 23:14:03 +02003098 /*
3099 * After all fds have been duped, we are now free to close all parent
3100 * fds that are *not* child fds.
3101 */
3102 for (size_t i = 0; i < j->preserved_fd_count; i++) {
Allen Webbc7182682021-04-16 09:44:53 -05003103 int parent_fd = j->preserved_fds[i].parent_fd;
Allen Webb05af7762021-07-16 12:56:44 -05003104 if (!FD_ISSET(parent_fd, child_fds)) {
Allen Webbc7182682021-04-16 09:44:53 -05003105 close(parent_fd);
Mattias Nissler1cf29fb2020-04-20 23:14:03 +02003106 }
Mattias Nissler1cf29fb2020-04-20 23:14:03 +02003107 }
Luis Hector Chavez1617f632017-08-01 18:32:30 -07003108 return 0;
3109}
3110
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003111static void minijail_free_run_state(struct minijail_run_state *state)
3112{
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003113 state->child_pid = -1;
3114
3115 int *fd_pairs[] = {state->pipe_fds, state->stdin_fds, state->stdout_fds,
3116 state->stderr_fds, state->child_sync_pipe_fds};
3117 for (size_t i = 0; i < ARRAY_SIZE(fd_pairs); ++i) {
3118 close_and_reset(&fd_pairs[i][0]);
3119 close_and_reset(&fd_pairs[i][1]);
3120 }
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003121
3122 minijail_free_env(state->child_env);
3123 state->child_env = NULL;
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003124}
3125
3126/* Set up stdin/stdout/stderr file descriptors in the child. */
3127static void setup_child_std_fds(struct minijail *j,
3128 struct minijail_run_state *state)
3129{
3130 struct {
3131 const char *name;
3132 int from;
3133 int to;
3134 } fd_map[] = {
3135 {"stdin", state->stdin_fds[0], STDIN_FILENO},
3136 {"stdout", state->stdout_fds[1], STDOUT_FILENO},
3137 {"stderr", state->stderr_fds[1], STDERR_FILENO},
3138 };
3139
3140 for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) {
Luis Héctor Cháveza63407a2021-01-03 05:47:00 -08003141 if (fd_map[i].from == -1 || fd_map[i].from == fd_map[i].to)
3142 continue;
3143 if (dup2(fd_map[i].from, fd_map[i].to) == -1)
3144 die("failed to set up %s pipe", fd_map[i].name);
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003145 }
3146
3147 /* Close temporary pipe file descriptors. */
3148 int *std_pipes[] = {state->stdin_fds, state->stdout_fds,
3149 state->stderr_fds};
3150 for (size_t i = 0; i < ARRAY_SIZE(std_pipes); ++i) {
3151 close_and_reset(&std_pipes[i][0]);
3152 close_and_reset(&std_pipes[i][1]);
3153 }
3154
3155 /*
3156 * If any of stdin, stdout, or stderr are TTYs, or setsid flag is
3157 * set, create a new session. This prevents the jailed process from
3158 * using the TIOCSTI ioctl to push characters into the parent process
3159 * terminal's input buffer, therefore escaping the jail.
3160 *
3161 * Since it has just forked, the child will not be a process group
3162 * leader, and this call to setsid() should always succeed.
3163 */
3164 if (j->flags.setsid || isatty(STDIN_FILENO) || isatty(STDOUT_FILENO) ||
3165 isatty(STDERR_FILENO)) {
3166 if (setsid() < 0) {
3167 pdie("setsid() failed");
3168 }
Allen Webbf486e262022-03-18 15:24:57 +00003169
3170 if (isatty(STDIN_FILENO)) {
3171 if (ioctl(STDIN_FILENO, TIOCSCTTY, 0) != 0) {
3172 pwarn("failed to set controlling terminal");
3173 }
3174 }
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003175 }
3176}
3177
3178/*
Dylan Reidacfb8be2017-08-25 12:56:51 -07003179 * Structure that specifies how to start a minijail.
3180 *
Allen Webb05af7762021-07-16 12:56:44 -05003181 * filename - The program to exec in the child. Should be NULL if elf_fd is set.
3182 * elf_fd - A fd to be used with fexecve. Should be -1 if filename is set.
3183 * NOTE: either filename or elf_fd is required if |exec_in_child| = 1.
Jorge Lucangeli Obesd2c951d2019-02-01 15:43:36 -05003184 * argv - Arguments for the child program. Required if |exec_in_child| = 1.
3185 * envp - Environment for the child program. Available if |exec_in_child| = 1.
Dylan Reidacfb8be2017-08-25 12:56:51 -07003186 * use_preload - If true use LD_PRELOAD.
Jorge Lucangeli Obesd2c951d2019-02-01 15:43:36 -05003187 * exec_in_child - If true, run |filename|. Otherwise, the child will return to
Dylan Reid0412dcc2017-08-24 11:33:15 -07003188 * the caller.
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003189 * pstdin_fd - Filled with stdin pipe if non-NULL.
3190 * pstdout_fd - Filled with stdout pipe if non-NULL.
3191 * pstderr_fd - Filled with stderr pipe if non-NULL.
3192 * pchild_pid - Filled with the pid of the child process if non-NULL.
Dylan Reidacfb8be2017-08-25 12:56:51 -07003193 */
3194struct minijail_run_config {
3195 const char *filename;
Allen Webb05af7762021-07-16 12:56:44 -05003196 int elf_fd;
Dylan Reidacfb8be2017-08-25 12:56:51 -07003197 char *const *argv;
Jorge Lucangeli Obesd2c951d2019-02-01 15:43:36 -05003198 char *const *envp;
Dylan Reidacfb8be2017-08-25 12:56:51 -07003199 int use_preload;
Dylan Reid0412dcc2017-08-24 11:33:15 -07003200 int exec_in_child;
Dylan Reidacfb8be2017-08-25 12:56:51 -07003201 int *pstdin_fd;
3202 int *pstdout_fd;
3203 int *pstderr_fd;
3204 pid_t *pchild_pid;
3205};
3206
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003207static int
3208minijail_run_config_internal(struct minijail *j,
3209 const struct minijail_run_config *config);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003210
Will Drewry6ac91122011-10-21 16:38:58 -05003211int API minijail_run(struct minijail *j, const char *filename,
3212 char *const argv[])
Elly Jonese1749eb2011-10-07 13:54:59 -04003213{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003214 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003215 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003216 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003217 .argv = argv,
3218 .envp = NULL,
3219 .use_preload = true,
3220 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003221 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003222 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07003223}
3224
Stéphane Lesimplef65da3a2022-01-11 11:44:47 +01003225int API minijail_run_env(struct minijail *j, const char *filename,
3226 char *const argv[], char *const envp[])
3227{
3228 struct minijail_run_config config = {
Allen Webbf486e262022-03-18 15:24:57 +00003229 .filename = filename,
3230 .elf_fd = -1,
3231 .argv = argv,
3232 .envp = envp,
3233 .use_preload = true,
3234 .exec_in_child = true,
Stéphane Lesimplef65da3a2022-01-11 11:44:47 +01003235 };
3236 return minijail_run_config_internal(j, &config);
3237}
3238
Jorge Lucangeli Obes9807d032012-04-17 13:36:00 -07003239int API minijail_run_pid(struct minijail *j, const char *filename,
3240 char *const argv[], pid_t *pchild_pid)
3241{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003242 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003243 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003244 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003245 .argv = argv,
3246 .envp = NULL,
3247 .use_preload = true,
3248 .exec_in_child = true,
3249 .pchild_pid = pchild_pid,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003250 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003251 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003252}
3253
3254int API minijail_run_pipe(struct minijail *j, const char *filename,
Jorge Lucangeli Obes6537a562012-09-05 10:39:40 -07003255 char *const argv[], int *pstdin_fd)
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003256{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003257 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003258 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003259 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003260 .argv = argv,
3261 .envp = NULL,
3262 .use_preload = true,
3263 .exec_in_child = true,
3264 .pstdin_fd = pstdin_fd,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003265 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003266 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003267}
3268
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08003269int API minijail_run_pid_pipes(struct minijail *j, const char *filename,
Jorge Lucangeli Obes4ae30cc2014-04-10 15:35:33 -07003270 char *const argv[], pid_t *pchild_pid,
3271 int *pstdin_fd, int *pstdout_fd, int *pstderr_fd)
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08003272{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003273 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003274 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003275 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003276 .argv = argv,
3277 .envp = NULL,
3278 .use_preload = true,
3279 .exec_in_child = true,
3280 .pstdin_fd = pstdin_fd,
3281 .pstdout_fd = pstdout_fd,
3282 .pstderr_fd = pstderr_fd,
3283 .pchild_pid = pchild_pid,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003284 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003285 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003286}
3287
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003288int API minijail_run_env_pid_pipes(struct minijail *j, const char *filename,
3289 char *const argv[], char *const envp[],
3290 pid_t *pchild_pid, int *pstdin_fd,
3291 int *pstdout_fd, int *pstderr_fd)
3292{
3293 struct minijail_run_config config = {
3294 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003295 .elf_fd = -1,
3296 .argv = argv,
3297 .envp = envp,
3298 .use_preload = true,
3299 .exec_in_child = true,
3300 .pstdin_fd = pstdin_fd,
3301 .pstdout_fd = pstdout_fd,
3302 .pstderr_fd = pstderr_fd,
3303 .pchild_pid = pchild_pid,
3304 };
3305 return minijail_run_config_internal(j, &config);
3306}
3307
3308int API minijail_run_fd_env_pid_pipes(struct minijail *j, int elf_fd,
3309 char *const argv[], char *const envp[],
3310 pid_t *pchild_pid, int *pstdin_fd,
3311 int *pstdout_fd, int *pstderr_fd)
3312{
3313 struct minijail_run_config config = {
3314 .filename = NULL,
3315 .elf_fd = elf_fd,
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003316 .argv = argv,
3317 .envp = envp,
3318 .use_preload = true,
3319 .exec_in_child = true,
3320 .pstdin_fd = pstdin_fd,
3321 .pstdout_fd = pstdout_fd,
3322 .pstderr_fd = pstderr_fd,
3323 .pchild_pid = pchild_pid,
3324 };
3325 return minijail_run_config_internal(j, &config);
3326}
3327
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003328int API minijail_run_no_preload(struct minijail *j, const char *filename,
3329 char *const argv[])
3330{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003331 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003332 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003333 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003334 .argv = argv,
3335 .envp = NULL,
3336 .use_preload = false,
3337 .exec_in_child = true,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003338 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003339 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003340}
3341
Samuel Tan63187f42015-10-16 13:01:53 -07003342int API minijail_run_pid_pipes_no_preload(struct minijail *j,
Jorge Lucangeli Obes43a6a862015-12-04 14:53:36 -08003343 const char *filename,
Allen Webb7ae41c22021-09-16 10:23:37 -05003344 char *const argv[], pid_t *pchild_pid,
3345 int *pstdin_fd, int *pstdout_fd,
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08003346 int *pstderr_fd)
3347{
Dylan Reidacfb8be2017-08-25 12:56:51 -07003348 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003349 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003350 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003351 .argv = argv,
3352 .envp = NULL,
3353 .use_preload = false,
3354 .exec_in_child = true,
3355 .pstdin_fd = pstdin_fd,
3356 .pstdout_fd = pstdout_fd,
3357 .pstderr_fd = pstderr_fd,
3358 .pchild_pid = pchild_pid,
Jorge Lucangeli Obesd2c951d2019-02-01 15:43:36 -05003359 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003360 return minijail_run_config_internal(j, &config);
Jorge Lucangeli Obesd2c951d2019-02-01 15:43:36 -05003361}
3362
3363int API minijail_run_env_pid_pipes_no_preload(struct minijail *j,
3364 const char *filename,
3365 char *const argv[],
3366 char *const envp[],
3367 pid_t *pchild_pid, int *pstdin_fd,
3368 int *pstdout_fd, int *pstderr_fd)
3369{
3370 struct minijail_run_config config = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003371 .filename = filename,
Allen Webb05af7762021-07-16 12:56:44 -05003372 .elf_fd = -1,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003373 .argv = argv,
3374 .envp = envp,
3375 .use_preload = false,
3376 .exec_in_child = true,
3377 .pstdin_fd = pstdin_fd,
3378 .pstdout_fd = pstdout_fd,
3379 .pstderr_fd = pstderr_fd,
3380 .pchild_pid = pchild_pid,
Dylan Reidacfb8be2017-08-25 12:56:51 -07003381 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003382 return minijail_run_config_internal(j, &config);
Samuel Tan63187f42015-10-16 13:01:53 -07003383}
3384
Dylan Reid0412dcc2017-08-24 11:33:15 -07003385pid_t API minijail_fork(struct minijail *j)
3386{
Allen Webb05af7762021-07-16 12:56:44 -05003387 struct minijail_run_config config = {
3388 .elf_fd = -1,
3389 };
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003390 return minijail_run_config_internal(j, &config);
Dylan Reid0412dcc2017-08-24 11:33:15 -07003391}
3392
Dylan Reid18c49c82017-08-25 14:52:27 -07003393static int minijail_run_internal(struct minijail *j,
3394 const struct minijail_run_config *config,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003395 struct minijail_run_state *state_out)
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003396{
Dylan Reidce5b55e2016-01-13 11:04:16 -08003397 int sync_child = 0;
Elly Jonese1749eb2011-10-07 13:54:59 -04003398 int ret;
Elly Jonesa05d7bb2012-06-14 14:09:27 -04003399 /* We need to remember this across the minijail_preexec() call. */
3400 int pid_namespace = j->flags.pids;
Luis Hector Chavezac981fc2017-09-18 15:52:38 -07003401 /*
3402 * Create an init process if we are entering a pid namespace, unless the
3403 * user has explicitly opted out by calling minijail_run_as_init().
3404 */
3405 int do_init = j->flags.do_init && !j->flags.run_as_init;
Dylan Reidacfb8be2017-08-25 12:56:51 -07003406 int use_preload = config->use_preload;
Ben Chan541c7e52011-08-26 14:55:53 -07003407
Allen Webb05af7762021-07-16 12:56:44 -05003408 if (config->filename != NULL && config->elf_fd != -1) {
3409 die("filename and elf_fd cannot be set at the same time");
3410 }
3411
Allen Webb77383c72021-10-15 10:34:24 -07003412 /*
3413 * Only copy the environment if we need to modify it. If this is done
3414 * unconditionally, it triggers odd behavior in the ARC container.
3415 */
3416 if (use_preload || j->seccomp_policy_path) {
3417 state_out->child_env =
3418 minijail_copy_env(config->envp ? config->envp : environ);
3419 if (!state_out->child_env)
3420 return ENOMEM;
3421 }
3422
3423 if (j->seccomp_policy_path &&
3424 setup_seccomp_policy_path(j, &state_out->child_env))
3425 return -EFAULT;
3426
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003427 if (use_preload) {
Dylan Reid0412dcc2017-08-24 11:33:15 -07003428 if (j->hooks_head != NULL)
3429 die("Minijail hooks are not supported with LD_PRELOAD");
3430 if (!config->exec_in_child)
3431 die("minijail_fork is not supported with LD_PRELOAD");
3432
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003433 /*
3434 * Before we fork(2) and execve(2) the child process, we need
3435 * to open a pipe(2) to send the minijail configuration over.
3436 */
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003437 if (setup_preload(j, &state_out->child_env) ||
3438 setup_pipe(&state_out->child_env, state_out->pipe_fds))
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003439 return -EFAULT;
Elly Jonese1749eb2011-10-07 13:54:59 -04003440 }
Will Drewryf89aef52011-09-16 16:48:57 -05003441
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003442 if (!use_preload) {
Luis Hector Chavezfe5fb8e2017-06-29 10:41:27 -07003443 if (j->flags.use_caps && j->caps != 0 &&
3444 !j->flags.set_ambient_caps) {
3445 die("non-empty, non-ambient capabilities are not "
3446 "supported without LD_PRELOAD");
3447 }
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003448 }
Will Drewry2f54b6a2011-09-16 13:45:31 -05003449
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003450 /* Create pipes for stdin/stdout/stderr as requested by caller. */
3451 struct {
3452 bool requested;
3453 int *pipe_fds;
3454 } pipe_fd_req[] = {
3455 {config->pstdin_fd != NULL, state_out->stdin_fds},
3456 {config->pstdout_fd != NULL, state_out->stdout_fds},
3457 {config->pstderr_fd != NULL, state_out->stderr_fds},
3458 };
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003459
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003460 for (size_t i = 0; i < ARRAY_SIZE(pipe_fd_req); ++i) {
3461 if (pipe_fd_req[i].requested &&
3462 pipe(pipe_fd_req[i].pipe_fds) == -1)
3463 return EFAULT;
Jorge Lucangeli Obes339a1132013-02-15 16:53:47 -08003464 }
3465
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003466 /*
François Degros664eba72019-11-05 13:18:24 +11003467 * If the parent process needs to configure the child's runtime
3468 * environment after forking, create a pipe(2) to block the child until
3469 * configuration is done.
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003470 */
Daniel Erat2d69add2019-04-23 20:58:53 -07003471 if (j->flags.forward_signals || j->flags.pid_file || j->flags.cgroups ||
3472 j->rlimit_count || j->flags.userns) {
Dylan Reidce5b55e2016-01-13 11:04:16 -08003473 sync_child = 1;
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003474 if (pipe(state_out->child_sync_pipe_fds))
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003475 return -EFAULT;
3476 }
3477
Jorge Lucangeli Obesd0a6e2f2015-11-24 14:21:21 -08003478 /*
3479 * Use sys_clone() if and only if we're creating a pid namespace.
Elly Jones761b7412012-06-13 15:49:52 -04003480 *
3481 * tl;dr: WARNING: do not mix pid namespaces and multithreading.
3482 *
3483 * In multithreaded programs, there are a bunch of locks inside libc,
3484 * some of which may be held by other threads at the time that we call
3485 * minijail_run_pid(). If we call fork(), glibc does its level best to
3486 * ensure that we hold all of these locks before it calls clone()
3487 * internally and drop them after clone() returns, but when we call
3488 * sys_clone(2) directly, all that gets bypassed and we end up with a
3489 * child address space where some of libc's important locks are held by
3490 * other threads (which did not get cloned, and hence will never release
3491 * those locks). This is okay so long as we call exec() immediately
3492 * after, but a bunch of seemingly-innocent libc functions like setenv()
3493 * take locks.
3494 *
3495 * Hence, only call sys_clone() if we need to, in order to get at pid
3496 * namespacing. If we follow this path, the child's address space might
3497 * have broken locks; you may only call functions that do not acquire
3498 * any locks.
3499 *
3500 * Unfortunately, fork() acquires every lock it can get its hands on, as
3501 * previously detailed, so this function is highly likely to deadlock
3502 * later on (see "deadlock here") if we're multithreaded.
3503 *
3504 * We might hack around this by having the clone()d child (init of the
3505 * pid namespace) return directly, rather than leaving the clone()d
3506 * process hanging around to be init for the new namespace (and having
Jorge Lucangeli Obesa521bee2016-03-03 13:47:57 -08003507 * its fork()ed child return in turn), but that process would be
3508 * crippled with its libc locks potentially broken. We might try
3509 * fork()ing in the parent before we clone() to ensure that we own all
3510 * the locks, but then we have to have the forked child hanging around
3511 * consuming resources (and possibly having file descriptors / shared
3512 * memory regions / etc attached). We'd need to keep the child around to
3513 * avoid having its children get reparented to init.
Elly Jones761b7412012-06-13 15:49:52 -04003514 *
3515 * TODO(ellyjones): figure out if the "forked child hanging around"
3516 * problem is fixable or not. It would be nice if we worked in this
3517 * case.
3518 */
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003519 pid_t child_pid;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003520 if (pid_namespace) {
François Degros94619842019-11-08 16:37:55 +11003521 unsigned long clone_flags = CLONE_NEWPID | SIGCHLD;
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003522 if (j->flags.userns)
3523 clone_flags |= CLONE_NEWUSER;
François Degros94619842019-11-08 16:37:55 +11003524
3525 child_pid = syscall(SYS_clone, clone_flags, NULL, 0L, 0L, 0L);
3526
3527 if (child_pid < 0) {
3528 if (errno == EPERM)
Allen Webb7ae41c22021-09-16 10:23:37 -05003529 pdie("clone(CLONE_NEWPID | ...) failed with "
3530 "EPERM; is this process missing "
3531 "CAP_SYS_ADMIN?");
François Degros94619842019-11-08 16:37:55 +11003532 pdie("clone(CLONE_NEWPID | ...) failed");
3533 }
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003534 } else {
Elly Jones761b7412012-06-13 15:49:52 -04003535 child_pid = fork();
3536
François Degros94619842019-11-08 16:37:55 +11003537 if (child_pid < 0)
3538 pdie("fork failed");
Elly Jonese1749eb2011-10-07 13:54:59 -04003539 }
Will Drewryf89aef52011-09-16 16:48:57 -05003540
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003541 state_out->child_pid = child_pid;
Elly Jonese1749eb2011-10-07 13:54:59 -04003542 if (child_pid) {
Elly Jonese1749eb2011-10-07 13:54:59 -04003543 j->initpid = child_pid;
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003544
Jorge Lucangeli Obesdba62092017-05-18 17:10:23 -04003545 if (j->flags.forward_signals) {
3546 forward_pid = child_pid;
3547 install_signal_handlers();
3548 }
3549
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08003550 if (j->flags.pid_file)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04003551 write_pid_file_or_die(j);
Yu-Hsi Chiang3cc05ea2015-08-11 11:23:17 +08003552
Jorge Lucangeli Obesb8a51382016-01-25 20:08:22 -08003553 if (j->flags.cgroups)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04003554 add_to_cgroups_or_die(j);
Dylan Reid605ce7f2016-01-19 19:21:00 -08003555
Dylan Reid0f72ef42017-06-06 15:42:49 -07003556 if (j->rlimit_count)
3557 set_rlimits_or_die(j);
3558
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003559 if (j->flags.userns)
Jorge Lucangeli Obesf205fff2016-08-06 09:06:21 -04003560 write_ugid_maps_or_die(j);
Dylan Reidce5b55e2016-01-13 11:04:16 -08003561
Mike Frysinger902a4492018-12-27 05:22:56 -05003562 if (j->flags.enter_vfs)
3563 close(j->mountns_fd);
3564
3565 if (j->flags.enter_net)
3566 close(j->netns_fd);
3567
Dylan Reidce5b55e2016-01-13 11:04:16 -08003568 if (sync_child)
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003569 parent_setup_complete(state_out->child_sync_pipe_fds);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003570
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003571 if (use_preload) {
François Degros664eba72019-11-05 13:18:24 +11003572 /*
3573 * Add SIGPIPE to the signal mask to avoid getting
3574 * killed if the child process finishes or closes its
3575 * end of the pipe prematurely.
3576 *
3577 * TODO(crbug.com/1022170): Use pthread_sigmask instead
3578 * of sigprocmask if Minijail is used in multithreaded
3579 * programs.
3580 */
3581 sigset_t to_block, to_restore;
3582 if (sigemptyset(&to_block) < 0)
3583 pdie("sigemptyset failed");
3584 if (sigaddset(&to_block, SIGPIPE) < 0)
3585 pdie("sigaddset failed");
3586 if (sigprocmask(SIG_BLOCK, &to_block, &to_restore) < 0)
3587 pdie("sigprocmask failed");
3588
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003589 /* Send marshalled minijail. */
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003590 close_and_reset(&state_out->pipe_fds[0]);
3591 ret = minijail_to_fd(j, state_out->pipe_fds[1]);
3592 close_and_reset(&state_out->pipe_fds[1]);
François Degros664eba72019-11-05 13:18:24 +11003593
3594 /* Accept any pending SIGPIPE. */
3595 while (true) {
3596 const struct timespec zero_time = {0, 0};
Allen Webb7ae41c22021-09-16 10:23:37 -05003597 const int sig =
3598 sigtimedwait(&to_block, NULL, &zero_time);
François Degros664eba72019-11-05 13:18:24 +11003599 if (sig < 0) {
3600 if (errno != EINTR)
3601 break;
3602 } else {
3603 if (sig != SIGPIPE)
Allen Webb7ae41c22021-09-16 10:23:37 -05003604 die("unexpected signal %d",
3605 sig);
François Degros664eba72019-11-05 13:18:24 +11003606 }
3607 }
3608
3609 /* Restore the signal mask to its original state. */
3610 if (sigprocmask(SIG_SETMASK, &to_restore, NULL) < 0)
3611 pdie("sigprocmask failed");
3612
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003613 if (ret) {
François Degros664eba72019-11-05 13:18:24 +11003614 warn("failed to send marshalled minijail: %s",
3615 strerror(-ret));
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003616 kill(j->initpid, SIGKILL);
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003617 }
Elly Jonese1749eb2011-10-07 13:54:59 -04003618 }
Jorge Lucangeli Obesdf4bd352012-08-29 19:12:28 -07003619
Elly Jonese1749eb2011-10-07 13:54:59 -04003620 return 0;
3621 }
Ben Chan541c7e52011-08-26 14:55:53 -07003622
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003623 /* Child process. */
Peter Qiu2860c462015-12-16 15:13:06 -08003624 if (j->flags.reset_signal_mask) {
3625 sigset_t signal_mask;
3626 if (sigemptyset(&signal_mask) != 0)
3627 pdie("sigemptyset failed");
3628 if (sigprocmask(SIG_SETMASK, &signal_mask, NULL) != 0)
3629 pdie("sigprocmask failed");
3630 }
3631
Luis Hector Chaveza27118a2018-04-04 08:18:01 -07003632 if (j->flags.reset_signal_handlers) {
3633 int signum;
3634 for (signum = 0; signum <= SIGRTMAX; signum++) {
3635 /*
3636 * Ignore EINVAL since some signal numbers in the range
3637 * might not be valid.
3638 */
3639 if (signal(signum, SIG_DFL) == SIG_ERR &&
3640 errno != EINVAL) {
3641 pdie("failed to reset signal %d disposition",
3642 signum);
3643 }
3644 }
3645 }
3646
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07003647 if (j->flags.close_open_fds) {
Allen Webb05af7762021-07-16 12:56:44 -05003648 const size_t kMaxInheritableFdsSize = 11 + MAX_PRESERVED_FDS;
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07003649 int inheritable_fds[kMaxInheritableFdsSize];
3650 size_t size = 0;
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003651
3652 int *pipe_fds[] = {
3653 state_out->pipe_fds, state_out->child_sync_pipe_fds,
3654 state_out->stdin_fds, state_out->stdout_fds,
3655 state_out->stderr_fds,
3656 };
3657
3658 for (size_t i = 0; i < ARRAY_SIZE(pipe_fds); ++i) {
3659 if (pipe_fds[i][0] != -1) {
3660 inheritable_fds[size++] = pipe_fds[i][0];
3661 }
3662 if (pipe_fds[i][1] != -1) {
3663 inheritable_fds[size++] = pipe_fds[i][1];
3664 }
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07003665 }
Jorge Lucangeli Obes2337f802019-07-18 14:46:03 -04003666
Jorge Lucangeli Obescf3bbea2019-07-24 09:06:40 -04003667 /*
3668 * Preserve namespace file descriptors over the close_open_fds()
3669 * call. These are closed in minijail_enter() so they won't leak
3670 * into the child process.
3671 */
Jorge Lucangeli Obes2337f802019-07-18 14:46:03 -04003672 if (j->flags.enter_vfs)
3673 minijail_preserve_fd(j, j->mountns_fd, j->mountns_fd);
3674 if (j->flags.enter_net)
3675 minijail_preserve_fd(j, j->netns_fd, j->netns_fd);
3676
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003677 for (size_t i = 0; i < j->preserved_fd_count; i++) {
Luis Hector Chavez1617f632017-08-01 18:32:30 -07003678 /*
3679 * Preserve all parent_fds. They will be dup2(2)-ed in
3680 * the child later.
3681 */
3682 inheritable_fds[size++] = j->preserved_fds[i].parent_fd;
3683 }
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07003684
Allen Webb05af7762021-07-16 12:56:44 -05003685 if (config->elf_fd > -1) {
3686 inheritable_fds[size++] = config->elf_fd;
3687 }
3688
Luis Hector Chavez43ff0802016-10-07 12:21:07 -07003689 if (close_open_fds(inheritable_fds, size) < 0)
3690 die("failed to close open file descriptors");
3691 }
3692
Allen Webb05af7762021-07-16 12:56:44 -05003693 /* The set of fds will be replaced. */
3694 fd_set child_fds;
3695 FD_ZERO(&child_fds);
3696 if (get_child_fds(j, &child_fds))
3697 die("failed to set up fd redirections");
3698
3699 if (avoid_pipe_conflicts(state_out, &child_fds))
3700 die("failed to redirect conflicting pipes");
3701
3702 /* The elf_fd needs to be mutable so use a stack copy from now on. */
3703 int elf_fd = config->elf_fd;
3704 if (elf_fd != -1 && ensure_no_fd_conflict(&child_fds, -1, &elf_fd))
3705 die("failed to redirect elf_fd");
3706
3707 if (redirect_fds(j, &child_fds))
Luis Hector Chavez1617f632017-08-01 18:32:30 -07003708 die("failed to set up fd redirections");
3709
Dylan Reidce5b55e2016-01-13 11:04:16 -08003710 if (sync_child)
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003711 wait_for_parent_setup(state_out->child_sync_pipe_fds);
Dylan Reidce5b55e2016-01-13 11:04:16 -08003712
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003713 if (j->flags.userns)
Dylan Reidce5b55e2016-01-13 11:04:16 -08003714 enter_user_namespace(j);
Yu-Hsi Chiang10e91232015-08-05 14:40:45 +08003715
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003716 setup_child_std_fds(j, state_out);
Jorge Lucangeli Obesaa235b92016-11-23 13:48:15 -05003717
Dylan Reid791f5772015-09-14 20:02:42 -07003718 /* If running an init program, let it decide when/how to mount /proc. */
3719 if (pid_namespace && !do_init)
3720 j->flags.remount_proc_ro = 0;
3721
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003722 if (use_preload) {
3723 /* Strip out flags that cannot be inherited across execve(2). */
3724 minijail_preexec(j);
3725 } else {
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04003726 /*
3727 * If not using LD_PRELOAD, do all jailing before execve(2).
3728 * Note that PID namespaces can only be entered on fork(2),
3729 * so that flag is still cleared.
3730 */
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003731 j->flags.pids = 0;
3732 }
Dylan Reid0412dcc2017-08-24 11:33:15 -07003733
3734 /*
3735 * Jail this process.
3736 * If forking, return.
3737 * If not, execve(2) the target.
3738 */
Elly Jonese1749eb2011-10-07 13:54:59 -04003739 minijail_enter(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04003740
Dylan Reid0412dcc2017-08-24 11:33:15 -07003741 if (config->exec_in_child && pid_namespace && do_init) {
Elly Jonesdd3e8512012-01-23 15:13:38 -05003742 /*
3743 * pid namespace: this process will become init inside the new
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08003744 * namespace. We don't want all programs we might exec to have
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003745 * to know how to be init. Normally (do_init == 1) we fork off
Yu-Hsi Chiang3e954ec2015-07-28 16:48:14 +08003746 * a child to actually run the program. If |do_init == 0|, we
3747 * let the program keep pid 1 and be init.
Elly Jones761b7412012-06-13 15:49:52 -04003748 *
3749 * If we're multithreaded, we'll probably deadlock here. See
3750 * WARNING above.
Elly Jonese1749eb2011-10-07 13:54:59 -04003751 */
3752 child_pid = fork();
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04003753 if (child_pid < 0) {
Elly Jonese1749eb2011-10-07 13:54:59 -04003754 _exit(child_pid);
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04003755 } else if (child_pid > 0) {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003756 minijail_free_run_state(state_out);
3757
Jorge Lucangeli Obes13650612016-09-02 11:27:29 -04003758 /*
3759 * Best effort. Don't bother checking the return value.
3760 */
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04003761 prctl(PR_SET_NAME, "minijail-init");
Allen Webb7ae41c22021-09-16 10:23:37 -05003762 init(child_pid); /* Never returns. */
Jorge Lucangeli Obes963eeec2016-08-10 16:02:43 -04003763 }
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003764 state_out->child_pid = child_pid;
Elly Jonese1749eb2011-10-07 13:54:59 -04003765 }
Elly Jonescd7a9042011-07-22 13:56:51 -04003766
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07003767 run_hooks_or_die(j, MINIJAIL_HOOK_EVENT_PRE_EXECVE);
3768
Dylan Reid0412dcc2017-08-24 11:33:15 -07003769 if (!config->exec_in_child)
3770 return 0;
3771
Ben Scarlatod7e6e682022-06-30 03:27:30 +00003772 /*
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003773 * We're going to execve(), so make sure any remaining resources are
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003774 * freed. Exceptions are:
3775 * 1. The child environment. No need to worry about freeing it since
3776 * execve reinitializes the heap anyways.
3777 * 2. The read side of the LD_PRELOAD pipe, which we need to hand down
3778 * into the target in which the preloaded code will read from it and
3779 * then close it.
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003780 */
3781 state_out->pipe_fds[0] = -1;
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003782 char *const *child_env = state_out->child_env;
3783 state_out->child_env = NULL;
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003784 minijail_free_run_state(state_out);
3785
3786 /*
Jorge Lucangeli Obes54714502015-09-30 10:08:45 -07003787 * If we aren't pid-namespaced, or the jailed program asked to be init:
Elly Jonese1749eb2011-10-07 13:54:59 -04003788 * calling process
3789 * -> execve()-ing process
3790 * If we are:
3791 * calling process
3792 * -> init()-ing process
3793 * -> execve()-ing process
3794 */
Allen Webb335b0bf2021-09-28 14:18:24 -07003795 if (!child_env)
3796 child_env = config->envp ? config->envp : environ;
Allen Webb05af7762021-07-16 12:56:44 -05003797 if (elf_fd > -1) {
3798 fexecve(elf_fd, config->argv, child_env);
3799 pwarn("fexecve(%d) failed", config->elf_fd);
3800 } else {
3801 execve(config->filename, config->argv, child_env);
3802 pwarn("execve(%s) failed", config->filename);
3803 }
François Degros08b10f72019-10-09 12:44:05 +11003804
Allen Webb7ae41c22021-09-16 10:23:37 -05003805 ret = (errno == ENOENT ? MINIJAIL_ERR_NO_COMMAND
3806 : MINIJAIL_ERR_NO_ACCESS);
Jorge Lucangeli Obesa2053902016-08-02 12:08:15 -04003807 _exit(ret);
Elly Jonescd7a9042011-07-22 13:56:51 -04003808}
3809
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003810static int
3811minijail_run_config_internal(struct minijail *j,
3812 const struct minijail_run_config *config)
3813{
3814 struct minijail_run_state state = {
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003815 .child_pid = -1,
3816 .pipe_fds = {-1, -1},
3817 .stdin_fds = {-1, -1},
3818 .stdout_fds = {-1, -1},
3819 .stderr_fds = {-1, -1},
3820 .child_sync_pipe_fds = {-1, -1},
Mattias Nisslerb35f2c12020-02-07 13:37:36 +01003821 .child_env = NULL,
Mattias Nissler6123e5a2020-02-11 13:38:03 +01003822 };
3823 int ret = minijail_run_internal(j, config, &state);
3824
3825 if (ret == 0) {
3826 if (config->pchild_pid)
3827 *config->pchild_pid = state.child_pid;
3828
3829 /* Grab stdin/stdout/stderr descriptors requested by caller. */
3830 struct {
3831 int *pfd;
3832 int *psrc;
3833 } fd_map[] = {
3834 {config->pstdin_fd, &state.stdin_fds[1]},
3835 {config->pstdout_fd, &state.stdout_fds[0]},
3836 {config->pstderr_fd, &state.stderr_fds[0]},
3837 };
3838
3839 for (size_t i = 0; i < ARRAY_SIZE(fd_map); ++i) {
3840 if (fd_map[i].pfd) {
3841 *fd_map[i].pfd = *fd_map[i].psrc;
3842 *fd_map[i].psrc = -1;
3843 }
3844 }
3845
3846 if (!config->exec_in_child)
3847 ret = state.child_pid;
3848 }
3849
3850 minijail_free_run_state(&state);
3851
3852 return ret;
3853}
3854
Victor Hsieh14036062021-04-30 15:40:36 -07003855static int minijail_wait_internal(struct minijail *j, int expected_signal)
Elly Jonese1749eb2011-10-07 13:54:59 -04003856{
François Degros08b10f72019-10-09 12:44:05 +11003857 if (j->initpid <= 0)
3858 return -ECHILD;
3859
Elly Jonese1749eb2011-10-07 13:54:59 -04003860 int st;
François Degros627deba2019-10-01 12:48:25 +10003861 while (true) {
3862 const int ret = waitpid(j->initpid, &st, 0);
3863 if (ret >= 0)
3864 break;
3865 if (errno != EINTR)
3866 return -errno;
3867 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08003868
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07003869 if (!WIFEXITED(st)) {
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07003870 int error_status = st;
Allen Webb77383c72021-10-15 10:34:24 -07003871 if (!WIFSIGNALED(st)) {
3872 return error_status;
3873 }
3874
3875 int signum = WTERMSIG(st);
3876 /*
3877 * We return MINIJAIL_ERR_JAIL if the process received
3878 * SIGSYS, which happens when a syscall is blocked by
3879 * seccomp filters.
3880 * If not, we do what bash(1) does:
3881 * $? = 128 + signum
3882 */
3883 if (signum == SIGSYS) {
3884 warn("child process %d had a policy violation (%s)",
3885 j->initpid,
3886 j->seccomp_policy_path ? j->seccomp_policy_path
3887 : "NO-LABEL");
3888 error_status = MINIJAIL_ERR_JAIL;
3889 } else {
Victor Hsieh14036062021-04-30 15:40:36 -07003890 if (signum != expected_signal) {
3891 warn("child process %d received signal %d",
3892 j->initpid, signum);
3893 }
Allen Webb77383c72021-10-15 10:34:24 -07003894 error_status = MINIJAIL_ERR_SIG_BASE + signum;
Jorge Lucangeli Obes18d1eba2014-04-18 13:58:20 -07003895 }
3896 return error_status;
Jorge Lucangeli Obesc2c9bcc2012-05-01 09:30:24 -07003897 }
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08003898
3899 int exit_status = WEXITSTATUS(st);
3900 if (exit_status != 0)
Allen Webb7ae41c22021-09-16 10:23:37 -05003901 info("child process %d exited with status %d", j->initpid,
3902 exit_status);
Jorge Lucangeli Obes1530b742012-12-11 14:08:09 -08003903
3904 return exit_status;
Elly Jonescd7a9042011-07-22 13:56:51 -04003905}
3906
Victor Hsieh14036062021-04-30 15:40:36 -07003907int API minijail_kill(struct minijail *j)
3908{
3909 if (j->initpid <= 0)
3910 return -ECHILD;
3911
3912 if (kill(j->initpid, SIGTERM))
3913 return -errno;
3914
3915 return minijail_wait_internal(j, SIGTERM);
3916}
3917
3918int API minijail_wait(struct minijail *j)
3919{
3920 return minijail_wait_internal(j, 0);
3921}
3922
Will Drewry6ac91122011-10-21 16:38:58 -05003923void API minijail_destroy(struct minijail *j)
Elly Jonese1749eb2011-10-07 13:54:59 -04003924{
Dylan Reid605ce7f2016-01-19 19:21:00 -08003925 size_t i;
3926
Luis Hector Chavezc3e17722018-10-16 20:43:12 -07003927 if (j->filter_prog) {
Jorge Lucangeli Obes524c0402012-01-17 11:30:23 -08003928 free(j->filter_prog->filter);
3929 free(j->filter_prog);
Elly Jonese1749eb2011-10-07 13:54:59 -04003930 }
Mike Frysingerac08a682017-10-10 02:04:50 -04003931 free_mounts_list(j);
Nicole Anderson-Au835f7172021-01-13 21:18:13 +00003932 free_remounts_list(j);
Luis Hector Chaveze0ba4ce2017-07-20 15:12:22 -07003933 while (j->hooks_head) {
3934 struct hook *c = j->hooks_head;
3935 j->hooks_head = c->next;
3936 free(c);
3937 }
3938 j->hooks_tail = NULL;
Ben Scarlatoee82b492022-08-09 18:33:25 +00003939 while (j->fs_rules_head) {
3940 struct fs_rule *r = j->fs_rules_head;
3941 j->fs_rules_head = r->next;
3942 free(r);
3943 }
3944 j->fs_rules_tail = NULL;
Elly Jonese1749eb2011-10-07 13:54:59 -04003945 if (j->user)
3946 free(j->user);
Jorge Lucangeli Obese81a52f2015-12-04 16:05:23 -08003947 if (j->suppl_gid_list)
3948 free(j->suppl_gid_list);
Will Drewrybee7ba72011-10-21 20:47:01 -05003949 if (j->chrootdir)
3950 free(j->chrootdir);
Jorge Lucangeli Obes3b2e6e42016-08-04 12:26:19 -04003951 if (j->pid_file_path)
3952 free(j->pid_file_path);
3953 if (j->uidmap)
3954 free(j->uidmap);
3955 if (j->gidmap)
3956 free(j->gidmap);
Mike Frysingerb9a7b162017-05-30 15:25:49 -04003957 if (j->hostname)
3958 free(j->hostname);
Luis Hector Chavez9acba452018-10-11 10:13:25 -07003959 if (j->preload_path)
3960 free(j->preload_path);
Andrew Brestickereac28942015-11-11 16:04:46 -08003961 if (j->alt_syscall_table)
3962 free(j->alt_syscall_table);
Dylan Reid605ce7f2016-01-19 19:21:00 -08003963 for (i = 0; i < j->cgroup_count; ++i)
3964 free(j->cgroups[i]);
Allen Webb77383c72021-10-15 10:34:24 -07003965 if (j->seccomp_policy_path)
3966 free(j->seccomp_policy_path);
Elly Jonese1749eb2011-10-07 13:54:59 -04003967 free(j);
Elly Jonescd7a9042011-07-22 13:56:51 -04003968}
Luis Hector Chavez114a9302017-09-05 20:36:58 -07003969
3970void API minijail_log_to_fd(int fd, int min_priority)
3971{
3972 init_logging(LOG_TO_FD, fd, min_priority);
3973}