libminijail,minijail0: add seccomp filter support
This change adds support for installing seccomp filters via libminijail
or by using minijail0 with an arch-specific filters file.
Support for LD_PRELOAD marshalling is still missing and will come in a new change.
BUG=chromium-os:19459
TEST=minijail0 -r -S dash-cat.policy -u chronos -- /bin/dash -c '/bin/cat /proc/self/seccomp_filter'
dash-cat.policy can be found in the bug.
built for arm-generic, tegra2_seaboard, and x86-alex. Tested on x86-alex as above and with -H.
Change-Id: I3cac97d1df62f70cd546763aeca8f52dd0aea09d
Reviewed-on: http://gerrit.chromium.org/gerrit/7773
Reviewed-by: Elly Jones <[email protected]>
Tested-by: Will Drewry <[email protected]>
diff --git a/Makefile b/Makefile
index 52fdcab..37e99d9 100644
--- a/Makefile
+++ b/Makefile
@@ -7,13 +7,58 @@
all : minijail0 libminijail.so libminijailpreload.so
-minijail0 : libminijail.o minijail0.c
+minijail0 : libsyscalls.gen.o libminijail.o minijail0.c
$(CC) $(CFLAGS) -o $@ $^ -lcap
-libminijail.so : libminijail.o
+libminijail.so : libminijail.o libsyscalls.gen.o
$(CC) $(CFLAGS) -shared -o $@ $^ -lcap
-libminijailpreload.so : libminijailpreload.c libminijail.o
+libminijailpreload.so : libminijailpreload.c libsyscalls.gen.o libminijail.o
$(CC) $(CFLAGS) -shared -o $@ $^ -ldl -lcap
libminijail.o : libminijail.c libminijail.h
+
+libsyscalls.gen.o : libsyscalls.gen.c libsyscalls.h
+
+# sed expression which extracts system calls that are
+# defined via asm/unistd.h. It converts them from:
+# #define __NR_read
+# to:
+# #ifdef __NR_read
+# { "read", __NR_read },
+# #endif
+# All other lines will not be emitted. The sed expression lives in its
+# own macro to allow clean line wrapping.
+define sed-multiline
+ 's/#define \(__NR_\)\([a-z0-9_]*\)$$/#ifdef \1\2\n\
+ { "\2", \1\2 },\n#endif/g p;'
+endef
+
+# Generates a header file with a system call table made up of "name",
+# syscall_nr entries by including the build target <asm/unistd.h> and
+# emitting the list of defines. Use of the compiler is needed to
+# dereference the actual provider of syscall definitions.
+# E.g., asm/unistd_32.h or asm/unistd_64.h, etc.
+define gen_syscalls
+ (set -e; \
+ echo '/* GENERATED BY MAKEFILE */'; \
+ echo '#include <stddef.h>'; \
+ echo '#include <asm/unistd.h>'; \
+ echo '#include "libsyscalls.h"'; \
+ echo "const struct syscall_entry syscall_table[] = {"; \
+ echo "#include <asm/unistd.h>" | \
+ $(CC) $(CFLAGS) -dN - -E | sed -ne $(sed-multiline); \
+ echo " { NULL, -1 },"; \
+ echo "};" ) > $1
+endef
+
+# Only regenerate libsyscalls.gen.c if the Makefile or header changes.
+# NOTE! This will not detect if the file is not appropriate for the target.
+libsyscalls.gen.c : Makefile libsyscalls.h
+ @printf "Generating target-arch specific $@ . . . "
+ @$(call gen_syscalls,$@)
+ @printf "done.\n"
+
+clean :
+ @rm -f libminijail.o libminijailpreload.so minijail0
+ @rm -f libsyscalls.gen.c libsyscalls.gen.o
diff --git a/libminijail-private.h b/libminijail-private.h
index 9912af1..923d06c 100644
--- a/libminijail-private.h
+++ b/libminijail-private.h
@@ -13,4 +13,6 @@
static const char *kCommandEnvVar = "__MINIJAIL_PRELOAD";
static const char *kLdPreloadEnvVar = "LD_PRELOAD";
+#define MINIJAIL_MAX_SECCOMP_FILTER_LINE 512
+
#endif /* !LIBMINIJAIL_PRIVATE_H */
diff --git a/libminijail.c b/libminijail.c
index 2afa236..08482e8 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -1,9 +1,11 @@
/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
#define _BSD_SOURCE
#define _GNU_SOURCE
+#include <ctype.h>
#include <errno.h>
#include <grp.h>
#include <inttypes.h>
@@ -24,8 +26,24 @@
#include <unistd.h>
#include "libminijail.h"
+#include "libsyscalls.h"
#include "libminijail-private.h"
+/* Until these are reliably available in linux/prctl.h */
+#ifndef PR_SET_SECCOMP_FILTER
+# define PR_SECCOMP_FILTER_SYSCALL 0
+# define PR_SECCOMP_FILTER_EVENT 1
+# define PR_GET_SECCOMP_FILTER 35
+# define PR_SET_SECCOMP_FILTER 36
+# define PR_CLEAR_SECCOMP_FILTER 37
+#endif
+
+struct seccomp_filter {
+ int nr;
+ char *filter;
+ struct seccomp_filter *next, *prev;
+};
+
struct minijail {
struct {
int uid : 1;
@@ -37,6 +55,7 @@
int readonly : 1;
int usergroups : 1;
int ptrace : 1;
+ int seccomp_filter : 1;
} flags;
uid_t uid;
gid_t gid;
@@ -44,17 +63,19 @@
const char *user;
uint64_t caps;
pid_t initpid;
+ struct seccomp_filter *filters;
};
-static void pdie(const char *failed) {
- syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
- abort();
-}
+#define die(_msg, ...) do { \
+ syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
+ abort(); \
+} while (0)
-static void die(const char *failed) {
- syslog(LOG_ERR, "libminijail: %s", failed);
- abort();
-}
+#define pdie(_msg, ...) \
+ die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
+
+#define warn(_msg, ...) \
+ syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
struct minijail *minijail_new(void) {
struct minijail *j = malloc(sizeof(*j));
@@ -117,6 +138,10 @@
j->flags.seccomp = 1;
}
+void minijail_use_seccomp_filter(struct minijail *j) {
+ j->flags.seccomp_filter = 1;
+}
+
void minijail_use_caps(struct minijail *j, uint64_t capmask) {
j->caps = capmask;
j->flags.caps = 1;
@@ -143,6 +168,104 @@
j->flags.ptrace = 1;
}
+int minijail_add_seccomp_filter(struct minijail *j, int nr,
+ const char *filter) {
+ struct seccomp_filter *sf;
+ if (!filter || nr < 0)
+ return -EINVAL;
+
+ sf = malloc(sizeof(*sf));
+ if (!sf)
+ return -ENOMEM;
+ sf->nr = nr;
+ sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
+ if (!sf->filter) {
+ free(sf);
+ return -ENOMEM;
+ }
+
+ if (!j->filters) {
+ j->filters = sf;
+ sf->next = sf;
+ sf->prev = sf;
+ return 0;
+ }
+ sf->next = j->filters;
+ sf->prev = j->filters->prev;
+ sf->prev->next = sf;
+ j->filters->prev = sf;
+ return 0;
+}
+
+int minijail_lookup_syscall(const char *name) {
+ const struct syscall_entry *entry = syscall_table;
+ for (; entry->name && entry->nr >= 0; ++entry)
+ if (!strcmp(entry->name, name))
+ return entry->nr;
+ return -1;
+}
+
+static char *strip(char *s) {
+ char *end;
+ while (*s && isblank(*s))
+ s++;
+ end = s + strlen(s) - 1;
+ while (*end && (isblank(*end) || *end == '\n'))
+ end--;
+ *(end+1) = '\0';
+ return s;
+}
+
+void minijail_parse_seccomp_filters(struct minijail *j, const char *path) {
+ FILE *file = fopen(path, "r");
+ char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
+ int count = 1;
+ if (!file)
+ pdie("failed to open seccomp filters file");
+
+ /* Format is simple:
+ * syscall_name<COLON><FILTER STRING>[\n|EOF]
+ * #...comment...
+ * <empty line?
+ */
+ while (fgets(line, sizeof(line), file)) {
+ char *filter = line;
+ char *name = strsep(&filter, ":");
+ char *name_end = NULL;
+ int nr = -1;
+
+ if (!name)
+ die("invalid filter on line %d", count);
+
+ name = strip(name);
+
+ if (!filter) {
+ if (strlen(name))
+ die("invalid filter on line %d", count);
+ /* Allow empty lines */
+ continue;
+ }
+
+ /* Allow comment lines */
+ if (*name == '#')
+ continue;
+
+ filter = strip(filter);
+
+ /* Take direct syscall numbers */
+ nr = strtol(name, &name_end, 0);
+ /* Or fail-over to using names */
+ if (*name_end != '\0')
+ nr = minijail_lookup_syscall(name);
+ if (nr < 0)
+ die("syscall '%s' unknown", name);
+
+ if (minijail_add_seccomp_filter(j, nr, filter))
+ pdie("failed to add filter for syscall '%s'", name);
+ }
+ fclose(file);
+}
+
static int remount_readonly(void) {
const char *kProcPath = "/proc";
const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
@@ -191,10 +314,59 @@
}
}
+static int setup_seccomp_filters(const struct minijail *j) {
+ const struct seccomp_filter *sf = j->filters;
+ int ret = 0;
+ int broaden = 0;
+
+ /* No filters installed isn't necessarily an error. */
+ if (!sf)
+ return ret;
+
+ do {
+ errno = 0;
+ ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
+ sf->nr, broaden ? "1" : sf->filter);
+ if (ret) {
+ switch (errno) {
+ case ENOSYS:
+ /* TODO(wad) make this a config option */
+ if (broaden)
+ die("CONFIG_SECCOMP_FILTER is not supported by your kernel");
+ warn("missing CONFIG_FTRACE_SYSCALLS; relaxing the filter for %d",
+ sf->nr);
+ broaden = 1;
+ continue;
+ case E2BIG:
+ warn("seccomp filter too long: %d", sf->nr);
+ pdie("filter too long");
+ case ENOSPC:
+ pdie("too many seccomp filters");
+ case EPERM:
+ warn("syscall filter disallowed for %d", sf->nr);
+ pdie("failed to install seccomp filter");
+ case EINVAL:
+ warn("seccomp filter or call method is invalid. %d:'%s'",
+ sf->nr, sf->filter);
+ default:
+ pdie("failed to install seccomp filter");
+ }
+ }
+ sf = sf->next;
+ broaden = 0;
+ } while (sf != j->filters);
+ return ret;
+}
+
void minijail_enter(const struct minijail *j) {
+ int ret;
if (j->flags.pids)
die("tried to enter a pid-namespaced jail; try minijail_run()?");
+ ret = setup_seccomp_filters(j);
+ if (j->flags.seccomp_filter && ret)
+ die("failed to configure seccomp filters");
+
if (j->flags.usergroups && !j->user)
die("usergroup inheritance without username");
@@ -217,10 +389,11 @@
pdie("prctl(PR_SET_SECUREBITS)");
}
- if (j->flags.usergroups && initgroups(j->user, j->usergid))
+ if (j->flags.usergroups && initgroups(j->user, j->usergid)) {
pdie("initgroups");
- else if (!j->flags.usergroups && setgroups(0, NULL))
+ } else if (!j->flags.usergroups && setgroups(0, NULL)) {
pdie("setgroups");
+ }
if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
pdie("setresgid");
@@ -233,6 +406,9 @@
/* seccomp has to come last since it cuts off all the other
* privilege-dropping syscalls :) */
+ if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
+ pdie("prctl(PR_SET_SECCOMP, 13)");
+
if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
pdie("prctl(PR_SET_SECCOMP)");
}
@@ -292,6 +468,10 @@
j->flags.ptrace = 0;
j->flags.seccomp = 0;
+ if (j->flags.seccomp_filter)
+ warn("TODO(wad) seccomp_filter is installed in the parent which "
+ "requires overly permissive rules for execve(2)ing.");
+
r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
if (!r) {
/* No commands generated, so no preload needed :) */
@@ -405,5 +585,15 @@
}
void minijail_destroy(struct minijail *j) {
+ struct seccomp_filter *f = j->filters;
+ /* Unlink the tail and head */
+ if (f)
+ f->prev->next = NULL;
+ while (f) {
+ struct seccomp_filter *next = f->next;
+ free(f->filter);
+ free(f);
+ f = next;
+ }
free(j);
}
diff --git a/libminijail.h b/libminijail.h
index 0df119e..6d36b85 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -1,6 +1,7 @@
/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
/* The general pattern of use here:
* 1) Construct a minijail with minijail_new()
@@ -41,6 +42,10 @@
/* 'group' should be kept valid until minijail_destroy() */
int minijail_change_group(struct minijail *j, const char *group);
void minijail_use_seccomp(struct minijail *j);
+void minijail_use_seccomp_filter(struct minijail *j);
+void minijail_parse_seccomp_filters(struct minijail *j, const char *path);
+int minijail_add_seccomp_filter(struct minijail *j, int nr,
+ const char *filter);
void minijail_use_caps(struct minijail *j, uint64_t capmask);
void minijail_namespace_vfs(struct minijail *j);
void minijail_namespace_pids(struct minijail *j);
@@ -48,6 +53,13 @@
void minijail_inherit_usergroups(struct minijail *j);
void minijail_disable_ptrace(struct minijail *j);
+/* Exposes minijail's name-to-int mapping for system calls for the
+ * architecture it was built on. This is primarily exposed for
+ * minijail_add_seccomp_filter() and testing.
+ * Returns the system call number on success or -1 on failure.
+ */
+int minijail_lookup_syscall(const char *name);
+
/* Lock this process into the given minijail. Note that this procedure cannot fail,
* since there is no way to undo privilege-dropping; therefore, if any part of
* the privilege-drop fails, minijail_enter() will abort the entire process.
diff --git a/libsyscalls.h b/libsyscalls.h
new file mode 100644
index 0000000..d31ec8f
--- /dev/null
+++ b/libsyscalls.h
@@ -0,0 +1,15 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef MINIJAIL_LIBSYSCALLS_H_
+#define MINIJAIL_LIBSYSCALLS_H_
+#include <sys/types.h>
+
+struct syscall_entry {
+ const char *name;
+ int nr;
+};
+
+extern const struct syscall_entry syscall_table[];
+#endif /* MINIJAIL_LIBSYSCALLS_H_ */
diff --git a/minijail0.1 b/minijail0.1
index 15ceeca..6aedb81 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -28,6 +28,11 @@
\fB-h\fR
Print a help message.
.TP
+\fB-H\fR
+Print a help message detailing supported system call names for seccomp_filter.
+(Other direct numbers may be specified if minijail0 is not in sync with the
+ host kernel or something like 32/64-bit compatibility issues exist.)
+.TP
\fB-p\fR
Run inside a new PID namespace. This option will make it impossible for the
program to see or affect processes that are not its descendants.
@@ -40,7 +45,12 @@
.TP
\fB-s\fR
Enable seccomp(2) in mode 1, which restricts the child process to a very small
-set of system calls. Support for more elaborate syscall filtering is coming.
+set of system calls.
+.TP
+\fB-S <arch-specific seccomp_filter policy file>\fR
+Enable seccomp(2) in mode 13 which restricts the child process to a set of
+system calls defined in the policy file. Note that system calls often change
+names based on the architecture or mode. (uname -m is your friend.)
.TP
\fB-u <user>\fR
Change users to \fIuser\fR, which may be either a user name or a numeric user
@@ -68,4 +78,4 @@
Copyright \(co 2011 The Chromium OS Authors
License BSD-like.
.SH "SEE ALSO"
-\fBlibminijail.h\fR
+\fBlibminijail.h\fR \fBminijail0(5)\fR
diff --git a/minijail0.5 b/minijail0.5
new file mode 100644
index 0000000..b9036b9
--- /dev/null
+++ b/minijail0.5
@@ -0,0 +1,85 @@
+.TH MINIJAIL0 "1" "July 2011" "Chromium OS" "User Commands"
+.SH NAME
+minijail0 \- sandbox a process
+.SH DESCRIPTION
+.PP
+Runs PROGRAM inside a sandbox. See minijail(1) for details.
+.SH EXAMPLES
+
+Safely switch from root to nobody while dropping all capabilities and
+inheriting any groups from nobody:
+
+ # minijail0 -c 0 -G -u nobody /usr/bin/whoami
+ nobody
+
+Run in a PID and VFS namespace without superuser capabilities (but still
+as root) and with a private view of /proc:
+
+ # minijail0 -p -v -r -c 0 /bin/ps
+ PID TTY TIME CMD
+ 1 pts/0 00:00:00 minijail0
+ 2 pts/0 00:00:00 ps
+
+Running a process with a seccomp filter policy at reduced privileges:
+
+ # minijail0 -S /usr/share/minijail0/$(uname -m)/cat.policy -- \\
+ /bin/cat /proc/self/seccomp_filter
+ ...
+
+.SH SECCOMP_FILTER POLICY
+The policy file supplied to the \fB-S\fR argument supports the following syntax:
+
+ \fB<syscall_name>\fR:\fB<ftrace filter policy>\fR
+ \fB<syscall_number>\fR:\fB<ftrace filter policy>\fR
+ \fB<empty line>\fR
+ \fB# any single line comment\fR
+
+A policy that emulates seccomp(2) in mode 1 may look like:
+ read: 1
+ write: 1
+ sig_return: 1
+ exit: 1
+
+The "1" acts as a wildcard and allows any use of the mentioned system
+call. More advanced filtering is possible if your kernel supports
+CONFIG_FTRACE_SYSCALLS. For example, we can allow a process to open any
+file read only and mmap PROT_READ only:
+
+ # open with O_LARGEFILE|O_RDONLY|O_NONBLOCK or some combination
+ open: flags == 32768 || flags == 0 || flags == 34816 || flags == 2048
+ mmap2: prot == 0x0
+ munmap: 1
+ close: 1
+
+The supported arguments may be found by reviewing the system call
+prototypes in the Linux kernel source code. Be aware that any
+non-numeric comparison may be subject to time-of-check-time-of-use
+attacks and cannot be considered safe.
+
+\fBexecve\fR may only be used when invoking with CAP_SYS_ADMIN privileges.
+
+.SH SECCOMP_FILTER POLICY WRITING
+
+Determining policy for seccomp_filter can be time consuming. System
+calls are often named in arch-specific, or legacy tainted, ways. E.g.,
+geteuid versus geteuid32. On process death due to a seccomp filter
+rule, the offending system call number will be supplied with a best
+guess of the ABI defined name. This information may be used to produce
+working baseline policies. However, if the process being contained has
+a fairly tight working domain, using \fBstrace -e raw=all <program>\fR
+can generate the list of system calls that are needed. Note that when
+using libminijail or minijail with preloading, supporting initial
+process setup calls will not be required. Be conservative.
+
+It's also possible to analyze the binary checking for all non-dead
+functions and determining if any of them issue system calls. There is
+no active implementation for this, but something like
+code.google.com/p/seccompsandbox is one possible runtime variant.
+
+.SH AUTHOR
+The Chromium OS Authors <[email protected]>
+.SH COPYRIGHT
+Copyright \(co 2011 The Chromium OS Authors
+License BSD-like.
+.SH "SEE ALSO"
+\fBminijail\fR(1)
diff --git a/minijail0.c b/minijail0.c
index 654f332..9e2bece 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -1,6 +1,7 @@
/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
#include <stdio.h>
#include <stdlib.h>
@@ -8,6 +9,7 @@
#include <unistd.h>
#include "libminijail.h"
+#include "libsyscalls.h"
static void set_user(struct minijail *j, const char *arg) {
char *end = NULL;
@@ -49,23 +51,36 @@
}
static void usage(const char *progn) {
- printf("Usage: %s [-Ghprsv] [-c <caps>] [-g <group>] [-u <user>] <program> [args...]\n"
+ printf("Usage: %s [-Ghprsv] [-c <caps>] [-g <group>] [-S <file>] [-u <user>] "
+ "<program> [args...]\n"
" -c: restrict caps to <caps>\n"
" -G: inherit groups from uid\n"
" -g: change gid to <group>\n"
" -h: help (this message)\n"
+ " -H: seccomp filter help message\n"
" -p: use pid namespace\n"
" -r: remount filesystems readonly (implies -v)\n"
" -s: use seccomp\n"
+ " -S: set seccomp filters using <file>\n"
+ " E.g., -S /usr/share/blah/seccomp_filters.$(uname -m)\n"
" -u: change uid to <user>\n"
" -v: use vfs namespace\n", progn);
}
+static void seccomp_filter_usage(const char *progn) {
+ const struct syscall_entry *entry = syscall_table;
+ printf("Usage: %s -S <policy.file> <program> [args...]\n\n"
+ "System call names supported:\n", progn);
+ for (; entry->name && entry->nr >= 0; ++entry)
+ printf(" %s [%d]\n", entry->name, entry->nr);
+ printf("\nSee minijail0(5) for example policies.\n");
+}
+
int main(int argc, char *argv[]) {
struct minijail *j = minijail_new();
int opt;
- while ((opt = getopt(argc, argv, "u:g:sc:vrGhp")) != -1) {
+ while ((opt = getopt(argc, argv, "u:g:sS:c:vrGhHp")) != -1) {
switch (opt) {
case 'u':
set_user(j, optarg);
@@ -76,6 +91,10 @@
case 's':
minijail_use_seccomp(j);
break;
+ case 'S':
+ minijail_parse_seccomp_filters(j, optarg);
+ minijail_use_seccomp_filter(j);
+ break;
case 'c':
use_caps(j, optarg);
break;
@@ -91,6 +110,9 @@
case 'p':
minijail_namespace_pids(j);
break;
+ case 'H':
+ seccomp_filter_usage(argv[0]);
+ exit(1);
default:
usage(argv[0]);
exit(1);