libminijail,minijail0: add seccomp filter support

This change adds support for installing seccomp filters via libminijail
or by using minijail0 with an arch-specific filters file.

Support for LD_PRELOAD marshalling is still missing and will come in a new change.

BUG=chromium-os:19459
TEST=minijail0 -r -S dash-cat.policy -u chronos -- /bin/dash -c '/bin/cat /proc/self/seccomp_filter'
dash-cat.policy can be found  in the bug.
built for arm-generic, tegra2_seaboard, and x86-alex.  Tested on x86-alex as above and with -H.

Change-Id: I3cac97d1df62f70cd546763aeca8f52dd0aea09d
Reviewed-on: http://gerrit.chromium.org/gerrit/7773
Reviewed-by: Elly Jones <[email protected]>
Tested-by: Will Drewry <[email protected]>
diff --git a/Makefile b/Makefile
index 52fdcab..37e99d9 100644
--- a/Makefile
+++ b/Makefile
@@ -7,13 +7,58 @@
 
 all : minijail0 libminijail.so libminijailpreload.so
 
-minijail0 : libminijail.o minijail0.c
+minijail0 : libsyscalls.gen.o libminijail.o minijail0.c
 	$(CC) $(CFLAGS) -o $@ $^ -lcap
 
-libminijail.so : libminijail.o
+libminijail.so : libminijail.o libsyscalls.gen.o
 	$(CC) $(CFLAGS) -shared -o $@ $^ -lcap
 
-libminijailpreload.so : libminijailpreload.c libminijail.o
+libminijailpreload.so : libminijailpreload.c libsyscalls.gen.o libminijail.o
 	$(CC) $(CFLAGS) -shared -o $@ $^ -ldl -lcap
 
 libminijail.o : libminijail.c libminijail.h
+
+libsyscalls.gen.o : libsyscalls.gen.c libsyscalls.h
+
+# sed expression which extracts system calls that are
+# defined via asm/unistd.h.  It converts them from:
+#  #define __NR_read
+# to:
+# #ifdef __NR_read
+#  { "read", __NR_read },
+# #endif
+# All other lines will not be emitted.  The sed expression lives in its
+# own macro to allow clean line wrapping.
+define sed-multiline
+	's/#define \(__NR_\)\([a-z0-9_]*\)$$/#ifdef \1\2\n\
+	 { "\2", \1\2 },\n#endif/g p;'
+endef
+
+# Generates a header file with a system call table made up of "name",
+# syscall_nr entries by including the build target <asm/unistd.h> and
+# emitting the list of defines.  Use of the compiler is needed to
+# dereference the actual provider of syscall definitions.
+#   E.g., asm/unistd_32.h or asm/unistd_64.h, etc.
+define gen_syscalls
+	(set -e; \
+	 echo '/* GENERATED BY MAKEFILE */'; \
+	 echo '#include <stddef.h>'; \
+	 echo '#include <asm/unistd.h>'; \
+	 echo '#include "libsyscalls.h"'; \
+	 echo "const struct syscall_entry syscall_table[] = {"; \
+	 echo "#include <asm/unistd.h>" | \
+	   $(CC) $(CFLAGS) -dN - -E | sed -ne $(sed-multiline); \
+	 echo "  { NULL, -1 },"; \
+	 echo "};" ) > $1
+endef
+
+# Only regenerate libsyscalls.gen.c if the Makefile or header changes.
+# NOTE! This will not detect if the file is not appropriate for the target.
+libsyscalls.gen.c : Makefile libsyscalls.h
+	@printf "Generating target-arch specific $@ . . . "
+	@$(call gen_syscalls,$@)
+	@printf "done.\n"
+
+clean :
+	@rm -f libminijail.o libminijailpreload.so minijail0
+	@rm -f libsyscalls.gen.c libsyscalls.gen.o
diff --git a/libminijail-private.h b/libminijail-private.h
index 9912af1..923d06c 100644
--- a/libminijail-private.h
+++ b/libminijail-private.h
@@ -13,4 +13,6 @@
 static const char *kCommandEnvVar = "__MINIJAIL_PRELOAD";
 static const char *kLdPreloadEnvVar = "LD_PRELOAD";
 
+#define MINIJAIL_MAX_SECCOMP_FILTER_LINE 512
+
 #endif /* !LIBMINIJAIL_PRIVATE_H */
diff --git a/libminijail.c b/libminijail.c
index 2afa236..08482e8 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -1,9 +1,11 @@
 /* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
 
 #define _BSD_SOURCE
 #define _GNU_SOURCE
+#include <ctype.h>
 #include <errno.h>
 #include <grp.h>
 #include <inttypes.h>
@@ -24,8 +26,24 @@
 #include <unistd.h>
 
 #include "libminijail.h"
+#include "libsyscalls.h"
 #include "libminijail-private.h"
 
+/* Until these are reliably available in linux/prctl.h */
+#ifndef PR_SET_SECCOMP_FILTER
+#  define PR_SECCOMP_FILTER_SYSCALL 0
+#  define PR_SECCOMP_FILTER_EVENT 1
+#  define PR_GET_SECCOMP_FILTER 35
+#  define PR_SET_SECCOMP_FILTER 36
+#  define PR_CLEAR_SECCOMP_FILTER 37
+#endif
+
+struct seccomp_filter {
+  int nr;
+  char *filter;
+  struct seccomp_filter *next, *prev;
+};
+
 struct minijail {
   struct {
     int uid : 1;
@@ -37,6 +55,7 @@
     int readonly : 1;
     int usergroups : 1;
     int ptrace : 1;
+    int seccomp_filter : 1;
   } flags;
   uid_t uid;
   gid_t gid;
@@ -44,17 +63,19 @@
   const char *user;
   uint64_t caps;
   pid_t initpid;
+  struct seccomp_filter *filters;
 };
 
-static void pdie(const char *failed) {
-  syslog(LOG_ERR, "libminijail: %s failed: %s", failed, strerror(errno));
-  abort();
-}
+#define die(_msg, ...) do { \
+  syslog(LOG_ERR, "libminijail: " _msg, ## __VA_ARGS__); \
+  abort(); \
+} while (0)
 
-static void die(const char *failed) {
-  syslog(LOG_ERR, "libminijail: %s", failed);
-  abort();
-}
+#define pdie(_msg, ...) \
+  die(_msg ": %s", ## __VA_ARGS__, strerror(errno))
+
+#define warn(_msg, ...) \
+  syslog(LOG_WARNING, "libminijail: " _msg, ## __VA_ARGS__)
 
 struct minijail *minijail_new(void) {
   struct minijail *j = malloc(sizeof(*j));
@@ -117,6 +138,10 @@
   j->flags.seccomp = 1;
 }
 
+void minijail_use_seccomp_filter(struct minijail *j) {
+  j->flags.seccomp_filter = 1;
+}
+
 void minijail_use_caps(struct minijail *j, uint64_t capmask) {
   j->caps = capmask;
   j->flags.caps = 1;
@@ -143,6 +168,104 @@
   j->flags.ptrace = 1;
 }
 
+int minijail_add_seccomp_filter(struct minijail *j, int nr,
+                                const char *filter) {
+  struct seccomp_filter *sf;
+  if (!filter || nr < 0)
+    return -EINVAL;
+
+  sf = malloc(sizeof(*sf));
+  if (!sf)
+    return -ENOMEM;
+  sf->nr = nr;
+  sf->filter = strndup(filter, MINIJAIL_MAX_SECCOMP_FILTER_LINE);
+  if (!sf->filter) {
+    free(sf);
+    return -ENOMEM;
+  }
+
+  if (!j->filters) {
+    j->filters = sf;
+    sf->next = sf;
+    sf->prev = sf;
+    return 0;
+  }
+  sf->next = j->filters;
+  sf->prev = j->filters->prev;
+  sf->prev->next = sf;
+  j->filters->prev = sf;
+  return 0;
+}
+
+int minijail_lookup_syscall(const char *name) {
+  const struct syscall_entry *entry = syscall_table;
+  for (; entry->name && entry->nr >= 0; ++entry)
+    if (!strcmp(entry->name, name))
+      return entry->nr;
+  return -1;
+}
+
+static char *strip(char *s) {
+  char *end;
+  while (*s && isblank(*s))
+    s++;
+  end = s + strlen(s) - 1;
+  while (*end && (isblank(*end) || *end == '\n'))
+    end--;
+  *(end+1) = '\0';
+  return s;
+}
+
+void minijail_parse_seccomp_filters(struct minijail *j, const char *path) {
+  FILE *file = fopen(path, "r");
+  char line[MINIJAIL_MAX_SECCOMP_FILTER_LINE];
+  int count = 1;
+  if (!file)
+    pdie("failed to open seccomp filters file");
+
+  /* Format is simple:
+   * syscall_name<COLON><FILTER STRING>[\n|EOF]
+   * #...comment...
+   * <empty line?
+   */
+  while (fgets(line, sizeof(line), file)) {
+    char *filter = line;
+    char *name = strsep(&filter, ":");
+    char *name_end = NULL;
+    int nr = -1;
+
+    if (!name)
+      die("invalid filter on line %d", count);
+
+    name = strip(name);
+
+    if (!filter) {
+      if (strlen(name))
+        die("invalid filter on line %d", count);
+      /* Allow empty lines */
+      continue;
+    }
+
+    /* Allow comment lines */
+    if (*name == '#')
+      continue;
+
+    filter = strip(filter);
+
+    /* Take direct syscall numbers */
+    nr = strtol(name, &name_end, 0);
+    /* Or fail-over to using names */
+    if (*name_end != '\0')
+      nr = minijail_lookup_syscall(name);
+    if (nr < 0)
+      die("syscall '%s' unknown", name);
+
+    if (minijail_add_seccomp_filter(j, nr, filter))
+      pdie("failed to add filter for syscall '%s'", name);
+  }
+  fclose(file);
+}
+
 static int remount_readonly(void) {
   const char *kProcPath = "/proc";
   const unsigned int kSafeFlags = MS_NODEV | MS_NOEXEC | MS_NOSUID;
@@ -191,10 +314,59 @@
   }
 }
 
+static int setup_seccomp_filters(const struct minijail *j) {
+  const struct seccomp_filter *sf = j->filters;
+  int ret = 0;
+  int broaden = 0;
+
+  /* No filters installed isn't necessarily an error. */
+  if (!sf)
+    return ret;
+
+  do {
+    errno = 0;
+    ret = prctl(PR_SET_SECCOMP_FILTER, PR_SECCOMP_FILTER_SYSCALL,
+                    sf->nr, broaden ? "1" : sf->filter);
+    if (ret) {
+      switch (errno) {
+        case ENOSYS:
+          /* TODO(wad) make this a config option */
+          if (broaden)
+            die("CONFIG_SECCOMP_FILTER is not supported by your kernel");
+          warn("missing CONFIG_FTRACE_SYSCALLS; relaxing the filter for %d",
+               sf->nr);
+          broaden = 1;
+          continue;
+        case E2BIG:
+          warn("seccomp filter too long: %d", sf->nr);
+          pdie("filter too long");
+        case ENOSPC:
+          pdie("too many seccomp filters");
+        case EPERM:
+          warn("syscall filter disallowed for %d", sf->nr);
+          pdie("failed to install seccomp filter");
+        case EINVAL:
+          warn("seccomp filter or call method is invalid. %d:'%s'",
+               sf->nr, sf->filter);
+        default:
+          pdie("failed to install seccomp filter");
+      }
+    }
+    sf = sf->next;
+    broaden = 0;
+  } while (sf != j->filters);
+  return ret;
+}
+
 void minijail_enter(const struct minijail *j) {
+  int ret;
   if (j->flags.pids)
     die("tried to enter a pid-namespaced jail; try minijail_run()?");
 
+  ret = setup_seccomp_filters(j);
+  if (j->flags.seccomp_filter && ret)
+    die("failed to configure seccomp filters");
+
   if (j->flags.usergroups && !j->user)
     die("usergroup inheritance without username");
 
@@ -217,10 +389,11 @@
       pdie("prctl(PR_SET_SECUREBITS)");
   }
 
-  if (j->flags.usergroups && initgroups(j->user, j->usergid))
+  if (j->flags.usergroups && initgroups(j->user, j->usergid)) {
     pdie("initgroups");
-  else if (!j->flags.usergroups && setgroups(0, NULL))
+  } else if (!j->flags.usergroups && setgroups(0, NULL)) {
     pdie("setgroups");
+  }
 
   if (j->flags.gid && setresgid(j->gid, j->gid, j->gid))
     pdie("setresgid");
@@ -233,6 +406,9 @@
 
   /* seccomp has to come last since it cuts off all the other
    * privilege-dropping syscalls :) */
+  if (j->flags.seccomp_filter && prctl(PR_SET_SECCOMP, 13))
+        pdie("prctl(PR_SET_SECCOMP, 13)");
+
   if (j->flags.seccomp && prctl(PR_SET_SECCOMP, 1))
     pdie("prctl(PR_SET_SECCOMP)");
 }
@@ -292,6 +468,10 @@
   j->flags.ptrace = 0;
   j->flags.seccomp = 0;
 
+  if (j->flags.seccomp_filter)
+    warn("TODO(wad) seccomp_filter is installed in the parent which "
+         "requires overly permissive rules for execve(2)ing.");
+
   r = snprintf(envbuf, kEnvBufSize, "%s%s%s%s", setuid, ptrace, seccomp, caps);
   if (!r) {
     /* No commands generated, so no preload needed :) */
@@ -405,5 +585,15 @@
 }
 
 void minijail_destroy(struct minijail *j) {
+  struct seccomp_filter *f = j->filters;
+  /* Unlink the tail and head */
+  if (f)
+    f->prev->next = NULL;
+  while (f) {
+    struct seccomp_filter *next = f->next;
+    free(f->filter);
+    free(f);
+    f = next;
+  }
   free(j);
 }
diff --git a/libminijail.h b/libminijail.h
index 0df119e..6d36b85 100644
--- a/libminijail.h
+++ b/libminijail.h
@@ -1,6 +1,7 @@
 /* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
 
 /* The general pattern of use here:
  * 1) Construct a minijail with minijail_new()
@@ -41,6 +42,10 @@
 /* 'group' should be kept valid until minijail_destroy() */
 int minijail_change_group(struct minijail *j, const char *group);
 void minijail_use_seccomp(struct minijail *j);
+void minijail_use_seccomp_filter(struct minijail *j);
+void minijail_parse_seccomp_filters(struct minijail *j, const char *path);
+int minijail_add_seccomp_filter(struct minijail *j, int nr,
+                                const char *filter);
 void minijail_use_caps(struct minijail *j, uint64_t capmask);
 void minijail_namespace_vfs(struct minijail *j);
 void minijail_namespace_pids(struct minijail *j);
@@ -48,6 +53,13 @@
 void minijail_inherit_usergroups(struct minijail *j);
 void minijail_disable_ptrace(struct minijail *j);
 
+/* Exposes minijail's name-to-int mapping for system calls for the
+ * architecture it was built on.  This is primarily exposed for
+ * minijail_add_seccomp_filter() and testing.
+ * Returns the system call number on success or -1 on failure.
+ */
+int minijail_lookup_syscall(const char *name);
+
 /* Lock this process into the given minijail. Note that this procedure cannot fail,
  * since there is no way to undo privilege-dropping; therefore, if any part of
  * the privilege-drop fails, minijail_enter() will abort the entire process.
diff --git a/libsyscalls.h b/libsyscalls.h
new file mode 100644
index 0000000..d31ec8f
--- /dev/null
+++ b/libsyscalls.h
@@ -0,0 +1,15 @@
+/* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+#ifndef MINIJAIL_LIBSYSCALLS_H_
+#define MINIJAIL_LIBSYSCALLS_H_
+#include <sys/types.h>
+
+struct syscall_entry {
+  const char *name;
+  int nr;
+};
+
+extern const struct syscall_entry syscall_table[];
+#endif  /* MINIJAIL_LIBSYSCALLS_H_ */
diff --git a/minijail0.1 b/minijail0.1
index 15ceeca..6aedb81 100644
--- a/minijail0.1
+++ b/minijail0.1
@@ -28,6 +28,11 @@
 \fB-h\fR
 Print a help message.
 .TP
+\fB-H\fR
+Print a help message detailing supported system call names for seccomp_filter.
+(Other direct numbers may be specified if minijail0 is not in sync with the
+ host kernel or something like 32/64-bit compatibility issues exist.)
+.TP
 \fB-p\fR
 Run inside a new PID namespace. This option will make it impossible for the
 program to see or affect processes that are not its descendants.
@@ -40,7 +45,12 @@
 .TP
 \fB-s\fR
 Enable seccomp(2) in mode 1, which restricts the child process to a very small
-set of system calls. Support for more elaborate syscall filtering is coming.
+set of system calls.
+.TP
+\fB-S <arch-specific seccomp_filter policy file>\fR
+Enable seccomp(2) in mode 13 which restricts the child process to a set of
+system calls defined in the policy file.  Note that system calls often change
+names based on the architecture or mode. (uname -m is your friend.)
 .TP
 \fB-u <user>\fR
 Change users to \fIuser\fR, which may be either a user name or a numeric user
@@ -68,4 +78,4 @@
 Copyright \(co 2011 The Chromium OS Authors
 License BSD-like.
 .SH "SEE ALSO"
-\fBlibminijail.h\fR
+\fBlibminijail.h\fR \fBminijail0(5)\fR
diff --git a/minijail0.5 b/minijail0.5
new file mode 100644
index 0000000..b9036b9
--- /dev/null
+++ b/minijail0.5
@@ -0,0 +1,85 @@
+.TH MINIJAIL0 "1" "July 2011" "Chromium OS" "User Commands"
+.SH NAME
+minijail0 \- sandbox a process
+.SH DESCRIPTION
+.PP
+Runs PROGRAM inside a sandbox. See minijail(1) for details.
+.SH EXAMPLES
+
+Safely switch from root to nobody while dropping all capabilities and
+inheriting any groups from nobody:
+
+  # minijail0 -c 0 -G -u nobody /usr/bin/whoami
+  nobody
+
+Run in a PID and VFS namespace without superuser capabilities (but still
+as root) and with a private view of /proc:
+
+  # minijail0 -p -v -r -c 0 /bin/ps
+    PID TTY           TIME CMD
+      1 pts/0     00:00:00 minijail0
+      2 pts/0     00:00:00 ps
+
+Running a process with a seccomp filter policy at reduced privileges:
+
+  # minijail0 -S /usr/share/minijail0/$(uname -m)/cat.policy -- \\
+              /bin/cat /proc/self/seccomp_filter
+  ...
+
+.SH SECCOMP_FILTER POLICY
+The policy file supplied to the \fB-S\fR argument supports the following syntax:
+
+  \fB<syscall_name>\fR:\fB<ftrace filter policy>\fR
+  \fB<syscall_number>\fR:\fB<ftrace filter policy>\fR
+  \fB<empty line>\fR
+  \fB# any single line comment\fR
+
+A policy that emulates seccomp(2) in mode 1 may look like:
+  read: 1
+  write: 1
+  sig_return: 1
+  exit: 1
+
+The "1" acts as a wildcard and allows any use of the mentioned system
+call.  More advanced filtering is possible if your kernel supports
+CONFIG_FTRACE_SYSCALLS.  For example, we can allow a process to open any
+file read only and mmap PROT_READ only:
+
+  # open with O_LARGEFILE|O_RDONLY|O_NONBLOCK or some combination
+  open: flags == 32768 || flags == 0 || flags == 34816 || flags == 2048
+  mmap2: prot == 0x0
+  munmap: 1
+  close: 1
+
+The supported arguments may be found by reviewing the system call
+prototypes in the Linux kernel source code.  Be aware that any
+non-numeric comparison may be subject to time-of-check-time-of-use
+attacks and cannot be considered safe.
+
+\fBexecve\fR may only be used when invoking with CAP_SYS_ADMIN privileges.
+
+.SH SECCOMP_FILTER POLICY WRITING
+
+Determining policy for seccomp_filter can be time consuming.  System
+calls are often named in arch-specific, or legacy tainted, ways.  E.g.,
+geteuid versus geteuid32.  On process death due to a seccomp filter
+rule, the offending system call number will be supplied with a best
+guess of the ABI defined name.  This information may be used to produce
+working baseline policies.  However, if the process being contained has
+a fairly tight working domain, using \fBstrace -e raw=all <program>\fR
+can generate the list of system calls that are needed. Note that when
+using libminijail or minijail with preloading, supporting initial
+process setup calls will not be required.  Be conservative.
+
+It's also possible to analyze the binary checking for all non-dead
+functions and determining if any of them issue system calls.  There is
+no active implementation for this, but something like
+code.google.com/p/seccompsandbox is one possible runtime variant.
+
+.SH AUTHOR
+The Chromium OS Authors <[email protected]>
+.SH COPYRIGHT
+Copyright \(co 2011 The Chromium OS Authors
+License BSD-like.
+.SH "SEE ALSO"
+\fBminijail\fR(1)
diff --git a/minijail0.c b/minijail0.c
index 654f332..9e2bece 100644
--- a/minijail0.c
+++ b/minijail0.c
@@ -1,6 +1,7 @@
 /* Copyright (c) 2011 The Chromium OS Authors. All rights reserved.
  * Use of this source code is governed by a BSD-style license that can be
- * found in the LICENSE file. */
+ * found in the LICENSE file.
+ */
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -8,6 +9,7 @@
 #include <unistd.h>
 
 #include "libminijail.h"
+#include "libsyscalls.h"
 
 static void set_user(struct minijail *j, const char *arg) {
   char *end = NULL;
@@ -49,23 +51,36 @@
 }
 
 static void usage(const char *progn) {
-  printf("Usage: %s [-Ghprsv] [-c <caps>] [-g <group>] [-u <user>] <program> [args...]\n"
+  printf("Usage: %s [-Ghprsv] [-c <caps>] [-g <group>] [-S <file>] [-u <user>] "
+         "<program> [args...]\n"
          "  -c: restrict caps to <caps>\n"
          "  -G: inherit groups from uid\n"
          "  -g: change gid to <group>\n"
          "  -h: help (this message)\n"
+         "  -H: seccomp filter help message\n"
          "  -p: use pid namespace\n"
          "  -r: remount filesystems readonly (implies -v)\n"
          "  -s: use seccomp\n"
+         "  -S: set seccomp filters using <file>\n"
+         "      E.g., -S /usr/share/blah/seccomp_filters.$(uname -m)\n"
          "  -u: change uid to <user>\n"
          "  -v: use vfs namespace\n", progn);
 }
 
+static void seccomp_filter_usage(const char *progn) {
+  const struct syscall_entry *entry = syscall_table;
+  printf("Usage: %s -S <policy.file> <program> [args...]\n\n"
+         "System call names supported:\n", progn);
+  for (; entry->name && entry->nr >= 0; ++entry)
+    printf("  %s [%d]\n", entry->name, entry->nr);
+  printf("\nSee minijail0(5) for example policies.\n");
+}
+
 int main(int argc, char *argv[]) {
   struct minijail *j = minijail_new();
 
   int opt;
-  while ((opt = getopt(argc, argv, "u:g:sc:vrGhp")) != -1) {
+  while ((opt = getopt(argc, argv, "u:g:sS:c:vrGhHp")) != -1) {
     switch (opt) {
       case 'u':
         set_user(j, optarg);
@@ -76,6 +91,10 @@
       case 's':
         minijail_use_seccomp(j);
         break;
+      case 'S':
+        minijail_parse_seccomp_filters(j, optarg);
+        minijail_use_seccomp_filter(j);
+        break;
       case 'c':
         use_caps(j, optarg);
         break;
@@ -91,6 +110,9 @@
       case 'p':
         minijail_namespace_pids(j);
         break;
+      case 'H':
+        seccomp_filter_usage(argv[0]);
+        exit(1);
       default:
         usage(argv[0]);
         exit(1);