Block symlinks in bind mount source paths.

Minijail is sometimes asked to bind mount directories owned by
less-privileged users. These less-privileged users can manufacture
a "mount-anything-anywhere" primitive by replacing bind mount paths
with symlinks.

Prevent this by checking whether the bind mount source path is a
canonical path. Because this happens at minijail_bind() time, there is
still the risk of TOCTOU issues. A follow-up patch will re-check things
closer to the mount() call. Unfortunately mount() takes paths so it is
not possible to fully eliminate this race.

Because some files that processes might want to bind mount (like files
in /sys or /dev) can be symlinks, allow users to specify a set of
prefixes exempt from these restrictions. This can also help with
rolling this restriction out (exclude some prefixes, fix the callers,
then remove the exclusions).

A follow-up to this change will add checks on the destination path,
which need to happen after the path is created. Ideally, these would
happen as close as possible to the mount() call.

Bug: 219093918
Test: New unit tests.
Change-Id: Ia747e4318ed4eabf27c64e04e0dd71723735bae2
diff --git a/Android.bp b/Android.bp
index 2b571ae..e771f95 100644
--- a/Android.bp
+++ b/Android.bp
@@ -75,6 +75,7 @@
         "-DALLOW_DEBUG_LOGGING",
         "-DALLOW_DUPLICATE_SYSCALLS",
         "-DDEFAULT_PIVOT_ROOT=\"/var/empty\"",
+        "-DBINDMOUNT_ALLOWED_PREFIXES=\"\"",
         "-Wall",
         "-Werror",
     ],
diff --git a/Makefile b/Makefile
index 6728cdb..3e41b5d 100644
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,18 @@
 endif
 endif
 
+# Prevent Minijail from following symlinks when performing bind mounts.
+# BINDMOUNT_ALLOWED_PREFIXES allows some flexibility. This is especially useful
+# for directories that are not normally modifiable by non-root users.
+# If a process can modify these directories, they probably don't need to mess
+# with Minijail bind mounts to gain root privileges.
+BINDMOUNT_ALLOWED_PREFIXES ?= /dev,/sys
+CPPFLAGS += -DBINDMOUNT_ALLOWED_PREFIXES='"$(BINDMOUNT_ALLOWED_PREFIXES)"'
+BLOCK_SYMLINKS_IN_BINDMOUNT_PATHS ?= no
+ifeq ($(BLOCK_SYMLINKS_IN_BINDMOUNT_PATHS),yes)
+CPPFLAGS += -DBLOCK_SYMLINKS_IN_BINDMOUNT_PATHS
+endif
+
 ifeq ($(USE_ASAN),yes)
 CPPFLAGS += -fsanitize=address -fno-omit-frame-pointer
 LDFLAGS += -fsanitize=address -fno-omit-frame-pointer
diff --git a/libminijail.c b/libminijail.c
index 54d8a74..34db672 100644
--- a/libminijail.c
+++ b/libminijail.c
@@ -873,6 +873,42 @@
 		ACCESS_FS_ROUGHLY_READ | ACCESS_FS_ROUGHLY_FULL_WRITE);
 }
 
+static bool is_valid_bind_path(const char *path)
+{
+	if (!block_symlinks_in_bindmount_paths()) {
+		return true;
+	}
+
+	/*
+	 * tokenize() will modify both the |prefixes| pointer and the contents
+	 * of the string, so:
+	 * -Copy |BINDMOUNT_ALLOWED_PREFIXES| since it lives in .rodata.
+	 * -Save the original pointer for free()ing.
+	 */
+	char *prefixes = strdup(BINDMOUNT_ALLOWED_PREFIXES);
+	attribute_cleanup_str char *orig_prefixes = prefixes;
+	(void)orig_prefixes;
+
+	char *prefix = NULL;
+	bool found_prefix = false;
+	if (!is_canonical_path(path)) {
+		while ((prefix = tokenize(&prefixes, ",")) != NULL) {
+			if (path_is_parent(prefix, path)) {
+				found_prefix = true;
+				break;
+			}
+		}
+		if (!found_prefix) {
+			/*
+			 * If the path does not include one of the allowed
+			 * prefixes, fail.
+			 */
+			warn("path '%s' is not a canonical path", path);
+			return false;
+		}
+	}
+	return true;
+}
 
 int API minijail_mount_with_data(struct minijail *j, const char *src,
 				 const char *dest, const char *type,
@@ -957,6 +993,13 @@
 {
 	unsigned long flags = MS_BIND;
 
+	if (!is_valid_bind_path(src)) {
+		warn("src '%s' is not a valid bind mount path", src);
+		return -ELOOP;
+	}
+
+	/* |dest| might not yet exist. */
+
 	if (!writeable)
 		flags |= MS_RDONLY;
 
diff --git a/libminijail_unittest.cc b/libminijail_unittest.cc
index a6abf1c..eea0053 100644
--- a/libminijail_unittest.cc
+++ b/libminijail_unittest.cc
@@ -1008,8 +1008,32 @@
   minijail_destroy(j);
 }
 
-// Test that bind mounting with a symlink works (but we're about to make this
-// fail).
+// Test that bind mounting onto a non-existing location works.
+TEST(Test, test_bind_mount_nonexistent_dest) {
+  TemporaryDir dir;
+  ASSERT_TRUE(dir.is_valid());
+
+  // minijail_bind() expects absolute paths, but TemporaryDir::path can return
+  // relative paths on Linux.
+  std::string path = dir.path;
+  if (!is_android()) {
+    std::string cwd(getcwd(NULL, 0));
+    path = cwd + "/" + path;
+  }
+
+  std::string path_src = path + "/src";
+  std::string path_dest = path + "/dest";
+
+  EXPECT_EQ(mkdir(path_src.c_str(), 0700), 0);
+
+  ScopedMinijail j(minijail_new());
+  int bind_res = minijail_bind(j.get(), path_src.c_str(), path_dest.c_str(),
+                               0 /*writable*/);
+  EXPECT_EQ(bind_res, 0);
+}
+
+// Test that bind mounting with a symlink behaves according to build-time
+// configuration.
 TEST(Test, test_bind_mount_symlink) {
   TemporaryDir dir;
   ASSERT_TRUE(dir.is_valid());
@@ -1033,7 +1057,11 @@
   ScopedMinijail j(minijail_new());
   int bind_res = minijail_bind(j.get(), path_sym.c_str(), path_dest.c_str(),
                                0 /*writable*/);
-  EXPECT_EQ(bind_res, 0);
+  if (block_symlinks_in_bindmount_paths()) {
+    EXPECT_NE(bind_res, 0);
+  } else {
+    EXPECT_EQ(bind_res, 0);
+  }
   EXPECT_EQ(unlink(path_sym.c_str()), 0);
 }
 
diff --git a/system.c b/system.c
index c8d257a..e260906 100644
--- a/system.c
+++ b/system.c
@@ -547,3 +547,9 @@
 	return sys_seccomp(SECCOMP_SET_MODE_FILTER, flags, NULL) != -1 ||
 	       errno != EINVAL;
 }
+
+bool is_canonical_path(const char *path)
+{
+	attribute_cleanup_str char *rp = realpath(path, NULL);
+	return rp != NULL ? streq(path, rp) : false;
+}
diff --git a/system.h b/system.h
index e3afb5f..fb351b4 100644
--- a/system.h
+++ b/system.h
@@ -63,6 +63,15 @@
 int seccomp_ret_kill_process_available(void);
 bool seccomp_filter_flags_available(unsigned int flags);
 
+/*
+ * is_canonical_path: checks whether @path is a canonical path.
+ * This means:
+ * -Absolute.
+ * -No symlinks.
+ * -No /./, /../, or extra '/'.
+ */
+bool is_canonical_path(const char *path);
+
 #ifdef __cplusplus
 }; /* extern "C" */
 #endif
diff --git a/system_unittest.cc b/system_unittest.cc
index 19b9099..e50a6e6 100644
--- a/system_unittest.cc
+++ b/system_unittest.cc
@@ -247,3 +247,14 @@
   seccomp_ret_log_available();
   seccomp_ret_kill_process_available();
 }
+
+TEST(is_canonical_path, basic) {
+  EXPECT_FALSE(is_canonical_path("/proc/self"));
+  EXPECT_FALSE(is_canonical_path("relative"));
+  EXPECT_FALSE(is_canonical_path("/proc/./1"));
+  EXPECT_FALSE(is_canonical_path("/proc/../proc/1"));
+
+  EXPECT_TRUE(is_canonical_path("/"));
+  EXPECT_TRUE(is_canonical_path("/proc"));
+  EXPECT_TRUE(is_canonical_path("/proc/1"));
+}
diff --git a/util.c b/util.c
index d55ec3a..a2bbd0a 100644
--- a/util.c
+++ b/util.c
@@ -449,6 +449,22 @@
 		   : path;
 }
 
+bool path_is_parent(const char *parent, const char *child)
+{
+	/*
+	 * -Make sure |child| starts with |parent|.
+	 * -Make sure that if |child| is longer than |parent|, either:
+	 * --the last character in |parent| is a path separator, or
+	 * --the character immediately following |parent| in |child| is a path
+	 *  separator.
+	 */
+	size_t parent_len = strlen(parent);
+	return strncmp(parent, child, parent_len) == 0 &&
+	       (strlen(child) > parent_len ? (parent[parent_len - 1] == '/' ||
+					      child[parent_len] == '/')
+					   : false);
+}
+
 void *consumebytes(size_t length, char **buf, size_t *buflength)
 {
 	char *p = *buf;
diff --git a/util.h b/util.h
index 3698263..4c65bc2 100644
--- a/util.h
+++ b/util.h
@@ -221,6 +221,15 @@
 #endif
 }
 
+static inline bool block_symlinks_in_bindmount_paths(void)
+{
+#if defined(BLOCK_SYMLINKS_IN_BINDMOUNT_PATHS)
+	return true;
+#else
+	return false;
+#endif
+}
+
 static inline size_t get_num_syscalls(void)
 {
 	return syscall_table_size;
@@ -259,6 +268,13 @@
 char *path_join(const char *external_path, const char *internal_path);
 
 /*
+ * path_is_parent: checks whether @parent is a parent of @child.
+ * Note: this function does not evaluate '.' or '..' nor does it resolve
+ * symlinks.
+ */
+bool path_is_parent(const char *parent, const char *child);
+
+/*
  * consumebytes: consumes @length bytes from a buffer @buf of length @buflength
  * @length    Number of bytes to consume
  * @buf       Buffer to consume from
diff --git a/util_unittest.cc b/util_unittest.cc
index b9e6dfc..354dbc4 100644
--- a/util_unittest.cc
+++ b/util_unittest.cc
@@ -402,6 +402,20 @@
   free(path);
 }
 
+TEST(path_is_parent, simple) {
+  EXPECT_TRUE(path_is_parent("/dev", "/dev/rtc"));
+  EXPECT_TRUE(path_is_parent("/dev/", "/dev/rtc"));
+  EXPECT_TRUE(path_is_parent("/sys", "/sys/power"));
+  EXPECT_TRUE(path_is_parent("/sys/power", "/sys/power/something"));
+  EXPECT_TRUE(path_is_parent("/sys", "/sys/sys/power"));
+
+  EXPECT_FALSE(path_is_parent("/dev", ""));
+  EXPECT_FALSE(path_is_parent("/dev", "/sys"));
+  EXPECT_FALSE(path_is_parent("/dev", "dev"));
+  EXPECT_FALSE(path_is_parent("/dev", "/sys/dev"));
+  EXPECT_FALSE(path_is_parent("/dev", "/device"));
+}
+
 TEST(getmultiline, basic) {
   std::string config =
            "\n"