fs/coredump: prevent fsuid=0 dumps into user-controlled directories

This commit fixes the following security hole affecting systems where
all of the following conditions are fulfilled:

 - The fs.suid_dumpable sysctl is set to 2.
 - The kernel.core_pattern sysctl's value starts with "/". (Systems
   where kernel.core_pattern starts with "|/" are not affected.)
 - Unprivileged user namespace creation is permitted. (This is
   true on Linux >=3.8, but some distributions disallow it by
   default using a distro patch.)

Under these conditions, if a program executes under secure exec rules,
causing it to run with the SUID_DUMP_ROOT flag, then unshares its user
namespace, changes its root directory and crashes, the coredump will be
written using fsuid=0 and a path derived from kernel.core_pattern - but
this path is interpreted relative to the root directory of the process,
allowing the attacker to control where a coredump will be written with
root privileges.

To fix the security issue, always interpret core_pattern for dumps that
are written under SUID_DUMP_ROOT relative to the root directory of init.

Signed-off-by: Jann Horn <[email protected]>
Acked-by: Kees Cook <[email protected]>
Cc: Al Viro <[email protected]>
Cc: "Eric W. Biederman" <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: <[email protected]>
Signed-off-by: Andrew Morton <[email protected]>
Signed-off-by: Linus Torvalds <[email protected]>
diff --git a/fs/coredump.c b/fs/coredump.c
index 9ea87e9..47c32c3 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,9 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
 #include <linux/timekeeping.h>
 
 #include <asm/uaccess.h>
@@ -649,6 +652,8 @@
 		}
 	} else {
 		struct inode *inode;
+		int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW |
+				 O_LARGEFILE | O_EXCL;
 
 		if (cprm.limit < binfmt->min_coredump)
 			goto fail_unlock;
@@ -687,10 +692,27 @@
 		 * what matters is that at least one of the two processes
 		 * writes its coredump successfully, not which one.
 		 */
-		cprm.file = filp_open(cn.corename,
-				 O_CREAT | 2 | O_NOFOLLOW |
-				 O_LARGEFILE | O_EXCL,
-				 0600);
+		if (need_suid_safe) {
+			/*
+			 * Using user namespaces, normal user tasks can change
+			 * their current->fs->root to point to arbitrary
+			 * directories. Since the intention of the "only dump
+			 * with a fully qualified path" rule is to control where
+			 * coredumps may be placed using root privileges,
+			 * current->fs->root must not be used. Instead, use the
+			 * root directory of init_task.
+			 */
+			struct path root;
+
+			task_lock(&init_task);
+			get_fs_root(init_task.fs, &root);
+			task_unlock(&init_task);
+			cprm.file = file_open_root(root.dentry, root.mnt,
+				cn.corename, open_flags, 0600);
+			path_put(&root);
+		} else {
+			cprm.file = filp_open(cn.corename, open_flags, 0600);
+		}
 		if (IS_ERR(cprm.file))
 			goto fail_unlock;
 
diff --git a/fs/fhandle.c b/fs/fhandle.c
index d59712d..ca3c3dd 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -228,7 +228,7 @@
 		path_put(&path);
 		return fd;
 	}
-	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	file = file_open_root(path.dentry, path.mnt, "", open_flag, 0);
 	if (IS_ERR(file)) {
 		put_unused_fd(fd);
 		retval =  PTR_ERR(file);
diff --git a/fs/open.c b/fs/open.c
index 55bdc75..17cb6b1 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -992,14 +992,12 @@
 EXPORT_SYMBOL(filp_open);
 
 struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
-			    const char *filename, int flags)
+			    const char *filename, int flags, umode_t mode)
 {
 	struct open_flags op;
-	int err = build_open_flags(flags, 0, &op);
+	int err = build_open_flags(flags, mode, &op);
 	if (err)
 		return ERR_PTR(err);
-	if (flags & O_CREAT)
-		return ERR_PTR(-EINVAL);
 	return do_file_open_root(dentry, mnt, filename, &op);
 }
 EXPORT_SYMBOL(file_open_root);