From 6e4eab577a0cae15b3da9b888cff16fe57981b3e Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 24 May 2016 09:29:01 -0500
Subject: fs: Add user namespace member to struct super_block

Start marking filesystems with a user namespace owner, s_user_ns.  In
this change this is only used for permission checks of who may mount a
filesystem.  Ultimately s_user_ns will be used for translating ids and
checking capabilities for filesystems mounted from user namespaces.

The default policy for setting s_user_ns is implemented in sget(),
which arranges for s_user_ns to be set to current_user_ns() and to
ensure that the mounter of the filesystem has CAP_SYS_ADMIN in that
user_ns.

The guts of sget are split out into another function sget_userns().
The function sget_userns calls alloc_super with the specified user
namespace or it verifies the existing superblock that was found
has the expected user namespace, and fails with EBUSY when it is not.
This failing prevents users with the wrong privileges mounting a
filesystem.

The reason for the split of sget_userns from sget is that in some
cases such as mount_ns and kernfs_mount_ns a different policy for
permission checking of mounts and setting s_user_ns is necessary, and
the existence of sget_userns() allows those policies to be
implemented.

The helper mount_ns is expected to be used for filesystems such as
proc and mqueuefs which present per namespace information.  The
function mount_ns is modified to call sget_userns instead of sget to
ensure the user namespace owner of the namespace whose information is
presented by the filesystem is used on the superblock.

For sysfs and cgroup the appropriate permission checks are already in
place, and kernfs_mount_ns is modified to call sget_userns so that
the init_user_ns is the only user namespace used.

For the cgroup filesystem cgroup namespace mounts are bind mounts of a
subset of the full cgroup filesystem and as such s_user_ns must be the
same for all of them as there is only a single superblock.

Mounts of sysfs that vary based on the network namespace could in principle
change s_user_ns but it keeps the analysis and implementation of kernfs
simpler if that is not supported, and at present there appear to be no
benefits from supporting a different s_user_ns on any sysfs mount.

Getting the details of setting s_user_ns correct has been
a long process.  Thanks to Pavel Tikhorirorv who spotted a leak
in sget_userns.  Thanks to Seth Forshee who has kept the work alive.

Thanks-to: Seth Forshee <seth.forshee@canonical.com>
Thanks-to: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
---
 fs/kernfs/mount.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs/kernfs')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 63534f5f9073..d90d574c15a2 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -241,7 +241,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 	info->root = root;
 	info->ns = ns;
 
-	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+			 &init_user_ns, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
 		kfree(info);
 	if (IS_ERR(sb))
-- 
cgit v1.2.3


From 29a517c232d21a717aecea29838aeb07131f6196 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 10 Jun 2016 13:03:05 -0500
Subject: kernfs: The cgroup filesystem also benefits from SB_I_NOEXEC

The cgroup filesystem is in the same boat as sysfs.  No one ever
permits executables of any kind on the cgroup filesystem, and there is
no reasonable future case to support executables in the future.

Therefore move the setting of SB_I_NOEXEC which makes the code proof
against future mistakes of accidentally creating executables from
sysfs to kernfs itself.  Making the code simpler and covering the
sysfs, cgroup, and cgroup2 filesystems.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/kernfs/mount.c | 2 ++
 fs/sysfs/mount.c  | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs/kernfs')

diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index d90d574c15a2..1443df670260 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
+	/* Userspace would break if executables appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f31e36994dfb..20b8f82e115b 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	else if (new_sb)
-		/* Userspace would break if executables appear on sysfs */
-		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+		root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
 
 	return root;
 }
-- 
cgit v1.2.3


From a2982cc922c3068783eb9a1f77a5626a1ec36a1f Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 9 Jun 2016 15:34:02 -0500
Subject: vfs: Generalize filesystem nodev handling.

Introduce a function may_open_dev that tests MNT_NODEV and a new
superblock flab SB_I_NODEV.  Use this new function in all of the
places where MNT_NODEV was previously tested.

Add the new SB_I_NODEV s_iflag to proc, sysfs, and mqueuefs as those
filesystems should never support device nodes, and a simple superblock
flags makes that very hard to get wrong.  With SB_I_NODEV set if any
device nodes somehow manage to show up on on a filesystem those
device nodes will be unopenable.

Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
---
 fs/block_dev.c     | 2 +-
 fs/kernfs/mount.c  | 4 ++--
 fs/namei.c         | 8 +++++++-
 fs/proc/inode.c    | 4 ++--
 include/linux/fs.h | 2 ++
 ipc/mqueue.c       | 2 +-
 6 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs/kernfs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 71ccab1d22c6..30b8d568203a 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1857,7 +1857,7 @@ struct block_device *lookup_bdev(const char *pathname)
 	if (!S_ISBLK(inode->i_mode))
 		goto fail;
 	error = -EACCES;
-	if (path.mnt->mnt_flags & MNT_NODEV)
+	if (!may_open_dev(&path))
 		goto fail;
 	error = -ENOMEM;
 	bdev = bd_acquire(inode);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 1443df670260..b3d73ad52b22 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,8 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 	struct dentry *root;
 
 	info->sb = sb;
-	/* Userspace would break if executables appear on sysfs */
-	sb->s_iflags |= SB_I_NOEXEC;
+	/* Userspace would break if executables or devices appear on sysfs */
+	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = magic;
diff --git a/fs/namei.c b/fs/namei.c
index 6a82fb7e2127..757a32725d92 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2881,6 +2881,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 }
 EXPORT_SYMBOL(vfs_create);
 
+bool may_open_dev(const struct path *path)
+{
+	return !(path->mnt->mnt_flags & MNT_NODEV) &&
+		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
 static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
@@ -2899,7 +2905,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 		break;
 	case S_IFBLK:
 	case S_IFCHR:
-		if (path->mnt->mnt_flags & MNT_NODEV)
+		if (!may_open_dev(path))
 			return -EACCES;
 		/*FALLTHRU*/
 	case S_IFIFO:
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index f4817efb25a6..a5b2c33745b7 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -466,8 +466,8 @@ int proc_fill_super(struct super_block *s, void *data, int silent)
 	if (!proc_parse_options(data, ns))
 		return -EINVAL;
 
-	/* User space would break if executables appear on proc */
-	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC;
+	/* User space would break if executables or devices appear on proc */
+	s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
 	s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
 	s->s_blocksize = 1024;
 	s->s_blocksize_bits = 10;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9eef64f23a75..e05983170d23 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1327,6 +1327,7 @@ struct mm_struct;
 /* sb->s_iflags */
 #define SB_I_CGROUPWB	0x00000001	/* cgroup-aware writeback enabled */
 #define SB_I_NOEXEC	0x00000002	/* Ignore executables on this fs */
+#define SB_I_NODEV	0x00000004	/* Ignore devices on this fs */
 
 /* sb->s_iflags to limit user namespace mounts */
 #define SB_I_USERNS_VISIBLE		0x00000010 /* fstype already mounted */
@@ -1602,6 +1603,7 @@ extern int vfs_whiteout(struct inode *, struct dentry *);
  */
 extern void inode_init_owner(struct inode *inode, const struct inode *dir,
 			umode_t mode);
+extern bool may_open_dev(const struct path *path);
 /*
  * VFS FS_IOC_FIEMAP helper definitions.
  */
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5bdd50de7d05..0b13ace266f2 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -307,7 +307,7 @@ static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode;
 	struct ipc_namespace *ns = sb->s_fs_info;
 
-	sb->s_iflags |= SB_I_NOEXEC;
+	sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
 	sb->s_blocksize = PAGE_SIZE;
 	sb->s_blocksize_bits = PAGE_SHIFT;
 	sb->s_magic = MQUEUE_MAGIC;
-- 
cgit v1.2.3