summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
authorChristian Brauner <christian.brauner@ubuntu.com>2021-01-21 14:19:54 +0100
committerChristian Brauner <christian.brauner@ubuntu.com>2021-01-24 14:43:45 +0100
commit9caccd41541a6f7d6279928d9f971f6642c361af (patch)
treec9fd1a3ffe04a19e0b44ca1007fba8a40f2c451e /fs/namespace.c
parent2a1867219c7b27f928e2545782b86daaf9ad50bd (diff)
downloadlwn-9caccd41541a6f7d6279928d9f971f6642c361af.tar.gz
lwn-9caccd41541a6f7d6279928d9f971f6642c361af.zip
fs: introduce MOUNT_ATTR_IDMAP
Introduce a new mount bind mount property to allow idmapping mounts. The MOUNT_ATTR_IDMAP flag can be set via the new mount_setattr() syscall together with a file descriptor referring to a user namespace. The user namespace referenced by the namespace file descriptor will be attached to the bind mount. All interactions with the filesystem going through that mount will be mapped according to the mapping specified in the user namespace attached to it. Using user namespaces to mark mounts means we can reuse all the existing infrastructure in the kernel that already exists to handle idmappings and can also use this for permission checking to allow unprivileged user to create idmapped mounts in the future. Idmapping a mount is decoupled from the caller's user and mount namespace. This means idmapped mounts can be created in the initial user namespace which is an important use-case for systemd-homed, portable usb-sticks between systems, sharing data between the initial user namespace and unprivileged containers, and other use-cases that have been brought up. For example, assume a home directory where all files are owned by uid and gid 1000 and the home directory is brought to a new laptop where the user has id 12345. The system administrator can simply create a mount of this home directory with a mapping of 1000:12345:1 and other mappings to indicate the ids should be kept. (With this it is e.g. also possible to create idmapped mounts on the host with an identity mapping 1:1:100000 where the root user is not mapped. A user with root access that e.g. has been pivot rooted into such a mount on the host will be not be able to execute, read, write, or create files as root.) Given that mapping a mount is decoupled from the caller's user namespace a sufficiently privileged process such as a container manager can set up an idmapped mount for the container and the container can simply pivot root to it. There's no need for the container to do anything. The mount will appear correctly mapped independent of the user namespace the container uses. This means we don't need to mark a mount as idmappable. In order to create an idmapped mount the caller must currently be privileged in the user namespace of the superblock the mount belongs to. Once a mount has been idmapped we don't allow it to change its mapping. This keeps permission checking and life-cycle management simple. Users wanting to change the idmapped can always create a new detached mount with a different idmapping. Link: https://lore.kernel.org/r/20210121131959.646623-36-christian.brauner@ubuntu.com Cc: Christoph Hellwig <hch@lst.de> Cc: David Howells <dhowells@redhat.com> Cc: Mauricio Vásquez Bernal <mauricio@kinvolk.io> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: linux-fsdevel@vger.kernel.org Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Christian Brauner <christian.brauner@ubuntu.com>
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c122
1 files changed, 115 insertions, 7 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 726a51c8c46e..062fe984a697 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -25,6 +25,7 @@
#include <linux/proc_ns.h>
#include <linux/magic.h>
#include <linux/memblock.h>
+#include <linux/proc_fs.h>
#include <linux/task_work.h>
#include <linux/sched/task.h>
#include <uapi/linux/mount.h>
@@ -79,6 +80,7 @@ struct mount_kattr {
unsigned int propagation;
unsigned int lookup_flags;
bool recurse;
+ struct user_namespace *mnt_userns;
};
/* /sys/fs */
@@ -3477,7 +3479,7 @@ out_type:
(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \
MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)
-#define MOUNT_SETATTR_VALID_FLAGS FSMOUNT_VALID_FLAGS
+#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP)
#define MOUNT_SETATTR_PROPAGATION_FLAGS \
(MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED)
@@ -3845,6 +3847,36 @@ static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt)
return flags;
}
+static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+ struct vfsmount *m = &mnt->mnt;
+
+ if (!kattr->mnt_userns)
+ return 0;
+
+ /*
+ * Once a mount has been idmapped we don't allow it to change its
+ * mapping. It makes things simpler and callers can just create
+ * another bind-mount they can idmap if they want to.
+ */
+ if (mnt_user_ns(m) != &init_user_ns)
+ return -EPERM;
+
+ /* The underlying filesystem doesn't support idmapped mounts yet. */
+ if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP))
+ return -EINVAL;
+
+ /* We're not controlling the superblock. */
+ if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ /* Mount has already been visible in the filesystem hierarchy. */
+ if (!is_anon_ns(mnt->mnt_ns))
+ return -EINVAL;
+
+ return 0;
+}
+
static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
struct mount *mnt, int *err)
{
@@ -3869,6 +3901,10 @@ static struct mount *mount_setattr_prepare(struct mount_kattr *kattr,
goto out;
}
+ *err = can_idmap_mount(kattr, m);
+ if (*err)
+ goto out;
+
last = m;
if ((kattr->attr_set & MNT_READONLY) &&
@@ -3883,6 +3919,18 @@ out:
return last;
}
+static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
+{
+ struct user_namespace *mnt_userns;
+
+ if (!kattr->mnt_userns)
+ return;
+
+ mnt_userns = get_user_ns(kattr->mnt_userns);
+ /* Pairs with smp_load_acquire() in mnt_user_ns(). */
+ smp_store_release(&mnt->mnt.mnt_userns, mnt_userns);
+}
+
static void mount_setattr_commit(struct mount_kattr *kattr,
struct mount *mnt, struct mount *last,
int err)
@@ -3893,6 +3941,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr,
if (!err) {
unsigned int flags;
+ do_idmap_mount(kattr, m);
flags = recalc_flags(kattr, m);
WRITE_ONCE(m->mnt.mnt_flags, flags);
}
@@ -3965,7 +4014,62 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
return err;
}
-static int build_mount_kattr(const struct mount_attr *attr,
+static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
+ struct mount_kattr *kattr, unsigned int flags)
+{
+ int err = 0;
+ struct ns_common *ns;
+ struct user_namespace *mnt_userns;
+ struct file *file;
+
+ if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
+ return 0;
+
+ /*
+ * We currently do not support clearing an idmapped mount. If this ever
+ * is a use-case we can revisit this but for now let's keep it simple
+ * and not allow it.
+ */
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP)
+ return -EINVAL;
+
+ if (attr->userns_fd > INT_MAX)
+ return -EINVAL;
+
+ file = fget(attr->userns_fd);
+ if (!file)
+ return -EBADF;
+
+ if (!proc_ns_file(file)) {
+ err = -EINVAL;
+ goto out_fput;
+ }
+
+ ns = get_proc_ns(file_inode(file));
+ if (ns->ops->type != CLONE_NEWUSER) {
+ err = -EINVAL;
+ goto out_fput;
+ }
+
+ /*
+ * The init_user_ns is used to indicate that a vfsmount is not idmapped.
+ * This is simpler than just having to treat NULL as unmapped. Users
+ * wanting to idmap a mount to init_user_ns can just use a namespace
+ * with an identity mapping.
+ */
+ mnt_userns = container_of(ns, struct user_namespace, ns);
+ if (mnt_userns == &init_user_ns) {
+ err = -EPERM;
+ goto out_fput;
+ }
+ kattr->mnt_userns = get_user_ns(mnt_userns);
+
+out_fput:
+ fput(file);
+ return err;
+}
+
+static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
@@ -3991,9 +4095,6 @@ static int build_mount_kattr(const struct mount_attr *attr,
if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS)
return -EINVAL;
- if (attr->userns_fd)
- return -EINVAL;
-
kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set);
kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr);
@@ -4032,7 +4133,13 @@ static int build_mount_kattr(const struct mount_attr *attr,
return -EINVAL;
}
- return 0;
+ return build_mount_idmapped(attr, usize, kattr, flags);
+}
+
+static void finish_mount_kattr(struct mount_kattr *kattr)
+{
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
}
SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
@@ -4070,7 +4177,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
attr.propagation == 0)
return 0;
- err = build_mount_kattr(&attr, &kattr, flags);
+ err = build_mount_kattr(&attr, usize, &kattr, flags);
if (err)
return err;
@@ -4079,6 +4186,7 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
err = do_mount_setattr(&target, &kattr);
+ finish_mount_kattr(&kattr);
path_put(&target);
return err;
}