summaryrefslogtreecommitdiff
path: root/fs/namespace.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/namespace.c')
-rw-r--r--fs/namespace.c1120
1 files changed, 836 insertions, 284 deletions
diff --git a/fs/namespace.c b/fs/namespace.c
index 23e81c2a1e3f..6100e5b962a6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,7 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
-#include <linux/nospec.h>
+#include <linux/pidfs.h>
#include "pnode.h"
#include "internal.h"
@@ -66,12 +66,12 @@ static int __init set_mphash_entries(char *str)
__setup("mphash_entries=", set_mphash_entries);
static u64 event;
-static DEFINE_IDA(mnt_id_ida);
+static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);
/* Don't allow confusion with old 32bit mount ID */
#define MNT_UNIQUE_ID_OFFSET (1ULL << 31)
-static atomic64_t mnt_id_ctr = ATOMIC64_INIT(MNT_UNIQUE_ID_OFFSET);
+static u64 mnt_id_ctr = MNT_UNIQUE_ID_OFFSET;
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
@@ -79,15 +79,25 @@ static struct kmem_cache *mnt_cache __ro_after_init;
static DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */
-static DEFINE_RWLOCK(mnt_ns_tree_lock);
+static DEFINE_SEQLOCK(mnt_ns_tree_lock);
+
+#ifdef CONFIG_FSNOTIFY
+LIST_HEAD(notify_list); /* protected by namespace_sem */
+#endif
static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */
+static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */
+
+enum mount_kattr_flags_t {
+ MOUNT_KATTR_RECURSE = (1 << 0),
+ MOUNT_KATTR_IDMAP_REPLACE = (1 << 1),
+};
struct mount_kattr {
unsigned int attr_set;
unsigned int attr_clr;
unsigned int propagation;
unsigned int lookup_flags;
- bool recurse;
+ enum mount_kattr_flags_t kflags;
struct user_namespace *mnt_userns;
struct mnt_idmap *mnt_idmap;
};
@@ -106,17 +116,6 @@ EXPORT_SYMBOL_GPL(fs_kobj);
*/
__cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock);
-static int mnt_ns_cmp(u64 seq, const struct mnt_namespace *ns)
-{
- u64 seq_b = ns->seq;
-
- if (seq < seq_b)
- return -1;
- if (seq > seq_b)
- return 1;
- return 0;
-}
-
static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
{
if (!node)
@@ -124,68 +123,89 @@ static inline struct mnt_namespace *node_to_mnt_ns(const struct rb_node *node)
return rb_entry(node, struct mnt_namespace, mnt_ns_tree_node);
}
-static bool mnt_ns_less(struct rb_node *a, const struct rb_node *b)
+static int mnt_ns_cmp(struct rb_node *a, const struct rb_node *b)
{
struct mnt_namespace *ns_a = node_to_mnt_ns(a);
struct mnt_namespace *ns_b = node_to_mnt_ns(b);
u64 seq_a = ns_a->seq;
+ u64 seq_b = ns_b->seq;
- return mnt_ns_cmp(seq_a, ns_b) < 0;
+ if (seq_a < seq_b)
+ return -1;
+ if (seq_a > seq_b)
+ return 1;
+ return 0;
+}
+
+static inline void mnt_ns_tree_write_lock(void)
+{
+ write_seqlock(&mnt_ns_tree_lock);
+}
+
+static inline void mnt_ns_tree_write_unlock(void)
+{
+ write_sequnlock(&mnt_ns_tree_lock);
}
static void mnt_ns_tree_add(struct mnt_namespace *ns)
{
- guard(write_lock)(&mnt_ns_tree_lock);
- rb_add(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_less);
+ struct rb_node *node, *prev;
+
+ mnt_ns_tree_write_lock();
+ node = rb_find_add_rcu(&ns->mnt_ns_tree_node, &mnt_ns_tree, mnt_ns_cmp);
+ /*
+ * If there's no previous entry simply add it after the
+ * head and if there is add it after the previous entry.
+ */
+ prev = rb_prev(&ns->mnt_ns_tree_node);
+ if (!prev)
+ list_add_rcu(&ns->mnt_ns_list, &mnt_ns_list);
+ else
+ list_add_rcu(&ns->mnt_ns_list, &node_to_mnt_ns(prev)->mnt_ns_list);
+ mnt_ns_tree_write_unlock();
+
+ WARN_ON_ONCE(node);
}
static void mnt_ns_release(struct mnt_namespace *ns)
{
- lockdep_assert_not_held(&mnt_ns_tree_lock);
-
/* keep alive for {list,stat}mount() */
if (refcount_dec_and_test(&ns->passive)) {
+ fsnotify_mntns_delete(ns);
put_user_ns(ns->user_ns);
kfree(ns);
}
}
DEFINE_FREE(mnt_ns_release, struct mnt_namespace *, if (_T) mnt_ns_release(_T))
+static void mnt_ns_release_rcu(struct rcu_head *rcu)
+{
+ mnt_ns_release(container_of(rcu, struct mnt_namespace, mnt_ns_rcu));
+}
+
static void mnt_ns_tree_remove(struct mnt_namespace *ns)
{
/* remove from global mount namespace list */
if (!is_anon_ns(ns)) {
- guard(write_lock)(&mnt_ns_tree_lock);
+ mnt_ns_tree_write_lock();
rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree);
+ list_bidir_del_rcu(&ns->mnt_ns_list);
+ mnt_ns_tree_write_unlock();
}
- mnt_ns_release(ns);
+ call_rcu(&ns->mnt_ns_rcu, mnt_ns_release_rcu);
}
-/*
- * Returns the mount namespace which either has the specified id, or has the
- * next smallest id afer the specified one.
- */
-static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
+static int mnt_ns_find(const void *key, const struct rb_node *node)
{
- struct rb_node *node = mnt_ns_tree.rb_node;
- struct mnt_namespace *ret = NULL;
+ const u64 mnt_ns_id = *(u64 *)key;
+ const struct mnt_namespace *ns = node_to_mnt_ns(node);
- lockdep_assert_held(&mnt_ns_tree_lock);
-
- while (node) {
- struct mnt_namespace *n = node_to_mnt_ns(node);
-
- if (mnt_ns_id <= n->seq) {
- ret = node_to_mnt_ns(node);
- if (mnt_ns_id == n->seq)
- break;
- node = node->rb_left;
- } else {
- node = node->rb_right;
- }
- }
- return ret;
+ if (mnt_ns_id < ns->seq)
+ return -1;
+ if (mnt_ns_id > ns->seq)
+ return 1;
+ return 0;
}
/*
@@ -195,18 +215,37 @@ static struct mnt_namespace *mnt_ns_find_id_at(u64 mnt_ns_id)
* namespace the @namespace_sem must first be acquired. If the namespace has
* already shut down before acquiring @namespace_sem, {list,stat}mount() will
* see that the mount rbtree of the namespace is empty.
+ *
+ * Note the lookup is lockless protected by a sequence counter. We only
+ * need to guard against false negatives as false positives aren't
+ * possible. So if we didn't find a mount namespace and the sequence
+ * counter has changed we need to retry. If the sequence counter is
+ * still the same we know the search actually failed.
*/
static struct mnt_namespace *lookup_mnt_ns(u64 mnt_ns_id)
{
- struct mnt_namespace *ns;
+ struct mnt_namespace *ns;
+ struct rb_node *node;
+ unsigned int seq;
+
+ guard(rcu)();
+ do {
+ seq = read_seqbegin(&mnt_ns_tree_lock);
+ node = rb_find_rcu(&mnt_ns_id, &mnt_ns_tree, mnt_ns_find);
+ if (node)
+ break;
+ } while (read_seqretry(&mnt_ns_tree_lock, seq));
- guard(read_lock)(&mnt_ns_tree_lock);
- ns = mnt_ns_find_id_at(mnt_ns_id);
- if (!ns || ns->seq != mnt_ns_id)
- return NULL;
+ if (!node)
+ return NULL;
- refcount_inc(&ns->passive);
- return ns;
+ /*
+ * The last reference count is put with RCU delay so we can
+ * unconditonally acquire a reference here.
+ */
+ ns = node_to_mnt_ns(node);
+ refcount_inc(&ns->passive);
+ return ns;
}
static inline void lock_mount_hash(void)
@@ -236,18 +275,19 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
static int mnt_alloc_id(struct mount *mnt)
{
- int res = ida_alloc(&mnt_id_ida, GFP_KERNEL);
+ int res;
- if (res < 0)
- return res;
- mnt->mnt_id = res;
- mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
- return 0;
+ xa_lock(&mnt_id_xa);
+ res = __xa_alloc(&mnt_id_xa, &mnt->mnt_id, mnt, XA_LIMIT(1, INT_MAX), GFP_KERNEL);
+ if (!res)
+ mnt->mnt_id_unique = ++mnt_id_ctr;
+ xa_unlock(&mnt_id_xa);
+ return res;
}
static void mnt_free_id(struct mount *mnt)
{
- ida_free(&mnt_id_ida, mnt->mnt_id);
+ xa_erase(&mnt_id_xa, mnt->mnt_id);
}
/*
@@ -344,6 +384,7 @@ static struct mount *alloc_vfsmnt(const char *name)
INIT_HLIST_NODE(&mnt->mnt_mp_list);
INIT_LIST_HEAD(&mnt->mnt_umounting);
INIT_HLIST_HEAD(&mnt->mnt_stuck_children);
+ RB_CLEAR_NODE(&mnt->mnt_node);
mnt->mnt.mnt_idmap = &nop_mnt_idmap;
}
return mnt;
@@ -966,6 +1007,17 @@ static inline int check_mnt(struct mount *mnt)
return mnt->mnt_ns == current->nsproxy->mnt_ns;
}
+static inline bool check_anonymous_mnt(struct mount *mnt)
+{
+ u64 seq;
+
+ if (!is_anon_ns(mnt->mnt_ns))
+ return false;
+
+ seq = mnt->mnt_ns->seq_origin;
+ return !seq || (seq == current->nsproxy->mnt_ns->seq);
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1123,19 +1175,29 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
{
struct rb_node **link = &ns->mounts.rb_node;
struct rb_node *parent = NULL;
+ bool mnt_first_node = true, mnt_last_node = true;
- WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+ WARN_ON(mnt_ns_attached(mnt));
mnt->mnt_ns = ns;
while (*link) {
parent = *link;
- if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+ if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique) {
link = &parent->rb_left;
- else
+ mnt_last_node = false;
+ } else {
link = &parent->rb_right;
+ mnt_first_node = false;
+ }
}
+
+ if (mnt_last_node)
+ ns->mnt_last_node = &mnt->mnt_node;
+ if (mnt_first_node)
+ ns->mnt_first_node = &mnt->mnt_node;
rb_link_node(&mnt->mnt_node, parent, link);
rb_insert_color(&mnt->mnt_node, &ns->mounts);
- mnt->mnt.mnt_flags |= MNT_ONRB;
+
+ mnt_notify_add(mnt);
}
/*
@@ -1305,7 +1367,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
}
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1683,6 +1745,50 @@ int may_umount(struct vfsmount *mnt)
EXPORT_SYMBOL(may_umount);
+#ifdef CONFIG_FSNOTIFY
+static void mnt_notify(struct mount *p)
+{
+ if (!p->prev_ns && p->mnt_ns) {
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ } else if (p->prev_ns && !p->mnt_ns) {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ } else if (p->prev_ns == p->mnt_ns) {
+ fsnotify_mnt_move(p->mnt_ns, &p->mnt);
+ } else {
+ fsnotify_mnt_detach(p->prev_ns, &p->mnt);
+ fsnotify_mnt_attach(p->mnt_ns, &p->mnt);
+ }
+ p->prev_ns = p->mnt_ns;
+}
+
+static void notify_mnt_list(void)
+{
+ struct mount *m, *tmp;
+ /*
+ * Notify about mounts that were added/reparented/detached/remain
+ * connected after unmount.
+ */
+ list_for_each_entry_safe(m, tmp, &notify_list, to_notify) {
+ mnt_notify(m);
+ list_del_init(&m->to_notify);
+ }
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return !list_empty(&notify_list);
+}
+#else
+static void notify_mnt_list(void)
+{
+}
+
+static bool need_notify_mnt_list(void)
+{
+ return false;
+}
+#endif
+
static void namespace_unlock(void)
{
struct hlist_head head;
@@ -1693,7 +1799,18 @@ static void namespace_unlock(void)
hlist_move_list(&unmounted, &head);
list_splice_init(&ex_mountpoints, &list);
- up_write(&namespace_sem);
+ if (need_notify_mnt_list()) {
+ /*
+ * No point blocking out concurrent readers while notifications
+ * are sent. This will also allow statmount()/listmount() to run
+ * concurrently.
+ */
+ downgrade_write(&namespace_sem);
+ notify_mnt_list();
+ up_read(&namespace_sem);
+ } else {
+ up_write(&namespace_sem);
+ }
shrink_dentry_list(&list);
@@ -1763,7 +1880,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
- if (p->mnt.mnt_flags & MNT_ONRB)
+ if (mnt_ns_attached(p))
move_from_ns(p, &tmp_list);
else
list_move(&p->mnt_list, &tmp_list);
@@ -1806,6 +1923,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
change_mnt_propagation(p, MS_PRIVATE);
if (disconnect)
hlist_add_head(&p->mnt_umount, &unmounted);
+
+ /*
+ * At this point p->mnt_ns is NULL, notification will be queued
+ * only if
+ *
+ * - p->prev_ns is non-NULL *and*
+ * - p->prev_ns->n_fsnotify_marks is non-NULL
+ *
+ * This will preclude queuing the mount if this is a cleanup
+ * after a failed copy_tree() or destruction of an anonymous
+ * namespace, etc.
+ */
+ mnt_notify_add(p);
}
}
@@ -1912,16 +2042,14 @@ static int do_umount(struct mount *mnt, int flags)
event++;
if (flags & MNT_DETACH) {
- if (mnt->mnt.mnt_flags & MNT_ONRB ||
- !list_empty(&mnt->mnt_list))
+ if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (mnt->mnt.mnt_flags & MNT_ONRB ||
- !list_empty(&mnt->mnt_list))
+ if (mnt_ns_attached(mnt) || !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
@@ -1988,6 +2116,7 @@ static void warn_mandlock(void)
static int can_umount(const struct path *path, int flags)
{
struct mount *mnt = real_mount(path->mnt);
+ struct super_block *sb = path->dentry->d_sb;
if (!may_mount())
return -EPERM;
@@ -1997,7 +2126,7 @@ static int can_umount(const struct path *path, int flags)
return -EINVAL;
if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */
return -EINVAL;
- if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN))
+ if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN))
return -EPERM;
return 0;
}
@@ -2055,9 +2184,15 @@ SYSCALL_DEFINE1(oldumount, char __user *, name)
static bool is_mnt_ns_file(struct dentry *dentry)
{
+ struct ns_common *ns;
+
/* Is this a proxy for a mount namespace? */
- return dentry->d_op == &ns_dentry_operations &&
- dentry->d_fsdata == &mntns_operations;
+ if (dentry->d_op != &ns_dentry_operations)
+ return false;
+
+ ns = d_inode(dentry)->i_private;
+
+ return ns->ops == &mntns_operations;
}
struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
@@ -2065,30 +2200,34 @@ struct ns_common *from_mnt_ns(struct mnt_namespace *mnt)
return &mnt->ns;
}
-struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool previous)
+struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool previous)
{
- guard(read_lock)(&mnt_ns_tree_lock);
+ guard(rcu)();
+
for (;;) {
- struct rb_node *node;
+ struct list_head *list;
if (previous)
- node = rb_prev(&mntns->mnt_ns_tree_node);
+ list = rcu_dereference(list_bidir_prev_rcu(&mntns->mnt_ns_list));
else
- node = rb_next(&mntns->mnt_ns_tree_node);
- if (!node)
+ list = rcu_dereference(list_next_rcu(&mntns->mnt_ns_list));
+ if (list_is_head(list, &mnt_ns_list))
return ERR_PTR(-ENOENT);
- mntns = node_to_mnt_ns(node);
- node = &mntns->mnt_ns_tree_node;
+ mntns = list_entry_rcu(list, struct mnt_namespace, mnt_ns_list);
+ /*
+ * The last passive reference count is put with RCU
+ * delay so accessing the mount namespace is not just
+ * safe but all relevant members are still valid.
+ */
if (!ns_capable_noaudit(mntns->user_ns, CAP_SYS_ADMIN))
continue;
/*
- * Holding mnt_ns_tree_lock prevents the mount namespace from
- * being freed but it may well be on it's deathbed. We want an
- * active reference, not just a passive one here as we're
- * persisting the mount namespace.
+ * We need an active reference count as we're persisting
+ * the mount namespace and it might already be on its
+ * deathbed.
*/
if (!refcount_inc_not_zero(&mntns->ns.count))
continue;
@@ -2097,16 +2236,24 @@ struct mnt_namespace *__lookup_next_mnt_ns(struct mnt_namespace *mntns, bool pre
}
}
+struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry)
+{
+ if (!is_mnt_ns_file(dentry))
+ return NULL;
+
+ return to_mnt_ns(get_proc_ns(dentry->d_inode));
+}
+
static bool mnt_ns_loop(struct dentry *dentry)
{
/* Could bind mounting the mount namespace inode cause a
* mount namespace loop?
*/
- struct mnt_namespace *mnt_ns;
- if (!is_mnt_ns_file(dentry))
+ struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry);
+
+ if (!mnt_ns)
return false;
- mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode));
return current->nsproxy->mnt_ns->seq >= mnt_ns->seq;
}
@@ -2198,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path)
static void free_mnt_ns(struct mnt_namespace *);
static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool);
+static inline bool must_dissolve(struct mnt_namespace *mnt_ns)
+{
+ /*
+ * This mount belonged to an anonymous mount namespace
+ * but was moved to a non-anonymous mount namespace and
+ * then unmounted.
+ */
+ if (unlikely(!mnt_ns))
+ return false;
+
+ /*
+ * This mount belongs to a non-anonymous mount namespace
+ * and we know that such a mount can never transition to
+ * an anonymous mount namespace again.
+ */
+ if (!is_anon_ns(mnt_ns)) {
+ /*
+ * A detached mount either belongs to an anonymous mount
+ * namespace or a non-anonymous mount namespace. It
+ * should never belong to something purely internal.
+ */
+ VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL);
+ return false;
+ }
+
+ return true;
+}
+
void dissolve_on_fput(struct vfsmount *mnt)
{
struct mnt_namespace *ns;
- namespace_lock();
- lock_mount_hash();
- ns = real_mount(mnt)->mnt_ns;
- if (ns) {
- if (is_anon_ns(ns))
- umount_tree(real_mount(mnt), UMOUNT_CONNECTED);
- else
- ns = NULL;
+ struct mount *m = real_mount(mnt);
+
+ scoped_guard(rcu) {
+ if (!must_dissolve(READ_ONCE(m->mnt_ns)))
+ return;
}
- unlock_mount_hash();
- namespace_unlock();
- if (ns)
- free_mnt_ns(ns);
+
+ scoped_guard(rwsem_write, &namespace_sem) {
+ ns = m->mnt_ns;
+ if (!must_dissolve(ns))
+ return;
+
+ /*
+ * After must_dissolve() we know that this is a detached
+ * mount in an anonymous mount namespace.
+ *
+ * Now when mnt_has_parent() reports that this mount
+ * tree has a parent, we know that this anonymous mount
+ * tree has been moved to another anonymous mount
+ * namespace.
+ *
+ * So when closing this file we cannot unmount the mount
+ * tree. This will be done when the file referring to
+ * the root of the anonymous mount namespace will be
+ * closed (It could already be closed but it would sync
+ * on @namespace_sem and wait for us to finish.).
+ */
+ if (mnt_has_parent(m))
+ return;
+
+ lock_mount_hash();
+ umount_tree(m, UMOUNT_CONNECTED);
+ unlock_mount_hash();
+ }
+
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
+ free_mnt_ns(ns);
}
void drop_collected_mounts(struct vfsmount *mnt)
@@ -2239,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
return false;
}
+/*
+ * Check that there aren't references to earlier/same mount namespaces in the
+ * specified subtree. Such references can act as pins for mount namespaces
+ * that aren't checked by the mount-cycle checking code, thereby allowing
+ * cycles to be made.
+ */
+static bool check_for_nsfs_mounts(struct mount *subtree)
+{
+ struct mount *p;
+ bool ret = false;
+
+ lock_mount_hash();
+ for (p = subtree; p; p = next_mnt(p, subtree))
+ if (mnt_ns_loop(p->mnt.mnt_root))
+ goto out;
+
+ ret = true;
+out:
+ unlock_mount_hash();
+ return ret;
+}
+
/**
* clone_private_mount - create a private clone of a path
* @path: path to clone
@@ -2247,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry)
* will not be attached anywhere in the namespace and will be private (i.e.
* changes to the originating mount won't be propagated into this).
*
+ * This assumes caller has called or done the equivalent of may_mount().
+ *
* Release with mntput().
*/
struct vfsmount *clone_private_mount(const struct path *path)
@@ -2254,30 +2478,36 @@ struct vfsmount *clone_private_mount(const struct path *path)
struct mount *old_mnt = real_mount(path->mnt);
struct mount *new_mnt;
- down_read(&namespace_sem);
+ scoped_guard(rwsem_read, &namespace_sem)
if (IS_MNT_UNBINDABLE(old_mnt))
- goto invalid;
+ return ERR_PTR(-EINVAL);
- if (!check_mnt(old_mnt))
- goto invalid;
+ if (mnt_has_parent(old_mnt)) {
+ if (!check_mnt(old_mnt))
+ return ERR_PTR(-EINVAL);
+ } else {
+ if (!is_mounted(&old_mnt->mnt))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure this isn't something purely kernel internal. */
+ if (!is_anon_ns(old_mnt->mnt_ns))
+ return ERR_PTR(-EINVAL);
+
+ /* Make sure we don't create mount namespace loops. */
+ if (!check_for_nsfs_mounts(old_mnt))
+ return ERR_PTR(-EINVAL);
+ }
if (has_locked_children(old_mnt, path->dentry))
- goto invalid;
+ return ERR_PTR(-EINVAL);
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
- up_read(&namespace_sem);
-
if (IS_ERR(new_mnt))
- return ERR_CAST(new_mnt);
+ return ERR_PTR(-EINVAL);
/* Longterm mount to be removed by kern_unmount*() */
new_mnt->mnt_ns = MNT_NS_INTERNAL;
-
return &new_mnt->mnt;
-
-invalid:
- up_read(&namespace_sem);
- return ERR_PTR(-EINVAL);
}
EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2376,6 +2606,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
enum mnt_tree_flags_t {
MNT_TREE_MOVE = BIT(0),
MNT_TREE_BENEATH = BIT(1),
+ MNT_TREE_PROPAGATION = BIT(2),
};
/**
@@ -2499,6 +2730,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
dest_mp = smp;
unhash_mnt(source_mnt);
attach_mnt(source_mnt, top_mnt, dest_mp, beneath);
+ mnt_notify_add(source_mnt);
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
@@ -2725,6 +2957,71 @@ static int do_change_type(struct path *path, int ms_flags)
return err;
}
+/* may_copy_tree() - check if a mount tree can be copied
+ * @path: path to the mount tree to be copied
+ *
+ * This helper checks if the caller may copy the mount tree starting
+ * from @path->mnt. The caller may copy the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller tries to copy an nfs mount referring to a mount
+ * namespace, i.e., the caller is trying to copy a mount namespace
+ * entry from nsfs.
+ * (3) The caller tries to copy a pidfs mount referring to a pidfd.
+ * (4) The caller is trying to copy a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * This is not strictly necessary. The current semantics of the new
+ * mount api enforce that the caller must be located in the same
+ * mount namespace as the mount tree it interacts with. Using the
+ * origin sequence number preserves these semantics even for
+ * anonymous mount namespaces. However, one could envision extending
+ * the api to directly operate across mount namespace if needed.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to copy the mount tree.
+ *
+ * Returns true if the mount tree can be copied, false otherwise.
+ */
+static inline bool may_copy_tree(struct path *path)
+{
+ struct mount *mnt = real_mount(path->mnt);
+ const struct dentry_operations *d_op;
+
+ if (check_mnt(mnt))
+ return true;
+
+ d_op = path->dentry->d_op;
+ if (d_op == &ns_dentry_operations)
+ return true;
+
+ if (d_op == &pidfs_dentry_operations)
+ return true;
+
+ if (!is_mounted(path->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+
static struct mount *__do_loopback(struct path *old_path, int recurse)
{
struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt);
@@ -2732,7 +3029,7 @@ static struct mount *__do_loopback(struct path *old_path, int recurse)
if (IS_MNT_UNBINDABLE(old))
return mnt;
- if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations)
+ if (!may_copy_tree(old_path))
return mnt;
if (!recurse && has_locked_children(old, old_path->dentry))
@@ -2800,15 +3097,30 @@ out:
static struct file *open_detached_copy(struct path *path, bool recursive)
{
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
- struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true);
+ struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns;
+ struct user_namespace *user_ns = mnt_ns->user_ns;
struct mount *mnt, *p;
struct file *file;
+ ns = alloc_mnt_ns(user_ns, true);
if (IS_ERR(ns))
return ERR_CAST(ns);
namespace_lock();
+
+ /*
+ * Record the sequence number of the source mount namespace.
+ * This needs to hold namespace_sem to ensure that the mount
+ * doesn't get attached.
+ */
+ if (is_mounted(path->mnt)) {
+ src_mnt_ns = real_mount(path->mnt)->mnt_ns;
+ if (is_anon_ns(src_mnt_ns))
+ ns->seq_origin = src_mnt_ns->seq_origin;
+ else
+ ns->seq_origin = src_mnt_ns->seq;
+ }
+
mnt = __do_loopback(path, recursive);
if (IS_ERR(mnt)) {
namespace_unlock();
@@ -2836,24 +3148,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
return file;
}
-SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags)
{
- struct file *file;
- struct path path;
+ int ret;
+ struct path path __free(path_put) = {};
int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
bool detached = flags & OPEN_TREE_CLONE;
- int error;
- int fd;
BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC);
if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE |
AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE |
OPEN_TREE_CLOEXEC))
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
if (flags & AT_NO_AUTOMOUNT)
lookup_flags &= ~LOOKUP_AUTOMOUNT;
@@ -2863,27 +3173,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl
lookup_flags |= LOOKUP_EMPTY;
if (detached && !may_mount())
- return -EPERM;
+ return ERR_PTR(-EPERM);
+
+ ret = user_path_at(dfd, filename, lookup_flags, &path);
+ if (unlikely(ret))
+ return ERR_PTR(ret);
+
+ if (detached)
+ return open_detached_copy(&path, flags & AT_RECURSIVE);
+
+ return dentry_open(&path, O_PATH, current_cred());
+}
+
+SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags)
+{
+ int fd;
+ struct file *file __free(fput) = NULL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
fd = get_unused_fd_flags(flags & O_CLOEXEC);
if (fd < 0)
return fd;
- error = user_path_at(dfd, filename, lookup_flags, &path);
- if (unlikely(error)) {
- file = ERR_PTR(error);
- } else {
- if (detached)
- file = open_detached_copy(&path, flags & AT_RECURSIVE);
- else
- file = dentry_open(&path, O_PATH, current_cred());
- path_put(&path);
- }
- if (IS_ERR(file)) {
- put_unused_fd(fd);
- return PTR_ERR(file);
- }
- fd_install(fd, file);
+ fd_install(fd, no_free_ptr(file));
return fd;
}
@@ -3070,28 +3385,6 @@ static inline int tree_contains_unbindable(struct mount *mnt)
return 0;
}
-/*
- * Check that there aren't references to earlier/same mount namespaces in the
- * specified subtree. Such references can act as pins for mount namespaces
- * that aren't checked by the mount-cycle checking code, thereby allowing
- * cycles to be made.
- */
-static bool check_for_nsfs_mounts(struct mount *subtree)
-{
- struct mount *p;
- bool ret = false;
-
- lock_mount_hash();
- for (p = subtree; p; p = next_mnt(p, subtree))
- if (mnt_ns_loop(p->mnt.mnt_root))
- goto out;
-
- ret = true;
-out:
- unlock_mount_hash();
- return ret;
-}
-
static int do_set_group(struct path *from_path, struct path *to_path)
{
struct mount *from, *to;
@@ -3267,8 +3560,56 @@ static int can_move_mount_beneath(const struct path *from,
return 0;
}
-static int do_move_mount(struct path *old_path, struct path *new_path,
- bool beneath)
+/* may_use_mount() - check if a mount tree can be used
+ * @mnt: vfsmount to be used
+ *
+ * This helper checks if the caller may use the mount tree starting
+ * from @path->mnt. The caller may use the mount tree under the
+ * following circumstances:
+ *
+ * (1) The caller is located in the mount namespace of the mount tree.
+ * This also implies that the mount does not belong to an anonymous
+ * mount namespace.
+ * (2) The caller is trying to use a mount tree that belongs to an
+ * anonymous mount namespace.
+ *
+ * For that to be safe, this helper enforces that the origin mount
+ * namespace the anonymous mount namespace was created from is the
+ * same as the caller's mount namespace by comparing the sequence
+ * numbers.
+ *
+ * The ownership of a non-anonymous mount namespace such as the
+ * caller's cannot change.
+ * => We know that the caller's mount namespace is stable.
+ *
+ * If the origin sequence number of the anonymous mount namespace is
+ * the same as the sequence number of the caller's mount namespace.
+ * => The owning namespaces are the same.
+ *
+ * ==> The earlier capability check on the owning namespace of the
+ * caller's mount namespace ensures that the caller has the
+ * ability to use the mount tree.
+ *
+ * Returns true if the mount tree can be used, false otherwise.
+ */
+static inline bool may_use_mount(struct mount *mnt)
+{
+ if (check_mnt(mnt))
+ return true;
+
+ /*
+ * Make sure that noone unmounted the target path or somehow
+ * managed to get their hands on something purely kernel
+ * internal.
+ */
+ if (!is_mounted(&mnt->mnt))
+ return false;
+
+ return check_anonymous_mnt(mnt);
+}
+
+static int do_move_mount(struct path *old_path,
+ struct path *new_path, enum mnt_tree_flags_t flags)
{
struct mnt_namespace *ns;
struct mount *p;
@@ -3276,8 +3617,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
struct mount *parent;
struct mountpoint *mp, *old_mp;
int err;
- bool attached;
- enum mnt_tree_flags_t flags = 0;
+ bool attached, beneath = flags & MNT_TREE_BENEATH;
mp = do_lock_mount(new_path, beneath);
if (IS_ERR(mp))
@@ -3293,8 +3633,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
ns = old->mnt_ns;
err = -EINVAL;
- /* The mountpoint must be in our namespace. */
- if (!check_mnt(p))
+ if (!may_use_mount(p))
goto out;
/* The thing moved must be mounted... */
@@ -3305,6 +3644,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (!(attached ? check_mnt(old) : is_anon_ns(ns)))
goto out;
+ if (is_anon_ns(ns)) {
+ /*
+ * Ending up with two files referring to the root of the
+ * same anonymous mount namespace would cause an error
+ * as this would mean trying to move the same mount
+ * twice into the mount tree which would be rejected
+ * later. But be explicit about it right here.
+ */
+ if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns))
+ goto out;
+
+ /*
+ * If this is an anonymous mount tree ensure that mount
+ * propagation can detect mounts that were just
+ * propagated to the target mount tree so we don't
+ * propagate onto them.
+ */
+ ns->mntns_flags |= MNTNS_PROPAGATING;
+ } else if (is_anon_ns(p->mnt_ns)) {
+ /*
+ * Don't allow moving an attached mount tree to an
+ * anonymous mount tree.
+ */
+ goto out;
+ }
+
if (old->mnt.mnt_flags & MNT_LOCKED)
goto out;
@@ -3347,6 +3712,9 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
if (err)
goto out;
+ if (is_anon_ns(ns))
+ ns->mntns_flags &= ~MNTNS_PROPAGATING;
+
/* if the mount is moved, it should no longer be expire
* automatically */
list_del_init(&old->mnt_expire);
@@ -3355,10 +3723,13 @@ static int do_move_mount(struct path *old_path, struct path *new_path,
out:
unlock_mount(mp);
if (!err) {
- if (attached)
+ if (attached) {
mntput_no_expire(parent);
- else
+ } else {
+ /* Make sure we notice when we leak mounts. */
+ VFS_WARN_ON_ONCE(!mnt_ns_empty(ns));
free_mnt_ns(ns);
+ }
}
return err;
}
@@ -3375,7 +3746,7 @@ static int do_move_mount_old(struct path *path, const char *old_name)
if (err)
return err;
- err = do_move_mount(&old_path, path, false);
+ err = do_move_mount(&old_path, path, 0);
path_put(&old_path);
return err;
}
@@ -3835,7 +4206,7 @@ int path_mount(const char *dev_name, struct path *path,
data_page);
}
-long do_mount(const char *dev_name, const char __user *dir_name,
+int do_mount(const char *dev_name, const char __user *dir_name,
const char *type_page, unsigned long flags, void *data_page)
{
struct path path;
@@ -3905,6 +4276,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
+ INIT_LIST_HEAD(&new_ns->mnt_ns_list);
RB_CLEAR_NODE(&new_ns->mnt_ns_tree_node);
init_waitqueue_head(&new_ns->poll);
new_ns->user_ns = get_user_ns(user_ns);
@@ -3984,7 +4356,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
while (p->mnt.mnt_root != q->mnt.mnt_root)
p = next_mnt(skip_mnt_tree(p), old);
}
- mnt_ns_tree_add(new_ns);
namespace_unlock();
if (rootmnt)
@@ -3992,6 +4363,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
if (pwdmnt)
mntput(pwdmnt);
+ mnt_ns_tree_add(new_ns);
return new_ns;
}
@@ -4215,6 +4587,21 @@ err_unlock:
return ret;
}
+static inline int vfs_move_mount(struct path *from_path, struct path *to_path,
+ enum mnt_tree_flags_t mflags)
+{
+ int ret;
+
+ ret = security_move_mount(from_path, to_path);
+ if (ret)
+ return ret;
+
+ if (mflags & MNT_TREE_PROPAGATION)
+ return do_set_group(from_path, to_path);
+
+ return do_move_mount(from_path, to_path, mflags);
+}
+
/*
* Move a mount from one place to another. In combination with
* fsopen()/fsmount() this is used to install a new mount and in combination
@@ -4228,8 +4615,12 @@ SYSCALL_DEFINE5(move_mount,
int, to_dfd, const char __user *, to_pathname,
unsigned int, flags)
{
- struct path from_path, to_path;
- unsigned int lflags;
+ struct path to_path __free(path_put) = {};
+ struct path from_path __free(path_put) = {};
+ struct filename *to_name __free(putname) = NULL;
+ struct filename *from_name __free(putname) = NULL;
+ unsigned int lflags, uflags;
+ enum mnt_tree_flags_t mflags = 0;
int ret = 0;
if (!may_mount())
@@ -4242,43 +4633,53 @@ SYSCALL_DEFINE5(move_mount,
(MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP))
return -EINVAL;
- /* If someone gives a pathname, they aren't permitted to move
- * from an fd that requires unmount as we can't get at the flag
- * to clear it afterwards.
- */
+ if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION;
+ if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH;
+
lflags = 0;
if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
-
- ret = user_path_at(from_dfd, from_pathname, lflags, &from_path);
- if (ret < 0)
- return ret;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ from_name = getname_maybe_null(from_pathname, uflags);
+ if (IS_ERR(from_name))
+ return PTR_ERR(from_name);
lflags = 0;
if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW;
if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT;
- if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY;
+ uflags = 0;
+ if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH;
+ to_name = getname_maybe_null(to_pathname, uflags);
+ if (IS_ERR(to_name))
+ return PTR_ERR(to_name);
+
+ if (!to_name && to_dfd >= 0) {
+ CLASS(fd_raw, f_to)(to_dfd);
+ if (fd_empty(f_to))
+ return -EBADF;
+
+ to_path = fd_file(f_to)->f_path;
+ path_get(&to_path);
+ } else {
+ ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL);
+ if (ret)
+ return ret;
+ }
- ret = user_path_at(to_dfd, to_pathname, lflags, &to_path);
- if (ret < 0)
- goto out_from;
+ if (!from_name && from_dfd >= 0) {
+ CLASS(fd_raw, f_from)(from_dfd);
+ if (fd_empty(f_from))
+ return -EBADF;
- ret = security_move_mount(&from_path, &to_path);
- if (ret < 0)
- goto out_to;
+ return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags);
+ }
- if (flags & MOVE_MOUNT_SET_GROUP)
- ret = do_set_group(&from_path, &to_path);
- else
- ret = do_move_mount(&from_path, &to_path,
- (flags & MOVE_MOUNT_BENEATH));
+ ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL);
+ if (ret)
+ return ret;
-out_to:
- path_put(&to_path);
-out_from:
- path_put(&from_path);
- return ret;
+ return vfs_move_mount(&from_path, &to_path, mflags);
}
/*
@@ -4414,6 +4815,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
list_del_init(&new_mnt->mnt_expire);
put_mountpoint(root_mp);
unlock_mount_hash();
+ mnt_notify_add(root_mnt);
+ mnt_notify_add(new_mnt);
chroot_fs_refs(&root, &new);
error = 0;
out4:
@@ -4458,11 +4861,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
return -EINVAL;
/*
- * Once a mount has been idmapped we don't allow it to change its
- * mapping. It makes things simpler and callers can just create
- * another bind-mount they can idmap if they want to.
+ * We only allow an mount to change it's idmapping if it has
+ * never been accessible to userspace.
*/
- if (is_idmapped_mnt(m))
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m))
return -EPERM;
/* The underlying filesystem doesn't support idmapped mounts yet. */
@@ -4522,7 +4924,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
break;
}
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
return 0;
}
@@ -4552,18 +4954,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt)
static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
{
+ struct mnt_idmap *old_idmap;
+
if (!kattr->mnt_idmap)
return;
- /*
- * Pairs with smp_load_acquire() in mnt_idmap().
- *
- * Since we only allow a mount to change the idmapping once and
- * verified this in can_idmap_mount() we know that the mount has
- * @nop_mnt_idmap attached to it. So there's no need to drop any
- * references.
- */
+ old_idmap = mnt_idmap(&mnt->mnt);
+
+ /* Pairs with smp_load_acquire() in mnt_idmap(). */
smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap));
+ mnt_idmap_put(old_idmap);
}
static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
@@ -4583,7 +4983,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt)
if (kattr->propagation)
change_mnt_propagation(m, kattr->propagation);
- if (!kattr->recurse)
+ if (!(kattr->kflags & MOUNT_KATTR_RECURSE))
break;
}
touch_mnt_namespace(mnt->mnt_ns);
@@ -4613,7 +5013,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
*/
namespace_lock();
if (kattr->propagation == MS_SHARED) {
- err = invent_group_ids(mnt, kattr->recurse);
+ err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE);
if (err) {
namespace_unlock();
return err;
@@ -4664,7 +5064,7 @@ out:
}
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
struct ns_common *ns;
struct user_namespace *mnt_userns;
@@ -4672,13 +5072,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
- /*
- * We currently do not support clearing an idmapped mount. If this ever
- * is a use-case we can revisit this but for now let's keep it simple
- * and not allow it.
- */
- if (attr->attr_clr & MOUNT_ATTR_IDMAP)
- return -EINVAL;
+ if (attr->attr_clr & MOUNT_ATTR_IDMAP) {
+ /*
+ * We can only remove an idmapping if it's never been
+ * exposed to userspace.
+ */
+ if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE))
+ return -EINVAL;
+
+ /*
+ * Removal of idmappings is equivalent to setting
+ * nop_mnt_idmap.
+ */
+ if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) {
+ kattr->mnt_idmap = &nop_mnt_idmap;
+ return 0;
+ }
+ }
if (attr->userns_fd > INT_MAX)
return -EINVAL;
@@ -4715,22 +5125,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
- struct mount_kattr *kattr, unsigned int flags)
+ struct mount_kattr *kattr)
{
- unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
-
- if (flags & AT_NO_AUTOMOUNT)
- lookup_flags &= ~LOOKUP_AUTOMOUNT;
- if (flags & AT_SYMLINK_NOFOLLOW)
- lookup_flags &= ~LOOKUP_FOLLOW;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
-
- *kattr = (struct mount_kattr) {
- .lookup_flags = lookup_flags,
- .recurse = !!(flags & AT_RECURSIVE),
- };
-
if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS)
return -EINVAL;
if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1)
@@ -4778,35 +5174,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
return -EINVAL;
}
- return build_mount_idmapped(attr, usize, kattr, flags);
+ return build_mount_idmapped(attr, usize, kattr);
}
static void finish_mount_kattr(struct mount_kattr *kattr)
{
- put_user_ns(kattr->mnt_userns);
- kattr->mnt_userns = NULL;
+ if (kattr->mnt_userns) {
+ put_user_ns(kattr->mnt_userns);
+ kattr->mnt_userns = NULL;
+ }
if (kattr->mnt_idmap)
mnt_idmap_put(kattr->mnt_idmap);
}
-SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
- unsigned int, flags, struct mount_attr __user *, uattr,
- size_t, usize)
+static int copy_mount_setattr(struct mount_attr __user *uattr, size_t usize,
+ struct mount_kattr *kattr)
{
- int err;
- struct path target;
+ int ret;
struct mount_attr attr;
- struct mount_kattr kattr;
BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0);
- if (flags & ~(AT_EMPTY_PATH |
- AT_RECURSIVE |
- AT_SYMLINK_NOFOLLOW |
- AT_NO_AUTOMOUNT))
- return -EINVAL;
-
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
if (unlikely(usize < MOUNT_ATTR_SIZE_VER0))
@@ -4815,9 +5204,9 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
if (!may_mount())
return -EPERM;
- err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
- if (err)
- return err;
+ ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize);
+ if (ret)
+ return ret;
/* Don't bother walking through the mounts if this is a nop. */
if (attr.attr_set == 0 &&
@@ -4825,7 +5214,39 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
attr.propagation == 0)
return 0;
- err = build_mount_kattr(&attr, usize, &kattr, flags);
+ return build_mount_kattr(&attr, usize, kattr);
+}
+
+SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
+ unsigned int, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ int err;
+ struct path target;
+ struct mount_kattr kattr;
+ unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW;
+
+ if (flags & ~(AT_EMPTY_PATH |
+ AT_RECURSIVE |
+ AT_SYMLINK_NOFOLLOW |
+ AT_NO_AUTOMOUNT))
+ return -EINVAL;
+
+ if (flags & AT_NO_AUTOMOUNT)
+ lookup_flags &= ~LOOKUP_AUTOMOUNT;
+ if (flags & AT_SYMLINK_NOFOLLOW)
+ lookup_flags &= ~LOOKUP_FOLLOW;
+ if (flags & AT_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
+
+ kattr = (struct mount_kattr) {
+ .lookup_flags = lookup_flags,
+ };
+
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ err = copy_mount_setattr(uattr, usize, &kattr);
if (err)
return err;
@@ -4838,6 +5259,47 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename,
+ unsigned, flags, struct mount_attr __user *, uattr,
+ size_t, usize)
+{
+ struct file __free(fput) *file = NULL;
+ int fd;
+
+ if (!uattr && usize)
+ return -EINVAL;
+
+ file = vfs_open_tree(dfd, filename, flags);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ if (uattr) {
+ int ret;
+ struct mount_kattr kattr = {};
+
+ kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE;
+ if (flags & AT_RECURSIVE)
+ kattr.kflags |= MOUNT_KATTR_RECURSE;
+
+ ret = copy_mount_setattr(uattr, usize, &kattr);
+ if (ret)
+ return ret;
+
+ ret = do_mount_setattr(&file->f_path, &kattr);
+ if (ret)
+ return ret;
+
+ finish_mount_kattr(&kattr);
+ }
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ fd_install(fd, no_free_ptr(file));
+ return fd;
+}
+
int show_path(struct seq_file *m, struct dentry *root)
{
if (root->d_sb->s_op->show_path)
@@ -4861,6 +5323,7 @@ struct kstatmount {
struct statmount __user *buf;
size_t bufsize;
struct vfsmount *mnt;
+ struct mnt_idmap *idmap;
u64 mask;
struct path root;
struct statmount sm;
@@ -5033,26 +5496,29 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
{
struct vfsmount *mnt = s->mnt;
struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
int err;
- if (sb->s_op->show_options) {
- size_t start = seq->count;
+ err = security_sb_show_options(seq, sb);
+ if (err)
+ return err;
+ if (sb->s_op->show_options) {
err = sb->s_op->show_options(seq, mnt->mnt_root);
if (err)
return err;
+ }
- if (unlikely(seq_has_overflowed(seq)))
- return -EAGAIN;
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
- if (seq->count == start)
- return 0;
+ if (seq->count == start)
+ return 0;
- /* skip leading comma */
- memmove(seq->buf + start, seq->buf + start + 1,
- seq->count - start - 1);
- seq->count--;
- }
+ /* skip leading comma */
+ memmove(seq->buf + start, seq->buf + start + 1,
+ seq->count - start - 1);
+ seq->count--;
return 0;
}
@@ -5127,47 +5593,101 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, true);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_uidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_UIDMAP;
+ return 0;
+}
+
+static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+
+ ret = statmount_mnt_idmap(s->idmap, seq, false);
+ if (ret < 0)
+ return ret;
+
+ s->sm.mnt_gidmap_num = ret;
+ /*
+ * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid
+ * mappings. This allows userspace to distinguish between a
+ * non-idmapped mount and an idmapped mount where none of the
+ * individual mappings are valid in the caller's idmapping.
+ */
+ if (is_valid_mnt_idmap(s->idmap))
+ s->sm.mask |= STATMOUNT_MNT_GIDMAP;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
- u32 start = seq->count;
+ u32 start, *offp;
+
+ /* Reserve an empty string at the beginning for any unset offsets */
+ if (!seq->count)
+ seq_putc(seq, 0);
+
+ start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = start;
+ offp = &sm->fs_type;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = start;
+ offp = &sm->mnt_root;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = start;
+ offp = &sm->mnt_point;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = start;
+ offp = &sm->mnt_opts;
ret = statmount_mnt_opts(s, seq);
break;
case STATMOUNT_OPT_ARRAY:
- sm->opt_array = start;
+ offp = &sm->opt_array;
ret = statmount_opt_array(s, seq);
break;
case STATMOUNT_OPT_SEC_ARRAY:
- sm->opt_sec_array = start;
+ offp = &sm->opt_sec_array;
ret = statmount_opt_sec_array(s, seq);
break;
case STATMOUNT_FS_SUBTYPE:
- sm->fs_subtype = start;
+ offp = &sm->fs_subtype;
statmount_fs_subtype(s, seq);
break;
case STATMOUNT_SB_SOURCE:
- sm->sb_source = start;
+ offp = &sm->sb_source;
ret = statmount_sb_source(s, seq);
break;
+ case STATMOUNT_MNT_UIDMAP:
+ sm->mnt_uidmap = start;
+ ret = statmount_mnt_uidmap(s, seq);
+ break;
+ case STATMOUNT_MNT_GIDMAP:
+ sm->mnt_gidmap = start;
+ ret = statmount_mnt_gidmap(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
@@ -5193,6 +5713,7 @@ static int statmount_string(struct kstatmount *s, u64 flag)
seq->buf[seq->count++] = '\0';
sm->mask |= flag;
+ *offp = start;
return 0;
}
@@ -5242,7 +5763,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
* We have to find the first mount in our ns and use that, however it
* may not exist, so handle that properly.
*/
- if (RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_empty(ns))
return -ENOENT;
first = child = ns->root;
@@ -5259,6 +5780,21 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
return 0;
}
+/* This must be updated whenever a new flag is added */
+#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \
+ STATMOUNT_MNT_BASIC | \
+ STATMOUNT_PROPAGATE_FROM | \
+ STATMOUNT_MNT_ROOT | \
+ STATMOUNT_MNT_POINT | \
+ STATMOUNT_FS_TYPE | \
+ STATMOUNT_MNT_NS_ID | \
+ STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | \
+ STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | \
+ STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_SUPPORTED_MASK)
+
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
@@ -5267,7 +5803,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
int err;
/* Has the namespace already been emptied? */
- if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts))
+ if (mnt_ns_id && mnt_ns_empty(ns))
return -ENOENT;
s->mnt = lookup_mnt_in_ns(mnt_id, ns);
@@ -5292,6 +5828,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
return err;
s->root = root;
+ s->idmap = mnt_idmap(s->mnt);
if (s->mask & STATMOUNT_SB_BASIC)
statmount_sb_basic(s);
@@ -5325,12 +5862,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_SB_SOURCE)
err = statmount_string(s, STATMOUNT_SB_SOURCE);
+ if (!err && s->mask & STATMOUNT_MNT_UIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_UIDMAP);
+
+ if (!err && s->mask & STATMOUNT_MNT_GIDMAP)
+ err = statmount_string(s, STATMOUNT_MNT_GIDMAP);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
+ if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) {
+ s->sm.mask |= STATMOUNT_SUPPORTED_MASK;
+ s->sm.supported_mask = STATMOUNT_SUPPORTED;
+ }
+
if (err)
return err;
+ /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */
+ WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask);
+
return 0;
}
@@ -5348,7 +5899,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
- STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \
+ STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
@@ -5525,9 +6077,9 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
if (!last_mnt_id) {
if (reverse)
- first = node_to_mount(rb_last(&ns->mounts));
+ first = node_to_mount(ns->mnt_last_node);
else
- first = node_to_mount(rb_first(&ns->mounts));
+ first = node_to_mount(ns->mnt_first_node);
} else {
if (reverse)
first = mnt_find_id_at_reverse(ns, last_mnt_id - 1);
@@ -5927,7 +6479,7 @@ const struct proc_ns_operations mntns_operations = {
};
#ifdef CONFIG_SYSCTL
-static struct ctl_table fs_namespace_sysctls[] = {
+static const struct ctl_table fs_namespace_sysctls[] = {
{
.procname = "mount-max",
.data = &sysctl_mount_max,