From 0e0e490f5d5ec2f91209b77a95f9c7185d97cfc6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 30 Apr 2026 15:42:43 -0400 Subject: VFS: use wait_var_event for waiting in d_alloc_parallel() Parallel lookup starts with a call of d_alloc_parallel(). That primitive either returns a matching hashed dentry or allocates a new one in the in-lookup state and returns it to the caller. Once the caller is done with lookup, it indicates so either by call of d_{splice_alias,add}() or by call of d_done_lookup(); at that point dentry leaves the in-lookup state. If d_alloc_parallel() finds a matching in-lookup dentry, it must wait for that dentry to leave the in-lookup state, one way or another. Currently by supplying wait_queue_head to d_alloc_parallel(). If d_alloc_parallel() creates a new in-lookup dentry, the address of that wait_queue_head is stored in ->d_wait of new dentry and stays there while it's in the in-lookup; subsequent d_alloc_parallel() will wait on the queue found in the matching in-lookup dentry. Transition out of in-lookup state wakes waiters on that queue (if any). That works, but the calling conventions are inconvenient - the caller must supply wait_queue_head and make sure that it survives at least until the new in-lookup dentry leaves the in-lookup state. That amounts to boilerplate in the d_alloc_parallel() callers that are followed by a call of d_lookup_done() in the same function; in cases like nfs asynchronous unlink it gets worse than that. This patch changes d_alloc_parallel() to use wake_up_var_locked() to wake up waiters, and wait_var_event_spinlock() to wait. dentry->d_lock is used for synchronisation as it is already held and the relevant times. That eliminates the need of caller-supplied wait_queue_head, simplifying the calling conventions. Better yet, we only need one bit of information stored in dentry itself: whether there are any waiters to be woken up, and that can be easily stored in ->d_flags; ->d_wait goes away. The reason we need that bit (DCACHE_LOOKUP_WAITERS) is that with wait_var machinery the queues are shared with all kinds of stuff and there's no way tell if any of the waiters have anything to do with our dentry; most of the time none of them will be relevant, so we need to avoid the pointless wakeups. Another benefit of the new scheme comes from the fact that wakeups have to be done outside of write-side critical areas of ->i_dir_seq; with the old scheme we need to carry the value picked from ->d_wait from __d_lookup_unhash() to the place where we actually wake the waiters up. Now we can just leave DCACHE_LOOKUP_WAITERS in ->d_flags until we get to doing wakeups - that's done within the same ->d_lock scope, so we are fine; new bit is accessed only under ->d_lock and it's seen only on dentries with DCACHE_PAR_LOOKUP in ->d_flags. __d_lookup_unhash() no longer needs to re-init ->d_lru. That was previously shared (in a union) with ->d_wait but ->d_wait is now gone so it no longer corrupts ->d_lru. Co-developed-by: Al Viro # saner handling of flags Signed-off-by: NeilBrown Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 6 +++ fs/afs/dir_silly.c | 4 +- fs/dcache.c | 83 +++++++++++++++++++---------------- fs/fuse/readdir.c | 3 +- fs/namei.c | 6 +-- fs/nfs/dir.c | 6 +-- fs/nfs/unlink.c | 3 +- fs/proc/base.c | 3 +- fs/proc/proc_sysctl.c | 3 +- fs/smb/client/readdir.c | 3 +- include/linux/dcache.h | 11 +++-- include/linux/nfs_xdr.h | 1 - 12 files changed, 67 insertions(+), 65 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index fdf074429cd3..36fecc7a3d97 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1385,3 +1385,9 @@ for_each_alias(dentry, inode) instead of hlist_for_each_entry; better yet, see if any of the exported primitives could be used instead of the entire loop. You still need to hold ->i_lock of the inode over either form of manual loop. + +--- + +**mandatory** + +d_alloc_parallel() no longer requires a waitqueue_head. diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index a748fd133faf..982bb6ec15f0 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -248,13 +248,11 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) struct dentry *alias; int ret; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); down_read(&dvnode->rmdir_lock); - alias = d_alloc_parallel(dentry->d_parent, &dentry->d_name, &wq); + alias = d_alloc_parallel(dentry->d_parent, &dentry->d_name); if (IS_ERR(alias)) { up_read(&dvnode->rmdir_lock); return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 2c61aeea41f4..0aff2c510beb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2250,8 +2250,7 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, return found; } if (d_in_lookup(dentry)) { - found = d_alloc_parallel(dentry->d_parent, name, - dentry->d_wait); + found = d_alloc_parallel(dentry->d_parent, name); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; @@ -2638,32 +2637,24 @@ static inline unsigned start_dir_add(struct inode *dir) } } -static inline void end_dir_add(struct inode *dir, unsigned int n, - wait_queue_head_t *d_wait) +static inline void end_dir_add(struct inode *dir, unsigned int n) { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); - if (wq_has_sleeper(d_wait)) - wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) { - if (d_in_lookup(dentry)) { - DECLARE_WAITQUEUE(wait, current); - add_wait_queue(dentry->d_wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(&dentry->d_lock); - schedule(); - spin_lock(&dentry->d_lock); - } while (d_in_lookup(dentry)); + if (likely(d_in_lookup(dentry))) { + dentry->d_flags |= DCACHE_LOOKUP_WAITERS; + wait_var_event_spinlock(&dentry->d_flags, + !d_in_lookup(dentry), + &dentry->d_lock); } } struct dentry *d_alloc_parallel(struct dentry *parent, - const struct qstr *name, - wait_queue_head_t *wq) + const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); @@ -2766,7 +2757,6 @@ retry: return dentry; } rcu_read_unlock(); - new->d_wait = wq; hlist_bl_add_head(&new->d_in_lookup_hash, b); hlist_bl_unlock(b); return new; @@ -2778,13 +2768,26 @@ mismatch: EXPORT_SYMBOL(d_alloc_parallel); /* - * - Unhash the dentry - * - Retrieve and clear the waitqueue head in dentry - * - Return the waitqueue head + * Move dentry from in-lookup state to busy-negative one. + * + * From now on d_in_lookup(dentry) will return false and dentry is gone from + * in-lookup hash. + * + * Anyone who had been waiting on it in d_alloc_parallel() is free to + * proceed after that. Note that waking such waiters up is left to + * the callers; PREEMPT_RT kernels can't have that wakeup done while + * in write-side critical area for ->i_dir_seq, so it's done by calling + * __d_wake_in_lookup_waiters() once it's safe to do so. + * + * Both __d_lookup_unhash() and __d_wake_in_lookup_waiters() should + * be called within the same ->d_lock scope. PAR_LOOKUP is cleared + * here, while LOOKUP_WAITERS (set by somebody finding dentry in + * the in-lookup hash and setting down to wait) is checked and cleared + * in __d_wake_in_lookup_waiters(). Both are gone by the end of + * ->d_lock scope. */ -static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) +static void __d_lookup_unhash(struct dentry *dentry) { - wait_queue_head_t *d_wait; struct hlist_bl_head *b; lockdep_assert_held(&dentry->d_lock); @@ -2793,18 +2796,23 @@ static wait_queue_head_t *__d_lookup_unhash(struct dentry *dentry) hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_in_lookup_hash); - d_wait = dentry->d_wait; - dentry->d_wait = NULL; hlist_bl_unlock(b); dentry->waiters = NULL; - INIT_LIST_HEAD(&dentry->d_lru); - return d_wait; +} + +static inline void __d_wake_in_lookup_waiters(struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_LOOKUP_WAITERS) { + wake_up_var_locked(&dentry->d_flags, &dentry->d_lock); + dentry->d_flags &= ~DCACHE_LOOKUP_WAITERS; + } } void __d_lookup_unhash_wake(struct dentry *dentry) { spin_lock(&dentry->d_lock); - wake_up_all(__d_lookup_unhash(dentry)); + __d_lookup_unhash(dentry); + __d_wake_in_lookup_waiters(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(__d_lookup_unhash_wake); @@ -2814,14 +2822,13 @@ EXPORT_SYMBOL(__d_lookup_unhash_wake); static inline void __d_add(struct dentry *dentry, struct inode *inode, const struct dentry_operations *ops) { - wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); - d_wait = __d_lookup_unhash(dentry); + __d_lookup_unhash(dentry); } if (unlikely(ops)) d_set_d_op(dentry, ops); @@ -2834,8 +2841,10 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode, fsnotify_update_flags(dentry); } __d_rehash(dentry); - if (dir) - end_dir_add(dir, n, d_wait); + if (dir) { + end_dir_add(dir, n); + __d_wake_in_lookup_waiters(dentry); + } spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); @@ -2948,7 +2957,6 @@ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; - wait_queue_head_t *d_wait; struct inode *dir = NULL; unsigned n; @@ -2979,7 +2987,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target, if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); - d_wait = __d_lookup_unhash(target); + __d_lookup_unhash(target); } write_seqcount_begin(&dentry->d_seq); @@ -3018,9 +3026,10 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); - if (dir) - end_dir_add(dir, n, d_wait); - + if (dir) { + end_dir_add(dir, n); + __d_wake_in_lookup_waiters(target); + } if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index db5ae8ec1030..a2361f1d9905 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -164,7 +164,6 @@ static int fuse_direntplus_link(struct file *file, struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); int epoch; if (!o->nodeid) { @@ -201,7 +200,7 @@ static int fuse_direntplus_link(struct file *file, dentry = d_lookup(parent, &name); if (!dentry) { retry: - dentry = d_alloc_parallel(parent, &name, &wq); + dentry = d_alloc_parallel(parent, &name); if (IS_ERR(dentry)) return PTR_ERR(dentry); } diff --git a/fs/namei.c b/fs/namei.c index c7fac83c9a85..ebde3a35746c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1891,13 +1891,12 @@ static struct dentry *__lookup_slow(const struct qstr *name, { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) return ERR_PTR(-ENOENT); again: - dentry = d_alloc_parallel(dir, name, &wq); + dentry = d_alloc_parallel(dir, name); if (IS_ERR(dentry)) return dentry; if (unlikely(!d_in_lookup(dentry))) { @@ -4414,7 +4413,6 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); @@ -4423,7 +4421,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, dentry = d_lookup(dir, &nd->last); for (;;) { if (!dentry) { - dentry = d_alloc_parallel(dir, &nd->last, &wq); + dentry = d_alloc_parallel(dir, &nd->last); if (IS_ERR(dentry)) return dentry; } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e9ce1883288c..9580af999d70 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -726,7 +726,6 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; @@ -755,7 +754,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, dentry = d_lookup(parent, &filename); again: if (!dentry) { - dentry = d_alloc_parallel(parent, &filename, &wq); + dentry = d_alloc_parallel(parent, &filename); if (IS_ERR(dentry)) return; } @@ -2106,7 +2105,6 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; @@ -2162,7 +2160,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, d_drop(dentry); switched = true; dentry = d_alloc_parallel(dentry->d_parent, - &dentry->d_name, &wq); + &dentry->d_name); if (IS_ERR(dentry)) return PTR_ERR(dentry); if (unlikely(!d_in_lookup(dentry))) diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index df3ca4669df6..43ea897943c0 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -124,7 +124,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct inode *inode, struct nf struct dentry *alias; down_read_non_owner(&NFS_I(dir)->rmdir_sem); - alias = d_alloc_parallel(dentry->d_parent, &data->args.name, &data->wq); + alias = d_alloc_parallel(dentry->d_parent, &data->args.name); if (IS_ERR(alias)) { up_read_non_owner(&NFS_I(dir)->rmdir_sem); return 0; @@ -185,7 +185,6 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; - init_waitqueue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); diff --git a/fs/proc/base.c b/fs/proc/base.c index d9acfa89c894..d55a4b603188 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2132,8 +2132,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, goto end_instantiate; if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - child = d_alloc_parallel(dir, &qname, &wq); + child = d_alloc_parallel(dir, &qname); if (IS_ERR(child)) goto end_instantiate; if (d_in_lookup(child)) { diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 49ab74e0bfde..04a382178c65 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -692,8 +692,7 @@ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - child = d_alloc_parallel(dir, &qname, &wq); + child = d_alloc_parallel(dir, &qname); if (IS_ERR(child)) return false; if (d_in_lookup(child)) { diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index e860fa08b5e3..1ff77f3d1de0 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -73,7 +73,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); bool posix = cifs_sb_master_tcon(cifs_sb)->posix_extensions; bool reparse_need_reval = false; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); int rc; cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -105,7 +104,7 @@ retry: (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)) return; - dentry = d_alloc_parallel(parent, name, &wq); + dentry = d_alloc_parallel(parent, name); } if (IS_ERR(dentry)) return; diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2577c05f84ec..97a887be150a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -116,10 +116,7 @@ struct dentry { * possible! */ - union { - struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ - }; + struct list_head d_lru; /* LRU list */ struct hlist_node d_sib; /* child of parent list */ struct hlist_head d_children; /* our children */ /* @@ -210,6 +207,9 @@ enum dentry_flags { DCACHE_REFERENCED = BIT(6), /* Recently used, don't discard. */ DCACHE_DONTCACHE = BIT(7), /* Purge from memory on final dput() */ DCACHE_CANT_MOUNT = BIT(8), + DCACHE_LOOKUP_WAITERS = BIT(9), /* A thread is waiting for + * PAR_LOOKUP to clear + */ DCACHE_SHRINK_LIST = BIT(10), DCACHE_OP_WEAK_REVALIDATE = BIT(11), /* @@ -256,8 +256,7 @@ extern void d_delete(struct dentry *); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); -extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); +extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); /* weird procfs mess; *NOT* exported */ extern struct dentry * d_splice_alias_ops(struct inode *, struct dentry *, diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index fcbd21b5685f..6aced49d5f00 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1743,7 +1743,6 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; -- cgit v1.2.3 From 4a810147a07535e63a7c1fbb90975012e8d57f60 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 27 Apr 2026 14:19:28 -0400 Subject: alloc_path_pseudo(): make sure we don't end up with NORCU dentries for directories A lot of places relies upon directories never having NORCU dentries; currently that property holds, but the proof is not straightforward and rather brittle. It's better to have that verified in the sole caller of d_alloc_pseudo(), so that any future bugs in that direction were caught early. That way we can be sure that * current directory of any process is not NORCU * root directory of any process is not NORCU * starting point of any LOOKUP_RCU pathwalk is not NORCU * dget_parent() can rely upon ->d_parent not being NORCU * d_walk() and is_subdir() can rely upon the same * alloc_file_pseudo() won't create multiple aliases for a directory without having to go through a convoluted audit. Signed-off-by: Al Viro --- fs/file_table.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/file_table.c b/fs/file_table.c index 16e52e7fc2ac..108ba09fb402 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -402,6 +402,8 @@ static struct file *alloc_file(const struct path *path, int flags, static inline int alloc_path_pseudo(const char *name, struct inode *inode, struct vfsmount *mnt, struct path *path) { + if (WARN_ON_ONCE(S_ISDIR(inode->i_mode))) + return -EINVAL; path->dentry = d_alloc_pseudo(mnt->mnt_sb, &QSTR(name)); if (!path->dentry) return -ENOMEM; -- cgit v1.2.3 From 1a967a7ec70ff951716e84739f79b6e167ac1e0b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 3 May 2026 23:00:09 -0400 Subject: fix a race between d_find_any_alias() and final dput() of NORCU dentries Refcount of a NORCU dentry must not be incremented after having dropped to zero. Otherwise we might end up with the following race: CPU1: in fast_dput(d), rcu_read_lock(); CPU1: decrements refcount of d to 0 CPU1: notice that it's unhashed CPU2: grab a reference to d CPU2: dput(d), freeing d CPU1: ... looks like we need to evict d, let's grab ->d_lock, recheck the refcount, etc. and that spin_lock(&d->d_lock) ends up a UAF, despite still being in an RCU read-side critical area started back when the refcount had been positive. If not for DCACHE_NORCU in d->d_flags freeing would've been RCU-delayed, so we'd have grabbed ->d_lock, noticed the negative value stored into refcount by __dentry_kill(), dropped the locks and that would be it. For NORCU dentries freeing is _not_ delayed, though. Most of the non-counting references are excluded for NORCU dentries - they are not allowed to be hashed, they never get placed on LRU, they never get placed into anyone's list of children and while dput_to_list() might put them into a shrink list, nobody bumps refcount of something that had been reached that way. However, inode's list of aliases can be a problem - it does not contribute to dentry refcount (for obvious reasons) and we *do* have places that grab references to something found on that list - that's precisely what d_find_alias() is. In case of d_find_alias() we are safe - it skips unhashed aliases, so all NORCU ones are ignored there. d_find_any_alias() is *not* limited to hashed ones, though, and while it's usually called for directories (which never get NORCU dentries), there are callers that use it to get something for non-directories with no hashed aliases. Having d_find_any_alias() hit a NORCU dentry is not impossible - it can be easily arranged if you have CAP_DAC_READ_SEARCH (memfd_create() + mmap() + name_to_handle_at() for /proc/self/map_files/<...> + munmap() + open_by_handle_at() will do that, and adding a second memfd_create() for mount_fd makes it possible to do that without having memfd pinned). The race window is narrow, and it's probably not feasible on bare hardware, but... It's not hard to fix, fortunately: * separate __d_find_dir_alias() (== current __d_find_any_alias()) to be used for directory inodes. * provide dget_alias_ilocked() that would return false for NORCU dentries with zero refcount and return true incrementing refcount otherwise * make __d_find_any_alias() go over the list of aliases, using dget_alias_ilocked() and returning the alias it succeeds on (normally the first one). Any NORCU alias with zero refcount is going to be evicted by the thread that had dropped the final reference; this makes __d_find_any_alias() pretend it had lost the race with eviction. Signed-off-by: Al Viro --- fs/dcache.c | 21 ++++++++++++++++++--- include/linux/dcache.h | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 0aff2c510beb..fa12e18906b9 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1052,7 +1052,10 @@ repeat: } EXPORT_SYMBOL(dget_parent); -static struct dentry * __d_find_any_alias(struct inode *inode) +/* + * inode is a directory, inode->i_lock is held by the caller + */ +static struct dentry * __d_find_dir_alias(struct inode *inode) { struct dentry *alias; @@ -1063,6 +1066,18 @@ static struct dentry * __d_find_any_alias(struct inode *inode) return alias; } +static struct dentry * __d_find_any_alias(struct inode *inode) +{ + struct dentry *alias; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + for_each_alias(alias, inode) + if (dget_alias_ilocked(alias)) + return alias; + return NULL; +} + /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for @@ -1086,7 +1101,7 @@ static struct dentry *__d_find_alias(struct inode *inode) struct dentry *alias; if (S_ISDIR(inode->i_mode)) - return __d_find_any_alias(inode); + return __d_find_dir_alias(inode); for_each_alias(alias, inode) { spin_lock(&alias->d_lock); @@ -3150,7 +3165,7 @@ struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry, security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { - struct dentry *new = __d_find_any_alias(inode); + struct dentry *new = __d_find_dir_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 97a887be150a..a3409de3f490 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -365,6 +365,24 @@ static inline struct dentry *dget(struct dentry *dentry) return dentry; } +/* dentry->d_inode->i_lock must be held by caller */ +static inline bool dget_alias_ilocked(struct dentry *dentry) +{ + if (likely(!(READ_ONCE(dentry->d_flags) & DCACHE_NORCU))) { + lockref_get(&dentry->d_lockref); + return true; + } + // NORCU dentries with zero refcount MUST NOT be grabbed + spin_lock(&dentry->d_lock); + if (dentry->d_lockref.count > 0) { + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + return true; + } + spin_unlock(&dentry->d_lock); + return false; +} + extern struct dentry *dget_parent(struct dentry *dentry); /** -- cgit v1.2.3 From 87e801e1678342fc23b1eb92c0eecedf5dca79cb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 May 2026 00:32:43 -0400 Subject: find_acceptable_alias(): skip NORCU aliases with zero refcount similar to d_find_any_alias() situation Signed-off-by: Al Viro --- fs/exportfs/expfs.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index fbd45e7ae706..eafd99507afe 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -53,10 +53,10 @@ find_acceptable_alias(struct dentry *result, inode = result->d_inode; spin_lock(&inode->i_lock); for_each_alias(dentry, inode) { - dget(dentry); + if (!dget_alias_ilocked(dentry)) + continue; spin_unlock(&inode->i_lock); - if (toput) - dput(toput); + dput(toput); if (dentry != result && acceptable(context, dentry)) { dput(result); return dentry; @@ -66,8 +66,7 @@ find_acceptable_alias(struct dentry *result, } spin_unlock(&inode->i_lock); - if (toput) - dput(toput); + dput(toput); return NULL; } -- cgit v1.2.3 From f601ede8fd166f8f2c84db427b94f80f1b0cbab1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Apr 2026 14:17:52 -0400 Subject: select_collect(): ignore dentries on shrink lists if they have positive refcounts If all dentries we find have positive refcounts and some happen to be on shrink lists, there's no point trying to steal them in the select_collect2() phase - we won't be able to evict any of them. Busy on shrink lists is still busy... Signed-off-by: Al Viro --- fs/dcache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index fa12e18906b9..40031b806b73 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1589,9 +1589,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (dentry->d_flags & DCACHE_SHRINK_LIST) { - data->found++; - } else if (!dentry->d_lockref.count) { + if (!dentry->d_lockref.count) { to_shrink_list(dentry, &data->dispose); data->found++; } else if (dentry->d_lockref.count < 0) { -- cgit v1.2.3 From 20aa45a87c308552781bc33dfb89a933ba5e8f43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Apr 2026 14:35:38 -0400 Subject: make to_shrink_list() return whether it has moved dentry to list ... and make it check the refcount for being zero in addition to dentry not being on a shrink list already. Simplifies the callers... Signed-off-by: Al Viro --- fs/dcache.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 40031b806b73..7a0c8349b5a1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -988,14 +988,17 @@ void d_make_discardable(struct dentry *dentry) } EXPORT_SYMBOL(d_make_discardable); -static void to_shrink_list(struct dentry *dentry, struct list_head *list) +static bool to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { - if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { + if (likely(!dentry->d_lockref.count && + !(dentry->d_flags & DCACHE_SHRINK_LIST))) { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); d_shrink_add(dentry, list); + return true; } + return false; } void dput_to_list(struct dentry *dentry, struct list_head *list) @@ -1180,8 +1183,7 @@ struct dentry *d_find_alias_rcu(struct inode *inode) void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) { spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, dispose); + to_shrink_list(dentry, dispose); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_dispose_if_unused); @@ -1589,11 +1591,9 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (!dentry->d_lockref.count) { + if (dentry->d_lockref.count <= 0) { to_shrink_list(dentry, &data->dispose); data->found++; - } else if (dentry->d_lockref.count < 0) { - data->found++; } /* * We can return to the caller if we have found some (this @@ -1623,17 +1623,12 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - if (!dentry->d_lockref.count) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) { + if (dentry->d_lockref.count <= 0) { + if (!to_shrink_list(dentry, &data->dispose)) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } - to_shrink_list(dentry, &data->dispose); - } else if (dentry->d_lockref.count < 0) { - rcu_read_lock(); - data->victim = dentry; - return D_WALK_QUIT; } /* * We can return to the caller if we have found some (this -- cgit v1.2.3 From 67132b5e5de8a01493ddbb217e936415ca4e09af Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 12 Apr 2026 23:39:16 -0400 Subject: kill d_dispose_if_unused() Rename to_shrink_list() into __move_to_shrink_list(), document and export it. Switch d_dispose_if_unused() users to that and kill d_dispose_if_unused() itself. Signed-off-by: Al Viro --- Documentation/filesystems/porting.rst | 11 ++++++++ fs/dcache.c | 51 ++++++++++++++++++----------------- fs/fuse/dir.c | 2 +- include/linux/dcache.h | 2 +- 4 files changed, 40 insertions(+), 26 deletions(-) diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 36fecc7a3d97..003ab084ad48 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1391,3 +1391,14 @@ either form of manual loop. **mandatory** d_alloc_parallel() no longer requires a waitqueue_head. + +--- + +**mandatory** + +d_dispose_if_unused() is gone; use __move_to_shrink_list() if you really +need that functionality, but watch out for memory safety issues - just +as with d_dispose_if_unused() these are not trivial; with this variant +of API it's more explicit, since grabbing ->d_lock is caller-side, but +d_dispose_if_unused() had all the same issues. It's a low-level primitive; +use only if you have no alternative. diff --git a/fs/dcache.c b/fs/dcache.c index 7a0c8349b5a1..ea26bd2dd2b2 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -988,7 +988,24 @@ void d_make_discardable(struct dentry *dentry) } EXPORT_SYMBOL(d_make_discardable); -static bool to_shrink_list(struct dentry *dentry, struct list_head *list) +/** + * __move_to_shrink_list - try to place a dentry into a shrink list + * @dentry: dentry to try putting into shrink list + * @list: the list to put @dentry into. + * Returns: true @dentry had been placed into @list, false otherwise + * + * If @dentry is idle and not already include into a shrink list, move + * it into @list and return %true; otherwise do nothing and return %false. + * + * Caller must be holding @dentry->d_lock. There must have been no calls of + * dentry_free(@dentry) prior to the beginning of the RCU read-side critical + * area in which __move_to_shrink_list(@dentry, @list) is called. + * + * @list should be thread-private and eventually emptied by passing it to + * shrink_dentry_list(). + */ + +bool __move_to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (likely(!dentry->d_lockref.count && @@ -1000,6 +1017,7 @@ __must_hold(&dentry->d_lock) } return false; } +EXPORT_SYMBOL(__move_to_shrink_list); void dput_to_list(struct dentry *dentry, struct list_head *list) { @@ -1009,7 +1027,7 @@ void dput_to_list(struct dentry *dentry, struct list_head *list) return; } rcu_read_unlock(); - to_shrink_list(dentry, list); + __move_to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } @@ -1170,24 +1188,6 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } -/** - * d_dispose_if_unused - move unreferenced dentries to shrink list - * @dentry: dentry in question - * @dispose: head of shrink list - * - * If dentry has no external references, move it to shrink list. - * - * NOTE!!! The caller is responsible for preventing eviction of the dentry by - * holding dentry->d_inode->i_lock or equivalent. - */ -void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) -{ - spin_lock(&dentry->d_lock); - to_shrink_list(dentry, dispose); - spin_unlock(&dentry->d_lock); -} -EXPORT_SYMBOL(d_dispose_if_unused); - /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1198,8 +1198,11 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - for_each_alias(dentry, inode) - d_dispose_if_unused(dentry, &dispose); + for_each_alias(dentry, inode) { + spin_lock(&dentry->d_lock); + __move_to_shrink_list(dentry, &dispose); + spin_unlock(&dentry->d_lock); + } spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1592,7 +1595,7 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) goto out; if (dentry->d_lockref.count <= 0) { - to_shrink_list(dentry, &data->dispose); + __move_to_shrink_list(dentry, &data->dispose); data->found++; } /* @@ -1624,7 +1627,7 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) goto out; if (dentry->d_lockref.count <= 0) { - if (!to_shrink_list(dentry, &data->dispose)) { + if (!__move_to_shrink_list(dentry, &data->dispose)) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index b658b6baf72f..d8e8ea7280bc 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -177,8 +177,8 @@ static void fuse_dentry_tree_work(struct work_struct *work) spin_lock(&fd->dentry->d_lock); /* If dentry is still referenced, let next dput release it */ fd->dentry->d_flags |= DCACHE_OP_DELETE; + __move_to_shrink_list(fd->dentry, &dispose); spin_unlock(&fd->dentry->d_lock); - d_dispose_if_unused(fd->dentry, &dispose); if (need_resched()) { spin_unlock(&dentry_hash[i].lock); cond_resched(); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a3409de3f490..4b1ff99608e0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -280,7 +280,7 @@ extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); -extern void d_dispose_if_unused(struct dentry *, struct list_head *); +extern bool __move_to_shrink_list(struct dentry *, struct list_head *); extern void shrink_dentry_list(struct list_head *); extern struct dentry *d_find_alias_rcu(struct inode *); -- cgit v1.2.3 From b9c505cbf4acd6f9fa5ebe183f434b72ac3199f7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 May 2026 02:49:20 -0400 Subject: d_prune_aliases(): make sure to skip NORCU aliases Either they are busy (in which case they won't be moved to shrink list anyway) or they have a zero refcount, in which case we really shouldn't mess with them - whoever had dropped the refcount to zero is on the way to evicting and freeing them. That way we are guaranteed that only the thread that has dropped refcount of NORCU dentry to zero might call lock_for_kill() and __dentry_kill() for those. Signed-off-by: Al Viro --- fs/dcache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index ea26bd2dd2b2..b5604b84bafb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1200,7 +1200,8 @@ void d_prune_aliases(struct inode *inode) spin_lock(&inode->i_lock); for_each_alias(dentry, inode) { spin_lock(&dentry->d_lock); - __move_to_shrink_list(dentry, &dispose); + if (likely(!(dentry->d_flags & DCACHE_NORCU))) + __move_to_shrink_list(dentry, &dispose); spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); -- cgit v1.2.3 From cb2ae3c99214013bde780a5e4c575588cf301eba Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 01:52:53 -0400 Subject: shrink_dentry_list(): start with removing from shrink list Currently we leave dentry on the list until we are done with lock_for_kill(). That guarantees that it won't have been even scheduled for removal until we remove it from the list and drop ->d_lock. We grab ->d_lock and rcu_read_lock() and call lock_for_kill(). There are four possible cases: 1) lock_for_kill() has succeeded; dentry and its inode (if any) are locked, dentry refcount is zero and we can remove it from shrink list and feed it to shrink_kill(). 2) lock_for_kill() fails since dentry has become busy. Nothing to do, rcu_read_unlock(), remove from shrink list, drop ->d_lock and move on. 3) lock_for_kill() fails since dentry is currently being killed - already entered __dentry_kill(), but hasn't reached dentry_unlist() yet. Nothing to do, we should just do rcu_read_unlock(), remove from shrink list so that whoever's executing __dentry_kill() would free it once they are done, drop ->d_lock and move on - same actions as in case (2). 4) lock_for_kill() fails since dentry has been killed (reached dentry_unlist(), DCACHE_DENTRY_KILLED set in ->d_flags). In that case whoever had been killing it had already seen it on our shrink list and skipped freeing it. At that point it's just a passive chunk of memory; rcu_read_unlock(), remove from the list, drop ->d_lock and use dentry_free() to schedule freeing. While that works, there's a simpler way to do it: * grab ->d_lock * remove dentry from our shrink list * if DCACHE_DENTRY_KILLED is already set, drop ->d_lock, call dentry_free() and move on. * otherwise grab rcu_read_lock() and call lock_for_free() * if lock_for_kill() succeeds, feed dentry to shrink_kill(), otherwise drop the locks and move on. The end result is equivalent to the old variant. The only difference arises if at the time we grab ->d_lock dentry had refcount 0 and lock_for_kill() had failed spin_trylock() and had to drop and regain ->d_lock. Otherwise nobody can observe at which point within the unbroken ->d_lock scope dentry had been removed from the shrink list - all accesses to ->d_lru are under ->d_lock. If ->d_lock had been dropped and regained, it is possible for another thread to feed that dentry to __dentry_kill(); if it doesn't get to dentry_unlist() before we regain ->d_lock, behaviour is still identical - it's case (3) and by the time __dentry_kill() would've gotten around to checking if the victim is on shrink list, it would've been already removed from ours. If __dentry_kill() from another thread *does* get to dentry_unlist(), in the old variant we would have __dentry_kill() leave calling dentry_free() to us and in the new one __dentry_kill() would've called dentry_free() itself. Since we are under rcu_read_lock(), we are guaranteed that actual freeing won't happen until we get around to rcu_read_unlock(). IOW, the new variant is still safe wrt UAF, if not for the same reason as the old one, and overall result is the same; the only difference is which threads ends up scheduling the actual freeing of dentry. Signed-off-by: Al Viro --- fs/dcache.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index b5604b84bafb..ffaaf48533e6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1228,19 +1228,19 @@ void shrink_dentry_list(struct list_head *list) dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); + d_shrink_del(dentry); + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + spin_unlock(&dentry->d_lock); + dentry_free(dentry); + continue; + } rcu_read_lock(); if (!lock_for_kill(dentry)) { - bool can_free; - rcu_read_unlock(); - d_shrink_del(dentry); - can_free = dentry->d_flags & DCACHE_DENTRY_KILLED; spin_unlock(&dentry->d_lock); - if (can_free) - dentry_free(dentry); - continue; + rcu_read_unlock(); + } else { + shrink_kill(dentry); } - d_shrink_del(dentry); - shrink_kill(dentry); } } EXPORT_SYMBOL(shrink_dentry_list); -- cgit v1.2.3 From 500590a897d9edadfffaed09f25f67ed7cd1813b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 03:06:39 -0400 Subject: fold lock_for_kill() into shrink_kill() Both callers have exact same shape. Signed-off-by: Al Viro --- fs/dcache.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index ffaaf48533e6..3325ca535bf6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1211,14 +1211,15 @@ EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { - do { + while (lock_for_kill(victim)) { rcu_read_unlock(); victim = __dentry_kill(victim); + if (!victim) + return; rcu_read_lock(); - } while (victim && lock_for_kill(victim)); + } + spin_unlock(&victim->d_lock); rcu_read_unlock(); - if (victim) - spin_unlock(&victim->d_lock); } void shrink_dentry_list(struct list_head *list) @@ -1235,12 +1236,7 @@ void shrink_dentry_list(struct list_head *list) continue; } rcu_read_lock(); - if (!lock_for_kill(dentry)) { - spin_unlock(&dentry->d_lock); - rcu_read_unlock(); - } else { - shrink_kill(dentry); - } + shrink_kill(dentry); } } EXPORT_SYMBOL(shrink_dentry_list); @@ -1688,12 +1684,7 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) wait_for_completion(&wait.completion); continue; } - if (!lock_for_kill(v)) { - spin_unlock(&v->d_lock); - rcu_read_unlock(); - } else { - shrink_kill(v); - } + shrink_kill(v); } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); -- cgit v1.2.3 From d60ac70f3f261529a52ac70b514a02651ebd5dc3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 03:14:19 -0400 Subject: fold lock_for_kill() and __dentry_kill() into common helper There are two callers of lock_for_kill() and both are followed by the same sequence of actions: * in case of failure, drop ->d_lock, do rcu_read_unlock() and go away * in case of success, do rcu_read_unlock() followed by passing dentry to __dentry_kill(); if the latter returns NULL, go away. All calls of __dentry_kill() are paired with lock_for_kill() now; let's turn that sequence into a new helper (dentry_kill()) and switch to using it. Signed-off-by: Al Viro --- fs/dcache.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 3325ca535bf6..1dfe37e57447 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -779,6 +779,17 @@ static bool lock_for_kill(struct dentry *dentry) return false; } +static struct dentry *dentry_kill(struct dentry *dentry) +{ + if (unlikely(!lock_for_kill(dentry))) { + spin_unlock(&dentry->d_lock); + rcu_read_unlock(); + return NULL; + } + rcu_read_unlock(); + return __dentry_kill(dentry); +} + /* * Decide if dentry is worth retaining. Usually this is called with dentry * locked; if not locked, we are more limited and might not be able to tell @@ -922,19 +933,13 @@ static void finish_dput(struct dentry *dentry) __releases(dentry->d_lock) __releases(RCU) { - while (lock_for_kill(dentry)) { - rcu_read_unlock(); - dentry = __dentry_kill(dentry); - if (!dentry) - return; + while ((dentry = dentry_kill(dentry)) != NULL) { if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } rcu_read_lock(); } - rcu_read_unlock(); - spin_unlock(&dentry->d_lock); } /* @@ -1211,15 +1216,8 @@ EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { - while (lock_for_kill(victim)) { - rcu_read_unlock(); - victim = __dentry_kill(victim); - if (!victim) - return; + while ((victim = dentry_kill(victim)) != NULL) rcu_read_lock(); - } - spin_unlock(&victim->d_lock); - rcu_read_unlock(); } void shrink_dentry_list(struct list_head *list) -- cgit v1.2.3 From 0954b4b558f7f582615b071b686f42a5a3ce2e53 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 03:24:28 -0400 Subject: simplify safety for lock_for_kill() slowpath rcu_read_lock() scopes in dentry eviction machinery are too wide and badly structured; we end up with too many of those, quite a few essentially identical. Worse, quite a few of the function involved are not neutral wrt that, making them harder to reason about. rcu_read_lock() scope is not the only thing establishing an RCU read-side critical area - spin_lock scope does the same and they can be mixed - the sequence rcu_read_lock() ... spin_lock() ... rcu_read_unlock() ... rcu_read_lock() ... spun_unlock() ... rcu_read_unlock() is an unbroken RCU read-side critical area. Use of that observation allows to simplify things. First of all, lock_for_kill() relies upon being in an unbroken RCU read-side critical area. It's always called with ->d_lock held, and normally returns without having ever dropped that spinlock. We would not need rcu_read_lock() at all, if not for the slow path - if trylock of inode->i_lock fails, we need to drop and retake ->d_lock. Having all calls of lock_for_kill() inside an rcu_read_lock() scope takes care of that, but to show that lock_for_kill() slow path is safe, we need to demonstrate such rcu_read_lock() scope for any call chain leading to lock_for_kill(). Which is not fun, seeing that there are 10 such scopes, with 5 distinct beginnings between them. Case 1: opens in dput() proceeds through fast_dput() grabbing ->d_lock, returning false into dput() and there a call of finish_dput() which calls dentry_kill(), which calls lock_for_kill(); ends in dentry_kill(), either right after lock_for_kill() success or right after dropping ->d_lock on lock_for_kill() failure. ->d_lock is held continuously all the way into lock_for_kill(). Case 2: opens in dentry_kill(), where we proceed to the same call of dentry_kill() as in case 1. ->d_lock is held since before the beginning of the scope and all the way into lock_for_kill(). Case 3: opens in select_collect2(), proceeds through the return to d_walk() and to shrink_dcache_tree() where we grab ->d_lock and proceed to call shrink_kill(), which calls dentry_kill(), then as in the previous scopes. Case 4: opens in shrink_dentry_list(), followed by call of shrink_kill(), then same as in case 3. ->d_lock is held since before the beginning of the scope and all the way into lock_for_kill(). Case 5: opens in shrink_kill(), where it's immediately followed by call of dentry_kill(), then same as in the previous scopes. ->d_lock is held since before the beginning of the scope all the way into lock_for_kill(). Note that in cases 2, 4 and 5 the slow path of lock_for_kill() is the only part of rcu_read_lock() scope that is not covered by spinlock scopes. In case 1 we have the area in fast_dput() as well and in case 3 - the return path from select_collect2() and chunk in shrink_dcache_tree() up to grabbing ->d_lock. Seeing that the reasons we need rcu_read_lock() in these additional areas are completely unrelated to lock_for_kill() slow path, the things get much more straightforward with * explicit rcu_read_lock() scope surrounding the area in slow path of lock_for_kill() where ->d_lock is not held * shrink_dentry_list() dropping rcu_read_lock() as soon as it has grabbed ->d_lock. * dput() dropping rcu_read_lock() just before calling finish_dput(). * rcu_read_lock() calls in finish_dput(), shrink_kill() and shrink_dentry_list() are removed, along with rcu_read_unlock() calls in dentry_kill(). RCU read-side critical areas are unchanged by that, safety of lock_for_kill() slow path is trivial to verify and a bunch of rcu_read_lock() scopes either gone or become easier to describe. Update the comments on locking conventions and memory safety considerations, including the NORCU case. Incidentally, all calls of fast_dput() are immediately preceded by rcu_read_lock() and followed by rcu_read_unlock() now, which will allow to simplify those on the next step... Signed-off-by: Al Viro --- fs/dcache.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 1dfe37e57447..9851d387349d 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -743,14 +743,29 @@ static struct dentry *__dentry_kill(struct dentry *dentry) } /* - * Lock a dentry for feeding it to __dentry_kill(). - * Called under rcu_read_lock() and dentry->d_lock; the former - * guarantees that nothing we access will be freed under us. - * Note that dentry is *not* protected from concurrent dentry_kill(), - * d_delete(), etc. - * - * Return false if dentry is busy. Otherwise, return true and have - * that dentry's inode locked. + * Prepare locking environment for killing a dentry. + * Called under dentry->d_lock. To proceed with eviction of a positive dentry + * we need to get ->i_lock of the inode of that dentry as well. + * However, ->i_lock nests outside of ->d_lock, so if trylock fails we might + * have to drop and regain the latter. Dentry state can change while its + * ->d_lock is not held - it might end up getting killed, becoming busy, + * negative, etc., so we need to be careful. + * + * For NORCU dentries memory safety relies upon having only one call of + * lock_for_kill() in the entire lifetime of dentry and dentry_free() being + * called only by the caller of lock_for_kill(). That this is NORCU-specific; + * the crucial part is that refcounts of NORCU dentries never grow once having + * dropped to zero. + * + * For normal dentries we can not assume that there won't be concurrent calls + * of dentry_free() - dentry might end up being evicted by another thread + * while we are dropping/retaking locks on the slow path. Memory safety is + * provided by keeping the RCU read-side critical area contiguous with + * an explicit rcu_read_lock() scope bridging over the break in spinlock scopes. + * + * Return false if dentry is busy (or busy dying, or already dead). Otherwise, + * return true and have that dentry's inode (if any) locked in addition to + * dentry itself. */ static bool lock_for_kill(struct dentry *dentry) @@ -763,15 +778,22 @@ static bool lock_for_kill(struct dentry *dentry) if (!inode || likely(spin_trylock(&inode->i_lock))) return true; + // Too bad - we need to drop ->d_lock and take locks in correct order. + // To avoid breaking RCU read-side critical area when we drop ->d_lock, + // take an explicit rcu_read_lock() while we are switching locks. + rcu_read_lock(); do { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); + // make sure we'd locked the right inode - ->d_inode might've + // changed while we were not holding ->d_lock if (likely(inode == dentry->d_inode)) break; spin_unlock(&inode->i_lock); inode = dentry->d_inode; } while (inode); + rcu_read_unlock(); if (likely(!dentry->d_lockref.count)) return true; if (inode) @@ -783,10 +805,8 @@ static struct dentry *dentry_kill(struct dentry *dentry) { if (unlikely(!lock_for_kill(dentry))) { spin_unlock(&dentry->d_lock); - rcu_read_unlock(); return NULL; } - rcu_read_unlock(); return __dentry_kill(dentry); } @@ -931,14 +951,12 @@ locked: static void finish_dput(struct dentry *dentry) __releases(dentry->d_lock) - __releases(RCU) { while ((dentry = dentry_kill(dentry)) != NULL) { if (retain_dentry(dentry, true)) { spin_unlock(&dentry->d_lock); return; } - rcu_read_lock(); } } @@ -978,6 +996,7 @@ void dput(struct dentry *dentry) rcu_read_unlock(); return; } + rcu_read_unlock(); finish_dput(dentry); } EXPORT_SYMBOL(dput); @@ -988,7 +1007,6 @@ void d_make_discardable(struct dentry *dentry) WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); dentry->d_flags &= ~DCACHE_PERSISTENT; dentry->d_lockref.count--; - rcu_read_lock(); finish_dput(dentry); } EXPORT_SYMBOL(d_make_discardable); @@ -1217,7 +1235,7 @@ EXPORT_SYMBOL(d_prune_aliases); static inline void shrink_kill(struct dentry *victim) { while ((victim = dentry_kill(victim)) != NULL) - rcu_read_lock(); + ; } void shrink_dentry_list(struct list_head *list) @@ -1233,7 +1251,6 @@ void shrink_dentry_list(struct list_head *list) dentry_free(dentry); continue; } - rcu_read_lock(); shrink_kill(dentry); } } @@ -1669,6 +1686,7 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) struct dentry *v = data.victim; spin_lock(&v->d_lock); + rcu_read_unlock(); if (v->d_lockref.count < 0 && !(v->d_flags & DCACHE_DENTRY_KILLED)) { struct completion_list wait; @@ -1676,7 +1694,6 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) // it becomes invisible to d_walk(). d_add_waiter(v, &wait); spin_unlock(&v->d_lock); - rcu_read_unlock(); if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); wait_for_completion(&wait.completion); -- cgit v1.2.3 From b4143e0e71cd660d05de8d2861e980e7c840eb39 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 03:56:42 -0400 Subject: Shift rcu_read_{,un}lock() inside fast_dput() Shrink rcu_read_lock() scopes surrounding fast_dput() calls. Both callers are immediately preceded and followed by rcu_read_lock()/rcu_read_unlock() resp. Shrink that down into fast_dput() itself; in case when fast_dput() ends up grabbing ->d_lock, we can pull rcu_read_unlock() up to right after spin_lock(). Signed-off-by: Al Viro --- fs/dcache.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9851d387349d..ae9b7151e6a4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -885,17 +885,17 @@ EXPORT_SYMBOL(d_mark_dontcache); * If unsuccessful, we return false, having already taken the dentry lock. * In that case refcount is guaranteed to be zero and we have already * decided that it's not worth keeping around. - * - * The caller needs to hold the RCU read lock, so that the dentry is - * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; /* - * try to decrement the lockref optimistically. + * Try to decrement the lockref optimistically. + * RCU read lock held so that dentry is guaranteed to stay around + * even if the refcount goes down to zero. */ + rcu_read_lock(); ret = lockref_put_return(&dentry->d_lockref); /* @@ -905,6 +905,7 @@ static inline bool fast_dput(struct dentry *dentry) */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); + rcu_read_unlock(); if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) { spin_unlock(&dentry->d_lock); return true; @@ -916,8 +917,10 @@ static inline bool fast_dput(struct dentry *dentry) /* * If we weren't the last ref, we're done. */ - if (ret) + if (ret) { + rcu_read_unlock(); return true; + } /* * Can we decide that decrement of refcount is all we needed without @@ -925,8 +928,10 @@ static inline bool fast_dput(struct dentry *dentry) * dentry looks like it ought to be retained and there's nothing else * to do. */ - if (retain_dentry(dentry, false)) + if (retain_dentry(dentry, false)) { + rcu_read_unlock(); return true; + } /* * Either not worth retaining or we can't tell without the lock. @@ -934,6 +939,7 @@ static inline bool fast_dput(struct dentry *dentry) * but we'll need to re-check the situation after getting the lock. */ spin_lock(&dentry->d_lock); + rcu_read_unlock(); /* * Did somebody else grab a reference to it in the meantime, and @@ -991,12 +997,8 @@ void dput(struct dentry *dentry) if (!dentry) return; might_sleep(); - rcu_read_lock(); - if (likely(fast_dput(dentry))) { - rcu_read_unlock(); + if (likely(fast_dput(dentry))) return; - } - rcu_read_unlock(); finish_dput(dentry); } EXPORT_SYMBOL(dput); @@ -1044,12 +1046,8 @@ EXPORT_SYMBOL(__move_to_shrink_list); void dput_to_list(struct dentry *dentry, struct list_head *list) { - rcu_read_lock(); - if (likely(fast_dput(dentry))) { - rcu_read_unlock(); + if (likely(fast_dput(dentry))) return; - } - rcu_read_unlock(); __move_to_shrink_list(dentry, list); spin_unlock(&dentry->d_lock); } -- cgit v1.2.3 From a5beeb64f22662a53facd71ca2843f9d649597d6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 04:01:28 -0400 Subject: Document rcu_read_lock() use in select_collect2() If select_collect2() finds something that is neither busy nor can be moved to shrink list, it needs to return that to caller's caller (shrink_dcache_tree()) ASAP and do so without grabbing references (among other things, it might be already dying, in which case refcount can't be incremented). We are called inside a ->d_lock scope, but that scope is going to be terminated as soon as we return to caller (d_walk()); ->d_lock will be retaken by shrink_dcache_tree(), but we need to bridge between these scopes, turning them into contiguous RCU read-side critical area. We do that with rcu_read_lock() scope - it spans from unbalanced rcu_read_lock() in select_collect2() to unbalanced rcu_read_unlock() in shrink_dcache_tree(). That works, but it really needs to be documented; it's rather unidiomatic and it had caused quite a bit of confusion - some of it in form of patches "fixing" the damn thing. Signed-off-by: Al Viro --- fs/dcache.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index ae9b7151e6a4..a65cb6451e63 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1638,6 +1638,15 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) if (dentry->d_lockref.count <= 0) { if (!__move_to_shrink_list(dentry, &data->dispose)) { + /* + * We need an enter RCU read-side critical area that + * would extend past the return from d_walk() and + * we are in the scope of ->d_lock that will terminate + * before that, so we use rcu_read_lock() to bridge + * over to the scope of ->d_lock in d_walk() caller. + * The scope of rcu_read_lock() spans from here to + * paired rcu_read_unlock() in shrink_dcache_tree(). + */ rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; @@ -1682,9 +1691,20 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) d_walk(parent, &data, select_collect2); if (data.victim) { struct dentry *v = data.victim; - + /* + * select_collect2() has picked a dentry that was + * either dying or on a shrink list and arranged + * for it to be returned to us. We are still in + * the RCU read-side critical area started there + * (rcu_read_lock() scope opened in select_collect2()), + * so dentry couldn't have been freed yet, but its + * state might've changed since we dropped ->d_lock + * on the way out. Switch over to ->d_lock scope + * and recheck the dentry state. + */ spin_lock(&v->d_lock); rcu_read_unlock(); + if (v->d_lockref.count < 0 && !(v->d_flags & DCACHE_DENTRY_KILLED)) { struct completion_list wait; -- cgit v1.2.3 From d197a9a6d44f23944d9096c5d955a7b1f75b0c89 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 04:17:02 -0400 Subject: adjust calling conventions of lock_for_kill(), fold __dentry_kill() into dentry_kill() Pull dropping ->d_lock on lock_for_kill() failure into lock_for_kill() itself. That reduces dentry_kill() to if (!lock_for_kill(dentry)) return NULL; return __dentry_kill(dentry); at which point it's easier to move that if (...) into the beginning of __dentry_kill() itself and rename it into dentry_kill(). Document the new calling conventions of lock_for_kill(). Signed-off-by: Al Viro --- fs/dcache.c | 112 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 54 insertions(+), 58 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index a65cb6451e63..3887d8b5f81a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -691,57 +691,6 @@ static inline void dentry_unlist(struct dentry *dentry) } } -static struct dentry *__dentry_kill(struct dentry *dentry) -{ - struct dentry *parent = NULL; - bool can_free = true; - - /* - * The dentry is now unrecoverably dead to the world. - */ - lockref_mark_dead(&dentry->d_lockref); - - /* - * inform the fs via d_prune that this dentry is about to be - * unhashed and destroyed. - */ - if (dentry->d_flags & DCACHE_OP_PRUNE) - dentry->d_op->d_prune(dentry); - - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) - d_lru_del(dentry); - } - /* if it was on the hash then remove it */ - __d_drop(dentry); - if (dentry->d_inode) - dentry_unlink_inode(dentry); - else - spin_unlock(&dentry->d_lock); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - - cond_resched(); - /* now that it's negative, ->d_parent is stable */ - if (!IS_ROOT(dentry)) { - parent = dentry->d_parent; - spin_lock(&parent->d_lock); - } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry_unlist(dentry); - if (dentry->d_flags & DCACHE_SHRINK_LIST) - can_free = false; - spin_unlock(&dentry->d_lock); - if (likely(can_free)) - dentry_free(dentry); - if (parent && --parent->d_lockref.count) { - spin_unlock(&parent->d_lock); - return NULL; - } - return parent; -} - /* * Prepare locking environment for killing a dentry. * Called under dentry->d_lock. To proceed with eviction of a positive dentry @@ -763,17 +712,18 @@ static struct dentry *__dentry_kill(struct dentry *dentry) * provided by keeping the RCU read-side critical area contiguous with * an explicit rcu_read_lock() scope bridging over the break in spinlock scopes. * - * Return false if dentry is busy (or busy dying, or already dead). Otherwise, - * return true and have that dentry's inode (if any) locked in addition to - * dentry itself. + * If dentry is busy (or busy dying, or already dead), unlock dentry + * and return false. Otherwise, return true and have that dentry's + * inode (if any) locked in addition to dentry itself. */ - static bool lock_for_kill(struct dentry *dentry) { struct inode *inode = dentry->d_inode; - if (unlikely(dentry->d_lockref.count)) + if (unlikely(dentry->d_lockref.count)) { + spin_unlock(&dentry->d_lock); return false; + } if (!inode || likely(spin_trylock(&inode->i_lock))) return true; @@ -798,16 +748,62 @@ static bool lock_for_kill(struct dentry *dentry) return true; if (inode) spin_unlock(&inode->i_lock); + spin_unlock(&dentry->d_lock); return false; } static struct dentry *dentry_kill(struct dentry *dentry) { - if (unlikely(!lock_for_kill(dentry))) { + struct dentry *parent = NULL; + bool can_free = true; + + if (unlikely(!lock_for_kill(dentry))) + return NULL; + + /* + * The dentry is now unrecoverably dead to the world. + */ + lockref_mark_dead(&dentry->d_lockref); + + /* + * inform the fs via d_prune that this dentry is about to be + * unhashed and destroyed. + */ + if (dentry->d_flags & DCACHE_OP_PRUNE) + dentry->d_op->d_prune(dentry); + + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + } + /* if it was on the hash then remove it */ + __d_drop(dentry); + if (dentry->d_inode) + dentry_unlink_inode(dentry); + else spin_unlock(&dentry->d_lock); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + cond_resched(); + /* now that it's negative, ->d_parent is stable */ + if (!IS_ROOT(dentry)) { + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + } + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + dentry_unlist(dentry); + if (dentry->d_flags & DCACHE_SHRINK_LIST) + can_free = false; + spin_unlock(&dentry->d_lock); + if (likely(can_free)) + dentry_free(dentry); + if (parent && --parent->d_lockref.count) { + spin_unlock(&parent->d_lock); return NULL; } - return __dentry_kill(dentry); + return parent; } /* -- cgit v1.2.3 From 4af0cdb5765ad15f4175bc94ca20b8deb2c7ebde Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 11 Apr 2026 16:17:19 -0400 Subject: document dentry_kill() Signed-off-by: Al Viro --- fs/dcache.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 3887d8b5f81a..cf4bd6c37d04 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -752,6 +752,44 @@ static bool lock_for_kill(struct dentry *dentry) return false; } +/** + * dentry_kill - evict a dentry + * @dentry: dentry to be evicted + * + * All dentry evictions are done by this function. The reference we are + * passed does not contribute to the refcount; the caller had either + * already decremented the refcount or it had never held one in the + * first place. @dentry->d_lock is held by the caller and dropped + * by dentry_kill(@dentry). + * + * We are guaranteed that nobody had called dentry_free(@dentry) + * prior to the beginning of RCU read-side critical area we are in. + * + * Caller must not access @dentry after the call. + * + * If eviction of @dentry drops the last reference to its parent, + * the reference to parent is returned to caller. In that case + * it is guaranteed to satisfy the requirements for dentry_kill() + * argument - its ->d_lock is held and we are guaranteed that nobody + * had passed it to dentry_free() prior to acquisition of its ->d_lock. + * Otherwise %NULL is returned. + * + * If @dentry is idle and remains such after we assemble the full + * locking environment for eviction (see lock_for_kill() for details) + * we mark it doomed (->d_lockref.count < 0) and proceed to detaching + * it from any filesystem objects. Otherwise we drop ->d_lock and + * return %NULL. + * + * Once @dentry is detached from the filesystem objects, we complete + * detaching it from dentry tree. The parent, if any, gets locked + * and its refcount is decremented; dentry is carefully removed from + * the tree (see dentry_unlist() for details) and marked killed + * (%DCACHE_DENTRY_KILLED set in ->d_flags). At that point it's just + * an inert chunk of memory, accessible only via RCU references + * and possibly via a shrink list. If it is not on any shrink lists, + * we call dentry_free(), which schedules actual freeing of memory. + * Othewise freeing is left to the owner of the shrink list in question. + */ static struct dentry *dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; -- cgit v1.2.3 From 3ef89feb387f2da13df290ce3e38fe8284160a86 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 21 Apr 2026 15:52:13 -0400 Subject: d_walk(): shrink rcu_read_lock() scope we only need it to bridge over from ->d_lock scope of child to ->d_lock scope of parent; dropping ->d_lock at rename_retry doesn't need to be in rcu_read_lock() scope. Signed-off-by: Al Viro --- fs/dcache.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index cf4bd6c37d04..94749442fa68 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1491,14 +1491,15 @@ resume: /* * All done at this level ... ascend and resume the search. */ - rcu_read_lock(); ascend: if (this_parent != parent) { dentry = this_parent; this_parent = dentry->d_parent; + rcu_read_lock(); spin_unlock(&dentry->d_lock); spin_lock(&this_parent->d_lock); + rcu_read_unlock(); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) @@ -1506,7 +1507,6 @@ ascend: /* go into the first sibling still alive */ hlist_for_each_entry_continue(dentry, d_sib) { if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) { - rcu_read_unlock(); goto resume; } } @@ -1514,7 +1514,6 @@ ascend: } if (need_seqretry(&rename_lock, seq)) goto rename_retry; - rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); @@ -1523,7 +1522,6 @@ out_unlock: rename_retry: spin_unlock(&this_parent->d_lock); - rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; -- cgit v1.2.3 From 92cba84470632f3b75487171ca86b351a212c3f4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 23 Apr 2026 19:29:18 +0100 Subject: shrinking rcu_read_lock() scope in d_alloc_parallel() The current use of rcu_read_lock() uses in d_alloc_parallel() is fairly opaque - the single large scope serves two purposes. We start with lookup in normal hash, and there rcu_read_lock() scope puts __d_lookup_rcu() and subsequent lockref_get_not_dead() into the same RCU read-side critical area. If no match is found, we proceed to lock the hash chain of in-lookup hash and scan that for a match. If we find a match, we want to grab it and wait for lookup in progress to finish. Since the bitlock we use for these hash chains has to nest inside ->d_lock, we need to unlock the chain first and use lockref_get_not_dead() on the match. That has to be done without breaking the RCU read-side critical area, and we use the same rcu_read_lock() scope to bridge over. The thing is, after having grabbed the reference (and it is very unlikely to fail) we proceed to grab ->d_lock - d_wait_lookup() and __d_lookup_unhash()/__d_wake_in_lookup_waiters() are using that for serialization. That makes lockref_get_not_dead() pointless - trying to avoid grabbing ->d_lock for refcount increment, only to grab it anyway immediately after that. If we grab ->d_lock first and replace lockref_get_not_dead() with direct check for sign and increment if non-negative we can move rcu_read_unlock() to immediately after grabbing ->d_lock. Moreover, we don't need the RCU read-side critical area to be contiguous since before earlier __d_lookup_rcu() - we can just as well terminate the earlier one ASAP and call rcu_read_lock() again only after having found a match (if any) in the in-lookup hash chain. That makes the entire thing easier to follow and the purpose of those rcu_read_lock() calls easier to describe - the first scope is for __d_lookup_rcu() + lockref_get_not_dead(), the second one bridges over from the bitlock scope to the ->d_lock scope on the match found in in-lookup hash. Signed-off-by: Al Viro --- fs/dcache.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 94749442fa68..beb0523c3304 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2743,38 +2743,33 @@ struct dentry *d_alloc_parallel(struct dentry *parent, spin_unlock(&parent->d_lock); retry: - rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); + rcu_read_lock(); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } + rcu_read_unlock(); if (read_seqcount_retry(&dentry->d_seq, d_seq)) { - rcu_read_unlock(); dput(dentry); goto retry; } - rcu_read_unlock(); dput(new); return dentry; } - if (unlikely(read_seqretry(&rename_lock, r_seq))) { - rcu_read_unlock(); + rcu_read_unlock(); + if (unlikely(read_seqretry(&rename_lock, r_seq))) goto retry; - } - if (unlikely(seq & 1)) { - rcu_read_unlock(); + if (unlikely(seq & 1)) goto retry; - } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); - rcu_read_unlock(); goto retry; } /* @@ -2791,19 +2786,20 @@ retry: continue; if (!d_same_name(dentry, parent, name)) continue; + rcu_read_lock(); hlist_bl_unlock(b); + spin_lock(&dentry->d_lock); + rcu_read_unlock(); /* now we can try to grab a reference */ - if (!lockref_get_not_dead(&dentry->d_lockref)) { - rcu_read_unlock(); + if (unlikely(dentry->d_lockref.count < 0)) { + spin_unlock(&dentry->d_lock); goto retry; } - - rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; - * wait for them to finish + * pin it and wait for them to finish */ - spin_lock(&dentry->d_lock); + dget_dlock(dentry); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat @@ -2824,7 +2820,6 @@ retry: dput(new); return dentry; } - rcu_read_unlock(); hlist_bl_add_head(&new->d_in_lookup_hash, b); hlist_bl_unlock(b); return new; -- cgit v1.2.3 From 58b366c2f5de1d5c5687f6ae049ca4639ce05eb4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 16 Apr 2026 11:50:41 -0400 Subject: shrink_dentry_tree(): unify the calls of shrink_dentry_list() Signed-off-by: Al Viro --- fs/dcache.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index beb0523c3304..1f4435b7f9ce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -630,12 +630,14 @@ struct completion_list { * Use ->waiters for a single-linked list of struct completion_list of * waiters. */ -static inline void d_add_waiter(struct dentry *dentry, struct completion_list *p) +static inline bool d_add_waiter(struct dentry *dentry, struct completion_list *p) { - struct completion_list *v = dentry->waiters; + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) + return false; init_completion(&p->completion); - p->next = v; + p->next = dentry->waiters; dentry->waiters = p; + return true; } static inline void d_complete_waiters(struct dentry *dentry) @@ -1705,7 +1707,9 @@ out: static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { for (;;) { - struct select_data data = {.start = parent}; + struct completion_list wait; + bool need_wait = false; + struct select_data data = { .start = parent }; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, @@ -1737,22 +1741,18 @@ static void shrink_dcache_tree(struct dentry *parent, bool for_umount) spin_lock(&v->d_lock); rcu_read_unlock(); - if (v->d_lockref.count < 0 && - !(v->d_flags & DCACHE_DENTRY_KILLED)) { - struct completion_list wait; - // It's busy dying; have it notify us once - // it becomes invisible to d_walk(). - d_add_waiter(v, &wait); + if (unlikely(v->d_lockref.count < 0)) { + // It's doomed; if it isn't dead yet, notify us + // once it becomes invisible to d_walk(). + need_wait = d_add_waiter(v, &wait); spin_unlock(&v->d_lock); - if (!list_empty(&data.dispose)) - shrink_dentry_list(&data.dispose); - wait_for_completion(&wait.completion); - continue; + } else { + shrink_kill(v); } - shrink_kill(v); } - if (!list_empty(&data.dispose)) - shrink_dentry_list(&data.dispose); + shrink_dentry_list(&data.dispose); + if (unlikely(need_wait)) + wait_for_completion(&wait.completion); } } -- cgit v1.2.3 From e9895609cb7f9d9712c5b20044edff5f196eec1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 18 Apr 2026 18:39:03 -0400 Subject: wind ->s_roots via ->d_sib instead of ->d_hash shrink_dcache_for_umount() is supposed to handle the possibility of some of the dentries to be evicted being in other threads shrink lists; it either kills them, leaving an empty husk to be freed by the owner of shrink list whenever it gets around to that, or it waits for the eviction in progress to get completed. That relies upon dentry remaining attached to the tree until the eviction reaches dentry_unlist() and its ->d_sib gets removed from the list. Unfortunately, the secondary roots are linked via ->d_hash, rather than ->d_sib and they become removed from that list before their inode references are dropped. If shrink_dentry_list() from another thread ends up evicting one of the secondary roots and gets to that point in dentry_kill() when shrink_dcache_for_umount() is looking for secondary roots, the latter will *not* notice anything, possibly leading to warnings about busy inodes at umount time and all kinds of breakage after that. Moreover, shrink_dcache_for_umount() walks the list of secondary roots with no protection whatsoever, so it might end up calling dget() on a dentry that already passed through lockref_mark_dead(&dentry->d_lockref); ending up with corrupted refcount and possible UAF. AFAICS, the most straightforward way to deal with that would be to have secondary roots linked via ->d_sib rather than ->d_hash; then they would remain on the list until killed, and we could use d_add_waiter() machinery to wait for eviction in progress. Changes: * secondary roots look the same as ->s_root from d_unhashed() and d_unlinked() POV now. * secondary roots are represented as "no parent, but on ->d_sib" instead of "no parent, but on ->d_hash". * since ->d_sib is a plain hlist, we protect it with per-superblock spinlock (sb->s_roots_lock) instead of the LSB of the head pointer (for non-root dentries it would be protected by ->d_lock of parent). * __d_obtain_alias() uses ->d_sib for linkage when allocating a secondary root. * d_splice_alias_ops() detects splicing of a secondary root and removes it from the list before calling __d_move(). * dentry_unlist() detects eviction of a secondary root and removes it from the list; no need to play the games for d_walk() sake, since the latter is not going to look for the next sibling of those anyway. * ___d_drop() doesn't care about ->s_roots anymore. * shrink_dcache_for_umount() uses proper locking for access to the list of secondary roots and if it runs into one that is in the middle of eviction waits for that to finish. Signed-off-by: Al Viro --- fs/dcache.c | 65 ++++++++++++++++++++++++++++++------------ fs/super.c | 1 + include/linux/fs/super_types.h | 3 +- 3 files changed, 50 insertions(+), 19 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 1f4435b7f9ce..257eefc46f5e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -43,8 +43,8 @@ * - i_dentry, d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table - * s_roots bl list spinlock protects: - * - the s_roots list (see __d_drop) + * s_roots_lock protects: + * - the s_roots list (see __d_move()/dentry_unlist()/d_obtain_root()) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: @@ -562,16 +562,7 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, static void ___d_drop(struct dentry *dentry) { - struct hlist_bl_head *b; - /* - * Hashed dentries are normally on the dentry hashtable, - * with the exception of those newly allocated by - * d_obtain_root, which are always IS_ROOT: - */ - if (unlikely(IS_ROOT(dentry))) - b = &dentry->d_sb->s_roots; - else - b = d_hash(dentry->d_name.hash); + struct hlist_bl_head *b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); @@ -654,6 +645,13 @@ static inline void d_complete_waiters(struct dentry *dentry) } } +static void unlink_secondary_root(struct dentry *dentry) +{ + spin_lock(&dentry->d_sb->s_roots_lock); + hlist_del_init(&dentry->d_sib); + spin_unlock(&dentry->d_sb->s_roots_lock); +} + static inline void dentry_unlist(struct dentry *dentry) { struct dentry *next; @@ -665,6 +663,10 @@ static inline void dentry_unlist(struct dentry *dentry) d_complete_waiters(dentry); if (unlikely(hlist_unhashed(&dentry->d_sib))) return; + if (unlikely(IS_ROOT(dentry))) { + unlink_secondary_root(dentry); // secondary root goes away + return; + } __hlist_del(&dentry->d_sib); /* * Cursors can move around the list of children. While we'd been @@ -1805,9 +1807,30 @@ void shrink_dcache_for_umount(struct super_block *sb) sb->s_root = NULL; do_one_tree(dentry); - while (!hlist_bl_empty(&sb->s_roots)) { - dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); - do_one_tree(dentry); + for (;;) { + spin_lock(&sb->s_roots_lock); + dentry = hlist_entry_safe(sb->s_roots.first, + struct dentry, d_sib); + if (!dentry) { + spin_unlock(&sb->s_roots_lock); + break; + } + rcu_read_lock(); + spin_unlock(&sb->s_roots_lock); + spin_lock(&dentry->d_lock); + rcu_read_unlock(); + if (unlikely(dentry->d_lockref.count < 0)) { + struct completion_list wait; + bool need_wait = d_add_waiter(dentry, &wait); + + spin_unlock(&dentry->d_lock); + if (need_wait) + wait_for_completion(&wait.completion); + } else { + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); + do_one_tree(dentry); + } } } @@ -2224,9 +2247,9 @@ static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) __d_set_inode_and_type(new, inode, add_flags); hlist_add_head(&new->d_alias, &inode->i_dentry); if (!disconnected) { - hlist_bl_lock(&sb->s_roots); - hlist_bl_add_head(&new->d_hash, &sb->s_roots); - hlist_bl_unlock(&sb->s_roots); + spin_lock(&sb->s_roots_lock); + hlist_add_head(&new->d_sib, &sb->s_roots); + spin_unlock(&sb->s_roots_lock); } spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); @@ -3238,6 +3261,12 @@ struct dentry *d_splice_alias_ops(struct inode *inode, struct dentry *dentry, } dput(old_parent); } else { + if (unlikely(!hlist_unhashed(&new->d_sib))) { + // secondary root getting spliced + spin_lock(&new->d_lock); + unlink_secondary_root(new); + spin_unlock(&new->d_lock); + } __d_move(new, dentry, false); write_sequnlock(&rename_lock); } diff --git a/fs/super.c b/fs/super.c index 378e81efe643..fb44ebadda82 100644 --- a/fs/super.c +++ b/fs/super.c @@ -359,6 +359,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); + spin_lock_init(&s->s_roots_lock); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..23d1c2612d0c 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -162,7 +162,8 @@ struct super_block { struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif - struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ + struct hlist_head s_roots; /* alternate root dentries for NFS */ + spinlock_t s_roots_lock; struct mount *s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; /* can go away once we use an accessor for @s_bdev_file */ struct file *s_bdev_file; -- cgit v1.2.3 From 144bef84e1cfadfe100c6728593ea9838853f347 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 15 Apr 2026 19:29:53 -0400 Subject: nfs: get rid of fake root dentries ... just grab the reference to the (real) root we are about to return for the first mount of this superblock and be done with that. Once upon a time dentry tree eviction at fs shutdown used to break if ->s_root had been spliced on top of something; that hadn't been the case for years now, and these fake root dentries violate a bunch of invariants. Let's get rid of them... Signed-off-by: Al Viro --- fs/dcache.c | 8 ++------ fs/nfs/getroot.c | 35 ++--------------------------------- 2 files changed, 4 insertions(+), 39 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 257eefc46f5e..e7487c92c21a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -455,14 +455,10 @@ static void dentry_unlink_inode(struct dentry * dentry) raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); - hlist_del_init(&dentry->d_alias); + __hlist_del(&dentry->d_alias); /* * dentry becomes negative, so the space occupied by ->d_alias - * belongs to ->waiters now; we could use __hlist_del() instead - * of hlist_del_init(), if not for the stunt pulled by nfs - * dummy root dentries - positive dentry *not* included into - * the alias list of its inode. Open-coding hlist_del_init() - * and removing zeroing would be too clumsy... + * belongs to ->waiters now. */ dentry->waiters = NULL; raw_write_seqcount_end(&dentry->d_seq); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index eef0736beb67..ff7424bc4bec 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -32,35 +32,6 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT -/* - * Set the superblock root dentry. - * Note that this function frees the inode in case of error. - */ -static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *inode) -{ - /* The mntroot acts as the dummy root dentry for this superblock */ - if (sb->s_root == NULL) { - sb->s_root = d_make_root(inode); - if (sb->s_root == NULL) - return -ENOMEM; - ihold(inode); - /* - * Ensure that this dentry is invisible to d_find_alias(). - * Otherwise, it may be spliced into the tree by - * d_splice_alias if a parent directory from the same - * filesystem gets mounted at a later time. - * This again causes shrink_dcache_for_umount_subtree() to - * Oops, since the test for IS_ROOT() will fail. - */ - spin_lock(&d_inode(sb->s_root)->i_lock); - spin_lock(&sb->s_root->d_lock); - hlist_del_init(&sb->s_root->d_alias); - spin_unlock(&sb->s_root->d_lock); - spin_unlock(&d_inode(sb->s_root)->i_lock); - } - return 0; -} - /* * get a root dentry from the root filehandle */ @@ -99,10 +70,6 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) goto out_fattr; } - error = nfs_superblock_set_dummy_root(s, inode); - if (error != 0) - goto out_fattr; - /* root dentries normally start off anonymous and get spliced in later * if the dentry tree reaches them; however if the dentry already * exists, we'll pick it up at this point and use it as the root @@ -123,6 +90,8 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) name = NULL; } spin_unlock(&root->d_lock); + if (!s->s_root) + s->s_root = dget(root); fc->root = root; if (server->caps & NFS_CAP_SECURITY_LABEL) kflags |= SECURITY_LSM_NATIVE_LABELS; -- cgit v1.2.3 From 3df5153c5f123d6018c82a24341ccd99c79d64a0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 May 2026 00:20:19 -0400 Subject: make cursors NORCU All it requires is making sure that d_walk() will skip *all* CURSOR dentries, even if somebody passes it one as an argument. Cursors are negative and unhashed all along, never get added to LRU or to shrink lists and no RCU references via ->d_sib are possible for those - dentry_unlist() makes sure that no killed dentry has ->d_sib.next left pointing to a cursor. Seeing that a cursor is allocated every time we open a directory on autofs, debugfs, devpts, etc., avoiding an RCU delay when such opened files get closed is attractive... Signed-off-by: Al Viro --- fs/dcache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index e7487c92c21a..1f50100bce49 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1443,6 +1443,8 @@ again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); + if (unlikely(this_parent->d_flags & DCACHE_DENTRY_CURSOR)) + goto out_unlock; ret = enter(data, this_parent); switch (ret) { @@ -1996,7 +1998,7 @@ struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { - dentry->d_flags |= DCACHE_DENTRY_CURSOR; + dentry->d_flags |= DCACHE_DENTRY_CURSOR | DCACHE_NORCU; dentry->d_parent = dget(parent); } return dentry; -- cgit v1.2.3