summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-06-15 03:30:45 +0530
committerLinus Torvalds <torvalds@linux-foundation.org>2026-06-15 03:30:45 +0530
commitc17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b (patch)
tree2cb320b5bc6f1c97da837e8cb43352a72e789267 /fs
parent0793d39ec8bab2b2255e3a288894c39e88ce5a75 (diff)
parent0275dc184aa007b260374af6d46fb15741c062a8 (diff)
downloadlwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.tar.gz
lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.zip
Merge tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs writeback updates from Christian Brauner: - Fix a race between cgroup_writeback_umount() and inode_switch_wbs() When a container exits, a race between cgroup_writeback_umount() and inode_switch_wbs()/cleanup_offline_cgwb() can trigger "VFS: Busy inodes after unmount" followed by a use-after-free on percpu counters. There is a window between inode_prepare_wbs_switch() returning true (having passed the SB_ACTIVE check and grabbed the inode) and the subsequent wb_queue_isw() call: if cgroup_writeback_umount() observes the global isw_nr_in_flight counter as non-zero but flush_workqueue() finds nothing queued yet, it returns early - leaving a held inode reference that blocks evict_inodes() and a later iput() that hits freed percpu counters. The race is closed by covering the window from inode_prepare_wbs_switch() through wb_queue_isw() with an RCU read-side critical section and synchronizing in the umount path. On top of that the now-dead rcu_barrier() left over from the queue_rcu_work() era is removed, and the global synchronize_rcu()/flush_workqueue() pair is replaced with a per-sb in-flight counter plus pin/unpin/drain helpers so umount no longer serializes against switch activity on unrelated superblocks. Under cgroup writeback churn on a 16 vCPU guest this takes umount latency from ~92-138ms p50 down to ~5-8ms p50 and the cumulative cost of cgroup_writeback_umount() from ~62ms to ~4us per call. The initial race fix is kept separate and minimal so it backports cleanly to stable trees that still queue switches via queue_rcu_work(). - Improve write performance with RWF_DONTCACHE Dirty DONTCACHE pages are now tracked per bdi_writeback so that the writeback flusher can be kicked in a targeted fashion for IOCB_DONTCACHE writes instead of relying on global writeback, and the PG_dropbehind flag is preserved when a folio is split. * tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking mm: track DONTCACHE dirty pages per bdi_writeback mm: preserve PG_dropbehind flag during folio split writeback: use a per-sb counter to drain inode wb switches at umount writeback: drop now-unnecessary rcu_barrier() in cgroup_writeback_umount() writeback: fix race between cgroup_writeback_umount() and inode_switch_wbs()
Diffstat (limited to 'fs')
-rw-r--r--fs/fs-writeback.c138
1 files changed, 112 insertions, 26 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..fdb8766d275a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode,
long nr = folio_nr_pages(folio);
wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
+ if (folio_test_dropbehind(folio)) {
+ wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr);
+ wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr);
+ }
}
}
@@ -497,6 +501,23 @@ skip_switch:
return switched;
}
+static inline void cgroup_writeback_pin(struct super_block *sb)
+{
+ atomic_inc(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_unpin(struct super_block *sb)
+{
+ if (atomic_dec_and_test(&sb->s_isw_nr_in_flight))
+ wake_up_var(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_drain(struct super_block *sb)
+{
+ wait_var_event(&sb->s_isw_nr_in_flight,
+ !atomic_read(&sb->s_isw_nr_in_flight));
+}
+
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
struct inode_switch_wbs_context *isw)
{
@@ -554,8 +575,12 @@ relock:
wb_put_many(old_wb, nr_switched);
}
- for (inodep = isw->inodes; *inodep; inodep++)
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ struct super_block *sb = (*inodep)->i_sb;
+
iput(*inodep);
+ cgroup_writeback_unpin(sb);
+ }
wb_put(new_wb);
kfree(isw);
atomic_dec(&isw_nr_in_flight);
@@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
+ /* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return false;
+
/*
- * Paired with smp_mb() in cgroup_writeback_umount().
- * isw_nr_in_flight must be increased before checking SB_ACTIVE and
- * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
- * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+ * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either
+ * sees a non-zero counter and waits, or we see SB_ACTIVE clear below.
*/
+ cgroup_writeback_pin(inode->i_sb);
smp_mb();
if (IS_DAX(inode))
- return false;
+ goto out_unpin;
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
@@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
inode_to_wb(inode) == new_wb) {
spin_unlock(&inode->i_lock);
- return false;
+ goto out_unpin;
}
inode_state_set(inode, I_WB_SWITCH);
__iget(inode);
spin_unlock(&inode->i_lock);
return true;
+
+out_unpin:
+ cgroup_writeback_unpin(inode->i_sb);
+ return false;
}
static void wb_queue_isw(struct bdi_writeback *wb,
@@ -1198,36 +1230,27 @@ out_bdi_put:
}
/**
- * cgroup_writeback_umount - flush inode wb switches for umount
+ * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb
* @sb: target super_block
*
- * This function is called when a super_block is about to be destroyed and
- * flushes in-flight inode wb switches. An inode wb switch goes through
- * RCU and then workqueue, so the two need to be flushed in order to ensure
- * that all previously scheduled switches are finished. As wb switches are
- * rare occurrences and synchronize_rcu() can take a while, perform
- * flushing iff wb switches are in flight.
+ * Wait until every inode wb switch that already passed the SB_ACTIVE
+ * check on this superblock has been completed by the worker. Since
+ * SB_ACTIVE is cleared before this is called, no new switches can start
+ * for @sb, so s_isw_nr_in_flight will monotonically drop to zero.
*/
void cgroup_writeback_umount(struct super_block *sb)
{
-
if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
return;
/*
- * SB_ACTIVE should be reliably cleared before checking
- * isw_nr_in_flight, see generic_shutdown_super().
+ * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe
+ * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear
+ * (cleared by generic_shutdown_super()) and bails before grabbing the
+ * inode.
*/
smp_mb();
-
- if (atomic_read(&isw_nr_in_flight)) {
- /*
- * Use rcu_barrier() to wait for all pending callbacks to
- * ensure that all in-flight wb switches are in the workqueue.
- */
- rcu_barrier();
- flush_workqueue(isw_wq);
- }
+ cgroup_writeback_drain(sb);
}
static int __init cgroup_writeback_init(void)
@@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb)
return nr_pages;
}
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+ long nr_pages;
+
+ if (!test_and_clear_bit(WB_start_dontcache, &wb->state))
+ return 0;
+
+ nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY);
+ if (nr_pages) {
+ struct wb_writeback_work work = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ .reason = WB_REASON_DONTCACHE,
+ };
+
+ nr_pages = wb_writeback(wb, &work);
+ }
+
+ return nr_pages;
+}
/*
* Retrieve work items and do the writeback they describe
@@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
wrote += wb_check_start_all(wb);
/*
+ * Check for dontcache writeback request
+ */
+ wrote += wb_check_start_dontcache(wb);
+
+ /*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
@@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
rcu_read_unlock();
}
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping: address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache dirty
+ * pages. Queue writeback for the inode's wb for as many pages as there are
+ * dontcache pages, but don't restrict writeback to dontcache pages only.
+ *
+ * This significantly improves performance over either writing all wb's pages
+ * or writing only dontcache pages. Although it doesn't guarantee quick
+ * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages
+ * in check. Over longer term dontcache pages get written and reclaimed by
+ * background writeback even with this rough heuristic.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ bool need_wakeup = false;
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ if (wb_has_dirty_io(wb) &&
+ !test_bit(WB_start_dontcache, &wb->state) &&
+ !test_and_set_bit(WB_start_dontcache, &wb->state)) {
+ wb_get(wb);
+ need_wakeup = true;
+ }
+ unlocked_inode_to_wb_end(inode, &cookie);
+
+ if (need_wakeup) {
+ wb_wakeup(wb);
+ wb_put(wb);
+ }
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
/*
* Wakeup the flusher threads to start writeback of all currently dirty pages
*/