Merge tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs writeback updates from Christian Brauner: - Fix a race between cgroup_writeback_umount() and inode_switch_wbs() When a container exits, a race between cgroup_writeback_umount() and inode_switch_wbs()/cleanup_offline_cgwb() can trigger "VFS: Busy inodes after unmount" followed by a use-after-free on percpu counters. There is a window between inode_prepare_wbs_switch() returning true (having passed the SB_ACTIVE check and grabbed the inode) and the subsequent wb_queue_isw() call: if cgroup_writeback_umount() observes the global isw_nr_in_flight counter as non-zero but flush_workqueue() finds nothing queued yet, it returns early - leaving a held inode reference that blocks evict_inodes() and a later iput() that hits freed percpu counters. The race is closed by covering the window from inode_prepare_wbs_switch() through wb_queue_isw() with an RCU read-side critical section and synchronizing in the umount path. On top of that the now-dead rcu_barrier() left over from the queue_rcu_work() era is removed, and the global synchronize_rcu()/flush_workqueue() pair is replaced with a per-sb in-flight counter plus pin/unpin/drain helpers so umount no longer serializes against switch activity on unrelated superblocks. Under cgroup writeback churn on a 16 vCPU guest this takes umount latency from ~92-138ms p50 down to ~5-8ms p50 and the cumulative cost of cgroup_writeback_umount() from ~62ms to ~4us per call. The initial race fix is kept separate and minimal so it backports cleanly to stable trees that still queue switches via queue_rcu_work(). - Improve write performance with RWF_DONTCACHE Dirty DONTCACHE pages are now tracked per bdi_writeback so that the writeback flusher can be kicked in a targeted fashion for IOCB_DONTCACHE writes instead of relying on global writeback, and the PG_dropbehind flag is preserved when a folio is split. * tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking mm: track DONTCACHE dirty pages per bdi_writeback mm: preserve PG_dropbehind flag during folio split writeback: use a per-sb counter to drain inode wb switches at umount writeback: drop now-unnecessary rcu_barrier() in cgroup_writeback_umount() writeback: fix race between cgroup_writeback_umount() and inode_switch_wbs()
author: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 03:30:45 +0530
committer: Linus Torvalds <torvalds@linux-foundation.org> 2026-06-15 03:30:45 +0530
commit: c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b (patch)
tree: 2cb320b5bc6f1c97da837e8cb43352a72e789267 /fs
parent: 0793d39ec8bab2b2255e3a288894c39e88ce5a75 (diff)
parent: 0275dc184aa007b260374af6d46fb15741c062a8 (diff)
download: lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.tar.gz
lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.zip
1 files changed, 112 insertions, 26 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..fdb8766d275a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode,
 			long nr = folio_nr_pages(folio);
 			wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
 			wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
+			if (folio_test_dropbehind(folio)) {
+				wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr);
+				wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr);
+			}
 		}
 	}
 
@@ -497,6 +501,23 @@ skip_switch:
 	return switched;
 }
 
+static inline void cgroup_writeback_pin(struct super_block *sb)
+{
+	atomic_inc(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_unpin(struct super_block *sb)
+{
+	if (atomic_dec_and_test(&sb->s_isw_nr_in_flight))
+		wake_up_var(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_drain(struct super_block *sb)
+{
+	wait_var_event(&sb->s_isw_nr_in_flight,
+		       !atomic_read(&sb->s_isw_nr_in_flight));
+}
+
 static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
 				     struct inode_switch_wbs_context *isw)
 {
@@ -554,8 +575,12 @@ relock:
 		wb_put_many(old_wb, nr_switched);
 	}
 
-	for (inodep = isw->inodes; *inodep; inodep++)
+	for (inodep = isw->inodes; *inodep; inodep++) {
+		struct super_block *sb = (*inodep)->i_sb;
+
 		iput(*inodep);
+		cgroup_writeback_unpin(sb);
+	}
 	wb_put(new_wb);
 	kfree(isw);
 	atomic_dec(&isw_nr_in_flight);
@@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
 static bool inode_prepare_wbs_switch(struct inode *inode,
 				     struct bdi_writeback *new_wb)
 {
+	/* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */
+	if (!(inode->i_sb->s_flags & SB_ACTIVE))
+		return false;
+
 	/*
-	 * Paired with smp_mb() in cgroup_writeback_umount().
-	 * isw_nr_in_flight must be increased before checking SB_ACTIVE and
-	 * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
-	 * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+	 * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either
+	 * sees a non-zero counter and waits, or we see SB_ACTIVE clear below.
 	 */
+	cgroup_writeback_pin(inode->i_sb);
 	smp_mb();
 
 	if (IS_DAX(inode))
-		return false;
+		goto out_unpin;
 
 	/* while holding I_WB_SWITCH, no one else can update the association */
 	spin_lock(&inode->i_lock);
@@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
 	    inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
 	    inode_to_wb(inode) == new_wb) {
 		spin_unlock(&inode->i_lock);
-		return false;
+		goto out_unpin;
 	}
 	inode_state_set(inode, I_WB_SWITCH);
 	__iget(inode);
 	spin_unlock(&inode->i_lock);
 
 	return true;
+
+out_unpin:
+	cgroup_writeback_unpin(inode->i_sb);
+	return false;
 }
 
 static void wb_queue_isw(struct bdi_writeback *wb,
@@ -1198,36 +1230,27 @@ out_bdi_put:
 }
 
 /**
- * cgroup_writeback_umount - flush inode wb switches for umount
+ * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb
  * @sb: target super_block
  *
- * This function is called when a super_block is about to be destroyed and
- * flushes in-flight inode wb switches.  An inode wb switch goes through
- * RCU and then workqueue, so the two need to be flushed in order to ensure
- * that all previously scheduled switches are finished.  As wb switches are
- * rare occurrences and synchronize_rcu() can take a while, perform
- * flushing iff wb switches are in flight.
+ * Wait until every inode wb switch that already passed the SB_ACTIVE
+ * check on this superblock has been completed by the worker.  Since
+ * SB_ACTIVE is cleared before this is called, no new switches can start
+ * for @sb, so s_isw_nr_in_flight will monotonically drop to zero.
  */
 void cgroup_writeback_umount(struct super_block *sb)
 {
-
 	if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
 		return;
 
 	/*
-	 * SB_ACTIVE should be reliably cleared before checking
-	 * isw_nr_in_flight, see generic_shutdown_super().
+	 * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe
+	 * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear
+	 * (cleared by generic_shutdown_super()) and bails before grabbing the
+	 * inode.
 	 */
 	smp_mb();
-
-	if (atomic_read(&isw_nr_in_flight)) {
-		/*
-		 * Use rcu_barrier() to wait for all pending callbacks to
-		 * ensure that all in-flight wb switches are in the workqueue.
-		 */
-		rcu_barrier();
-		flush_workqueue(isw_wq);
-	}
+	cgroup_writeback_drain(sb);
 }
 
 static int __init cgroup_writeback_init(void)
@@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb)
 	return nr_pages;
 }
 
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+	long nr_pages;
+
+	if (!test_and_clear_bit(WB_start_dontcache, &wb->state))
+		return 0;
+
+	nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY);
+	if (nr_pages) {
+		struct wb_writeback_work work = {
+			.nr_pages	= nr_pages,
+			.sync_mode	= WB_SYNC_NONE,
+			.range_cyclic	= 1,
+			.reason		= WB_REASON_DONTCACHE,
+		};
+
+		nr_pages = wb_writeback(wb, &work);
+	}
+
+	return nr_pages;
+}
 
 /*
  * Retrieve work items and do the writeback they describe
@@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 	wrote += wb_check_start_all(wb);
 
 	/*
+	 * Check for dontcache writeback request
+	 */
+	wrote += wb_check_start_dontcache(wb);
+
+	/*
 	 * Check for periodic writeback, kupdated() style
 	 */
 	wrote += wb_check_old_data_flush(wb);
@@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
 	rcu_read_unlock();
 }
 
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping:	address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache dirty
+ * pages. Queue writeback for the inode's wb for as many pages as there are
+ * dontcache pages, but don't restrict writeback to dontcache pages only.
+ *
+ * This significantly improves performance over either writing all wb's pages
+ * or writing only dontcache pages.  Although it doesn't guarantee quick
+ * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages
+ * in check. Over longer term dontcache pages get written and reclaimed by
+ * background writeback even with this rough heuristic.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+	struct inode *inode = mapping->host;
+	struct bdi_writeback *wb;
+	struct wb_lock_cookie cookie = {};
+	bool need_wakeup = false;
+
+	wb = unlocked_inode_to_wb_begin(inode, &cookie);
+	if (wb_has_dirty_io(wb) &&
+	    !test_bit(WB_start_dontcache, &wb->state) &&
+	    !test_and_set_bit(WB_start_dontcache, &wb->state)) {
+		wb_get(wb);
+		need_wakeup = true;
+	}
+	unlocked_inode_to_wb_end(inode, &cookie);
+
+	if (need_wakeup) {
+		wb_wakeup(wb);
+		wb_put(wb);
+	}
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
 /*
  * Wakeup the flusher threads to start writeback of all currently dirty pages
  */
author	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 03:30:45 +0530
committer	Linus Torvalds <torvalds@linux-foundation.org>	2026-06-15 03:30:45 +0530
commit	c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b (patch)
tree	2cb320b5bc6f1c97da837e8cb43352a72e789267 /fs
parent	0793d39ec8bab2b2255e3a288894c39e88ce5a75 (diff)
parent	0275dc184aa007b260374af6d46fb15741c062a8 (diff)
download	lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.tar.gz lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.zip