diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 03:30:45 +0530 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2026-06-15 03:30:45 +0530 |
| commit | c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b (patch) | |
| tree | 2cb320b5bc6f1c97da837e8cb43352a72e789267 /fs | |
| parent | 0793d39ec8bab2b2255e3a288894c39e88ce5a75 (diff) | |
| parent | 0275dc184aa007b260374af6d46fb15741c062a8 (diff) | |
| download | lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.tar.gz lwn-c17fdf62aeecbbaf2c2fd5c494e2089c02b0e75b.zip | |
Merge tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs writeback updates from Christian Brauner:
- Fix a race between cgroup_writeback_umount() and inode_switch_wbs()
When a container exits, a race between cgroup_writeback_umount() and
inode_switch_wbs()/cleanup_offline_cgwb() can trigger "VFS: Busy
inodes after unmount" followed by a use-after-free on percpu
counters.
There is a window between inode_prepare_wbs_switch() returning true
(having passed the SB_ACTIVE check and grabbed the inode) and the
subsequent wb_queue_isw() call: if cgroup_writeback_umount() observes
the global isw_nr_in_flight counter as non-zero but flush_workqueue()
finds nothing queued yet, it returns early - leaving a held inode
reference that blocks evict_inodes() and a later iput() that hits
freed percpu counters.
The race is closed by covering the window from
inode_prepare_wbs_switch() through wb_queue_isw() with an RCU
read-side critical section and synchronizing in the umount path.
On top of that the now-dead rcu_barrier() left over from the
queue_rcu_work() era is removed, and the global
synchronize_rcu()/flush_workqueue() pair is replaced with a per-sb
in-flight counter plus pin/unpin/drain helpers so umount no longer
serializes against switch activity on unrelated superblocks.
Under cgroup writeback churn on a 16 vCPU guest this takes umount
latency from ~92-138ms p50 down to ~5-8ms p50 and the cumulative cost
of cgroup_writeback_umount() from ~62ms to ~4us per call.
The initial race fix is kept separate and minimal so it backports
cleanly to stable trees that still queue switches via
queue_rcu_work().
- Improve write performance with RWF_DONTCACHE
Dirty DONTCACHE pages are now tracked per bdi_writeback so that the
writeback flusher can be kicked in a targeted fashion for
IOCB_DONTCACHE writes instead of relying on global writeback, and the
PG_dropbehind flag is preserved when a folio is split.
* tag 'vfs-7.2-rc1.writeback' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
mm: track DONTCACHE dirty pages per bdi_writeback
mm: preserve PG_dropbehind flag during folio split
writeback: use a per-sb counter to drain inode wb switches at umount
writeback: drop now-unnecessary rcu_barrier() in cgroup_writeback_umount()
writeback: fix race between cgroup_writeback_umount() and inode_switch_wbs()
Diffstat (limited to 'fs')
| -rw-r--r-- | fs/fs-writeback.c | 138 |
1 files changed, 112 insertions, 26 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a65694cbfe68..fdb8766d275a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode, long nr = folio_nr_pages(folio); wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); + if (folio_test_dropbehind(folio)) { + wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr); + wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr); + } } } @@ -497,6 +501,23 @@ skip_switch: return switched; } +static inline void cgroup_writeback_pin(struct super_block *sb) +{ + atomic_inc(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_unpin(struct super_block *sb) +{ + if (atomic_dec_and_test(&sb->s_isw_nr_in_flight)) + wake_up_var(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_drain(struct super_block *sb) +{ + wait_var_event(&sb->s_isw_nr_in_flight, + !atomic_read(&sb->s_isw_nr_in_flight)); +} + static void process_inode_switch_wbs(struct bdi_writeback *new_wb, struct inode_switch_wbs_context *isw) { @@ -554,8 +575,12 @@ relock: wb_put_many(old_wb, nr_switched); } - for (inodep = isw->inodes; *inodep; inodep++) + for (inodep = isw->inodes; *inodep; inodep++) { + struct super_block *sb = (*inodep)->i_sb; + iput(*inodep); + cgroup_writeback_unpin(sb); + } wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); @@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work) static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { + /* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */ + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return false; + /* - * Paired with smp_mb() in cgroup_writeback_umount(). - * isw_nr_in_flight must be increased before checking SB_ACTIVE and - * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 - * in cgroup_writeback_umount() and the isw_wq will be not flushed. + * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either + * sees a non-zero counter and waits, or we see SB_ACTIVE clear below. */ + cgroup_writeback_pin(inode->i_sb); smp_mb(); if (IS_DAX(inode)) - return false; + goto out_unpin; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); @@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode, inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); - return false; + goto out_unpin; } inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); return true; + +out_unpin: + cgroup_writeback_unpin(inode->i_sb); + return false; } static void wb_queue_isw(struct bdi_writeback *wb, @@ -1198,36 +1230,27 @@ out_bdi_put: } /** - * cgroup_writeback_umount - flush inode wb switches for umount + * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb * @sb: target super_block * - * This function is called when a super_block is about to be destroyed and - * flushes in-flight inode wb switches. An inode wb switch goes through - * RCU and then workqueue, so the two need to be flushed in order to ensure - * that all previously scheduled switches are finished. As wb switches are - * rare occurrences and synchronize_rcu() can take a while, perform - * flushing iff wb switches are in flight. + * Wait until every inode wb switch that already passed the SB_ACTIVE + * check on this superblock has been completed by the worker. Since + * SB_ACTIVE is cleared before this is called, no new switches can start + * for @sb, so s_isw_nr_in_flight will monotonically drop to zero. */ void cgroup_writeback_umount(struct super_block *sb) { - if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) return; /* - * SB_ACTIVE should be reliably cleared before checking - * isw_nr_in_flight, see generic_shutdown_super(). + * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe + * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear + * (cleared by generic_shutdown_super()) and bails before grabbing the + * inode. */ smp_mb(); - - if (atomic_read(&isw_nr_in_flight)) { - /* - * Use rcu_barrier() to wait for all pending callbacks to - * ensure that all in-flight wb switches are in the workqueue. - */ - rcu_barrier(); - flush_workqueue(isw_wq); - } + cgroup_writeback_drain(sb); } static int __init cgroup_writeback_init(void) @@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb) return nr_pages; } +static long wb_check_start_dontcache(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_and_clear_bit(WB_start_dontcache, &wb->state)) + return 0; + + nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = WB_REASON_DONTCACHE, + }; + + nr_pages = wb_writeback(wb, &work); + } + + return nr_pages; +} /* * Retrieve work items and do the writeback they describe @@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) wrote += wb_check_start_all(wb); /* + * Check for dontcache writeback request + */ + wrote += wb_check_start_dontcache(wb); + + /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); @@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, rcu_read_unlock(); } +/** + * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes + * @mapping: address_space that was just written to + * + * Kick the writeback flusher thread to expedite writeback of dontcache dirty + * pages. Queue writeback for the inode's wb for as many pages as there are + * dontcache pages, but don't restrict writeback to dontcache pages only. + * + * This significantly improves performance over either writing all wb's pages + * or writing only dontcache pages. Although it doesn't guarantee quick + * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages + * in check. Over longer term dontcache pages get written and reclaimed by + * background writeback even with this rough heuristic. + */ +void filemap_dontcache_kick_writeback(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + bool need_wakeup = false; + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + if (wb_has_dirty_io(wb) && + !test_bit(WB_start_dontcache, &wb->state) && + !test_and_set_bit(WB_start_dontcache, &wb->state)) { + wb_get(wb); + need_wakeup = true; + } + unlocked_inode_to_wb_end(inode, &cookie); + + if (need_wakeup) { + wb_wakeup(wb); + wb_put(wb); + } +} +EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback); + /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ |
