diff options
| -rw-r--r-- | fs/fs-writeback.c | 138 | ||||
| -rw-r--r-- | include/linux/backing-dev-defs.h | 3 | ||||
| -rw-r--r-- | include/linux/fs.h | 6 | ||||
| -rw-r--r-- | include/linux/fs/super_types.h | 8 | ||||
| -rw-r--r-- | include/trace/events/writeback.h | 3 | ||||
| -rw-r--r-- | mm/filemap.c | 15 | ||||
| -rw-r--r-- | mm/huge_memory.c | 1 | ||||
| -rw-r--r-- | mm/page-writeback.c | 6 |
8 files changed, 147 insertions, 33 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a65694cbfe68..fdb8766d275a 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode, long nr = folio_nr_pages(folio); wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr); wb_stat_mod(new_wb, WB_RECLAIMABLE, nr); + if (folio_test_dropbehind(folio)) { + wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr); + wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr); + } } } @@ -497,6 +501,23 @@ skip_switch: return switched; } +static inline void cgroup_writeback_pin(struct super_block *sb) +{ + atomic_inc(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_unpin(struct super_block *sb) +{ + if (atomic_dec_and_test(&sb->s_isw_nr_in_flight)) + wake_up_var(&sb->s_isw_nr_in_flight); +} + +static inline void cgroup_writeback_drain(struct super_block *sb) +{ + wait_var_event(&sb->s_isw_nr_in_flight, + !atomic_read(&sb->s_isw_nr_in_flight)); +} + static void process_inode_switch_wbs(struct bdi_writeback *new_wb, struct inode_switch_wbs_context *isw) { @@ -554,8 +575,12 @@ relock: wb_put_many(old_wb, nr_switched); } - for (inodep = isw->inodes; *inodep; inodep++) + for (inodep = isw->inodes; *inodep; inodep++) { + struct super_block *sb = (*inodep)->i_sb; + iput(*inodep); + cgroup_writeback_unpin(sb); + } wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); @@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work) static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { + /* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */ + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return false; + /* - * Paired with smp_mb() in cgroup_writeback_umount(). - * isw_nr_in_flight must be increased before checking SB_ACTIVE and - * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 - * in cgroup_writeback_umount() and the isw_wq will be not flushed. + * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either + * sees a non-zero counter and waits, or we see SB_ACTIVE clear below. */ + cgroup_writeback_pin(inode->i_sb); smp_mb(); if (IS_DAX(inode)) - return false; + goto out_unpin; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); @@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode, inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); - return false; + goto out_unpin; } inode_state_set(inode, I_WB_SWITCH); __iget(inode); spin_unlock(&inode->i_lock); return true; + +out_unpin: + cgroup_writeback_unpin(inode->i_sb); + return false; } static void wb_queue_isw(struct bdi_writeback *wb, @@ -1198,36 +1230,27 @@ out_bdi_put: } /** - * cgroup_writeback_umount - flush inode wb switches for umount + * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb * @sb: target super_block * - * This function is called when a super_block is about to be destroyed and - * flushes in-flight inode wb switches. An inode wb switch goes through - * RCU and then workqueue, so the two need to be flushed in order to ensure - * that all previously scheduled switches are finished. As wb switches are - * rare occurrences and synchronize_rcu() can take a while, perform - * flushing iff wb switches are in flight. + * Wait until every inode wb switch that already passed the SB_ACTIVE + * check on this superblock has been completed by the worker. Since + * SB_ACTIVE is cleared before this is called, no new switches can start + * for @sb, so s_isw_nr_in_flight will monotonically drop to zero. */ void cgroup_writeback_umount(struct super_block *sb) { - if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK)) return; /* - * SB_ACTIVE should be reliably cleared before checking - * isw_nr_in_flight, see generic_shutdown_super(). + * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe + * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear + * (cleared by generic_shutdown_super()) and bails before grabbing the + * inode. */ smp_mb(); - - if (atomic_read(&isw_nr_in_flight)) { - /* - * Use rcu_barrier() to wait for all pending callbacks to - * ensure that all in-flight wb switches are in the workqueue. - */ - rcu_barrier(); - flush_workqueue(isw_wq); - } + cgroup_writeback_drain(sb); } static int __init cgroup_writeback_init(void) @@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb) return nr_pages; } +static long wb_check_start_dontcache(struct bdi_writeback *wb) +{ + long nr_pages; + + if (!test_and_clear_bit(WB_start_dontcache, &wb->state)) + return 0; + + nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY); + if (nr_pages) { + struct wb_writeback_work work = { + .nr_pages = nr_pages, + .sync_mode = WB_SYNC_NONE, + .range_cyclic = 1, + .reason = WB_REASON_DONTCACHE, + }; + + nr_pages = wb_writeback(wb, &work); + } + + return nr_pages; +} /* * Retrieve work items and do the writeback they describe @@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb) wrote += wb_check_start_all(wb); /* + * Check for dontcache writeback request + */ + wrote += wb_check_start_dontcache(wb); + + /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); @@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, rcu_read_unlock(); } +/** + * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes + * @mapping: address_space that was just written to + * + * Kick the writeback flusher thread to expedite writeback of dontcache dirty + * pages. Queue writeback for the inode's wb for as many pages as there are + * dontcache pages, but don't restrict writeback to dontcache pages only. + * + * This significantly improves performance over either writing all wb's pages + * or writing only dontcache pages. Although it doesn't guarantee quick + * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages + * in check. Over longer term dontcache pages get written and reclaimed by + * background writeback even with this rough heuristic. + */ +void filemap_dontcache_kick_writeback(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + bool need_wakeup = false; + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + if (wb_has_dirty_io(wb) && + !test_bit(WB_start_dontcache, &wb->state) && + !test_and_set_bit(WB_start_dontcache, &wb->state)) { + wb_get(wb); + need_wakeup = true; + } + unlocked_inode_to_wb_end(inode, &cookie); + + if (need_wakeup) { + wb_wakeup(wb); + wb_put(wb); + } +} +EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback); + /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a06b93446d10..4f1084937315 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -26,6 +26,7 @@ enum wb_state { WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ WB_start_all, /* nr_pages == 0 (all) work pending */ + WB_start_dontcache, /* dontcache writeback pending */ }; enum wb_stat_item { @@ -33,6 +34,7 @@ enum wb_stat_item { WB_WRITEBACK, WB_DIRTIED, WB_WRITTEN, + WB_DONTCACHE_DIRTY, NR_WB_STAT_ITEMS }; @@ -55,6 +57,7 @@ enum wb_reason { */ WB_REASON_FORKER_THREAD, WB_REASON_FOREIGN_FLUSH, + WB_REASON_DONTCACHE, WB_REASON_MAX, }; diff --git a/include/linux/fs.h b/include/linux/fs.h index a220d14b1f91..2a6d2cb674db 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2633,6 +2633,7 @@ extern int __must_check file_write_and_wait_range(struct file *file, loff_t start, loff_t end); int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end); +void filemap_dontcache_kick_writeback(struct address_space *mapping); static inline int file_write_and_wait(struct file *file) { @@ -2666,10 +2667,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) if (ret) return ret; } else if (iocb->ki_flags & IOCB_DONTCACHE) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - filemap_flush_range(mapping, iocb->ki_pos - count, - iocb->ki_pos - 1); + filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping); } return count; diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h index 383050e7fdf5..1ab4e2265129 100644 --- a/include/linux/fs/super_types.h +++ b/include/linux/fs/super_types.h @@ -274,6 +274,14 @@ struct super_block { /* number of fserrors that are being sent to fsnotify/filesystems */ refcount_t s_pending_errors; + +#ifdef CONFIG_CGROUP_WRITEBACK + /* + * Number of in-flight inode wb switches for this sb. Drained by + * cgroup_writeback_umount() before tear-down. + */ + atomic_t s_isw_nr_in_flight; +#endif } __randomize_layout; /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index bdac0d685a98..13ee076ccd16 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -44,7 +44,8 @@ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EM( WB_REASON_FORKER_THREAD, "forker_thread") \ - EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush") + EM( WB_REASON_FOREIGN_FLUSH, "foreign_flush") \ + EMe(WB_REASON_DONTCACHE, "dontcache") WB_WORK_REASON diff --git a/mm/filemap.c b/mm/filemap.c index 4e636647100c..179f2886f8c0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2052,8 +2052,19 @@ no_page: if (!folio) return ERR_PTR(-ENOENT); /* not an uncached lookup, clear uncached if set */ - if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE)) - folio_clear_dropbehind(folio); + if (!(fgp_flags & FGP_DONTCACHE) && folio_test_clear_dropbehind(folio)) { + if (folio_test_dirty(folio) && + mapping_can_writeback(mapping)) { + struct inode *inode = mapping->host; + struct bdi_writeback *wb; + struct wb_lock_cookie cookie = {}; + long nr = folio_nr_pages(folio); + + wb = unlocked_inode_to_wb_begin(inode, &cookie); + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); + unlocked_inode_to_wb_end(inode, &cookie); + } + } return folio; } EXPORT_SYMBOL(__filemap_get_folio_mpol); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b118bcd392cb..d29e85495091 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3644,6 +3644,7 @@ static void __split_folio_to_order(struct folio *folio, int old_order, (1L << PG_arch_3) | #endif (1L << PG_dirty) | + (1L << PG_dropbehind) | LRU_GEN_MASK | LRU_REFS_MASK)); if (handle_hwpoison && diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 833f743f309f..e98748112d1e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2626,6 +2626,8 @@ static void folio_account_dirtied(struct folio *folio, wb = inode_to_wb(inode); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, nr); __zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); __node_stat_mod_folio(folio, NR_DIRTIED, nr); wb_stat_mod(wb, WB_RECLAIMABLE, nr); @@ -2647,6 +2649,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); task_io_account_cancelled_write(nr * PAGE_SIZE); @@ -2916,6 +2920,8 @@ bool folio_clear_dirty_for_io(struct folio *folio) if (folio_test_clear_dirty(folio)) { long nr = folio_nr_pages(folio); lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr); + if (folio_test_dropbehind(folio)) + wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); wb_stat_mod(wb, WB_RECLAIMABLE, -nr); ret = true; |
