summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--fs/fs-writeback.c138
-rw-r--r--include/linux/backing-dev-defs.h3
-rw-r--r--include/linux/fs.h6
-rw-r--r--include/linux/fs/super_types.h8
-rw-r--r--include/trace/events/writeback.h3
-rw-r--r--mm/filemap.c15
-rw-r--r--mm/huge_memory.c1
-rw-r--r--mm/page-writeback.c6
8 files changed, 147 insertions, 33 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..fdb8766d275a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -432,6 +432,10 @@ static bool inode_do_switch_wbs(struct inode *inode,
long nr = folio_nr_pages(folio);
wb_stat_mod(old_wb, WB_RECLAIMABLE, -nr);
wb_stat_mod(new_wb, WB_RECLAIMABLE, nr);
+ if (folio_test_dropbehind(folio)) {
+ wb_stat_mod(old_wb, WB_DONTCACHE_DIRTY, -nr);
+ wb_stat_mod(new_wb, WB_DONTCACHE_DIRTY, nr);
+ }
}
}
@@ -497,6 +501,23 @@ skip_switch:
return switched;
}
+static inline void cgroup_writeback_pin(struct super_block *sb)
+{
+ atomic_inc(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_unpin(struct super_block *sb)
+{
+ if (atomic_dec_and_test(&sb->s_isw_nr_in_flight))
+ wake_up_var(&sb->s_isw_nr_in_flight);
+}
+
+static inline void cgroup_writeback_drain(struct super_block *sb)
+{
+ wait_var_event(&sb->s_isw_nr_in_flight,
+ !atomic_read(&sb->s_isw_nr_in_flight));
+}
+
static void process_inode_switch_wbs(struct bdi_writeback *new_wb,
struct inode_switch_wbs_context *isw)
{
@@ -554,8 +575,12 @@ relock:
wb_put_many(old_wb, nr_switched);
}
- for (inodep = isw->inodes; *inodep; inodep++)
+ for (inodep = isw->inodes; *inodep; inodep++) {
+ struct super_block *sb = (*inodep)->i_sb;
+
iput(*inodep);
+ cgroup_writeback_unpin(sb);
+ }
wb_put(new_wb);
kfree(isw);
atomic_dec(&isw_nr_in_flight);
@@ -598,16 +623,19 @@ void inode_switch_wbs_work_fn(struct work_struct *work)
static bool inode_prepare_wbs_switch(struct inode *inode,
struct bdi_writeback *new_wb)
{
+ /* Avoid the atomic_inc/smp_mb dance once SB_ACTIVE is gone. */
+ if (!(inode->i_sb->s_flags & SB_ACTIVE))
+ return false;
+
/*
- * Paired with smp_mb() in cgroup_writeback_umount().
- * isw_nr_in_flight must be increased before checking SB_ACTIVE and
- * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0
- * in cgroup_writeback_umount() and the isw_wq will be not flushed.
+ * Pairs with smp_mb() in cgroup_writeback_umount(): the umounter either
+ * sees a non-zero counter and waits, or we see SB_ACTIVE clear below.
*/
+ cgroup_writeback_pin(inode->i_sb);
smp_mb();
if (IS_DAX(inode))
- return false;
+ goto out_unpin;
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
@@ -615,13 +643,17 @@ static bool inode_prepare_wbs_switch(struct inode *inode,
inode_state_read(inode) & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) ||
inode_to_wb(inode) == new_wb) {
spin_unlock(&inode->i_lock);
- return false;
+ goto out_unpin;
}
inode_state_set(inode, I_WB_SWITCH);
__iget(inode);
spin_unlock(&inode->i_lock);
return true;
+
+out_unpin:
+ cgroup_writeback_unpin(inode->i_sb);
+ return false;
}
static void wb_queue_isw(struct bdi_writeback *wb,
@@ -1198,36 +1230,27 @@ out_bdi_put:
}
/**
- * cgroup_writeback_umount - flush inode wb switches for umount
+ * cgroup_writeback_umount - wait for in-flight inode wb switches on @sb
* @sb: target super_block
*
- * This function is called when a super_block is about to be destroyed and
- * flushes in-flight inode wb switches. An inode wb switch goes through
- * RCU and then workqueue, so the two need to be flushed in order to ensure
- * that all previously scheduled switches are finished. As wb switches are
- * rare occurrences and synchronize_rcu() can take a while, perform
- * flushing iff wb switches are in flight.
+ * Wait until every inode wb switch that already passed the SB_ACTIVE
+ * check on this superblock has been completed by the worker. Since
+ * SB_ACTIVE is cleared before this is called, no new switches can start
+ * for @sb, so s_isw_nr_in_flight will monotonically drop to zero.
*/
void cgroup_writeback_umount(struct super_block *sb)
{
-
if (!(sb->s_bdi->capabilities & BDI_CAP_WRITEBACK))
return;
/*
- * SB_ACTIVE should be reliably cleared before checking
- * isw_nr_in_flight, see generic_shutdown_super().
+ * Pairs with smp_mb() in inode_prepare_wbs_switch(): we either observe
+ * a non-zero counter and wait, or the switcher sees SB_ACTIVE clear
+ * (cleared by generic_shutdown_super()) and bails before grabbing the
+ * inode.
*/
smp_mb();
-
- if (atomic_read(&isw_nr_in_flight)) {
- /*
- * Use rcu_barrier() to wait for all pending callbacks to
- * ensure that all in-flight wb switches are in the workqueue.
- */
- rcu_barrier();
- flush_workqueue(isw_wq);
- }
+ cgroup_writeback_drain(sb);
}
static int __init cgroup_writeback_init(void)
@@ -2373,6 +2396,27 @@ static long wb_check_start_all(struct bdi_writeback *wb)
return nr_pages;
}
+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+ long nr_pages;
+
+ if (!test_and_clear_bit(WB_start_dontcache, &wb->state))
+ return 0;
+
+ nr_pages = wb_stat_sum(wb, WB_DONTCACHE_DIRTY);
+ if (nr_pages) {
+ struct wb_writeback_work work = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ .reason = WB_REASON_DONTCACHE,
+ };
+
+ nr_pages = wb_writeback(wb, &work);
+ }
+
+ return nr_pages;
+}
/*
* Retrieve work items and do the writeback they describe
@@ -2395,6 +2439,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
wrote += wb_check_start_all(wb);
/*
+ * Check for dontcache writeback request
+ */
+ wrote += wb_check_start_dontcache(wb);
+
+ /*
* Check for periodic writeback, kupdated() style
*/
wrote += wb_check_old_data_flush(wb);
@@ -2468,6 +2517,43 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
rcu_read_unlock();
}
+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping: address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache dirty
+ * pages. Queue writeback for the inode's wb for as many pages as there are
+ * dontcache pages, but don't restrict writeback to dontcache pages only.
+ *
+ * This significantly improves performance over either writing all wb's pages
+ * or writing only dontcache pages. Although it doesn't guarantee quick
+ * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages
+ * in check. Over longer term dontcache pages get written and reclaimed by
+ * background writeback even with this rough heuristic.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ bool need_wakeup = false;
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ if (wb_has_dirty_io(wb) &&
+ !test_bit(WB_start_dontcache, &wb->state) &&
+ !test_and_set_bit(WB_start_dontcache, &wb->state)) {
+ wb_get(wb);
+ need_wakeup = true;
+ }
+ unlocked_inode_to_wb_end(inode, &cookie);
+
+ if (need_wakeup) {
+ wb_wakeup(wb);
+ wb_put(wb);
+ }
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
/*
* Wakeup the flusher threads to start writeback of all currently dirty pages
*/
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index a06b93446d10..4f1084937315 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -26,6 +26,7 @@ enum wb_state {
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
WB_start_all, /* nr_pages == 0 (all) work pending */
+ WB_start_dontcache, /* dontcache writeback pending */
};
enum wb_stat_item {
@@ -33,6 +34,7 @@ enum wb_stat_item {
WB_WRITEBACK,
WB_DIRTIED,
WB_WRITTEN,
+ WB_DONTCACHE_DIRTY,
NR_WB_STAT_ITEMS
};
@@ -55,6 +57,7 @@ enum wb_reason {
*/
WB_REASON_FORKER_THREAD,
WB_REASON_FOREIGN_FLUSH,
+ WB_REASON_DONTCACHE,
WB_REASON_MAX,
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a220d14b1f91..2a6d2cb674db 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2633,6 +2633,7 @@ extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);
static inline int file_write_and_wait(struct file *file)
{
@@ -2666,10 +2667,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
if (ret)
return ret;
} else if (iocb->ki_flags & IOCB_DONTCACHE) {
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- filemap_flush_range(mapping, iocb->ki_pos - count,
- iocb->ki_pos - 1);
+ filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
}
return count;
diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
index 383050e7fdf5..1ab4e2265129 100644
--- a/include/linux/fs/super_types.h
+++ b/include/linux/fs/super_types.h
@@ -274,6 +274,14 @@ struct super_block {
/* number of fserrors that are being sent to fsnotify/filesystems */
refcount_t s_pending_errors;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ /*
+ * Number of in-flight inode wb switches for this sb. Drained by
+ * cgroup_writeback_umount() before tear-down.
+ */
+ atomic_t s_isw_nr_in_flight;
+#endif
} __randomize_layout;
/*
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..13ee076ccd16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -44,7 +44,8 @@
EM( WB_REASON_PERIODIC, "periodic") \
EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \
EM( WB_REASON_FORKER_THREAD, "forker_thread") \
- EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush")
+ EM( WB_REASON_FOREIGN_FLUSH, "foreign_flush") \
+ EMe(WB_REASON_DONTCACHE, "dontcache")
WB_WORK_REASON
diff --git a/mm/filemap.c b/mm/filemap.c
index 4e636647100c..179f2886f8c0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2052,8 +2052,19 @@ no_page:
if (!folio)
return ERR_PTR(-ENOENT);
/* not an uncached lookup, clear uncached if set */
- if (folio_test_dropbehind(folio) && !(fgp_flags & FGP_DONTCACHE))
- folio_clear_dropbehind(folio);
+ if (!(fgp_flags & FGP_DONTCACHE) && folio_test_clear_dropbehind(folio)) {
+ if (folio_test_dirty(folio) &&
+ mapping_can_writeback(mapping)) {
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+ long nr = folio_nr_pages(folio);
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr);
+ unlocked_inode_to_wb_end(inode, &cookie);
+ }
+ }
return folio;
}
EXPORT_SYMBOL(__filemap_get_folio_mpol);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b118bcd392cb..d29e85495091 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3644,6 +3644,7 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
(1L << PG_arch_3) |
#endif
(1L << PG_dirty) |
+ (1L << PG_dropbehind) |
LRU_GEN_MASK | LRU_REFS_MASK));
if (handle_hwpoison &&
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 833f743f309f..e98748112d1e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2626,6 +2626,8 @@ static void folio_account_dirtied(struct folio *folio,
wb = inode_to_wb(inode);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
+ if (folio_test_dropbehind(folio))
+ wb_stat_mod(wb, WB_DONTCACHE_DIRTY, nr);
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
@@ -2647,6 +2649,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb)
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ if (folio_test_dropbehind(folio))
+ wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
task_io_account_cancelled_write(nr * PAGE_SIZE);
@@ -2916,6 +2920,8 @@ bool folio_clear_dirty_for_io(struct folio *folio)
if (folio_test_clear_dirty(folio)) {
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
+ if (folio_test_dropbehind(folio))
+ wb_stat_mod(wb, WB_DONTCACHE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
ret = true;