From ea6813be07dcdc072aa9ad18099115a74cecb5e1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:44 -0700 Subject: mm: Move starting of background writeback into the main balancing loop We start background writeback if we are over background threshold after exiting the main loop in balance_dirty_pages(). This may result in basing the decision on already stale values (we may have slept for significant amount of time) and it is also inconvenient for refactoring needed for async dirty throttling. Move the check into the main waiting loop. Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220623175157.1715274-2-shr@fb.com Signed-off-by: Jens Axboe --- mm/page-writeback.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 55c2776ae699..e59c523aed1a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1627,6 +1627,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, } } + /* + * In laptop mode, we wait until hitting the higher threshold + * before starting background writeout, and then write out all + * the way down to the lower threshold. So slow writers cause + * minimal disk activity. + * + * In normal mode, we start background writeout at the lower + * background_thresh, to keep the amount of dirty memory low. + */ + if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh && + !writeback_in_progress(wb)) + wb_start_background_writeback(wb); + /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts @@ -1657,6 +1670,7 @@ free_running: break; } + /* Start writeback even when in laptop mode */ if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); @@ -1823,23 +1837,6 @@ pause: if (!dirty_exceeded && wb->dirty_exceeded) wb->dirty_exceeded = 0; - - if (writeback_in_progress(wb)) - return; - - /* - * In laptop mode, we wait until hitting the higher threshold before - * starting background writeout, and then write out all the way down - * to the lower threshold. So slow writers cause minimal disk activity. - * - * In normal mode, we start background writeout at the lower - * background_thresh, to keep the amount of dirty memory low. - */ - if (laptop_mode) - return; - - if (nr_reclaimable > gdtc->bg_thresh) - wb_start_background_writeback(wb); } static DEFINE_PER_CPU(int, bdp_ratelimits); -- cgit v1.2.3 From e92eebbb09218e128e559cf12b65317721309324 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:45 -0700 Subject: mm: Move updates of dirty_exceeded into one place Transition of wb->dirty_exceeded from 0 to 1 happens before we go to sleep in balance_dirty_pages() while transition from 1 to 0 happens when exiting from balance_dirty_pages(), possibly based on old values. This does not make a lot of sense since wb->dirty_exceeded should simply reflect whether wb is over dirty limit and so we should ratelimit entering to balance_dirty_pages() less. Move the two updates together. Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Link: https://lore.kernel.org/r/20220623175157.1715274-3-shr@fb.com Signed-off-by: Jens Axboe --- mm/page-writeback.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e59c523aed1a..90b1998c16a1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1729,8 +1729,8 @@ free_running: sdtc = mdtc; } - if (dirty_exceeded && !wb->dirty_exceeded) - wb->dirty_exceeded = 1; + if (dirty_exceeded != wb->dirty_exceeded) + wb->dirty_exceeded = dirty_exceeded; if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) @@ -1834,9 +1834,6 @@ pause: if (fatal_signal_pending(current)) break; } - - if (!dirty_exceeded && wb->dirty_exceeded) - wb->dirty_exceeded = 0; } static DEFINE_PER_CPU(int, bdp_ratelimits); -- cgit v1.2.3 From fe6c9c6e3e3e332b998393d214fba9d09ab0acb0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 10:51:46 -0700 Subject: mm: Add balance_dirty_pages_ratelimited_flags() function This adds the helper function balance_dirty_pages_ratelimited_flags(). It adds the parameter flags to balance_dirty_pages_ratelimited(). The flags parameter is passed to balance_dirty_pages(). For async buffered writes the flag value will be BDP_ASYNC. If balance_dirty_pages() gets called for async buffered write, we don't want to wait. Instead we need to indicate to the caller that throttling is needed so that it can stop writing and offload the rest of the write to a context that can block. The new helper function is also used by balance_dirty_pages_ratelimited(). Signed-off-by: Jan Kara Signed-off-by: Stefan Roesch Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623175157.1715274-4-shr@fb.com [axboe: fix kerneltest bot 'ret' issue] Signed-off-by: Jens Axboe --- include/linux/writeback.h | 7 +++++++ mm/page-writeback.c | 51 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index da21d63f70e2..b8c9610c2313 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb); + +/* Invoke balance dirty pages in async mode. */ +#define BDP_ASYNC 0x0001 + void balance_dirty_pages_ratelimited(struct address_space *mapping); +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags); + bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 90b1998c16a1..d0d466a5c804 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct bdi_writeback *wb, - unsigned long pages_dirtied) +static int balance_dirty_pages(struct bdi_writeback *wb, + unsigned long pages_dirtied, unsigned int flags) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; @@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct backing_dev_info *bdi = wb->bdi; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; + int ret = 0; for (;;) { unsigned long now = jiffies; @@ -1803,6 +1804,10 @@ pause: period, pause, start_time); + if (flags & BDP_ASYNC) { + ret = -EAGAIN; + break; + } __set_current_state(TASK_KILLABLE); wb->dirty_sleep = now; io_schedule_timeout(pause); @@ -1834,6 +1839,7 @@ pause: if (fatal_signal_pending(current)) break; } + return ret; } static DEFINE_PER_CPU(int, bdp_ratelimits); @@ -1855,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited - balance dirty memory state - * @mapping: address_space which was dirtied + * balance_dirty_pages_ratelimited_flags - Balance dirty memory state. + * @mapping: address_space which was dirtied. + * @flags: BDP flags. * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's * dirty state and will initiate writeback if needed. * - * Once we're over the dirty memory limit we decrease the ratelimiting - * by a lot, to prevent individual processes from overshooting the limit - * by (ratelimit_pages) each. + * See balance_dirty_pages_ratelimited() for details. + * + * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to + * indicate that memory is out of balance and the caller must wait + * for I/O to complete. Otherwise, it will return 0 to indicate + * that either memory was already in balance, or it was able to sleep + * until the amount of dirty memory returned to balance. */ -void balance_dirty_pages_ratelimited(struct address_space *mapping) +int balance_dirty_pages_ratelimited_flags(struct address_space *mapping, + unsigned int flags) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; int ratelimit; + int ret = 0; int *p; if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) - return; + return ret; if (inode_cgwb_enabled(inode)) wb = wb_get_create_current(bdi, GFP_KERNEL); @@ -1915,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(wb, current->nr_dirtied); + ret = balance_dirty_pages(wb, current->nr_dirtied, flags); wb_put(wb); + return ret; +} + +/** + * balance_dirty_pages_ratelimited - balance dirty memory state. + * @mapping: address_space which was dirtied. + * + * Processes which are dirtying memory should call in here once for each page + * which was newly dirtied. The function will periodically check the system's + * dirty state and will initiate writeback if needed. + * + * Once we're over the dirty memory limit we decrease the ratelimiting + * by a lot, to prevent individual processes from overshooting the limit + * by (ratelimit_pages) each. + */ +void balance_dirty_pages_ratelimited(struct address_space *mapping) +{ + balance_dirty_pages_ratelimited_flags(mapping, 0); } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); -- cgit v1.2.3 From 0dd316ba8692c2374fbb82cce57c0b23144f2977 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 1 Jul 2022 14:04:43 -0600 Subject: mm: honor FGP_NOWAIT for page cache page allocation If we're creating a page cache page with FGP_CREAT but FGP_NOWAIT is set, we should dial back the gfp flags to avoid frivolous blocking which is trivial to hit in low memory conditions: [ 10.117661] __schedule+0x8c/0x550 [ 10.118305] schedule+0x58/0xa0 [ 10.118897] schedule_timeout+0x30/0xdc [ 10.119610] __wait_for_common+0x88/0x114 [ 10.120348] wait_for_completion+0x1c/0x24 [ 10.121103] __flush_work.isra.0+0x16c/0x19c [ 10.121896] flush_work+0xc/0x14 [ 10.122496] __drain_all_pages+0x144/0x218 [ 10.123267] drain_all_pages+0x10/0x18 [ 10.123941] __alloc_pages+0x464/0x9e4 [ 10.124633] __folio_alloc+0x18/0x3c [ 10.125294] __filemap_get_folio+0x17c/0x204 [ 10.126084] iomap_write_begin+0xf8/0x428 [ 10.126829] iomap_file_buffered_write+0x144/0x24c [ 10.127710] xfs_file_buffered_write+0xe8/0x248 [ 10.128553] xfs_file_write_iter+0xa8/0x120 [ 10.129324] io_write+0x16c/0x38c [ 10.129940] io_issue_sqe+0x70/0x1cc [ 10.130617] io_queue_sqe+0x18/0xfc [ 10.131277] io_submit_sqes+0x5d4/0x600 [ 10.131946] __arm64_sys_io_uring_enter+0x224/0x600 [ 10.132752] invoke_syscall.constprop.0+0x70/0xc0 [ 10.133616] do_el0_svc+0xd0/0x118 [ 10.134238] el0_svc+0x78/0xa0 Clear IO, FS, and reclaim flags and mark the allocation as GFP_NOWAIT and add __GFP_NOWARN to avoid polluting dmesg with pointless allocations failures. A caller with FGP_NOWAIT must be expected to handle the resulting -EAGAIN return and retry from a suitable context without NOWAIT set. Reviewed-by: Shakeel Butt Signed-off-by: Jens Axboe --- mm/filemap.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ffdfbc8b0e3c..254931a6e3ed 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1988,6 +1988,10 @@ no_page: gfp |= __GFP_WRITE; if (fgp_flags & FGP_NOFS) gfp &= ~__GFP_FS; + if (fgp_flags & FGP_NOWAIT) { + gfp &= ~GFP_KERNEL; + gfp |= GFP_NOWAIT | __GFP_NOWARN; + } folio = filemap_alloc_folio(gfp, 0); if (!folio) -- cgit v1.2.3