diff options
author | Tejun Heo <tj@kernel.org> | 2013-04-01 19:08:06 -0700 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2013-04-01 19:08:06 -0700 |
commit | 839a8e8660b6777e7fe4e80af1a048aebe2b5977 (patch) | |
tree | 80398cd4dd8ebc4c51be20725c0cc427bfe321b3 /fs/fs-writeback.c | |
parent | 181387da2d64c3129e5b5186c4dd388bc5041d53 (diff) | |
download | lwn-839a8e8660b6777e7fe4e80af1a048aebe2b5977.tar.gz lwn-839a8e8660b6777e7fe4e80af1a048aebe2b5977.zip |
writeback: replace custom worker pool implementation with unbound workqueue
Writeback implements its own worker pool - each bdi can be associated
with a worker thread which is created and destroyed dynamically. The
worker thread for the default bdi is always present and serves as the
"forker" thread which forks off worker threads for other bdis.
there's no reason for writeback to implement its own worker pool when
using unbound workqueue instead is much simpler and more efficient.
This patch replaces custom worker pool implementation in writeback
with an unbound workqueue.
The conversion isn't too complicated but the followings are worth
mentioning.
* bdi_writeback->last_active, task and wakeup_timer are removed.
delayed_work ->dwork is added instead. Explicit timer handling is
no longer necessary. Everything works by either queueing / modding
/ flushing / canceling the delayed_work item.
* bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off
bdi_writeback->dwork. On each execution, it processes
bdi->work_list and reschedules itself if there are more things to
do.
The function also handles low-mem condition, which used to be
handled by the forker thread. If the function is running off a
rescuer thread, it only writes out limited number of pages so that
the rescuer can serve other bdis too. This preserves the flusher
creation failure behavior of the forker thread.
* INIT_LIST_HEAD(&bdi->bdi_list) is used to tell
bdi_writeback_workfn() about on-going bdi unregistration so that it
always drains work_list even if it's running off the rescuer. Note
that the original code was broken in this regard. Under memory
pressure, a bdi could finish unregistration with non-empty
work_list.
* The default bdi is no longer special. It now is treated the same as
any other bdi and bdi_cap_flush_forker() is removed.
* BDI_pending is no longer used. Removed.
* Some tracepoints become non-applicable. The following TPs are
removed - writeback_nothread, writeback_wake_thread,
writeback_wake_forker_thread, writeback_thread_start,
writeback_thread_stop.
Everything, including devices coming and going away and rescuer
operation under simulated memory pressure, seems to work fine in my
test setup.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Diffstat (limited to 'fs/fs-writeback.c')
-rw-r--r-- | fs/fs-writeback.c | 102 |
1 files changed, 32 insertions, 70 deletions
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 21f46fb3a101..8067d3719e94 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -22,7 +22,6 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> -#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> @@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head) #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> -/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */ -static void bdi_wakeup_flusher(struct backing_dev_info *bdi) -{ - if (bdi->wb.task) { - wake_up_process(bdi->wb.task); - } else { - /* - * The bdi thread isn't there, wake up the forker thread which - * will create and run it. - */ - wake_up_process(default_backing_dev_info.wb.task); - } -} - static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { @@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi, spin_lock_bh(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); - if (!bdi->wb.task) - trace_writeback_nothread(bdi, work); - bdi_wakeup_flusher(bdi); spin_unlock_bh(&bdi->wb_lock); + + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } static void @@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) { - trace_writeback_nowork(bdi); - wake_up_process(bdi->wb.task); - } + trace_writeback_nowork(bdi); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); return; } @@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - spin_lock_bh(&bdi->wb_lock); - bdi_wakeup_flusher(bdi); - spin_unlock_bh(&bdi->wb_lock); + mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); } /* @@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) /* * Handle writeback of dirty data for the device backed by this bdi. Also - * wakes up periodically and does kupdated style flushing. + * reschedules periodically and does kupdated style flushing. */ -int bdi_writeback_thread(void *data) +void bdi_writeback_workfn(struct work_struct *work) { - struct bdi_writeback *wb = data; + struct bdi_writeback *wb = container_of(to_delayed_work(work), + struct bdi_writeback, dwork); struct backing_dev_info *bdi = wb->bdi; long pages_written; current->flags |= PF_SWAPWRITE; - set_freezable(); - wb->last_active = jiffies; - - /* - * Our parent may run at a different priority, just set us to normal - */ - set_user_nice(current, 0); - - trace_writeback_thread_start(bdi); - while (!kthread_freezable_should_stop(NULL)) { + if (likely(!current_is_workqueue_rescuer() || + list_empty(&bdi->bdi_list))) { /* - * Remove own delayed wake-up timer, since we are already awake - * and we'll take care of the periodic write-back. + * The normal path. Keep writing back @bdi until its + * work_list is empty. Note that this path is also taken + * if @bdi is shutting down even when we're running off the + * rescuer as work_list needs to be drained. */ - del_timer(&wb->wakeup_timer); - - pages_written = wb_do_writeback(wb, 0); - + do { + pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + } while (!list_empty(&bdi->work_list)); + } else { + /* + * bdi_wq can't get enough workers and we're running off + * the emergency worker. Don't hog it. Hopefully, 1024 is + * enough for efficient IO. + */ + pages_written = writeback_inodes_wb(&bdi->wb, 1024, + WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); - - if (pages_written) - wb->last_active = jiffies; - - set_current_state(TASK_INTERRUPTIBLE); - if (!list_empty(&bdi->work_list) || kthread_should_stop()) { - __set_current_state(TASK_RUNNING); - continue; - } - - if (wb_has_dirty_io(wb) && dirty_writeback_interval) - schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); - else { - /* - * We have nothing to do, so can go sleep without any - * timeout and save power. When a work is queued or - * something is made dirty - we will be woken up. - */ - schedule(); - } } - /* Flush any work that raced with us exiting */ - if (!list_empty(&bdi->work_list)) - wb_do_writeback(wb, 1); + if (!list_empty(&bdi->work_list) || + (wb_has_dirty_io(wb) && dirty_writeback_interval)) + queue_delayed_work(bdi_wq, &wb->dwork, + msecs_to_jiffies(dirty_writeback_interval * 10)); - trace_writeback_thread_stop(bdi); - return 0; + current->flags &= ~PF_SWAPWRITE; } - /* * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back * the whole world. |