From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:26 -0400 Subject: writeback: move backing_dev_info->state into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->state into wb. * enum bdi_state is renamed to wb_state and the prefix of all enums is changed from BDI_ to WB_. * Explicit zeroing of bdi->state is removed without adding zeoring of wb->state as the whole data structure is zeroed on init anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: drbd-dev@lists.linbit.com Cc: Neil Brown Cc: Alasdair Kergon Cc: Mike Snitzer Signed-off-by: Jens Axboe --- mm/backing-dev.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 6dc4580df2af..b23cf0ea5912 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -96,7 +96,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_io, nr_more_io, nr_dirty_time, - !list_empty(&bdi->bdi_list), bdi->state); + !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; @@ -280,7 +280,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) + if (test_bit(WB_registered, &bdi->wb.state)) queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); spin_unlock_bh(&bdi->wb_lock); } @@ -315,7 +315,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); - set_bit(BDI_registered, &bdi->state); + set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); @@ -339,7 +339,7 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi) { /* Make sure nobody queues further work */ spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(BDI_registered, &bdi->state)) { + if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) { spin_unlock_bh(&bdi->wb_lock); return; } @@ -492,11 +492,11 @@ static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { - enum bdi_state bit; + enum wb_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (test_and_clear_bit(bit, &bdi->state)) + bit = sync ? WB_sync_congested : WB_async_congested; + if (test_and_clear_bit(bit, &bdi->wb.state)) atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) @@ -506,10 +506,10 @@ EXPORT_SYMBOL(clear_bdi_congested); void set_bdi_congested(struct backing_dev_info *bdi, int sync) { - enum bdi_state bit; + enum wb_state bit; - bit = sync ? BDI_sync_congested : BDI_async_congested; - if (!test_and_set_bit(bit, &bdi->state)) + bit = sync ? WB_sync_congested : WB_async_congested; + if (!test_and_set_bit(bit, &bdi->wb.state)) atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); -- cgit v1.2.3 From 93f78d882865cb90020d0f80a9523c99cf46924c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:27 -0400 Subject: writeback: move backing_dev_info->bdi_stat[] into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->bdi_stat[] into wb. * enum bdi_stat_item is renamed to wb_stat_item and the prefix of all enums is changed from BDI_ to WB_. * BDI_STAT_BATCH() -> WB_STAT_BATCH() * [__]{add|inc|dec|sum}_wb_stat(bdi, ...) -> [__]{add|inc}_wb_stat(wb, ...) * bdi_stat[_error]() -> wb_stat[_error]() * bdi_writeout_inc() -> wb_writeout_inc() * stat init is moved to bdi_wb_init() and bdi_wb_exit() is added and frees stat. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- fs/fuse/file.c | 12 ++++---- fs/nfs/internal.h | 2 +- fs/nfs/write.c | 3 +- include/linux/backing-dev.h | 68 +++++++++++++++++++++------------------------ mm/backing-dev.c | 60 ++++++++++++++++++++++++--------------- mm/page-writeback.c | 55 ++++++++++++++++++------------------ 7 files changed, 106 insertions(+), 96 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 983312cea245..8873ecd1578c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -840,7 +840,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi) global_page_state(NR_UNSTABLE_NFS) > background_thresh) return true; - if (bdi_stat(bdi, BDI_RECLAIMABLE) > + if (wb_stat(&bdi->wb, WB_RECLAIMABLE) > bdi_dirty_limit(bdi, background_thresh)) return true; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ef05b5c4cff..8c5e2fa68835 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } @@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); goto out; @@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9e6475bc5ba2..7e3c4604bea8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page) struct inode *inode = page_file_mapping(page)->host; inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d12a4be613a5..94c7ce01dfb1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -853,7 +853,8 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, + WB_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index eb14f988a63e..fe7a907a4e16 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -36,15 +36,15 @@ enum wb_state { typedef int (congested_fn)(void *, int); -enum bdi_stat_item { - BDI_RECLAIMABLE, - BDI_WRITEBACK, - BDI_DIRTIED, - BDI_WRITTEN, - NR_BDI_STAT_ITEMS +enum wb_stat_item { + WB_RECLAIMABLE, + WB_WRITEBACK, + WB_DIRTIED, + WB_WRITTEN, + NR_WB_STAT_ITEMS }; -#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -58,6 +58,8 @@ struct bdi_writeback { struct list_head b_more_io; /* parked for more writeback */ struct list_head b_dirty_time; /* time stamps are dirty */ spinlock_t list_lock; /* protects the b_* lists */ + + struct percpu_counter stat[NR_WB_STAT_ITEMS]; }; struct backing_dev_info { @@ -69,8 +71,6 @@ struct backing_dev_info { char *name; - struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS]; - unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -137,78 +137,74 @@ static inline int wb_has_dirty_io(struct bdi_writeback *wb) !list_empty(&wb->b_more_io); } -static inline void __add_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item, s64 amount) +static inline void __add_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item, s64 amount) { - __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH); + __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH); } -static inline void __inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __inc_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, 1); + __add_wb_stat(wb, item, 1); } -static inline void inc_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __inc_bdi_stat(bdi, item); + __inc_wb_stat(wb, item); local_irq_restore(flags); } -static inline void __dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void __dec_wb_stat(struct bdi_writeback *wb, + enum wb_stat_item item) { - __add_bdi_stat(bdi, item, -1); + __add_wb_stat(wb, item, -1); } -static inline void dec_bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { unsigned long flags; local_irq_save(flags); - __dec_bdi_stat(bdi, item); + __dec_wb_stat(wb, item); local_irq_restore(flags); } -static inline s64 bdi_stat(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item) { - return percpu_counter_read_positive(&bdi->bdi_stat[item]); + return percpu_counter_read_positive(&wb->stat[item]); } -static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 __wb_stat_sum(struct bdi_writeback *wb, + enum wb_stat_item item) { - return percpu_counter_sum_positive(&bdi->bdi_stat[item]); + return percpu_counter_sum_positive(&wb->stat[item]); } -static inline s64 bdi_stat_sum(struct backing_dev_info *bdi, - enum bdi_stat_item item) +static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item) { s64 sum; unsigned long flags; local_irq_save(flags); - sum = __bdi_stat_sum(bdi, item); + sum = __wb_stat_sum(wb, item); local_irq_restore(flags); return sum; } -extern void bdi_writeout_inc(struct backing_dev_info *bdi); +extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi) +static inline unsigned long wb_stat_error(struct bdi_writeback *wb) { #ifdef CONFIG_SMP - return nr_cpu_ids * BDI_STAT_BATCH; + return nr_cpu_ids * WB_STAT_BATCH; #else return 1; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b23cf0ea5912..7b1d1917b658 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -84,13 +84,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", - (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), - (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), + (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), + (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), K(bdi_thresh), K(dirty_thresh), K(background_thresh), - (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)), - (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)), + (unsigned long) K(wb_stat(wb, WB_DIRTIED)), + (unsigned long) K(wb_stat(wb, WB_WRITTEN)), (unsigned long) K(bdi->write_bandwidth), nr_dirty, nr_io, @@ -376,8 +376,10 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); -static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) { + int i, err; + memset(wb, 0, sizeof(*wb)); wb->bdi = bdi; @@ -388,6 +390,27 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { + err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL); + if (err) { + while (--i) + percpu_counter_destroy(&wb->stat[i]); + return err; + } + } + + return 0; +} + +static void bdi_wb_exit(struct bdi_writeback *wb) +{ + int i; + + WARN_ON(delayed_work_pending(&wb->dwork)); + + for (i = 0; i < NR_WB_STAT_ITEMS; i++) + percpu_counter_destroy(&wb->stat[i]); } /* @@ -397,7 +420,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) int bdi_init(struct backing_dev_info *bdi) { - int i, err; + int err; bdi->dev = NULL; @@ -408,13 +431,9 @@ int bdi_init(struct backing_dev_info *bdi) INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->work_list); - bdi_wb_init(&bdi->wb, bdi); - - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { - err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); - if (err) - goto err; - } + err = bdi_wb_init(&bdi->wb, bdi); + if (err) + return err; bdi->dirty_exceeded = 0; @@ -427,25 +446,20 @@ int bdi_init(struct backing_dev_info *bdi) bdi->avg_write_bandwidth = INIT_BW; err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); - if (err) { -err: - while (i--) - percpu_counter_destroy(&bdi->bdi_stat[i]); + bdi_wb_exit(&bdi->wb); + return err; } - return err; + return 0; } EXPORT_SYMBOL(bdi_init); void bdi_destroy(struct backing_dev_info *bdi) { - int i; - bdi_wb_shutdown(bdi); WARN_ON(!list_empty(&bdi->work_list)); - WARN_ON(delayed_work_pending(&bdi->wb.dwork)); if (bdi->dev) { bdi_debug_unregister(bdi); @@ -453,8 +467,8 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi->dev = NULL; } - for (i = 0; i < NR_BDI_STAT_ITEMS; i++) - percpu_counter_destroy(&bdi->bdi_stat[i]); + bdi_wb_exit(&bdi->wb); + fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bdeecad00489..dc673a035413 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -396,11 +396,11 @@ static unsigned long wp_next_time(unsigned long cur_time) * Increment the BDI's writeout completion count and the global writeout * completion count. Called from test_clear_page_writeback(). */ -static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) +static inline void __wb_writeout_inc(struct bdi_writeback *wb) { - __inc_bdi_stat(bdi, BDI_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &bdi->completions, - bdi->max_prop_frac); + __inc_wb_stat(wb, WB_WRITTEN); + __fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions, + wb->bdi->max_prop_frac); /* First event after period switching was turned off? */ if (!unlikely(writeout_period_time)) { /* @@ -414,15 +414,15 @@ static inline void __bdi_writeout_inc(struct backing_dev_info *bdi) } } -void bdi_writeout_inc(struct backing_dev_info *bdi) +void wb_writeout_inc(struct bdi_writeback *wb) { unsigned long flags; local_irq_save(flags); - __bdi_writeout_inc(bdi); + __wb_writeout_inc(wb); local_irq_restore(flags); } -EXPORT_SYMBOL_GPL(bdi_writeout_inc); +EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * Obtain an accurate fraction of the BDI's portion. @@ -1130,8 +1130,8 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed < BANDWIDTH_INTERVAL) return; - dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]); - written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]); + dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]); + written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. @@ -1288,7 +1288,8 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, unsigned long *bdi_thresh, unsigned long *bdi_bg_thresh) { - unsigned long bdi_reclaimable; + struct bdi_writeback *wb = &bdi->wb; + unsigned long wb_reclaimable; /* * bdi_thresh is not treated as some limiting factor as @@ -1320,14 +1321,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * actually dirty; with m+n sitting in the percpu * deltas. */ - if (*bdi_thresh < 2 * bdi_stat_error(bdi)) { - bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat_sum(bdi, BDI_WRITEBACK); + if (*bdi_thresh < 2 * wb_stat_error(wb)) { + wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); + *bdi_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { - bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - *bdi_dirty = bdi_reclaimable + - bdi_stat(bdi, BDI_WRITEBACK); + wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE); + *bdi_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK); } } @@ -1514,9 +1513,9 @@ pause: * In theory 1 page is enough to keep the comsumer-producer * pipe going: the flusher cleans 1 page => the task dirties 1 * more page. However bdi_dirty has accounting errors. So use - * the larger and more IO friendly bdi_stat_error. + * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= bdi_stat_error(bdi)) + if (bdi_dirty <= wb_stat_error(&bdi->wb)) break; if (fatal_signal_pending(current)) @@ -2106,8 +2105,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping, mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_bdi_stat(bdi, BDI_RECLAIMABLE); - __inc_bdi_stat(bdi, BDI_DIRTIED); + __inc_wb_stat(&bdi->wb, WB_RECLAIMABLE); + __inc_wb_stat(&bdi->wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); @@ -2126,7 +2125,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, if (mapping_cap_account_dirty(mapping)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(mapping->host)->wb, WB_RECLAIMABLE); task_io_account_cancelled_write(PAGE_CACHE_SIZE); } } @@ -2190,7 +2189,7 @@ void account_page_redirty(struct page *page) if (mapping && mapping_cap_account_dirty(mapping)) { current->nr_dirtied--; dec_zone_page_state(page, NR_DIRTIED); - dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED); + dec_wb_stat(&inode_to_bdi(mapping->host)->wb, WB_DIRTIED); } } EXPORT_SYMBOL(account_page_redirty); @@ -2369,8 +2368,8 @@ int clear_page_dirty_for_io(struct page *page) if (TestClearPageDirty(page)) { mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY); - dec_bdi_stat(inode_to_bdi(mapping->host), - BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(mapping->host)->wb, + WB_RECLAIMABLE); ret = 1; } mem_cgroup_end_page_stat(memcg); @@ -2398,8 +2397,8 @@ int test_clear_page_writeback(struct page *page) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) { - __dec_bdi_stat(bdi, BDI_WRITEBACK); - __bdi_writeout_inc(bdi); + __dec_wb_stat(&bdi->wb, WB_WRITEBACK); + __wb_writeout_inc(&bdi->wb); } } spin_unlock_irqrestore(&mapping->tree_lock, flags); @@ -2433,7 +2432,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) page_index(page), PAGECACHE_TAG_WRITEBACK); if (bdi_cap_account_writeback(bdi)) - __inc_bdi_stat(bdi, BDI_WRITEBACK); + __inc_wb_stat(&bdi->wb, WB_WRITEBACK); } if (!PageDirty(page)) radix_tree_tag_clear(&mapping->page_tree, -- cgit v1.2.3 From a88a341a73be4ef035ca26170c849f002797da27 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:28 -0400 Subject: writeback: move bandwidth related fields from backing_dev_info into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bandwidth related fields from backing_dev_info into bdi_writeback. * The moved fields are: bw_time_stamp, dirtied_stamp, written_stamp, write_bandwidth, avg_write_bandwidth, dirty_ratelimit, balanced_dirty_ratelimit, completions and dirty_exceeded. * writeback_chunk_size() and over_bground_thresh() now take @wb instead of @bdi. * bdi_writeout_fraction(bdi, ...) -> wb_writeout_fraction(wb, ...) bdi_dirty_limit(bdi, ...) -> wb_dirty_limit(wb, ...) bdi_position_ration(bdi, ...) -> wb_position_ratio(wb, ...) bdi_update_writebandwidth(bdi, ...) -> wb_update_write_bandwidth(wb, ...) [__]bdi_update_bandwidth(bdi, ...) -> [__]wb_update_bandwidth(wb, ...) bdi_{max|min}_pause(bdi, ...) -> wb_{max|min}_pause(wb, ...) bdi_dirty_limits(bdi, ...) -> wb_dirty_limits(wb, ...) * Init/exits of the relocated fields are moved to bdi_wb_init/exit() respectively. Note that explicit zeroing is dropped in the process as wb's are cleared in entirety anyway. * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->stat[] are mechanically replaced with bdi->wb.stat[] introducing no behavior changes. v2: Typo in description fixed as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Cc: Jaegeuk Kim Cc: Steven Whitehouse Signed-off-by: Jens Axboe --- fs/f2fs/node.c | 4 +- fs/f2fs/segment.h | 2 +- fs/fs-writeback.c | 17 ++- fs/gfs2/super.c | 2 +- include/linux/backing-dev.h | 20 +-- include/linux/writeback.h | 19 ++- include/trace/events/writeback.h | 8 +- mm/backing-dev.c | 45 +++---- mm/page-writeback.c | 262 ++++++++++++++++++++------------------- 9 files changed, 187 insertions(+), 192 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ab0cf1930bd..d211602e0f86 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; } return res; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 85d7fa7514b2..6408989857e0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -713,7 +713,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8873ecd1578c..1945cb9f6b97 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -624,7 +624,7 @@ out: return ret; } -static long writeback_chunk_size(struct backing_dev_info *bdi, +static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -645,7 +645,7 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { - pages = min(bdi->avg_write_bandwidth / 2, + pages = min(wb->avg_write_bandwidth / 2, global_dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, @@ -743,7 +743,7 @@ static long writeback_sb_inodes(struct super_block *sb, inode->i_state |= I_SYNC; spin_unlock(&inode->i_lock); - write_chunk = writeback_chunk_size(wb->bdi, work); + write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -830,7 +830,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct backing_dev_info *bdi) +static bool over_bground_thresh(struct bdi_writeback *wb) { unsigned long background_thresh, dirty_thresh; @@ -840,8 +840,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi) global_page_state(NR_UNSTABLE_NFS) > background_thresh) return true; - if (wb_stat(&bdi->wb, WB_RECLAIMABLE) > - bdi_dirty_limit(bdi, background_thresh)) + if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh)) return true; return false; @@ -854,7 +853,7 @@ static bool over_bground_thresh(struct backing_dev_info *bdi) static void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time) { - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); + __wb_update_bandwidth(wb, 0, 0, 0, 0, 0, start_time); } /* @@ -906,7 +905,7 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) + if (work->for_background && !over_bground_thresh(wb)) break; /* @@ -998,7 +997,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (over_bground_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 859c6edbf81a..2982445947e1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); - if (bdi->dirty_exceeded) + if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index fe7a907a4e16..2ab06049d812 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -60,16 +60,6 @@ struct bdi_writeback { spinlock_t list_lock; /* protects the b_* lists */ struct percpu_counter stat[NR_WB_STAT_ITEMS]; -}; - -struct backing_dev_info { - struct list_head bdi_list; - unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ - unsigned int capabilities; /* Device capabilities */ - congested_fn *congested_fn; /* Function pointer if device is md/dm */ - void *congested_data; /* Pointer to aux data for congested func */ - - char *name; unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; @@ -88,6 +78,16 @@ struct backing_dev_info { struct fprop_local_percpu completions; int dirty_exceeded; +}; + +struct backing_dev_info { + struct list_head bdi_list; + unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */ + unsigned int capabilities; /* Device capabilities */ + congested_fn *congested_fn; /* Function pointer if device is md/dm */ + void *congested_data; /* Pointer to aux data for congested func */ + + char *name; unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b2dd371ec0ca..a6b9db7fcee8 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,16 +155,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty); - -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time); +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); + +void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time); void page_writeback_init(void); void balance_dirty_pages_ratelimited(struct address_space *mapping); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 880dd7437172..9b876f6cc81a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -400,13 +400,13 @@ TRACE_EVENT(bdi_dirty_ratelimit, TP_fast_assign( strlcpy(__entry->bdi, dev_name(bdi->dev), 32); - __entry->write_bw = KBps(bdi->write_bandwidth); - __entry->avg_write_bw = KBps(bdi->avg_write_bandwidth); + __entry->write_bw = KBps(bdi->wb.write_bandwidth); + __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); - __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit); + __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = - KBps(bdi->balanced_dirty_ratelimit); + KBps(bdi->wb.balanced_dirty_ratelimit); ), TP_printk("bdi %s: " diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7b1d1917b658..9a6c472ebd0b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -66,7 +66,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + bdi_thresh = wb_dirty_limit(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -91,7 +91,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) K(background_thresh), (unsigned long) K(wb_stat(wb, WB_DIRTIED)), (unsigned long) K(wb_stat(wb, WB_WRITTEN)), - (unsigned long) K(bdi->write_bandwidth), + (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, @@ -376,6 +376,11 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +/* + * Initial write bandwidth: 100 MB/s + */ +#define INIT_BW (100 << (20 - PAGE_SHIFT)) + static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) { int i, err; @@ -391,11 +396,22 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) spin_lock_init(&wb->list_lock); INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); + wb->bw_time_stamp = jiffies; + wb->balanced_dirty_ratelimit = INIT_BW; + wb->dirty_ratelimit = INIT_BW; + wb->write_bandwidth = INIT_BW; + wb->avg_write_bandwidth = INIT_BW; + + err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL); + if (err) + return err; + for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL); if (err) { while (--i) percpu_counter_destroy(&wb->stat[i]); + fprop_local_destroy_percpu(&wb->completions); return err; } } @@ -411,12 +427,9 @@ static void bdi_wb_exit(struct bdi_writeback *wb) for (i = 0; i < NR_WB_STAT_ITEMS; i++) percpu_counter_destroy(&wb->stat[i]); -} -/* - * Initial write bandwidth: 100 MB/s - */ -#define INIT_BW (100 << (20 - PAGE_SHIFT)) + fprop_local_destroy_percpu(&wb->completions); +} int bdi_init(struct backing_dev_info *bdi) { @@ -435,22 +448,6 @@ int bdi_init(struct backing_dev_info *bdi) if (err) return err; - bdi->dirty_exceeded = 0; - - bdi->bw_time_stamp = jiffies; - bdi->written_stamp = 0; - - bdi->balanced_dirty_ratelimit = INIT_BW; - bdi->dirty_ratelimit = INIT_BW; - bdi->write_bandwidth = INIT_BW; - bdi->avg_write_bandwidth = INIT_BW; - - err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); - if (err) { - bdi_wb_exit(&bdi->wb); - return err; - } - return 0; } EXPORT_SYMBOL(bdi_init); @@ -468,8 +465,6 @@ void bdi_destroy(struct backing_dev_info *bdi) } bdi_wb_exit(&bdi->wb); - - fprop_local_destroy_percpu(&bdi->completions); } EXPORT_SYMBOL(bdi_destroy); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index dc673a035413..cd39ee91b7bb 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -399,7 +399,7 @@ static unsigned long wp_next_time(unsigned long cur_time) static inline void __wb_writeout_inc(struct bdi_writeback *wb) { __inc_wb_stat(wb, WB_WRITTEN); - __fprop_inc_percpu_max(&writeout_completions, &wb->bdi->completions, + __fprop_inc_percpu_max(&writeout_completions, &wb->completions, wb->bdi->max_prop_frac); /* First event after period switching was turned off? */ if (!unlikely(writeout_period_time)) { @@ -427,10 +427,10 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); /* * Obtain an accurate fraction of the BDI's portion. */ -static void bdi_writeout_fraction(struct backing_dev_info *bdi, - long *numerator, long *denominator) +static void wb_writeout_fraction(struct bdi_writeback *wb, + long *numerator, long *denominator) { - fprop_fraction_percpu(&writeout_completions, &bdi->completions, + fprop_fraction_percpu(&writeout_completions, &wb->completions, numerator, denominator); } @@ -516,11 +516,11 @@ static unsigned long hard_dirty_limit(unsigned long thresh) } /** - * bdi_dirty_limit - @bdi's share of dirty throttling threshold - * @bdi: the backing_dev_info to query + * wb_dirty_limit - @wb's share of dirty throttling threshold + * @wb: bdi_writeback to query * @dirty: global dirty limit in pages * - * Returns @bdi's dirty limit in pages. The term "dirty" in the context of + * Returns @wb's dirty limit in pages. The term "dirty" in the context of * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. * * Note that balance_dirty_pages() will only seriously take it as a hard limit @@ -528,34 +528,35 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * control. For example, when the device is completely stalled due to some error * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key. * In the other normal situations, it acts more gently by throttling the tasks - * more (rather than completely block them) when the bdi dirty pages go high. + * more (rather than completely block them) when the wb dirty pages go high. * * It allocates high/low dirty limits to fast/slow devices, in order to prevent * - starving fast devices * - piling up dirty pages (that will take long time to sync) on slow devices * - * The bdi's share of dirty limit will be adapting to its throughput and + * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty) { - u64 bdi_dirty; + struct backing_dev_info *bdi = wb->bdi; + u64 wb_dirty; long numerator, denominator; /* * Calculate this BDI's share of the dirty ratio. */ - bdi_writeout_fraction(bdi, &numerator, &denominator); + wb_writeout_fraction(wb, &numerator, &denominator); - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); + wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + wb_dirty *= numerator; + do_div(wb_dirty, denominator); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; + wb_dirty += (dirty * bdi->min_ratio) / 100; + if (wb_dirty > (dirty * bdi->max_ratio) / 100) + wb_dirty = dirty * bdi->max_ratio / 100; - return bdi_dirty; + return wb_dirty; } /* @@ -664,14 +665,14 @@ static long long pos_ratio_polynom(unsigned long setpoint, * card's bdi_dirty may rush to many times higher than bdi_setpoint. * - the bdi dirty thresh drops quickly due to change of JBOD workload */ -static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty) +static unsigned long wb_position_ratio(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty) { - unsigned long write_bw = bdi->avg_write_bandwidth; + unsigned long write_bw = wb->avg_write_bandwidth; unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); unsigned long limit = hard_dirty_limit(thresh); unsigned long x_intercept; @@ -702,12 +703,12 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * consume arbitrary amount of RAM because it is accounted in * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty". * - * Here, in bdi_position_ratio(), we calculate pos_ratio based on + * Here, in wb_position_ratio(), we calculate pos_ratio based on * two values: bdi_dirty and bdi_thresh. Let's consider an example: * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is + * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. bdi_setpoint is * about ~6K pages (as the average of background and throttle bdi * limits). The 3rd order polynomial will provide positive feedback if * bdi_dirty is under bdi_setpoint and vice versa. @@ -717,7 +718,7 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB * in the example above). */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { long long bdi_pos_ratio; unsigned long bdi_bg_thresh; @@ -842,13 +843,13 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi, return pos_ratio; } -static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, - unsigned long elapsed, - unsigned long written) +static void wb_update_write_bandwidth(struct bdi_writeback *wb, + unsigned long elapsed, + unsigned long written) { const unsigned long period = roundup_pow_of_two(3 * HZ); - unsigned long avg = bdi->avg_write_bandwidth; - unsigned long old = bdi->write_bandwidth; + unsigned long avg = wb->avg_write_bandwidth; + unsigned long old = wb->write_bandwidth; u64 bw; /* @@ -861,14 +862,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, * @written may have decreased due to account_page_redirty(). * Avoid underflowing @bw calculation. */ - bw = written - min(written, bdi->written_stamp); + bw = written - min(written, wb->written_stamp); bw *= HZ; if (unlikely(elapsed > period)) { do_div(bw, elapsed); avg = bw; goto out; } - bw += (u64)bdi->write_bandwidth * (period - elapsed); + bw += (u64)wb->write_bandwidth * (period - elapsed); bw >>= ilog2(period); /* @@ -881,8 +882,8 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi, avg += (old - avg) >> 3; out: - bdi->write_bandwidth = bw; - bdi->avg_write_bandwidth = avg; + wb->write_bandwidth = bw; + wb->avg_write_bandwidth = avg; } /* @@ -947,20 +948,20 @@ static void global_update_bandwidth(unsigned long thresh, * Normal bdi tasks will be curbed at or below it in long term. * Obviously it should be around (write_bw / N) when there are N dd tasks. */ -static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long dirtied, - unsigned long elapsed) +static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long dirtied, + unsigned long elapsed) { unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh); unsigned long limit = hard_dirty_limit(thresh); unsigned long setpoint = (freerun + limit) / 2; - unsigned long write_bw = bdi->avg_write_bandwidth; - unsigned long dirty_ratelimit = bdi->dirty_ratelimit; + unsigned long write_bw = wb->avg_write_bandwidth; + unsigned long dirty_ratelimit = wb->dirty_ratelimit; unsigned long dirty_rate; unsigned long task_ratelimit; unsigned long balanced_dirty_ratelimit; @@ -972,10 +973,10 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, * The dirty rate will match the writeout rate in long term, except * when dirty pages are truncated by userspace or re-dirtied by FS. */ - dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed; + dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed; - pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty); + pos_ratio = wb_position_ratio(wb, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty); /* * task_ratelimit reflects each dd's dirty rate for the past 200ms. */ @@ -1059,31 +1060,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, /* * For strictlimit case, calculations above were based on bdi counters - * and limits (starting from pos_ratio = bdi_position_ratio() and up to + * and limits (starting from pos_ratio = wb_position_ratio() and up to * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate). * Hence, to calculate "step" properly, we have to use bdi_dirty as * "dirty" and bdi_setpoint as "setpoint". * * We rampup dirty_ratelimit forcibly if bdi_dirty is low because * it's possible that bdi_thresh is close to zero due to inactivity - * of backing device (see the implementation of bdi_dirty_limit()). + * of backing device (see the implementation of wb_dirty_limit()). */ - if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = bdi_dirty; if (bdi_dirty < 8) setpoint = bdi_dirty + 1; else setpoint = (bdi_thresh + - bdi_dirty_limit(bdi, bg_thresh)) / 2; + wb_dirty_limit(wb, bg_thresh)) / 2; } if (dirty < setpoint) { - x = min3(bdi->balanced_dirty_ratelimit, + x = min3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit < x) step = x - dirty_ratelimit; } else { - x = max3(bdi->balanced_dirty_ratelimit, + x = max3(wb->balanced_dirty_ratelimit, balanced_dirty_ratelimit, task_ratelimit); if (dirty_ratelimit > x) step = dirty_ratelimit - x; @@ -1105,22 +1106,22 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi, else dirty_ratelimit -= step; - bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL); - bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit; + wb->dirty_ratelimit = max(dirty_ratelimit, 1UL); + wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit; - trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit); + trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit); } -void __bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +void __wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) { unsigned long now = jiffies; - unsigned long elapsed = now - bdi->bw_time_stamp; + unsigned long elapsed = now - wb->bw_time_stamp; unsigned long dirtied; unsigned long written; @@ -1130,44 +1131,44 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi, if (elapsed < BANDWIDTH_INTERVAL) return; - dirtied = percpu_counter_read(&bdi->wb.stat[WB_DIRTIED]); - written = percpu_counter_read(&bdi->wb.stat[WB_WRITTEN]); + dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]); + written = percpu_counter_read(&wb->stat[WB_WRITTEN]); /* * Skip quiet periods when disk bandwidth is under-utilized. * (at least 1s idle time between two flusher runs) */ - if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time)) + if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time)) goto snapshot; if (thresh) { global_update_bandwidth(thresh, dirty, now); - bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, - dirtied, elapsed); + wb_update_dirty_ratelimit(wb, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty, + dirtied, elapsed); } - bdi_update_write_bandwidth(bdi, elapsed, written); + wb_update_write_bandwidth(wb, elapsed, written); snapshot: - bdi->dirtied_stamp = dirtied; - bdi->written_stamp = written; - bdi->bw_time_stamp = now; + wb->dirtied_stamp = dirtied; + wb->written_stamp = written; + wb->bw_time_stamp = now; } -static void bdi_update_bandwidth(struct backing_dev_info *bdi, - unsigned long thresh, - unsigned long bg_thresh, - unsigned long dirty, - unsigned long bdi_thresh, - unsigned long bdi_dirty, - unsigned long start_time) +static void wb_update_bandwidth(struct bdi_writeback *wb, + unsigned long thresh, + unsigned long bg_thresh, + unsigned long dirty, + unsigned long bdi_thresh, + unsigned long bdi_dirty, + unsigned long start_time) { - if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL)) + if (time_is_after_eq_jiffies(wb->bw_time_stamp + BANDWIDTH_INTERVAL)) return; - spin_lock(&bdi->wb.list_lock); - __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty, - bdi_thresh, bdi_dirty, start_time); - spin_unlock(&bdi->wb.list_lock); + spin_lock(&wb->list_lock); + __wb_update_bandwidth(wb, thresh, bg_thresh, dirty, + bdi_thresh, bdi_dirty, start_time); + spin_unlock(&wb->list_lock); } /* @@ -1187,10 +1188,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty, return 1; } -static unsigned long bdi_max_pause(struct backing_dev_info *bdi, - unsigned long bdi_dirty) +static unsigned long wb_max_pause(struct bdi_writeback *wb, + unsigned long bdi_dirty) { - unsigned long bw = bdi->avg_write_bandwidth; + unsigned long bw = wb->avg_write_bandwidth; unsigned long t; /* @@ -1206,14 +1207,14 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi, return min_t(unsigned long, t, MAX_PAUSE); } -static long bdi_min_pause(struct backing_dev_info *bdi, - long max_pause, - unsigned long task_ratelimit, - unsigned long dirty_ratelimit, - int *nr_dirtied_pause) +static long wb_min_pause(struct bdi_writeback *wb, + long max_pause, + unsigned long task_ratelimit, + unsigned long dirty_ratelimit, + int *nr_dirtied_pause) { - long hi = ilog2(bdi->avg_write_bandwidth); - long lo = ilog2(bdi->dirty_ratelimit); + long hi = ilog2(wb->avg_write_bandwidth); + long lo = ilog2(wb->dirty_ratelimit); long t; /* target pause */ long pause; /* estimated next pause */ int pages; /* target nr_dirtied_pause */ @@ -1281,14 +1282,13 @@ static long bdi_min_pause(struct backing_dev_info *bdi, return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t; } -static inline void bdi_dirty_limits(struct backing_dev_info *bdi, - unsigned long dirty_thresh, - unsigned long background_thresh, - unsigned long *bdi_dirty, - unsigned long *bdi_thresh, - unsigned long *bdi_bg_thresh) +static inline void wb_dirty_limits(struct bdi_writeback *wb, + unsigned long dirty_thresh, + unsigned long background_thresh, + unsigned long *bdi_dirty, + unsigned long *bdi_thresh, + unsigned long *bdi_bg_thresh) { - struct bdi_writeback *wb = &bdi->wb; unsigned long wb_reclaimable; /* @@ -1301,10 +1301,10 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, * In this case we don't want to hard throttle the USB key * dirtiers for 100 seconds until bdi_dirty drops under * bdi_thresh. Instead the auxiliary bdi control line in - * bdi_position_ratio() will let the dirtier task progress + * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down bdi_dirty. */ - *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + *bdi_thresh = wb_dirty_limit(wb, dirty_thresh); if (bdi_bg_thresh) *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * @@ -1354,6 +1354,7 @@ static void balance_dirty_pages(struct address_space *mapping, unsigned long dirty_ratelimit; unsigned long pos_ratio; struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb = &bdi->wb; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; unsigned long start_time = jiffies; @@ -1378,8 +1379,8 @@ static void balance_dirty_pages(struct address_space *mapping, global_dirty_limits(&background_thresh, &dirty_thresh); if (unlikely(strictlimit)) { - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, &bg_thresh); + wb_dirty_limits(wb, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, &bg_thresh); dirty = bdi_dirty; thresh = bdi_thresh; @@ -1410,28 +1411,28 @@ static void balance_dirty_pages(struct address_space *mapping, bdi_start_background_writeback(bdi); if (!strictlimit) - bdi_dirty_limits(bdi, dirty_thresh, background_thresh, - &bdi_dirty, &bdi_thresh, NULL); + wb_dirty_limits(wb, dirty_thresh, background_thresh, + &bdi_dirty, &bdi_thresh, NULL); dirty_exceeded = (bdi_dirty > bdi_thresh) && ((nr_dirty > dirty_thresh) || strictlimit); - if (dirty_exceeded && !bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; + if (dirty_exceeded && !wb->dirty_exceeded) + wb->dirty_exceeded = 1; - bdi_update_bandwidth(bdi, dirty_thresh, background_thresh, - nr_dirty, bdi_thresh, bdi_dirty, - start_time); + wb_update_bandwidth(wb, dirty_thresh, background_thresh, + nr_dirty, bdi_thresh, bdi_dirty, + start_time); - dirty_ratelimit = bdi->dirty_ratelimit; - pos_ratio = bdi_position_ratio(bdi, dirty_thresh, - background_thresh, nr_dirty, - bdi_thresh, bdi_dirty); + dirty_ratelimit = wb->dirty_ratelimit; + pos_ratio = wb_position_ratio(wb, dirty_thresh, + background_thresh, nr_dirty, + bdi_thresh, bdi_dirty); task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >> RATELIMIT_CALC_SHIFT; - max_pause = bdi_max_pause(bdi, bdi_dirty); - min_pause = bdi_min_pause(bdi, max_pause, - task_ratelimit, dirty_ratelimit, - &nr_dirtied_pause); + max_pause = wb_max_pause(wb, bdi_dirty); + min_pause = wb_min_pause(wb, max_pause, + task_ratelimit, dirty_ratelimit, + &nr_dirtied_pause); if (unlikely(task_ratelimit == 0)) { period = max_pause; @@ -1515,15 +1516,15 @@ pause: * more page. However bdi_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (bdi_dirty <= wb_stat_error(&bdi->wb)) + if (bdi_dirty <= wb_stat_error(wb)) break; if (fatal_signal_pending(current)) break; } - if (!dirty_exceeded && bdi->dirty_exceeded) - bdi->dirty_exceeded = 0; + if (!dirty_exceeded && wb->dirty_exceeded) + wb->dirty_exceeded = 0; if (writeback_in_progress(bdi)) return; @@ -1577,6 +1578,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb = &bdi->wb; int ratelimit; int *p; @@ -1584,7 +1586,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) return; ratelimit = current->nr_dirtied_pause; - if (bdi->dirty_exceeded) + if (wb->dirty_exceeded) ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10)); preempt_disable(); -- cgit v1.2.3 From f0054bb1e1f3be03cc33369df640db97f10f6172 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:30 -0400 Subject: writeback: move backing_dev_info->wb_lock and ->worklist into bdi_writeback Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback) and the role of the separation is unclear. For cgroup support for writeback IOs, a bdi will be updated to host multiple wb's where each wb serves writeback IOs of a different cgroup on the bdi. To achieve that, a wb should carry all states necessary for servicing writeback IOs for a cgroup independently. This patch moves bdi->wb_lock and ->worklist into wb. * The lock protects bdi->worklist and bdi->wb.dwork scheduling. While moving, rename it to wb->work_lock as wb->wb_lock is confusing. Also, move wb->dwork downwards so that it's colocated with the new ->work_lock and ->work_list fields. * bdi_writeback_workfn() -> wb_workfn() bdi_wakeup_thread_delayed(bdi) -> wb_wakeup_delayed(wb) bdi_wakeup_thread(bdi) -> wb_wakeup(wb) bdi_queue_work(bdi, ...) -> wb_queue_work(wb, ...) __bdi_start_writeback(bdi, ...) -> __wb_start_writeback(wb, ...) get_next_work_item(bdi) -> get_next_work_item(wb) * bdi_wb_shutdown() is renamed to wb_shutdown() and now takes @wb. The function contained parts which belong to the containing bdi rather than the wb itself - testing cap_writeback_dirty and bdi_remove_from_list() invocation. Those are moved to bdi_unregister(). * bdi_wb_{init|exit}() are renamed to wb_{init|exit}(). Initializations of the moved bdi->wb_lock and ->work_list are relocated from bdi_init() to wb_init(). * As there's still only one bdi_writeback per backing_dev_info, all uses of bdi->state are mechanically replaced with bdi->wb.state introducing no behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 86 +++++++++++++++++++++------------------------ include/linux/backing-dev.h | 12 +++---- mm/backing-dev.c | 59 +++++++++++++++---------------- 3 files changed, 75 insertions(+), 82 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 1945cb9f6b97..a69d2e10d514 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -109,34 +109,33 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); -static void bdi_wakeup_thread(struct backing_dev_info *bdi) +static void wb_wakeup(struct bdi_writeback *wb) { - spin_lock_bh(&bdi->wb_lock); - if (test_bit(WB_registered, &bdi->wb.state)) - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + spin_unlock_bh(&wb->work_lock); } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +static void wb_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) { - trace_writeback_queue(bdi, work); + trace_writeback_queue(wb->bdi, work); - spin_lock_bh(&bdi->wb_lock); - if (!test_bit(WB_registered, &bdi->wb.state)) { + spin_lock_bh(&wb->work_lock); + if (!test_bit(WB_registered, &wb->state)) { if (work->done) complete(work->done); goto out_unlock; } - list_add_tail(&work->list, &bdi->work_list); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); } -static void -__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, enum wb_reason reason) +static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason) { struct wb_writeback_work *work; @@ -146,8 +145,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_nowork(wb->bdi); + wb_wakeup(wb); return; } @@ -156,7 +155,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->range_cyclic = range_cyclic; work->reason = reason; - bdi_queue_work(bdi, work); + wb_queue_work(wb, work); } /** @@ -174,7 +173,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason) { - __bdi_start_writeback(bdi, nr_pages, true, reason); + __wb_start_writeback(&bdi->wb, nr_pages, true, reason); } /** @@ -194,7 +193,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(bdi); - bdi_wakeup_thread(bdi); + wb_wakeup(&bdi->wb); } /* @@ -898,7 +897,7 @@ static long wb_writeback(struct bdi_writeback *wb, * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && - !list_empty(&wb->bdi->work_list)) + !list_empty(&wb->work_list)) break; /* @@ -969,18 +968,17 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi) +static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&bdi->wb_lock); - if (!list_empty(&bdi->work_list)) { - work = list_entry(bdi->work_list.next, + spin_lock_bh(&wb->work_lock); + if (!list_empty(&wb->work_list)) { + work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); return work; } @@ -1052,14 +1050,13 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) */ static long wb_do_writeback(struct bdi_writeback *wb) { - struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; set_bit(WB_writeback_running, &wb->state); - while ((work = get_next_work_item(bdi)) != NULL) { + while ((work = get_next_work_item(wb)) != NULL) { - trace_writeback_exec(bdi, work); + trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); @@ -1087,43 +1084,42 @@ static long wb_do_writeback(struct bdi_writeback *wb) * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ -void bdi_writeback_workfn(struct work_struct *work) +void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); - struct backing_dev_info *bdi = wb->bdi; long pages_written; - set_worker_desc("flush-%s", dev_name(bdi->dev)); + set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { /* - * The normal path. Keep writing back @bdi until its + * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken - * if @bdi is shutting down even when we're running off the + * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); - } while (!list_empty(&bdi->work_list)); + } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ - pages_written = writeback_inodes_wb(&bdi->wb, 1024, + pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list)) + if (!list_empty(&wb->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } @@ -1143,7 +1139,7 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, reason); + __wb_start_writeback(&bdi->wb, nr_pages, false, reason); } rcu_read_unlock(); } @@ -1174,7 +1170,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { if (list_empty(&bdi->wb.b_dirty_time)) continue; - bdi_wakeup_thread(bdi); + wb_wakeup(&bdi->wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -1347,7 +1343,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) trace_writeback_dirty_inode_enqueue(inode); if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(&bdi->wb); return; } } @@ -1437,7 +1433,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, if (sb->s_bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); + wb_queue_work(&sb->s_bdi->wb, &work); wait_for_completion(&done); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1521,7 +1517,7 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); + wb_queue_work(&sb->s_bdi->wb, &work); wait_for_completion(&done); wait_sb_inodes(sb); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 2ab06049d812..d796f49ce87a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -52,7 +52,6 @@ struct bdi_writeback { unsigned long state; /* Always use atomic bitops on this */ unsigned long last_old_flush; /* last old data flush */ - struct delayed_work dwork; /* work item used for writeback */ struct list_head b_dirty; /* dirty inodes */ struct list_head b_io; /* parked for writeback */ struct list_head b_more_io; /* parked for more writeback */ @@ -78,6 +77,10 @@ struct bdi_writeback { struct fprop_local_percpu completions; int dirty_exceeded; + + spinlock_t work_lock; /* protects work_list & dwork scheduling */ + struct list_head work_list; + struct delayed_work dwork; /* work item used for writeback */ }; struct backing_dev_info { @@ -93,9 +96,6 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ - spinlock_t wb_lock; /* protects work_list & wb.dwork scheduling */ - - struct list_head work_list; struct device *dev; @@ -121,9 +121,9 @@ int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); -void bdi_writeback_workfn(struct work_struct *work); +void wb_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); +void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; extern struct list_head bdi_list; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9a6c472ebd0b..597f0cec8405 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -261,7 +261,7 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) } /* - * This function is used when the first inode for this bdi is marked dirty. It + * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just @@ -274,15 +274,15 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi) * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ -void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); - spin_lock_bh(&bdi->wb_lock); - if (test_bit(WB_registered, &bdi->wb.state)) - queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout); - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + queue_delayed_work(bdi_wq, &wb->dwork, timeout); + spin_unlock_bh(&wb->work_lock); } /* @@ -335,28 +335,24 @@ EXPORT_SYMBOL(bdi_register_dev); /* * Remove bdi from the global list and shutdown any threads we have running */ -static void bdi_wb_shutdown(struct backing_dev_info *bdi) +static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ - spin_lock_bh(&bdi->wb_lock); - if (!test_and_clear_bit(WB_registered, &bdi->wb.state)) { - spin_unlock_bh(&bdi->wb_lock); + spin_lock_bh(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_bh(&wb->work_lock); return; } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); /* - * Make sure nobody finds us on the bdi_list anymore + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. */ - bdi_remove_from_list(bdi); - - /* - * Drain work list and shutdown the delayed_work. At this point, - * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi - * is dying and its work_list needs to be drained no matter what. - */ - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - flush_delayed_work(&bdi->wb.dwork); + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); } /* @@ -381,7 +377,7 @@ EXPORT_SYMBOL(bdi_unregister); */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) -static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) { int i, err; @@ -394,7 +390,6 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); - INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; @@ -402,6 +397,10 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; + spin_lock_init(&wb->work_lock); + INIT_LIST_HEAD(&wb->work_list); + INIT_DELAYED_WORK(&wb->dwork, wb_workfn); + err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL); if (err) return err; @@ -419,7 +418,7 @@ static int bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) return 0; } -static void bdi_wb_exit(struct bdi_writeback *wb) +static void wb_exit(struct bdi_writeback *wb) { int i; @@ -440,11 +439,9 @@ int bdi_init(struct backing_dev_info *bdi) bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; - spin_lock_init(&bdi->wb_lock); INIT_LIST_HEAD(&bdi->bdi_list); - INIT_LIST_HEAD(&bdi->work_list); - err = bdi_wb_init(&bdi->wb, bdi); + err = wb_init(&bdi->wb, bdi); if (err) return err; @@ -454,9 +451,9 @@ EXPORT_SYMBOL(bdi_init); void bdi_destroy(struct backing_dev_info *bdi) { - bdi_wb_shutdown(bdi); - - WARN_ON(!list_empty(&bdi->work_list)); + /* make sure nobody finds us on the bdi_list anymore */ + bdi_remove_from_list(bdi); + wb_shutdown(&bdi->wb); if (bdi->dev) { bdi_debug_unregister(bdi); @@ -464,7 +461,7 @@ void bdi_destroy(struct backing_dev_info *bdi) bdi->dev = NULL; } - bdi_wb_exit(&bdi->wb); + wb_exit(&bdi->wb); } EXPORT_SYMBOL(bdi_destroy); -- cgit v1.2.3 From 4610007142823307d930ac890d822633a05ce08c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:31 -0400 Subject: writeback: reorganize mm/backing-dev.c Move wb_shutdown(), bdi_register(), bdi_register_dev(), bdi_prune_sb(), bdi_remove_from_list() and bdi_unregister() so that init / exit functions are grouped together. This will make updating init / exit paths for cgroup writeback support easier. This is pure source file reorganization. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Jens Axboe --- mm/backing-dev.c | 174 +++++++++++++++++++++++++++---------------------------- 1 file changed, 87 insertions(+), 87 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 597f0cec8405..ff85ecb7d6a1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -285,93 +285,6 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) spin_unlock_bh(&wb->work_lock); } -/* - * Remove bdi from bdi_list, and ensure that it is no longer visible - */ -static void bdi_remove_from_list(struct backing_dev_info *bdi) -{ - spin_lock_bh(&bdi_lock); - list_del_rcu(&bdi->bdi_list); - spin_unlock_bh(&bdi_lock); - - synchronize_rcu_expedited(); -} - -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...) -{ - va_list args; - struct device *dev; - - if (bdi->dev) /* The driver needs to use separate queues per device */ - return 0; - - va_start(args, fmt); - dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); - va_end(args); - if (IS_ERR(dev)) - return PTR_ERR(dev); - - bdi->dev = dev; - - bdi_debug_register(bdi, dev_name(dev)); - set_bit(WB_registered, &bdi->wb.state); - - spin_lock_bh(&bdi_lock); - list_add_tail_rcu(&bdi->bdi_list, &bdi_list); - spin_unlock_bh(&bdi_lock); - - trace_writeback_bdi_register(bdi); - return 0; -} -EXPORT_SYMBOL(bdi_register); - -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) -{ - return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); -} -EXPORT_SYMBOL(bdi_register_dev); - -/* - * Remove bdi from the global list and shutdown any threads we have running - */ -static void wb_shutdown(struct bdi_writeback *wb) -{ - /* Make sure nobody queues further work */ - spin_lock_bh(&wb->work_lock); - if (!test_and_clear_bit(WB_registered, &wb->state)) { - spin_unlock_bh(&wb->work_lock); - return; - } - spin_unlock_bh(&wb->work_lock); - - /* - * Drain work list and shutdown the delayed_work. !WB_registered - * tells wb_workfn() that @wb is dying and its work_list needs to - * be drained no matter what. - */ - mod_delayed_work(bdi_wq, &wb->dwork, 0); - flush_delayed_work(&wb->dwork); - WARN_ON(!list_empty(&wb->work_list)); -} - -/* - * Called when the device behind @bdi has been removed or ejected. - * - * We can't really do much here except for reducing the dirty ratio at - * the moment. In the future we should be able to set a flag so that - * the filesystem can handle errors at mark_inode_dirty time instead - * of only at writeback time. - */ -void bdi_unregister(struct backing_dev_info *bdi) -{ - if (WARN_ON_ONCE(!bdi->dev)) - return; - - bdi_set_min_ratio(bdi, 0); -} -EXPORT_SYMBOL(bdi_unregister); - /* * Initial write bandwidth: 100 MB/s */ @@ -418,6 +331,29 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) return 0; } +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void wb_shutdown(struct bdi_writeback *wb) +{ + /* Make sure nobody queues further work */ + spin_lock_bh(&wb->work_lock); + if (!test_and_clear_bit(WB_registered, &wb->state)) { + spin_unlock_bh(&wb->work_lock); + return; + } + spin_unlock_bh(&wb->work_lock); + + /* + * Drain work list and shutdown the delayed_work. !WB_registered + * tells wb_workfn() that @wb is dying and its work_list needs to + * be drained no matter what. + */ + mod_delayed_work(bdi_wq, &wb->dwork, 0); + flush_delayed_work(&wb->dwork); + WARN_ON(!list_empty(&wb->work_list)); +} + static void wb_exit(struct bdi_writeback *wb) { int i; @@ -449,6 +385,70 @@ int bdi_init(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_init); +int bdi_register(struct backing_dev_info *bdi, struct device *parent, + const char *fmt, ...) +{ + va_list args; + struct device *dev; + + if (bdi->dev) /* The driver needs to use separate queues per device */ + return 0; + + va_start(args, fmt); + dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); + va_end(args); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + bdi->dev = dev; + + bdi_debug_register(bdi, dev_name(dev)); + set_bit(WB_registered, &bdi->wb.state); + + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; +} +EXPORT_SYMBOL(bdi_register); + +int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +{ + return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); +} +EXPORT_SYMBOL(bdi_register_dev); + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu_expedited(); +} + +/* + * Called when the device behind @bdi has been removed or ejected. + * + * We can't really do much here except for reducing the dirty ratio at + * the moment. In the future we should be able to set a flag so that + * the filesystem can handle errors at mark_inode_dirty time instead + * of only at writeback time. + */ +void bdi_unregister(struct backing_dev_info *bdi) +{ + if (WARN_ON_ONCE(!bdi->dev)) + return; + + bdi_set_min_ratio(bdi, 0); +} +EXPORT_SYMBOL(bdi_unregister); + void bdi_destroy(struct backing_dev_info *bdi) { /* make sure nobody finds us on the bdi_list anymore */ -- cgit v1.2.3 From a212b105b07d75b48b1a166378282e8a77fbf53d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:33 -0400 Subject: bdi: make inode_to_bdi() inline Now that bdi definitions are moved to backing-dev-defs.h, backing-dev.h can include blkdev.h and inline inode_to_bdi() without worrying about introducing circular include dependency. The function gets called from hot paths and fairly trivial. This patch makes inode_to_bdi() and sb_is_blkdev_sb() that the function calls inline. blockdev_superblock and noop_backing_dev_info are EXPORT_GPL'd to allow the inline functions to be used from modules. While at it, make sb_is_blkdev_sb() return bool instead of int. v2: Fixed typo in description as suggested by Jan. Signed-off-by: Tejun Heo Reviewed-by: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/block_dev.c | 8 ++------ fs/fs-writeback.c | 16 ---------------- include/linux/backing-dev.h | 18 ++++++++++++++++-- include/linux/fs.h | 8 +++++++- mm/backing-dev.c | 1 + 5 files changed, 26 insertions(+), 25 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/block_dev.c b/fs/block_dev.c index e545cbfbe5b2..f04c873a7365 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -547,7 +547,8 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { @@ -688,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - /* Call when you free inode */ void bd_forget(struct inode *inode) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a69d2e10d514..34d1cb854fc1 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -78,22 +78,6 @@ int writeback_in_progress(struct backing_dev_info *bdi) } EXPORT_SYMBOL(writeback_in_progress); -struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); -#endif - return sb->s_bdi; -} -EXPORT_SYMBOL_GPL(inode_to_bdi); - static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5e39f7a8efed..785782034e86 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -11,11 +11,10 @@ #include #include #include +#include #include #include -struct backing_dev_info *inode_to_bdi(struct inode *inode); - int __must_check bdi_init(struct backing_dev_info *bdi); void bdi_destroy(struct backing_dev_info *bdi); @@ -149,6 +148,21 @@ extern struct backing_dev_info noop_backing_dev_info; int writeback_in_progress(struct backing_dev_info *bdi); +static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) +{ + struct super_block *sb; + + if (!inode) + return &noop_backing_dev_info; + + sb = inode->i_sb; +#ifdef CONFIG_BLOCK + if (sb_is_blkdev_sb(sb)) + return blk_get_backing_dev_info(I_BDEV(inode)); +#endif + return sb->s_bdi; +} + static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ef63900243c..ce100b87fba3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2240,7 +2240,13 @@ extern struct super_block *freeze_bdev(struct block_device *); extern void emergency_thaw_all(void); extern int thaw_bdev(struct block_device *bdev, struct super_block *sb); extern int fsync_bdev(struct block_device *); -extern int sb_is_blkdev_sb(struct super_block *sb); + +extern struct super_block *blockdev_superblock; + +static inline bool sb_is_blkdev_sb(struct super_block *sb) +{ + return sb == blockdev_superblock; +} #else static inline void bd_forget(struct inode *inode) {} static inline int sync_blockdev(struct block_device *bdev) { return 0; } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ff85ecb7d6a1..b0707d1b1d38 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; -- cgit v1.2.3 From 8395cd9f813d5d7ed9870e642230acfcfc1e8a0a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:34 -0400 Subject: writeback: add @gfp to wb_init() wb_init() currently always uses GFP_KERNEL but the planned cgroup writeback support needs using other allocation masks. Add @gfp to wb_init(). This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- mm/backing-dev.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b0707d1b1d38..805b2870ca4e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -291,7 +291,8 @@ void wb_wakeup_delayed(struct bdi_writeback *wb) */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) -static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, + gfp_t gfp) { int i, err; @@ -315,12 +316,12 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); - err = fprop_local_init_percpu(&wb->completions, GFP_KERNEL); + err = fprop_local_init_percpu(&wb->completions, gfp); if (err) return err; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { - err = percpu_counter_init(&wb->stat[i], 0, GFP_KERNEL); + err = percpu_counter_init(&wb->stat[i], 0, gfp); if (err) { while (--i) percpu_counter_destroy(&wb->stat[i]); @@ -378,7 +379,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); - err = wb_init(&bdi->wb, bdi); + err = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (err) return err; -- cgit v1.2.3 From 4aa9c692e052cf6db99db62a8fe0543e5c455da7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:35 -0400 Subject: bdi: separate out congested state into a separate struct Currently, a wb's (bdi_writeback) congestion state is carried in its ->state field; however, cgroup writeback support will require multiple wb's sharing the same congestion state. This patch separates out congestion state into its own struct - struct bdi_writeback_congested. A new field wb field, wb_congested, points to its associated congested struct. The default wb, bdi->wb, always points to bdi->wb_congested. While this patch adds a layer of indirection, it doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 ++++++++++++-- include/linux/backing-dev.h | 2 +- mm/backing-dev.c | 7 +++++-- 3 files changed, 18 insertions(+), 5 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index aa18c4bd43c1..9e9eafa5f5aa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -16,12 +16,15 @@ struct dentry; * Bits in bdi_writeback.state */ enum wb_state { - WB_async_congested, /* The async (write) queue is getting full */ - WB_sync_congested, /* The sync queue is getting full */ WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ }; +enum wb_congested_state { + WB_async_congested, /* The async (write) queue is getting full */ + WB_sync_congested, /* The sync queue is getting full */ +}; + typedef int (congested_fn)(void *, int); enum wb_stat_item { @@ -34,6 +37,10 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +struct bdi_writeback_congested { + unsigned long state; /* WB_[a]sync_congested flags */ +}; + struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -48,6 +55,8 @@ struct bdi_writeback { struct percpu_counter stat[NR_WB_STAT_ITEMS]; + struct bdi_writeback_congested *congested; + unsigned long bw_time_stamp; /* last time write bw is updated */ unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ @@ -84,6 +93,7 @@ struct backing_dev_info { unsigned int max_ratio, max_prop_frac; struct bdi_writeback wb; /* default writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; struct device *dev; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 785782034e86..bfdaa18ba0a1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,7 +167,7 @@ static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) { if (bdi->congested_fn) return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.state & bdi_bits); + return (bdi->wb.congested->state & bdi_bits); } static inline int bdi_read_congested(struct backing_dev_info *bdi) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 805b2870ca4e..5ec7658cb984 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -383,6 +383,9 @@ int bdi_init(struct backing_dev_info *bdi) if (err) return err; + bdi->wb_congested.state = 0; + bdi->wb.congested = &bdi->wb_congested; + return 0; } EXPORT_SYMBOL(bdi_init); @@ -504,7 +507,7 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.state)) + if (test_and_clear_bit(bit, &bdi->wb.congested->state)) atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) @@ -517,7 +520,7 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum wb_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.state)) + if (!test_and_set_bit(bit, &bdi->wb.congested->state)) atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); -- cgit v1.2.3 From 52ebea749aaed195245701a8f90a23d672c7a933 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:37 -0400 Subject: writeback: make backing_dev_info host cgroup-specific bdi_writebacks For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo Cc: kbuild test robot Cc: Dan Carpenter Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 7 +- fs/fs-writeback.c | 8 +- fs/inode.c | 1 + include/linux/backing-dev-defs.h | 59 +++++- include/linux/backing-dev.h | 195 +++++++++++++++++++ include/linux/blk-cgroup.h | 4 + include/linux/fs.h | 4 + include/linux/memcontrol.h | 4 + mm/backing-dev.c | 397 +++++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 19 +- mm/page-writeback.c | 11 +- 11 files changed, 698 insertions(+), 11 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 54ec1721cd3e..979cfdbb94e0 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -797,6 +798,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) @@ -827,7 +830,9 @@ done: spin_lock_init(&blkcg->lock); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&blkcg->blkg_list); - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif return &blkcg->css; } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 34d1cb854fc1..99a2440c5588 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -185,11 +185,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = inode_to_wb(inode); - spin_lock(&bdi->wb.list_lock); + spin_lock(&wb->list_lock); list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + spin_unlock(&wb->list_lock); } /* @@ -1268,6 +1268,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; diff --git a/fs/inode.c b/fs/inode.c index ea37cd17b53f..efc9edacfb9b 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -223,6 +223,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 9e9eafa5f5aa..a1e9c407a59a 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -2,8 +2,11 @@ #define __LINUX_BACKING_DEV_DEFS_H #include +#include +#include #include #include +#include #include #include #include @@ -37,10 +40,43 @@ enum wb_stat_item { #define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids))) +/* + * For cgroup writeback, multiple wb's may map to the same blkcg. Those + * wb's can operate mostly independently but should share the congested + * state. To facilitate such sharing, the congested state is tracked using + * the following struct which is created on demand, indexed by blkcg ID on + * its bdi, and refcounted. + */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct backing_dev_info *bdi; /* the associated bdi */ + atomic_t refcnt; /* nr of attached wb's and blkg */ + int blkcg_id; /* ID of the associated blkcg */ + struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ +#endif }; +/* + * Each wb (bdi_writeback) can perform writeback operations, is measured + * and throttled, independently. Without cgroup writeback, each bdi + * (bdi_writeback) is served by its embedded bdi->wb. + * + * On the default hierarchy, blkcg implicitly enables memcg. This allows + * using memcg's page ownership for attributing writeback IOs, and every + * memcg - blkcg combination can be served by its own wb by assigning a + * dedicated wb to each memcg, which enables isolation across different + * cgroups and propagation of IO back pressure down from the IO layer upto + * the tasks which are generating the dirty pages to be written back. + * + * A cgroup wb is indexed on its bdi by the ID of the associated memcg, + * refcounted with the number of inodes attached to it, and pins the memcg + * and the corresponding blkcg. As the corresponding blkcg for a memcg may + * change as blkcg is disabled and enabled higher up in the hierarchy, a wb + * is tested for blkcg after lookup and removed from index on mismatch so + * that a new wb for the combination can be created. + */ struct bdi_writeback { struct backing_dev_info *bdi; /* our parent bdi */ @@ -78,6 +114,19 @@ struct bdi_writeback { spinlock_t work_lock; /* protects work_list & dwork scheduling */ struct list_head work_list; struct delayed_work dwork; /* work item used for writeback */ + +#ifdef CONFIG_CGROUP_WRITEBACK + struct percpu_ref refcnt; /* used only for !root wb's */ + struct cgroup_subsys_state *memcg_css; /* the associated memcg */ + struct cgroup_subsys_state *blkcg_css; /* and blkcg */ + struct list_head memcg_node; /* anchored at memcg->cgwb_list */ + struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */ + + union { + struct work_struct release_work; + struct rcu_head rcu; + }; +#endif }; struct backing_dev_info { @@ -92,9 +141,13 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - struct bdi_writeback wb; /* default writeback info for this bdi */ - struct bdi_writeback_congested wb_congested; - + struct bdi_writeback wb; /* the root writeback info for this bdi */ + struct bdi_writeback_congested wb_congested; /* its congested state */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ + struct rb_root cgwb_congested_tree; /* their congested states */ + atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ +#endif struct device *dev; struct timer_list laptop_mode_wb_timer; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6bb31234e6a9..8ae59df2e3d1 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include int __must_check bdi_init(struct backing_dev_info *bdi); @@ -234,6 +235,16 @@ static inline int bdi_sched_wait(void *word) #ifdef CONFIG_CGROUP_WRITEBACK +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp); +void wb_congested_put(struct bdi_writeback_congested *congested); +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp); +void __inode_attach_wb(struct inode *inode, struct page *page); +void wb_memcg_offline(struct mem_cgroup *memcg); +void wb_blkcg_offline(struct blkcg *blkcg); + /** * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode * @inode: inode of interest @@ -250,6 +261,135 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +/** + * wb_find_current - find wb for %current on a bdi + * @bdi: bdi of interest + * + * Find the wb of @bdi which matches both the memcg and blkcg of %current. + * Must be called under rcu_read_lock() which protects the returend wb. + * NULL if not found. + */ +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + struct cgroup_subsys_state *memcg_css; + struct bdi_writeback *wb; + + memcg_css = task_css(current, memory_cgrp_id); + if (!memcg_css->parent) + return &bdi->wb; + + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + + /* + * %current's blkcg equals the effective blkcg of its memcg. No + * need to use the relatively expensive cgroup_get_e_css(). + */ + if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + return wb; + return NULL; +} + +/** + * wb_get_create_current - get or create wb for %current on a bdi + * @bdi: bdi of interest + * @gfp: allocation mask + * + * Equivalent to wb_get_create() on %current's memcg. This function is + * called from a relatively hot path and optimizes the common cases using + * wb_find_current(). + */ +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + struct bdi_writeback *wb; + + rcu_read_lock(); + wb = wb_find_current(bdi); + if (wb && unlikely(!wb_tryget(wb))) + wb = NULL; + rcu_read_unlock(); + + if (unlikely(!wb)) { + struct cgroup_subsys_state *memcg_css; + + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, gfp); + css_put(memcg_css); + } + return wb; +} + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +/** + * inode_to_wb - determine the wb of an inode + * @inode: inode of interest + * + * Returns the wb @inode is currently associated with. + */ +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return inode->i_wb; +} + #else /* CONFIG_CGROUP_WRITEBACK */ static inline bool inode_cgwb_enabled(struct inode *inode) @@ -257,6 +397,61 @@ static inline bool inode_cgwb_enabled(struct inode *inode) return false; } +static inline struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + return bdi->wb.congested; +} + +static inline void wb_congested_put(struct bdi_writeback_congested *congested) +{ +} + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) +{ + return &bdi->wb; +} + +static inline struct bdi_writeback * +wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) +{ + return &bdi->wb; +} + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +static inline struct bdi_writeback *inode_to_wb(struct inode *inode) +{ + return &inode_to_bdi(inode)->wb; +} + +static inline void wb_memcg_offline(struct mem_cgroup *memcg) +{ +} + +static inline void wb_blkcg_offline(struct blkcg *blkcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 4dc643f2046e..3033eb173eb4 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -53,6 +53,10 @@ struct blkcg { /* TODO: per-policy storage in blkcg */ unsigned int cfq_weight; /* belongs to cfq */ unsigned int cfq_leaf_weight; + +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif }; struct blkg_stat { diff --git a/include/linux/fs.h b/include/linux/fs.h index 74e0ae0626a8..67a42ec95065 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ #include struct backing_dev_info; +struct bdi_writeback; struct export_operations; struct hd_geometry; struct iovec; @@ -635,6 +636,9 @@ struct inode { struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ +#ifdef CONFIG_CGROUP_WRITEBACK + struct bdi_writeback *i_wb; /* the associated cgroup wb */ +#endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; union { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 637ef626008e..662a953ea8ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -388,6 +388,10 @@ enum { OVER_LIMIT, }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); +#endif + struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) void sock_update_memcg(struct sock *sk); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5ec7658cb984..4c9386c98ec1 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -368,6 +368,401 @@ static void wb_exit(struct bdi_writeback *wb) fprop_local_destroy_percpu(&wb->completions); } +#ifdef CONFIG_CGROUP_WRITEBACK + +#include + +/* + * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, + * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU + * protected. cgwb_release_wait is used to wait for the completion of cgwb + * releases from bdi destruction path. + */ +static DEFINE_SPINLOCK(cgwb_lock); +static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); + +/** + * wb_congested_get_create - get or create a wb_congested + * @bdi: associated bdi + * @blkcg_id: ID of the associated blkcg + * @gfp: allocation mask + * + * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one. + * The returned wb_congested has its reference count incremented. Returns + * NULL on failure. + */ +struct bdi_writeback_congested * +wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) +{ + struct bdi_writeback_congested *new_congested = NULL, *congested; + struct rb_node **node, *parent; + unsigned long flags; + + if (blkcg_id == 1) + return &bdi->wb_congested; +retry: + spin_lock_irqsave(&cgwb_lock, flags); + + node = &bdi->cgwb_congested_tree.rb_node; + parent = NULL; + + while (*node != NULL) { + parent = *node; + congested = container_of(parent, struct bdi_writeback_congested, + rb_node); + if (congested->blkcg_id < blkcg_id) + node = &parent->rb_left; + else if (congested->blkcg_id > blkcg_id) + node = &parent->rb_right; + else + goto found; + } + + if (new_congested) { + /* !found and storage for new one already allocated, insert */ + congested = new_congested; + new_congested = NULL; + rb_link_node(&congested->rb_node, parent, node); + rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); + atomic_inc(&bdi->usage_cnt); + goto found; + } + + spin_unlock_irqrestore(&cgwb_lock, flags); + + /* allocate storage for new one and retry */ + new_congested = kzalloc(sizeof(*new_congested), gfp); + if (!new_congested) + return NULL; + + atomic_set(&new_congested->refcnt, 0); + new_congested->bdi = bdi; + new_congested->blkcg_id = blkcg_id; + goto retry; + +found: + atomic_inc(&congested->refcnt); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(new_congested); + return congested; +} + +/** + * wb_congested_put - put a wb_congested + * @congested: wb_congested to put + * + * Put @congested and destroy it if the refcnt reaches zero. + */ +void wb_congested_put(struct bdi_writeback_congested *congested) +{ + struct backing_dev_info *bdi = congested->bdi; + unsigned long flags; + + if (congested->blkcg_id == 1) + return; + + local_irq_save(flags); + if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + local_irq_restore(flags); + return; + } + + rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree); + spin_unlock_irqrestore(&cgwb_lock, flags); + kfree(congested); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release_workfn(struct work_struct *work) +{ + struct bdi_writeback *wb = container_of(work, struct bdi_writeback, + release_work); + struct backing_dev_info *bdi = wb->bdi; + + wb_shutdown(wb); + + css_put(wb->memcg_css); + css_put(wb->blkcg_css); + wb_congested_put(wb->congested); + + percpu_ref_exit(&wb->refcnt); + wb_exit(wb); + kfree_rcu(wb, rcu); + + if (atomic_dec_and_test(&bdi->usage_cnt)) + wake_up_all(&cgwb_release_wait); +} + +static void cgwb_release(struct percpu_ref *refcnt) +{ + struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, + refcnt); + schedule_work(&wb->release_work); +} + +static void cgwb_kill(struct bdi_writeback *wb) +{ + lockdep_assert_held(&cgwb_lock); + + WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); + list_del(&wb->memcg_node); + list_del(&wb->blkcg_node); + percpu_ref_kill(&wb->refcnt); +} + +static int cgwb_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, gfp_t gfp) +{ + struct mem_cgroup *memcg; + struct cgroup_subsys_state *blkcg_css; + struct blkcg *blkcg; + struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; + struct bdi_writeback *wb; + unsigned long flags; + int ret = 0; + + memcg = mem_cgroup_from_css(memcg_css); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg = css_to_blkcg(blkcg_css); + memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + blkcg_cgwb_list = &blkcg->cgwb_list; + + /* look up again under lock and discard on blkcg mismatch */ + spin_lock_irqsave(&cgwb_lock, flags); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb && wb->blkcg_css != blkcg_css) { + cgwb_kill(wb); + wb = NULL; + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (wb) + goto out_put; + + /* need to create a new one */ + wb = kmalloc(sizeof(*wb), gfp); + if (!wb) + return -ENOMEM; + + ret = wb_init(wb, bdi, gfp); + if (ret) + goto err_free; + + ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); + if (ret) + goto err_wb_exit; + + wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); + if (!wb->congested) + goto err_ref_exit; + + wb->memcg_css = memcg_css; + wb->blkcg_css = blkcg_css; + INIT_WORK(&wb->release_work, cgwb_release_workfn); + set_bit(WB_registered, &wb->state); + + /* + * The root wb determines the registered state of the whole bdi and + * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate + * whether they're still online. Don't link @wb if any is dead. + * See wb_memcg_offline() and wb_blkcg_offline(). + */ + ret = -ENODEV; + spin_lock_irqsave(&cgwb_lock, flags); + if (test_bit(WB_registered, &bdi->wb.state) && + blkcg_cgwb_list->next && memcg_cgwb_list->next) { + /* we might have raced another instance of this function */ + ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); + if (!ret) { + atomic_inc(&bdi->usage_cnt); + list_add(&wb->memcg_node, memcg_cgwb_list); + list_add(&wb->blkcg_node, blkcg_cgwb_list); + css_get(memcg_css); + css_get(blkcg_css); + } + } + spin_unlock_irqrestore(&cgwb_lock, flags); + if (ret) { + if (ret == -EEXIST) + ret = 0; + goto err_put_congested; + } + goto out_put; + +err_put_congested: + wb_congested_put(wb->congested); +err_ref_exit: + percpu_ref_exit(&wb->refcnt); +err_wb_exit: + wb_exit(wb); +err_free: + kfree(wb); +out_put: + css_put(blkcg_css); + return ret; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. The returned wb has its refcount incremented. + * + * This function uses css_get() on @memcg_css and thus expects its refcnt + * to be positive on invocation. IOW, rcu_read_lock() protection on + * @memcg_css isn't enough. try_get it before calling this function. + * + * A wb is keyed by its associated memcg. As blkcg implicitly enables + * memcg on the default hierarchy, memcg association is guaranteed to be + * more specific (equal or descendant to the associated blkcg) and thus can + * identify both the memcg and blkcg associations. + * + * Because the blkcg associated with a memcg may change as blkcg is enabled + * and disabled closer to root in the hierarchy, each wb keeps track of + * both the memcg and blkcg associated with it and verifies the blkcg on + * each lookup. On mismatch, the existing wb is discarded and a new one is + * created. + */ +struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css, + gfp_t gfp) +{ + struct bdi_writeback *wb; + + might_sleep_if(gfp & __GFP_WAIT); + + if (!memcg_css->parent) + return &bdi->wb; + + do { + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, + &blkio_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || + !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); + + return wb; +} + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +static void cgwb_bdi_init(struct backing_dev_info *bdi) +{ + bdi->wb.memcg_css = mem_cgroup_root_css; + bdi->wb.blkcg_css = blkcg_root_css; + bdi->wb_congested.blkcg_id = 1; + INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); + bdi->cgwb_congested_tree = RB_ROOT; + atomic_set(&bdi->usage_cnt, 1); +} + +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + struct radix_tree_iter iter; + void **slot; + + WARN_ON(test_bit(WB_registered, &bdi->wb.state)); + + spin_lock_irq(&cgwb_lock); + radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) + cgwb_kill(*slot); + spin_unlock_irq(&cgwb_lock); + + /* + * All cgwb's and their congested states must be shutdown and + * released before returning. Drain the usage counter to wait for + * all cgwb's and cgwb_congested's ever created on @bdi. + */ + atomic_dec(&bdi->usage_cnt); + wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); +} + +/** + * wb_memcg_offline - kill all wb's associated with a memcg being offlined + * @memcg: memcg being offlined + * + * Also prevents creation of any new wb's associated with @memcg. + */ +void wb_memcg_offline(struct mem_cgroup *memcg) +{ + LIST_HEAD(to_destroy); + struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) + cgwb_kill(wb); + memcg_cgwb_list->next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +/** + * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined + * @blkcg: blkcg being offlined + * + * Also prevents creation of any new wb's associated with @blkcg. + */ +void wb_blkcg_offline(struct blkcg *blkcg) +{ + LIST_HEAD(to_destroy); + struct bdi_writeback *wb, *next; + + spin_lock_irq(&cgwb_lock); + list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + cgwb_kill(wb); + blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + spin_unlock_irq(&cgwb_lock); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static void cgwb_bdi_init(struct backing_dev_info *bdi) { } +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +#endif /* CONFIG_CGROUP_WRITEBACK */ + int bdi_init(struct backing_dev_info *bdi) { int err; @@ -386,6 +781,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->wb_congested.state = 0; bdi->wb.congested = &bdi->wb_congested; + cgwb_bdi_init(bdi); return 0; } EXPORT_SYMBOL(bdi_init); @@ -459,6 +855,7 @@ void bdi_destroy(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); + cgwb_bdi_destroy(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c270a03729c..49e5aa6abca6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -348,6 +348,10 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif + /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -4030,6 +4034,15 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg) } #endif +#ifdef CONFIG_CGROUP_WRITEBACK + +struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) +{ + return &memcg->cgwb_list; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * DO NOT USE IN NEW FILES. * @@ -4494,7 +4507,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; #endif - +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&memcg->cgwb_list); +#endif return &memcg->css; free_out: @@ -4582,6 +4597,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); memcg_deactivate_kmem(memcg); + + wb_memcg_offline(memcg); } static void mem_cgroup_css_free(struct cgroup_subsys_state *css) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 78ef5511a198..9b95cf80b407 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2097,16 +2097,21 @@ int __set_page_dirty_no_writeback(struct page *page) void account_page_dirtied(struct page *page, struct address_space *mapping, struct mem_cgroup *memcg) { + struct inode *inode = mapping->host; + trace_writeback_dirty_page(page, mapping); if (mapping_cap_account_dirty(mapping)) { - struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct bdi_writeback *wb; + + inode_attach_wb(inode, page); + wb = inode_to_wb(inode); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY); __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_DIRTIED); - __inc_wb_stat(&bdi->wb, WB_RECLAIMABLE); - __inc_wb_stat(&bdi->wb, WB_DIRTIED); + __inc_wb_stat(wb, WB_RECLAIMABLE); + __inc_wb_stat(wb, WB_DIRTIED); task_io_account_write(PAGE_CACHE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); -- cgit v1.2.3 From ec8a6f2643923ee5b74d24fa8d134240379f436b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:41 -0400 Subject: writeback: make congestion functions per bdi_writeback Currently, all congestion functions take bdi (backing_dev_info) and always operate on the root wb (bdi->wb) and the congestion state from the block layer is propagated only for the root blkcg. This patch introduces {set|clear}_wb_congested() and wb_congested() which take a bdi_writeback_congested and bdi_writeback respectively. The bdi counteparts are now wrappers invoking the wb based functions on @bdi->wb. While converting clear_bdi_congested() to clear_wb_congested(), the local variable declaration order between @wqh and @bit is swapped for cosmetic reason. This patch just adds the new wb based functions. The following patches will apply them. v2: Updated for bdi_writeback_congested. Signed-off-by: Tejun Heo Reviewed-by: Jan Kara Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 14 +++++++++++-- include/linux/backing-dev.h | 45 +++++++++++++++++++++++----------------- mm/backing-dev.c | 22 ++++++++++---------- 3 files changed, 49 insertions(+), 32 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index a1e9c407a59a..eb386766b5f3 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -163,7 +163,17 @@ enum { BLK_RW_SYNC = 1, }; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync); -void set_bdi_congested(struct backing_dev_info *bdi, int sync); +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync); +void set_wb_congested(struct bdi_writeback_congested *congested, int sync); + +static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + clear_wb_congested(bdi->wb.congested, sync); +} + +static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) +{ + set_wb_congested(bdi->wb.congested, sync); +} #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8ae59df2e3d1..2c498a2a8268 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -167,27 +167,13 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) return sb->s_bdi; } -static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits) +static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) { - if (bdi->congested_fn) - return bdi->congested_fn(bdi->congested_data, bdi_bits); - return (bdi->wb.congested->state & bdi_bits); -} - -static inline int bdi_read_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_sync_congested); -} - -static inline int bdi_write_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, 1 << WB_async_congested); -} + struct backing_dev_info *bdi = wb->bdi; -static inline int bdi_rw_congested(struct backing_dev_info *bdi) -{ - return bdi_congested(bdi, (1 << WB_sync_congested) | - (1 << WB_async_congested)); + if (bdi->congested_fn) + return bdi->congested_fn(bdi->congested_data, cong_bits); + return wb->congested->state & cong_bits; } long congestion_wait(int sync, long timeout); @@ -454,4 +440,25 @@ static inline void wb_blkcg_offline(struct blkcg *blkcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits) +{ + return wb_congested(&bdi->wb, cong_bits); +} + +static inline int bdi_read_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_sync_congested); +} + +static inline int bdi_write_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, 1 << WB_async_congested); +} + +static inline int bdi_rw_congested(struct backing_dev_info *bdi) +{ + return bdi_congested(bdi, (1 << WB_sync_congested) | + (1 << WB_async_congested)); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 4c9386c98ec1..5029c4ad2f36 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -896,31 +896,31 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; -static atomic_t nr_bdi_congested[2]; +static atomic_t nr_wb_congested[2]; -void clear_bdi_congested(struct backing_dev_info *bdi, int sync) +void clear_wb_congested(struct bdi_writeback_congested *congested, int sync) { - enum wb_state bit; wait_queue_head_t *wqh = &congestion_wqh[sync]; + enum wb_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (test_and_clear_bit(bit, &bdi->wb.congested->state)) - atomic_dec(&nr_bdi_congested[sync]); + if (test_and_clear_bit(bit, &congested->state)) + atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } -EXPORT_SYMBOL(clear_bdi_congested); +EXPORT_SYMBOL(clear_wb_congested); -void set_bdi_congested(struct backing_dev_info *bdi, int sync) +void set_wb_congested(struct bdi_writeback_congested *congested, int sync) { enum wb_state bit; bit = sync ? WB_sync_congested : WB_async_congested; - if (!test_and_set_bit(bit, &bdi->wb.congested->state)) - atomic_inc(&nr_bdi_congested[sync]); + if (!test_and_set_bit(bit, &congested->state)) + atomic_inc(&nr_wb_congested[sync]); } -EXPORT_SYMBOL(set_bdi_congested); +EXPORT_SYMBOL(set_wb_congested); /** * congestion_wait - wait for a backing_dev to become uncongested @@ -979,7 +979,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) * encountered in the current zone, yield if necessary instead * of sleeping on the congestion queue */ - if (atomic_read(&nr_bdi_congested[sync]) == 0 || + if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { cond_resched(); -- cgit v1.2.3 From d6c10f1fc8626dc55946f4768ae322b4c57b07dd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:45 -0400 Subject: writeback: implement WB_has_dirty_io wb_state flag Currently, wb_has_dirty_io() determines whether a wb (bdi_writeback) has any dirty inode by testing all three IO lists on each invocation without actively keeping track. For cgroup writeback support, a single bdi will host multiple wb's each of which will host dirty inodes separately and we'll need to make bdi_has_dirty_io(), which currently only represents the root wb, aggregate has_dirty_io from all member wb's, which requires tracking transitions in has_dirty_io state on each wb. This patch introduces inode_wb_list_{move|del}_locked() to consolidate IO list operations leaving queue_io() the only other function which directly manipulates IO lists (via move_expired_inodes()). All three functions are updated to call wb_io_lists_[de]populated() which keep track of whether the wb has dirty inodes or not and record it using the new WB_has_dirty_io flag. inode_wb_list_moved_locked()'s return value indicates whether the wb had no dirty inodes before. mark_inode_dirty() is restructured so that the return value of inode_wb_list_move_locked() can be used for deciding whether to wake up the wb. While at it, change {bdi|wb}_has_dirty_io()'s return values to bool. These functions were returning 0 and 1 before. Also, add a comment explaining the synchronization of wb_state flags. v2: Updated to accommodate b_dirty_time. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 110 ++++++++++++++++++++++++++++++--------- include/linux/backing-dev-defs.h | 1 + include/linux/backing-dev.h | 8 ++- mm/backing-dev.c | 2 +- 4 files changed, 91 insertions(+), 30 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7ec491b1be04..0a90dc557748 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -93,6 +93,66 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); +static bool wb_io_lists_populated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb)) { + return false; + } else { + set_bit(WB_has_dirty_io, &wb->state); + return true; + } +} + +static void wb_io_lists_depopulated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && + list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) + clear_bit(WB_has_dirty_io, &wb->state); +} + +/** + * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * @inode: inode to be moved + * @wb: target bdi_writeback + * @head: one of @wb->b_{dirty|io|more_io} + * + * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Returns %true if @inode is the first occupant of the !dirty_time IO + * lists; otherwise, %false. + */ +static bool inode_wb_list_move_locked(struct inode *inode, + struct bdi_writeback *wb, + struct list_head *head) +{ + assert_spin_locked(&wb->list_lock); + + list_move(&inode->i_wb_list, head); + + /* dirty_time doesn't count as dirty_io until expiration */ + if (head != &wb->b_dirty_time) + return wb_io_lists_populated(wb); + + wb_io_lists_depopulated(wb); + return false; +} + +/** + * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * @inode: inode to be removed + * @wb: bdi_writeback @inode is being removed from + * + * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and + * clear %WB_has_dirty_io if all are empty afterwards. + */ +static void inode_wb_list_del_locked(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + + list_del_init(&inode->i_wb_list); + wb_io_lists_depopulated(wb); +} + static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_bh(&wb->work_lock); @@ -217,7 +277,7 @@ void inode_wb_list_del(struct inode *inode) struct bdi_writeback *wb = inode_to_wb(inode); spin_lock(&wb->list_lock); - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -232,7 +292,6 @@ void inode_wb_list_del(struct inode *inode) */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -240,7 +299,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_wb_list, &wb->b_dirty); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -248,8 +307,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, &wb->b_more_io); + inode_wb_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -358,6 +416,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); + if (moved) + wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, moved); } @@ -483,10 +543,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &wb->b_dirty_time); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); } } @@ -628,7 +688,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -1327,37 +1387,39 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct list_head *dirty_list; bool wakeup_bdi = false; bdi = inode_to_bdi(inode); spin_unlock(&inode->i_lock); spin_lock(&bdi->wb.list_lock); - if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(WB_registered, &bdi->wb.state), - "bdi-%s not registered\n", bdi->name); - /* - * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding - * bdi thread to make sure background - * write-back happens later. - */ - if (!wb_has_dirty_io(&bdi->wb)) - wakeup_bdi = true; - } + WARN(bdi_cap_writeback_dirty(bdi) && + !test_bit(WB_registered, &bdi->wb.state), + "bdi-%s not registered\n", bdi->name); inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + dirty_list = &bdi->wb.b_dirty; else - list_move(&inode->i_wb_list, - &bdi->wb.b_dirty_time); + dirty_list = &bdi->wb.b_dirty_time; + + wakeup_bdi = inode_wb_list_move_locked(inode, &bdi->wb, + dirty_list); + spin_unlock(&bdi->wb.list_lock); trace_writeback_dirty_inode_enqueue(inode); - if (wakeup_bdi) + /* + * If this is the first dirty inode for this bdi, + * we have to wake-up the corresponding bdi thread + * to make sure background write-back happens + * later. + */ + if (bdi_cap_writeback_dirty(bdi) && wakeup_bdi) wb_wakeup_delayed(&bdi->wb); return; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index eb386766b5f3..7a94b7850b7c 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; enum wb_state { WB_registered, /* bdi_register() was done */ WB_writeback_running, /* Writeback is in progress */ + WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; enum wb_congested_state { diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 6f0882105f95..3c8403c012ce 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -int bdi_has_dirty_io(struct backing_dev_info *bdi); +bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -37,11 +37,9 @@ extern struct list_head bdi_list; extern struct workqueue_struct *bdi_wq; -static inline int wb_has_dirty_io(struct bdi_writeback *wb) +static inline bool wb_has_dirty_io(struct bdi_writeback *wb) { - return !list_empty(&wb->b_dirty) || - !list_empty(&wb->b_io) || - !list_empty(&wb->b_more_io); + return test_bit(WB_has_dirty_io, &wb->state); } static inline void __add_wb_stat(struct bdi_writeback *wb, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5029c4ad2f36..161ddf12b862 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -256,7 +256,7 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -int bdi_has_dirty_io(struct backing_dev_info *bdi) +bool bdi_has_dirty_io(struct backing_dev_info *bdi) { return wb_has_dirty_io(&bdi->wb); } -- cgit v1.2.3 From 95a46c65e3c09edb9f17dabf2dc16670cd328739 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:47 -0400 Subject: writeback: make bdi_has_dirty_io() take multiple bdi_writeback's into account bdi_has_dirty_io() used to only reflect whether the root wb (bdi_writeback) has dirty inodes. For cgroup writeback support, it needs to take all active wb's into account. If any wb on the bdi has dirty inodes, bdi_has_dirty_io() should return true. To achieve that, as inode_wb_list_{move|del}_locked() now keep track of the dirty state transition of each wb, the number of dirty wbs can be counted in the bdi; however, bdi is already aggregating wb->avg_write_bandwidth which can easily be guaranteed to be > 0 when there are any dirty inodes by ensuring wb->avg_write_bandwidth can't dip below 1. bdi_has_dirty_io() can simply test whether bdi->tot_write_bandwidth is zero or not. While this bumps the value of wb->avg_write_bandwidth to one when it used to be zero, this shouldn't cause any meaningful behavior difference. bdi_has_dirty_io() is made an inline function which tests whether ->tot_write_bandwidth is non-zero. Also, WARN_ON_ONCE()'s on its value are added to inode_wb_list_{move|del}_locked(). Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 +++-- include/linux/backing-dev-defs.h | 8 ++++++-- include/linux/backing-dev.h | 10 +++++++++- mm/backing-dev.c | 5 ----- mm/page-writeback.c | 10 +++++++--- 5 files changed, 25 insertions(+), 13 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index bbccf68b38db..c98d3923d9a9 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -99,6 +99,7 @@ static bool wb_io_lists_populated(struct bdi_writeback *wb) return false; } else { set_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth); return true; @@ -110,8 +111,8 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { clear_bit(WB_has_dirty_io, &wb->state); - atomic_long_sub(wb->avg_write_bandwidth, - &wb->bdi->tot_write_bandwidth); + WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth) < 0); } } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index d631a61f4023..8c857d723023 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -98,7 +98,7 @@ struct bdi_writeback { unsigned long dirtied_stamp; unsigned long written_stamp; /* pages written at bw_time_stamp */ unsigned long write_bandwidth; /* the estimated write bandwidth */ - unsigned long avg_write_bandwidth; /* further smoothed write bw */ + unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */ /* * The base dirty throttle rate, re-calculated on every 200ms. @@ -142,7 +142,11 @@ struct backing_dev_info { unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; - atomic_long_t tot_write_bandwidth; /* sum of active avg_write_bw */ + /* + * Sum of avg_write_bw of wbs with dirty inodes. > 0 if there are + * any dirty wbs, which is depended upon by bdi_has_dirty(). + */ + atomic_long_t tot_write_bandwidth; struct bdi_writeback wb; /* the root writeback info for this bdi */ struct bdi_writeback_congested wb_congested; /* its congested state */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3c8403c012ce..0839e44105bd 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -29,7 +29,6 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); void wb_workfn(struct work_struct *work); -bool bdi_has_dirty_io(struct backing_dev_info *bdi); void wb_wakeup_delayed(struct bdi_writeback *wb); extern spinlock_t bdi_lock; @@ -42,6 +41,15 @@ static inline bool wb_has_dirty_io(struct bdi_writeback *wb) return test_bit(WB_has_dirty_io, &wb->state); } +static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + /* + * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are + * any dirty wbs. See wb_update_write_bandwidth(). + */ + return atomic_long_read(&bdi->tot_write_bandwidth); +} + static inline void __add_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item, s64 amount) { diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 161ddf12b862..d2f16fc9069a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -256,11 +256,6 @@ static int __init default_bdi_init(void) } subsys_initcall(default_bdi_init); -bool bdi_has_dirty_io(struct backing_dev_info *bdi) -{ - return wb_has_dirty_io(&bdi->wb); -} - /* * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c95eb24dd652..99b88465096e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -881,9 +881,13 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb, avg += (old - avg) >> 3; out: - if (wb_has_dirty_io(wb)) - atomic_long_add(avg - wb->avg_write_bandwidth, - &wb->bdi->tot_write_bandwidth); + /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */ + avg = max(avg, 1LU); + if (wb_has_dirty_io(wb)) { + long delta = avg - wb->avg_write_bandwidth; + WARN_ON_ONCE(atomic_long_add_return(delta, + &wb->bdi->tot_write_bandwidth) <= 0); + } wb->write_bandwidth = bw; wb->avg_write_bandwidth = avg; } -- cgit v1.2.3 From cc395d7f1f7b9c740ab6d367ef1f6eb248595dff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 17:13:58 -0400 Subject: writeback: implement bdi_wait_for_completion() If the completion of a wb_writeback_work can be waited upon by setting its ->done to a struct completion and waiting on it; however, for cgroup writeback support, it's necessary to issue multiple work items to multiple bdi_writebacks and wait for the completion of all. This patch implements wb_completion which can wait for multiple work items and replaces the struct completion with it. It can be defined using DEFINE_WB_COMPLETION_ONSTACK(), used for multiple work items and waited for by wb_wait_for_completion(). Nobody currently issues multiple work items and this patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 58 +++++++++++++++++++++++++++++++--------- include/linux/backing-dev-defs.h | 2 ++ mm/backing-dev.c | 1 + 3 files changed, 49 insertions(+), 12 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 22f1def4e63a..d7d4a1bcdd2f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -34,6 +34,10 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +struct wb_completion { + atomic_t cnt; +}; + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -51,9 +55,22 @@ struct wb_writeback_work { enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ + struct wb_completion *done; /* set if the caller waits */ }; +/* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ + struct wb_completion cmpl = { \ + .cnt = ATOMIC_INIT(1), \ + } + + /* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's @@ -161,17 +178,34 @@ static void wb_queue_work(struct bdi_writeback *wb, trace_writeback_queue(wb->bdi, work); spin_lock_bh(&wb->work_lock); - if (!test_bit(WB_registered, &wb->state)) { - if (work->done) - complete(work->done); + if (!test_bit(WB_registered, &wb->state)) goto out_unlock; - } + if (work->done) + atomic_inc(&work->done->cnt); list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: spin_unlock_bh(&wb->work_lock); } +/** + * wb_wait_for_completion - wait for completion of bdi_writeback_works + * @bdi: bdi work items were issued to + * @done: target wb_completion + * + * Wait for one or more work items issued to @bdi with their ->done field + * set to @done, which should have been defined with + * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such + * work items are completed. Work items which are waited upon aren't freed + * automatically on completion. + */ +static void wb_wait_for_completion(struct backing_dev_info *bdi, + struct wb_completion *done) +{ + atomic_dec(&done->cnt); /* put down the initial count */ + wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +} + #ifdef CONFIG_CGROUP_WRITEBACK /** @@ -1143,7 +1177,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { - struct completion *done = work->done; + struct wb_completion *done = work->done; trace_writeback_exec(wb->bdi, work); @@ -1151,8 +1185,8 @@ static long wb_do_writeback(struct bdi_writeback *wb) if (work->auto_free) kfree(work); - if (done) - complete(done); + if (done && atomic_dec_and_test(&done->cnt)) + wake_up_all(&wb->bdi->wb_waitq); } /* @@ -1518,7 +1552,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, @@ -1533,7 +1567,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); wb_queue_work(&bdi->wb, &work); - wait_for_completion(&done); + wb_wait_for_completion(bdi, &done); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1600,7 +1634,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -1618,7 +1652,7 @@ void sync_inodes_sb(struct super_block *sb) WARN_ON(!rwsem_is_locked(&sb->s_umount)); wb_queue_work(&bdi->wb, &work); - wait_for_completion(&done); + wb_wait_for_completion(bdi, &done); wait_sb_inodes(sb); } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8c857d723023..97a92fa0cdb5 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -155,6 +155,8 @@ struct backing_dev_info { struct rb_root cgwb_congested_tree; /* their congested states */ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #endif + wait_queue_head_t wb_waitq; + struct device *dev; struct timer_list laptop_mode_wb_timer; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d2f16fc9069a..ad5608d01e8c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -768,6 +768,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); + init_waitqueue_head(&bdi->wb_waitq); err = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (err) -- cgit v1.2.3 From 0d960a383ae7aa791b2833e122ba7519d264cf92 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:19 -0400 Subject: writeback: clean up wb_dirty_limit() The function name wb_dirty_limit(), its argument @dirty and the local variable @wb_dirty are mortally confusing given that the function calculates per-wb threshold value not dirty pages, especially given that @dirty and @wb_dirty are used elsewhere for dirty pages. Let's rename the function to wb_calc_thresh() and wb_dirty to wb_thresh. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 2 +- include/linux/writeback.h | 2 +- mm/backing-dev.c | 6 +++--- mm/page-writeback.c | 30 +++++++++++++++--------------- 4 files changed, 20 insertions(+), 20 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 881ea5d97c00..b1b3b8184500 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1081,7 +1081,7 @@ static bool over_bground_thresh(struct bdi_writeback *wb) global_page_state(NR_UNSTABLE_NFS) > background_thresh) return true; - if (wb_stat(wb, WB_RECLAIMABLE) > wb_dirty_limit(wb, background_thresh)) + if (wb_stat(wb, WB_RECLAIMABLE) > wb_calc_thresh(wb, background_thresh)) return true; return false; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 23af355d5471..0435c85d4cfa 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -155,7 +155,7 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty); +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void __wb_update_bandwidth(struct bdi_writeback *wb, unsigned long thresh, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index ad5608d01e8c..9c8b7b5a0eee 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -49,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; - unsigned long bdi_thresh; + unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; @@ -67,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); - bdi_thresh = wb_dirty_limit(wb, dirty_thresh); + wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -87,7 +87,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "state: %10lx\n", (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), - K(bdi_thresh), + K(wb_thresh), K(dirty_thresh), K(background_thresh), (unsigned long) K(wb_stat(wb, WB_DIRTIED)), diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 70cf98dc3423..c7745a7fe11e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -556,7 +556,7 @@ static unsigned long hard_dirty_limit(unsigned long thresh) } /** - * wb_dirty_limit - @wb's share of dirty throttling threshold + * wb_calc_thresh - @wb's share of dirty throttling threshold * @wb: bdi_writeback to query * @dirty: global dirty limit in pages * @@ -577,28 +577,28 @@ static unsigned long hard_dirty_limit(unsigned long thresh) * The wb's share of dirty limit will be adapting to its throughput and * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. */ -unsigned long wb_dirty_limit(struct bdi_writeback *wb, unsigned long dirty) +unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh) { - u64 wb_dirty; + u64 wb_thresh; long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; /* - * Calculate this BDI's share of the dirty ratio. + * Calculate this BDI's share of the thresh ratio. */ wb_writeout_fraction(wb, &numerator, &denominator); - wb_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - wb_dirty *= numerator; - do_div(wb_dirty, denominator); + wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh *= numerator; + do_div(wb_thresh, denominator); wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); - wb_dirty += (dirty * wb_min_ratio) / 100; - if (wb_dirty > (dirty * wb_max_ratio) / 100) - wb_dirty = dirty * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / 100; + if (wb_thresh > (thresh * wb_max_ratio) / 100) + wb_thresh = thresh * wb_max_ratio / 100; - return wb_dirty; + return wb_thresh; } /* @@ -750,7 +750,7 @@ static unsigned long wb_position_ratio(struct bdi_writeback *wb, * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global * limits are set by default to 10% and 20% (background and throttle). * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages. - * wb_dirty_limit(wb, bg_thresh) is about ~4K pages. wb_setpoint is + * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is * about ~6K pages (as the average of background and throttle wb * limits). The 3rd order polynomial will provide positive feedback if * wb_dirty is under wb_setpoint and vice versa. @@ -1115,7 +1115,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, * * We rampup dirty_ratelimit forcibly if wb_dirty is low because * it's possible that wb_thresh is close to zero due to inactivity - * of backing device (see the implementation of wb_dirty_limit()). + * of backing device (see the implementation of wb_calc_thresh()). */ if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { dirty = wb_dirty; @@ -1123,7 +1123,7 @@ static void wb_update_dirty_ratelimit(struct bdi_writeback *wb, setpoint = wb_dirty + 1; else setpoint = (wb_thresh + - wb_dirty_limit(wb, bg_thresh)) / 2; + wb_calc_thresh(wb, bg_thresh)) / 2; } if (dirty < setpoint) { @@ -1352,7 +1352,7 @@ static inline void wb_dirty_limits(struct bdi_writeback *wb, * wb_position_ratio() will let the dirtier task progress * at some rate <= (write_bw / 2) for bringing down wb_dirty. */ - *wb_thresh = wb_dirty_limit(wb, dirty_thresh); + *wb_thresh = wb_calc_thresh(wb, dirty_thresh); if (wb_bg_thresh) *wb_bg_thresh = dirty_thresh ? div_u64((u64)*wb_thresh * -- cgit v1.2.3 From 841710aa6e4acd066ab9fe8c8cb6f4e4e6709d83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 22 May 2015 18:23:33 -0400 Subject: writeback: implement memcg wb_domain Dirtyable memory is distributed to a wb (bdi_writeback) according to the relative bandwidth the wb is writing out in the whole system. This distribution is global - each wb is measured against all other wb's and gets the proportinately sized portion of the memory in the whole system. For cgroup writeback, the amount of dirtyable memory is scoped by memcg and thus each wb would need to be measured and controlled in its memcg. IOW, a wb will belong to two writeback domains - the global and memcg domains. The previous patches laid the groundwork to support the two wb_domains and this patch implements memcg wb_domain. memcg->cgwb_domain is initialized on css online and destroyed on css release, wb->memcg_completions is added, and __wb_writeout_inc() is updated to increment completions against both global and memcg wb_domains. The following patches will update balance_dirty_pages() and its subroutines to actually consider memcg wb_domain for throttling. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- include/linux/backing-dev-defs.h | 1 + include/linux/memcontrol.h | 12 +++++++++++- include/linux/writeback.h | 3 +++ mm/backing-dev.c | 9 ++++++++- mm/memcontrol.c | 39 +++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 25 +++++++++++++++++++++++++ 6 files changed, 87 insertions(+), 2 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 97a92fa0cdb5..8d470b73824f 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -118,6 +118,7 @@ struct bdi_writeback { #ifdef CONFIG_CGROUP_WRITEBACK struct percpu_ref refcnt; /* used only for !root wb's */ + struct fprop_local_percpu memcg_completions; struct cgroup_subsys_state *memcg_css; /* the associated memcg */ struct cgroup_subsys_state *blkcg_css; /* and blkcg */ struct list_head memcg_node; /* anchored at memcg->cgwb_list */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 662a953ea8ad..e3177bed23ea 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -389,8 +389,18 @@ enum { }; #ifdef CONFIG_CGROUP_WRITEBACK + struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); -#endif +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index b57c2786b5aa..04a3786c456f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -167,6 +167,9 @@ static inline void laptop_sync_completion(void) { } void throttle_vm_writeout(gfp_t gfp_mask); bool zone_dirty_ok(struct zone *zone); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom); +#endif extern struct wb_domain global_wb_domain; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 9c8b7b5a0eee..84ebf7c8d006 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -482,6 +482,7 @@ static void cgwb_release_workfn(struct work_struct *work) css_put(wb->blkcg_css); wb_congested_put(wb->congested); + fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); @@ -548,9 +549,13 @@ static int cgwb_create(struct backing_dev_info *bdi, if (ret) goto err_wb_exit; + ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); + if (ret) + goto err_ref_exit; + wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); if (!wb->congested) - goto err_ref_exit; + goto err_fprop_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; @@ -587,6 +592,8 @@ static int cgwb_create(struct backing_dev_info *bdi, err_put_congested: wb_congested_put(wb->congested); +err_fprop_exit: + fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 701cbee9acba..ce113ddf2fb5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -345,6 +345,7 @@ struct mem_cgroup { #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; + struct wb_domain cgwb_domain; #endif /* List of events which userspace want to receive */ @@ -3994,6 +3995,37 @@ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) return &memcg->cgwb_list; } +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return wb_domain_init(&memcg->cgwb_domain, gfp); +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ + wb_domain_exit(&memcg->cgwb_domain); +} + +struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + + if (!memcg->css.parent) + return NULL; + + return &memcg->cgwb_domain; +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) +{ + return 0; +} + +static void memcg_wb_domain_exit(struct mem_cgroup *memcg) +{ +} + #endif /* CONFIG_CGROUP_WRITEBACK */ /* @@ -4380,9 +4412,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void) memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!memcg->stat) goto out_free; + + if (memcg_wb_domain_init(memcg, GFP_KERNEL)) + goto out_free_stat; + spin_lock_init(&memcg->pcp_counter_lock); return memcg; +out_free_stat: + free_percpu(memcg->stat); out_free: kfree(memcg); return NULL; @@ -4409,6 +4447,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_zone_info(memcg, node); free_percpu(memcg->stat); + memcg_wb_domain_exit(memcg); kfree(memcg); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a7ba5cee950b..a146e3389e78 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -171,6 +171,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m return mdtc->gdtc; } +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return &wb->memcg_completions; +} + static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { @@ -213,6 +218,11 @@ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *m return NULL; } +static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb) +{ + return NULL; +} + static void wb_min_max_ratio(struct bdi_writeback *wb, unsigned long *minp, unsigned long *maxp) { @@ -530,9 +540,16 @@ static void wb_domain_writeout_inc(struct wb_domain *dom, */ static inline void __wb_writeout_inc(struct bdi_writeback *wb) { + struct wb_domain *cgdom; + __inc_wb_stat(wb, WB_WRITTEN); wb_domain_writeout_inc(&global_wb_domain, &wb->completions, wb->bdi->max_prop_frac); + + cgdom = mem_cgroup_wb_domain(wb); + if (cgdom) + wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb), + wb->bdi->max_prop_frac); } void wb_writeout_inc(struct bdi_writeback *wb) @@ -583,6 +600,14 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) return fprop_global_init(&dom->completions, gfp); } +#ifdef CONFIG_CGROUP_WRITEBACK +void wb_domain_exit(struct wb_domain *dom) +{ + del_timer_sync(&dom->period_timer); + fprop_global_destroy(&dom->completions); +} +#endif + /* * bdi_min_ratio keeps the sum of the minimum dirty shares of all * registered backing devices, which, for obvious reasons, can not -- cgit v1.2.3 From 21c6321fbb3a3787af07f1bc031d713a707fb69c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 28 May 2015 14:50:49 -0400 Subject: writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb() Currently, majority of cgroup writeback support including all the above functions are implemented in include/linux/backing-dev.h and mm/backing-dev.c; however, the portion closely related to writeback logic implemented in include/linux/writeback.h and mm/page-writeback.c will expand to support foreign writeback detection and correction. This patch moves wb[_try]_get() and wb_put() to include/linux/backing-dev-defs.h so that they can be used from writeback.h and inode_{attach|detach}_wb() to writeback.h and page-writeback.c. This is pure reorganization and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Jens Axboe Cc: Jan Kara Cc: Wu Fengguang Cc: Greg Thelen Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 31 +++++++++++++++ include/linux/backing-dev-defs.h | 50 ++++++++++++++++++++++++ include/linux/backing-dev.h | 82 ---------------------------------------- include/linux/writeback.h | 46 ++++++++++++++++++++++ mm/backing-dev.c | 30 --------------- 5 files changed, 127 insertions(+), 112 deletions(-) (limited to 'mm/backing-dev.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index da355879ba7c..cf6ccfb01e03 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "internal.h" /* @@ -213,6 +214,36 @@ static void wb_wait_for_completion(struct backing_dev_info *bdi, #ifdef CONFIG_CGROUP_WRITEBACK +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8d470b73824f..e047b496a0b9 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -186,4 +186,54 @@ static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync) set_wb_congested(bdi->wb.congested, sync); } +#ifdef CONFIG_CGROUP_WRITEBACK + +/** + * wb_tryget - try to increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + return percpu_ref_tryget(&wb->refcnt); + return true; +} + +/** + * wb_get - increment a wb's refcount + * @wb: bdi_writeback to get + */ +static inline void wb_get(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_get(&wb->refcnt); +} + +/** + * wb_put - decrement a wb's refcount + * @wb: bdi_writeback to put + */ +static inline void wb_put(struct bdi_writeback *wb) +{ + if (wb != &wb->bdi->wb) + percpu_ref_put(&wb->refcnt); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline bool wb_tryget(struct bdi_writeback *wb) +{ + return true; +} + +static inline void wb_get(struct bdi_writeback *wb) +{ +} + +static inline void wb_put(struct bdi_writeback *wb) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + #endif /* __LINUX_BACKING_DEV_DEFS_H */ diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e9d7373f5f93..5c978a924157 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -243,7 +243,6 @@ void wb_congested_put(struct bdi_writeback_congested *congested); struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp); -void __inode_attach_wb(struct inode *inode, struct page *page); void wb_memcg_offline(struct mem_cgroup *memcg); void wb_blkcg_offline(struct blkcg *blkcg); int inode_congested(struct inode *inode, int cong_bits); @@ -264,37 +263,6 @@ static inline bool inode_cgwb_enabled(struct inode *inode) (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK); } -/** - * wb_tryget - try to increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - return percpu_ref_tryget(&wb->refcnt); - return true; -} - -/** - * wb_get - increment a wb's refcount - * @wb: bdi_writeback to get - */ -static inline void wb_get(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_get(&wb->refcnt); -} - -/** - * wb_put - decrement a wb's refcount - * @wb: bdi_writeback to put - */ -static inline void wb_put(struct bdi_writeback *wb) -{ - if (wb != &wb->bdi->wb) - percpu_ref_put(&wb->refcnt); -} - /** * wb_find_current - find wb for %current on a bdi * @bdi: bdi of interest @@ -353,35 +321,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return wb; } -/** - * inode_attach_wb - associate an inode with its wb - * @inode: inode of interest - * @page: page being dirtied (may be NULL) - * - * If @inode doesn't have its wb, associate it with the wb matching the - * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o - * @inode->i_lock. - */ -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ - if (!inode->i_wb) - __inode_attach_wb(inode, page); -} - -/** - * inode_detach_wb - disassociate an inode from its wb - * @inode: inode of interest - * - * @inode is being freed. Detach from its wb. - */ -static inline void inode_detach_wb(struct inode *inode) -{ - if (inode->i_wb) { - wb_put(inode->i_wb); - inode->i_wb = NULL; - } -} - /** * inode_to_wb - determine the wb of an inode * @inode: inode of interest @@ -471,19 +410,6 @@ static inline void wb_congested_put(struct bdi_writeback_congested *congested) { } -static inline bool wb_tryget(struct bdi_writeback *wb) -{ - return true; -} - -static inline void wb_get(struct bdi_writeback *wb) -{ -} - -static inline void wb_put(struct bdi_writeback *wb) -{ -} - static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi) { return &bdi->wb; @@ -495,14 +421,6 @@ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp) return &bdi->wb; } -static inline void inode_attach_wb(struct inode *inode, struct page *page) -{ -} - -static inline void inode_detach_wb(struct inode *inode) -{ -} - static inline struct bdi_writeback *inode_to_wb(struct inode *inode) { return &inode_to_bdi(inode)->wb; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3b73e97ecfc7..6726b7e56beb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -8,6 +8,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, dirty_throttle_leaks); @@ -173,6 +174,51 @@ static inline void wait_on_inode(struct inode *inode) wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } +#ifdef CONFIG_CGROUP_WRITEBACK + +void __inode_attach_wb(struct inode *inode, struct page *page); + +/** + * inode_attach_wb - associate an inode with its wb + * @inode: inode of interest + * @page: page being dirtied (may be NULL) + * + * If @inode doesn't have its wb, associate it with the wb matching the + * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o + * @inode->i_lock. + */ +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ + if (!inode->i_wb) + __inode_attach_wb(inode, page); +} + +/** + * inode_detach_wb - disassociate an inode from its wb + * @inode: inode of interest + * + * @inode is being freed. Detach from its wb. + */ +static inline void inode_detach_wb(struct inode *inode) +{ + if (inode->i_wb) { + wb_put(inode->i_wb); + inode->i_wb = NULL; + } +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static inline void inode_attach_wb(struct inode *inode, struct page *page) +{ +} + +static inline void inode_detach_wb(struct inode *inode) +{ +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + /* * mm/page-writeback.c */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 84ebf7c8d006..887d72a85b5e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -660,36 +660,6 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return wb; } -void __inode_attach_wb(struct inode *inode, struct page *page) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - struct bdi_writeback *wb = NULL; - - if (inode_cgwb_enabled(inode)) { - struct cgroup_subsys_state *memcg_css; - - if (page) { - memcg_css = mem_cgroup_css_from_page(page); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - } else { - /* must pin memcg_css, see wb_get_create() */ - memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - css_put(memcg_css); - } - } - - if (!wb) - wb = &bdi->wb; - - /* - * There may be multiple instances of this function racing to - * update the same inode. Use cmpxchg() to tell the winner. - */ - if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) - wb_put(wb); -} - static void cgwb_bdi_init(struct backing_dev_info *bdi) { bdi->wb.memcg_css = mem_cgroup_root_css; -- cgit v1.2.3 From 5857cd637bc0d60dc7e37af396b01324f199d89b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 5 Jun 2015 06:20:51 +0900 Subject: bdi: fix wrong error return value in cgwb_create() On wb_congested_get_create() failure, cgwb_create() forgot to set @ret to -ENOMEM ending up returning 0. Fix it so that it returns -ENOMEM. Signed-off-by: Tejun Heo Reported-by: Dan Carpenter Signed-off-by: Jens Axboe --- mm/backing-dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm/backing-dev.c') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 887d72a85b5e..436bb53dd383 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -554,8 +554,10 @@ static int cgwb_create(struct backing_dev_info *bdi, goto err_ref_exit; wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp); - if (!wb->congested) + if (!wb->congested) { + ret = -ENOMEM; goto err_fprop_exit; + } wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; -- cgit v1.2.3