summaryrefslogtreecommitdiff
path: root/include/linux/backing-dev-defs.h
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2015-05-22 17:13:37 -0400
committerJens Axboe <axboe@fb.com>2015-06-02 08:33:35 -0600
commit52ebea749aaed195245701a8f90a23d672c7a933 (patch)
tree5fac5f82f94b64a9cbcb65a99f4cddf4b13456e2 /include/linux/backing-dev-defs.h
parent89e9b9e07a390c50980d10aa37a04631db5a23ab (diff)
downloadlwn-52ebea749aaed195245701a8f90a23d672c7a933.tar.gz
lwn-52ebea749aaed195245701a8f90a23d672c7a933.zip
writeback: make backing_dev_info host cgroup-specific bdi_writebacks
For the planned cgroup writeback support, on each bdi (backing_dev_info), each memcg will be served by a separate wb (bdi_writeback). This patch updates bdi so that a bdi can host multiple wbs (bdi_writebacks). On the default hierarchy, blkcg implicitly enables memcg. This allows using memcg's page ownership for attributing writeback IOs, and every memcg - blkcg combination can be served by its own wb by assigning a dedicated wb to each memcg. This means that there may be multiple wb's of a bdi mapped to the same blkcg. As congested state is per blkcg - bdi combination, those wb's should share the same congested state. This is achieved by tracking congested state via bdi_writeback_congested structs which are keyed by blkcg. bdi->wb remains unchanged and will keep serving the root cgroup. cgwb's (cgroup wb's) for non-root cgroups are created on-demand or looked up while dirtying an inode according to the memcg of the page being dirtied or current task. Each cgwb is indexed on bdi->cgwb_tree by its memcg id. Once an inode is associated with its wb, it can be retrieved using inode_to_wb(). Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all pages will keep being associated with bdi->wb. v3: inode_attach_wb() in account_page_dirtied() moved inside mapping_cap_account_dirty() block where it's known to be !NULL. Also, an unnecessary NULL check before kfree() removed. Both detected by the kbuild bot. v2: Updated so that wb association is per inode and wb is per memcg rather than blkcg. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: kbuild test robot <fengguang.wu@intel.com> Cc: Dan Carpenter <dan.carpenter@oracle.com> Cc: Jens Axboe <axboe@kernel.dk> Cc: Jan Kara <jack@suse.cz> Signed-off-by: Jens Axboe <axboe@fb.com>
Diffstat (limited to 'include/linux/backing-dev-defs.h')
-rw-r--r--include/linux/backing-dev-defs.h59
1 files changed, 56 insertions, 3 deletions
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index 9e9eafa5f5aa..a1e9c407a59a 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -2,8 +2,11 @@
#define __LINUX_BACKING_DEV_DEFS_H
#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
#include <linux/spinlock.h>
#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
#include <linux/flex_proportions.h>
#include <linux/timer.h>
#include <linux/workqueue.h>
@@ -37,10 +40,43 @@ enum wb_stat_item {
#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg. Those
+ * wb's can operate mostly independently but should share the congested
+ * state. To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
struct bdi_writeback_congested {
unsigned long state; /* WB_[a]sync_congested flags */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct backing_dev_info *bdi; /* the associated bdi */
+ atomic_t refcnt; /* nr of attached wb's and blkg */
+ int blkcg_id; /* ID of the associated blkcg */
+ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */
+#endif
};
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently. Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg. This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg. As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
struct bdi_writeback {
struct backing_dev_info *bdi; /* our parent bdi */
@@ -78,6 +114,19 @@ struct bdi_writeback {
spinlock_t work_lock; /* protects work_list & dwork scheduling */
struct list_head work_list;
struct delayed_work dwork; /* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct percpu_ref refcnt; /* used only for !root wb's */
+ struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+ struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+ struct list_head memcg_node; /* anchored at memcg->cgwb_list */
+ struct list_head blkcg_node; /* anchored at blkcg->cgwb_list */
+
+ union {
+ struct work_struct release_work;
+ struct rcu_head rcu;
+ };
+#endif
};
struct backing_dev_info {
@@ -92,9 +141,13 @@ struct backing_dev_info {
unsigned int min_ratio;
unsigned int max_ratio, max_prop_frac;
- struct bdi_writeback wb; /* default writeback info for this bdi */
- struct bdi_writeback_congested wb_congested;
-
+ struct bdi_writeback wb; /* the root writeback info for this bdi */
+ struct bdi_writeback_congested wb_congested; /* its congested state */
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+ struct rb_root cgwb_congested_tree; /* their congested states */
+ atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#endif
struct device *dev;
struct timer_list laptop_mode_wb_timer;