blkcg: implement per-blkg request allocation

Currently, request_queue has one request_list to allocate requests from regardless of blkcg of the IO being issued. When the unified request pool is used up, cfq proportional IO limits become meaningless - whoever grabs the next request being freed wins the race regardless of the configured weights. This can be easily demonstrated by creating a blkio cgroup w/ very low weight, put a program which can issue a lot of random direct IOs there and running a sequential IO from a different cgroup. As soon as the request pool is used up, the sequential IO bandwidth crashes. This patch implements per-blkg request_list. Each blkg has its own request_list and any IO allocates its request from the matching blkg making blkcgs completely isolated in terms of request allocation. * Root blkcg uses the request_list embedded in each request_queue, which was renamed to @q->root_rl from @q->rq. While making blkcg rl handling a bit harier, this enables avoiding most overhead for root blkcg. * Queue fullness is properly per request_list but bdi isn't blkcg aware yet, so congestion state currently just follows the root blkcg. As writeback isn't aware of blkcg yet, this works okay for async congestion but readahead may get the wrong signals. It's better than blkcg completely collapsing with shared request_list but needs to be improved with future changes. * After this change, each block cgroup gets a full request pool making resource consumption of each cgroup higher. This makes allowing non-root users to create cgroups less desirable; however, note that allowing non-root users to directly manage cgroups is already severely broken regardless of this patch - each block cgroup consumes kernel memory and skews IO weight (IO weights are not hierarchical). v2: queue-sysfs.txt updated and patch description udpated as suggested by Vivek. v3: blk_get_rl() wasn't checking error return from blkg_lookup_create() and may cause oops on lookup failure. Fix it by falling back to root_rl on blkg lookup failures. This problem was spotted by Rakesh Iyer <rni@google.com>. v4: Updated to accomodate 458f27a982 "block: Avoid missed wakeup in request waitqueue". blk_drain_queue() now wakes up waiters on all blkg->rl on the target queue. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Vivek Goyal <vgoyal@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Tejun Heo <tj@kernel.org> 2012-06-26 15:05:44 -0700
committer: Jens Axboe <axboe@kernel.dk> 2012-06-26 18:42:49 -0400
commit: a051661ca6d134c18599498b185b667859d4339b (patch)
tree: 9d840030874aed9b97a58051bf9568455126e8e8 /block/blk-core.c
parent: 5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)
download: lwn-a051661ca6d134c18599498b185b667859d4339b.tar.gz
lwn-a051661ca6d134c18599498b185b667859d4339b.zip
1 files changed, 32 insertions, 10 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index f392a2edf462..dd134d834d58 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -416,9 +416,14 @@ void blk_drain_queue(struct request_queue *q, bool drain_all)
 	 * left with hung waiters. We need to wake up those waiters.
 	 */
 	if (q->request_fn) {
+		struct request_list *rl;
+
 		spin_lock_irq(q->queue_lock);
-		for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++)
-			wake_up_all(&q->rq.wait[i]);
+
+		blk_queue_for_each_rl(rl, q)
+			for (i = 0; i < ARRAY_SIZE(rl->wait); i++)
+				wake_up_all(&rl->wait[i]);
+
 		spin_unlock_irq(q->queue_lock);
 	}
 }
@@ -685,7 +690,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 	if (!q)
 		return NULL;
 
-	if (blk_init_rl(&q->rq, q, GFP_KERNEL))
+	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
 		return NULL;
 
 	q->request_fn		= rfn;
@@ -776,7 +781,12 @@ static void __freed_request(struct request_list *rl, int sync)
 {
 	struct request_queue *q = rl->q;
 
-	if (rl->count[sync] < queue_congestion_off_threshold(q))
+	/*
+	 * bdi isn't aware of blkcg yet.  As all async IOs end up root
+	 * blkcg anyway, just use root blkcg state.
+	 */
+	if (rl == &q->root_rl &&
+	    rl->count[sync] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, sync);
 
 	if (rl->count[sync] + 1 <= q->nr_requests) {
@@ -897,7 +907,12 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 				}
 			}
 		}
-		blk_set_queue_congested(q, is_sync);
+		/*
+		 * bdi isn't aware of blkcg yet.  As all async IOs end up
+		 * root blkcg anyway, just use root blkcg state.
+		 */
+		if (rl == &q->root_rl)
+			blk_set_queue_congested(q, is_sync);
 	}
 
 	/*
@@ -939,6 +954,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
 		goto fail_alloc;
 
 	blk_rq_init(q, rq);
+	blk_rq_set_rl(rq, rl);
 	rq->cmd_flags = rw_flags | REQ_ALLOCED;
 
 	/* init elvpriv */
@@ -1032,15 +1048,19 @@ static struct request *get_request(struct request_queue *q, int rw_flags,
 {
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	DEFINE_WAIT(wait);
-	struct request_list *rl = &q->rq;
+	struct request_list *rl;
 	struct request *rq;
+
+	rl = blk_get_rl(q, bio);	/* transferred to @rq on success */
 retry:
-	rq = __get_request(&q->rq, rw_flags, bio, gfp_mask);
+	rq = __get_request(rl, rw_flags, bio, gfp_mask);
 	if (rq)
 		return rq;
 
-	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q)))
+	if (!(gfp_mask & __GFP_WAIT) || unlikely(blk_queue_dead(q))) {
+		blk_put_rl(rl);
 		return NULL;
+	}
 
 	/* wait on @rl and retry */
 	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -1231,12 +1251,14 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		unsigned int flags = req->cmd_flags;
+		struct request_list *rl = blk_rq_rl(req);
 
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 
-		blk_free_request(&q->rq, req);
-		freed_request(&q->rq, flags);
+		blk_free_request(rl, req);
+		freed_request(rl, flags);
+		blk_put_rl(rl);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
author	Tejun Heo <tj@kernel.org>	2012-06-26 15:05:44 -0700
committer	Jens Axboe <axboe@kernel.dk>	2012-06-26 18:42:49 -0400
commit	a051661ca6d134c18599498b185b667859d4339b (patch)
tree	9d840030874aed9b97a58051bf9568455126e8e8 /block/blk-core.c
parent	5b788ce3e2acac9bf109743b1281d77347cf2101 (diff)
download	lwn-a051661ca6d134c18599498b185b667859d4339b.tar.gz lwn-a051661ca6d134c18599498b185b667859d4339b.zip