From a50c3af910e06f35bc0c68f89d8fef98c0fec0ea Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Tue, 10 Jan 2012 17:52:31 -0800
Subject: rcu: Don't make callbacks go through second full grace period

RCU's current CPU-offline code path dumps all of the outgoing CPU's
callbacks onto the RCU_NEXT_TAIL portion of the surviving CPU's
callback list.  This means that all the ready-to-invoke callbacks from
the outgoing CPU must wait for another full RCU grace period.  This was
just fine when CPU-hotplug events were rare, but there is increasing
evidence that users are planning to make increasing use of CPU hotplug.

Therefore, this commit changes the callback-dumping procedure so that
callbacks that are ready to invoke are moved to the RCU_DONE_TAIL
portion of the surviving CPU's callback list.  This avoids running
these callbacks through a second unnecessary grace period.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcutree.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

(limited to 'kernel/rcutree.c')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ac3a810d2db7..7789e666394d 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1270,24 +1270,64 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
 	struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu);
 	struct rcu_node *rnp = rdp->mynode; /* For dying CPU. */
 
-	/* Move callbacks to some other CPU. */
+	/* First, adjust the counts. */
+	if (rdp->nxtlist != NULL) {
+		receive_rdp->qlen_lazy += rdp->qlen_lazy;
+		receive_rdp->qlen += rdp->qlen;
+		rdp->qlen_lazy = 0;
+		rdp->qlen = 0;
+	}
+
+	/*
+	 * Next, move ready-to-invoke callbacks to be invoked on some
+	 * other CPU.  These will not be required to pass through another
+	 * grace period:  They are done, regardless of CPU.
+	 */
+	if (rdp->nxtlist != NULL &&
+	    rdp->nxttail[RCU_DONE_TAIL] != &rdp->nxtlist) {
+		struct rcu_head *oldhead;
+		struct rcu_head **oldtail;
+		struct rcu_head **newtail;
+
+		oldhead = rdp->nxtlist;
+		oldtail = receive_rdp->nxttail[RCU_DONE_TAIL];
+		rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
+		*rdp->nxttail[RCU_DONE_TAIL] = *oldtail;
+		*receive_rdp->nxttail[RCU_DONE_TAIL] = oldhead;
+		newtail = rdp->nxttail[RCU_DONE_TAIL];
+		for (i = RCU_DONE_TAIL; i < RCU_NEXT_SIZE; i++) {
+			if (receive_rdp->nxttail[i] == oldtail)
+				receive_rdp->nxttail[i] = newtail;
+			if (rdp->nxttail[i] == newtail)
+				rdp->nxttail[i] = &rdp->nxtlist;
+		}
+	}
+
+	/*
+	 * Finally, put the rest of the callbacks at the end of the list.
+	 * The ones that made it partway through get to start over:  We
+	 * cannot assume that grace periods are synchronized across CPUs.
+	 * (We could splice RCU_WAIT_TAIL into RCU_NEXT_READY_TAIL, but
+	 * this does not seem compelling.  Not yet, anyway.)
+	 */
 	if (rdp->nxtlist != NULL) {
 		*receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
 		receive_rdp->nxttail[RCU_NEXT_TAIL] =
 				rdp->nxttail[RCU_NEXT_TAIL];
-		receive_rdp->qlen_lazy += rdp->qlen_lazy;
-		receive_rdp->qlen += rdp->qlen;
 		receive_rdp->n_cbs_adopted += rdp->qlen;
 		rdp->n_cbs_orphaned += rdp->qlen;
 
 		rdp->nxtlist = NULL;
 		for (i = 0; i < RCU_NEXT_SIZE; i++)
 			rdp->nxttail[i] = &rdp->nxtlist;
-		rdp->qlen_lazy = 0;
-		rdp->qlen = 0;
 	}
 
-	/* Record a quiescent state for the dying CPU. */
+	/*
+	 * Record a quiescent state for the dying CPU.  This is safe
+	 * only because we have already cleared out the callbacks.
+	 * (Otherwise, the RCU core might try to schedule the invocation
+	 * of callbacks on this now-offline CPU, which would be bad.)
+	 */
 	mask = rdp->grpmask;	/* rnp->grplo is constant. */
 	trace_rcu_grace_period(rsp->name,
 			       rnp->gpnum + 1 - !!(rnp->qsmask & mask),
-- 
cgit v1.2.3