rhashtable: don't hold lock on first table throughout insertion.

rhashtable_try_insert() currently holds a lock on the bucket in the first table, while also locking buckets in subsequent tables. This is unnecessary and looks like a hold-over from some earlier version of the implementation. As insert and remove always lock a bucket in each table in turn, and as insert only inserts in the final table, there cannot be any races that are not covered by simply locking a bucket in each table in turn. When an insert call reaches that last table it can be sure that there is no matchinf entry in any other table as it has searched them all, and insertion never happens anywhere but in the last table. The fact that code tests for the existence of future_tbl while holding a lock on the relevant bucket ensures that two threads inserting the same key will make compatible decisions about which is the "last" table. This simplifies the code and allows the ->rehash field to be discarded. We still need a way to ensure that a dead bucket_table is never re-linked by rhashtable_walk_stop(). This can be achieved by calling call_rcu() inside the locked region, and checking with rcu_head_after_call_rcu() in rhashtable_walk_stop() to see if the bucket table is empty and dead. Acked-by: Herbert Xu <herbert@gondor.apana.org.au> Reviewed-by: Paul E. McKenney <paulmck@linux.ibm.com> Signed-off-by: NeilBrown <neilb@suse.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: NeilBrown <neilb@suse.com> 2019-03-21 14:42:40 +1100
committer: David S. Miller <davem@davemloft.net> 2019-03-21 14:01:10 -0700
commit: 4feb7c7a4fbb8f63371be31cda79433c7cf3da86 (patch)
tree: 634aa8dd79c50984306f29738f2a19287a97013d /lib/rhashtable.c
parent: 83b038db255d83a32c15fd003f9a921204b9215a (diff)
download: lwn-4feb7c7a4fbb8f63371be31cda79433c7cf3da86.tar.gz
lwn-4feb7c7a4fbb8f63371be31cda79433c7cf3da86.zip
1 files changed, 16 insertions, 36 deletions
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index 0a105d4af166..776b3a82d3a1 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -197,6 +197,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
 		return NULL;
 	}
 
+	rcu_head_init(&tbl->rcu);
 	INIT_LIST_HEAD(&tbl->walkers);
 
 	tbl->hash_rnd = get_random_u32();
@@ -280,10 +281,9 @@ static int rhashtable_rehash_chain(struct rhashtable *ht,
 	while (!(err = rhashtable_rehash_one(ht, old_hash)))
 		;
 
-	if (err == -ENOENT) {
-		old_tbl->rehash++;
+	if (err == -ENOENT)
 		err = 0;
-	}
+
 	spin_unlock_bh(old_bucket_lock);
 
 	return err;
@@ -330,13 +330,16 @@ static int rhashtable_rehash_table(struct rhashtable *ht)
 	spin_lock(&ht->lock);
 	list_for_each_entry(walker, &old_tbl->walkers, list)
 		walker->tbl = NULL;
-	spin_unlock(&ht->lock);
 
 	/* Wait for readers. All new readers will see the new
 	 * table, and thus no references to the old table will
 	 * remain.
+	 * We do this inside the locked region so that
+	 * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
+	 * to check if it should not re-link the table.
 	 */
 	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
+	spin_unlock(&ht->lock);
 
 	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
 }
@@ -578,46 +581,22 @@ static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
 	struct bucket_table *new_tbl;
 	struct bucket_table *tbl;
 	unsigned int hash;
-	spinlock_t *lock;
 	void *data;
 
-	tbl = rcu_dereference(ht->tbl);
-
-	/* All insertions must grab the oldest table containing
-	 * the hashed bucket that is yet to be rehashed.
-	 */
-	for (;;) {
-		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		lock = rht_bucket_lock(tbl, hash);
-		spin_lock_bh(lock);
-
-		if (tbl->rehash <= hash)
-			break;
-
-		spin_unlock_bh(lock);
-		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
-	}
-
-	data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
-	new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
-	if (PTR_ERR(new_tbl) != -EEXIST)
-		data = ERR_CAST(new_tbl);
+	new_tbl = rcu_dereference(ht->tbl);
 
-	while (!IS_ERR_OR_NULL(new_tbl)) {
+	do {
 		tbl = new_tbl;
 		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
-		spin_lock_nested(rht_bucket_lock(tbl, hash),
-				 SINGLE_DEPTH_NESTING);
+		spin_lock_bh(rht_bucket_lock(tbl, hash));
 
 		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
 		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
 		if (PTR_ERR(new_tbl) != -EEXIST)
 			data = ERR_CAST(new_tbl);
 
-		spin_unlock(rht_bucket_lock(tbl, hash));
-	}
-
-	spin_unlock_bh(lock);
+		spin_unlock_bh(rht_bucket_lock(tbl, hash));
+	} while (!IS_ERR_OR_NULL(new_tbl));
 
 	if (PTR_ERR(data) == -EAGAIN)
 		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
@@ -939,10 +918,11 @@ void rhashtable_walk_stop(struct rhashtable_iter *iter)
 	ht = iter->ht;
 
 	spin_lock(&ht->lock);
-	if (tbl->rehash < tbl->size)
-		list_add(&iter->walker.list, &tbl->walkers);
-	else
+	if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
+		/* This bucket table is being freed, don't re-link it. */
 		iter->walker.tbl = NULL;
+	else
+		list_add(&iter->walker.list, &tbl->walkers);
 	spin_unlock(&ht->lock);
 
 out:
author	NeilBrown <neilb@suse.com>	2019-03-21 14:42:40 +1100
committer	David S. Miller <davem@davemloft.net>	2019-03-21 14:01:10 -0700
commit	4feb7c7a4fbb8f63371be31cda79433c7cf3da86 (patch)
tree	634aa8dd79c50984306f29738f2a19287a97013d /lib/rhashtable.c
parent	83b038db255d83a32c15fd003f9a921204b9215a (diff)
download	lwn-4feb7c7a4fbb8f63371be31cda79433c7cf3da86.tar.gz lwn-4feb7c7a4fbb8f63371be31cda79433c7cf3da86.zip