summaryrefslogtreecommitdiff
path: root/fs/bcachefs/btree_locking.h
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-08-22 13:23:47 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:41 -0400
commit33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d (patch)
tree6fff6e218b381e0fa2cd4580da3a2e919d18ccd8 /fs/bcachefs/btree_locking.h
parent62448afee714354a26db8a0f3c644f58628f0792 (diff)
downloadlwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.tar.gz
lwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.zip
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy. The btree iterator API provides an interface where the user doesn't need to concern themselves with lock ordering - different btree iterators can be traversed in any order. Without special care, this will lead to deadlocks. Our previous strategy was to define a lock ordering internally, and whenever we attempt to take a lock and trylock() fails, we'd check if the current btree transaction is holding any locks that cause a lock ordering violation. If so, we'd issue a transaction restart, and then bch2_trans_begin() would re-traverse all previously used iterators, but in the correct order. That approach had some issues, though. - Sometimes we'd issue transaction restarts unnecessarily, when no deadlock would have actually occured. Lock ordering restarts have become our primary cause of transaction restarts, on some workloads totally 20% of actual transaction commits. - To avoid deadlock or livelock, we'd often have to take intent locks when we only wanted a read lock: with the lock ordering approach, it is actually illegal to hold _any_ read lock while blocking on an intent lock, and this has been causing us unnecessary lock contention. - It was getting fragile - the various lock ordering rules are not trivial, and we'd been seeing occasional livelock issues related to this machinery. So, since bcachefs is already a relational database masquerading as a filesystem, we're stealing the next traditional database technique and switching to a cycle detector for avoiding deadlocks. When we block taking a btree lock, after adding ourself to the waitlist but before sleeping, we do a DFS of btree transactions waiting on other btree transactions, starting with the current transaction and walking our held locks, and transactions blocking on our held locks. If we find a cycle, we emit a transaction restart. Occasionally (e.g. the btree split path) we can not allow the lock() operation to fail, so if necessary we'll tell another transaction that it has to fail. Result: trans_restart_would_deadlock events are reduced by a factor of 10 to 100, and we'll be able to delete a whole bunch of grotty, fragile code. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs/btree_locking.h')
-rw-r--r--fs/bcachefs/btree_locking.h65
1 files changed, 48 insertions, 17 deletions
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index aea2ebafffd8..874dd4428b3a 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
void bch2_btree_node_unlock_write(struct btree_trans *,
struct btree_path *, struct btree *);
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
/* lock: */
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail)
+{
+ int ret;
+
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->locking = b;
+ trans->lock_must_abort = false;
+
+ ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
+ WRITE_ONCE(trans->locking, NULL);
+ WRITE_ONCE(trans->locking_wait.start_time, 0);
+ return ret;
+}
+
static inline int __must_check
btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- six_lock_type(&b->lock, type, NULL, NULL);
- return 0;
+ return __btree_node_lock_nopath(trans, b, type, false);
}
static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- int ret = btree_node_lock_nopath(trans, b, type);
+ int ret = __btree_node_lock_nopath(trans, b, type, true);
BUG_ON(ret);
}
@@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- int ret;
-
if (six_trylock_type(&b->lock, type))
return 0;
@@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
- trans->locking_lock_type = type;
+ trans->lock_may_not_fail = false;
trans->locking = b;
- ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
- trans->locking = NULL;
- return ret;
+ return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
}
/*
@@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
return ret;
}
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
{
+ int ret;
+
EBUG_ON(&path->l[b->level].b->c != b);
EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
*/
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
- if (unlikely(!six_trylock_write(&b->lock)))
- __bch2_btree_node_lock_write(trans, b);
+ ret = likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+ return ret;
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ int ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
}
static inline int __must_check
@@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- bch2_btree_node_lock_write_nofail(trans, path, b);
- return 0;
+ return __btree_node_lock_write(trans, path, b, false);
}
/* relock: */