diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2022-08-22 13:23:47 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:09:41 -0400 |
commit | 33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d (patch) | |
tree | 6fff6e218b381e0fa2cd4580da3a2e919d18ccd8 /fs/bcachefs | |
parent | 62448afee714354a26db8a0f3c644f58628f0792 (diff) | |
download | lwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.tar.gz lwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.zip |
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy.
The btree iterator API provides an interface where the user doesn't need
to concern themselves with lock ordering - different btree iterators can
be traversed in any order. Without special care, this will lead to
deadlocks.
Our previous strategy was to define a lock ordering internally, and
whenever we attempt to take a lock and trylock() fails, we'd check if
the current btree transaction is holding any locks that cause a lock
ordering violation. If so, we'd issue a transaction restart, and then
bch2_trans_begin() would re-traverse all previously used iterators, but
in the correct order.
That approach had some issues, though.
- Sometimes we'd issue transaction restarts unnecessarily, when no
deadlock would have actually occured. Lock ordering restarts have
become our primary cause of transaction restarts, on some workloads
totally 20% of actual transaction commits.
- To avoid deadlock or livelock, we'd often have to take intent locks
when we only wanted a read lock: with the lock ordering approach, it
is actually illegal to hold _any_ read lock while blocking on an intent
lock, and this has been causing us unnecessary lock contention.
- It was getting fragile - the various lock ordering rules are not
trivial, and we'd been seeing occasional livelock issues related to
this machinery.
So, since bcachefs is already a relational database masquerading as a
filesystem, we're stealing the next traditional database technique and
switching to a cycle detector for avoiding deadlocks.
When we block taking a btree lock, after adding ourself to the waitlist
but before sleeping, we do a DFS of btree transactions waiting on other
btree transactions, starting with the current transaction and walking
our held locks, and transactions blocking on our held locks.
If we find a cycle, we emit a transaction restart. Occasionally (e.g.
the btree split path) we can not allow the lock() operation to fail, so
if necessary we'll tell another transaction that it has to fail.
Result: trans_restart_would_deadlock events are reduced by a factor of
10 to 100, and we'll be able to delete a whole bunch of grotty, fragile
code.
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/bcachefs_format.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.c | 11 | ||||
-rw-r--r-- | fs/bcachefs/btree_iter.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/btree_locking.c | 246 | ||||
-rw-r--r-- | fs/bcachefs/btree_locking.h | 65 | ||||
-rw-r--r-- | fs/bcachefs/btree_types.h | 10 | ||||
-rw-r--r-- | fs/bcachefs/debug.c | 6 | ||||
-rw-r--r-- | fs/bcachefs/errcode.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/trace.h | 6 |
9 files changed, 322 insertions, 33 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0e80fe2568f2..5471b797be93 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1400,7 +1400,8 @@ struct bch_sb_field_disk_groups { x(trans_restart_key_cache_upgrade, 70) \ x(trans_traverse_all, 71) \ x(transaction_commit, 72) \ - x(write_super, 73) + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) enum bch_persistent_counters { #define x(t, n, ...) BCH_COUNTER_##t, diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index df9949cef907..5773b00e69ac 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2848,9 +2848,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * trans->fn = fn; trans->last_begin_time = ktime_get_ns(); trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn); - trans->task = current; + trans->locking_wait.task = current; trans->journal_replay_not_finished = !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + closure_init_stack(&trans->ref); bch2_trans_alloc_paths(trans, c); @@ -2877,7 +2878,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char * mutex_lock(&c->btree_trans_lock); list_for_each_entry(pos, &c->btree_trans_list, list) { - if (trans->task->pid < pos->task->pid) { + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { list_add_tail(&trans->list, &pos->list); goto list_add_done; } @@ -2919,6 +2920,8 @@ void bch2_trans_exit(struct btree_trans *trans) bch2_trans_unlock(trans); + closure_sync(&trans->ref); + if (s) s->max_mem = max(s->max_mem, trans->mem_max); @@ -2997,7 +3000,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) static char lock_types[] = { 'r', 'i', 'w' }; unsigned l; - prt_printf(out, "%i %s\n", trans->task->pid, trans->fn); + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); trans_for_each_path(trans, path) { if (!path->nodes_locked) @@ -3029,7 +3032,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) trans->locking_path_idx, path->cached ? 'c' : 'b', trans->locking_level, - lock_types[trans->locking_lock_type], + lock_types[trans->locking_wait.lock_want], bch2_btree_ids[trans->locking_btree_id]); bch2_bpos_to_text(out, trans->locking_pos); diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 2f47889c688a..04b6773d6e10 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -81,11 +81,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx) return &trans->paths[idx]; } -#define trans_for_each_path(_trans, _path) \ - for (_path = __trans_next_path((_trans), 0); \ +#define trans_for_each_path_from(_trans, _path, _start) \ + for (_path = __trans_next_path((_trans), _start); \ (_path); \ _path = __trans_next_path((_trans), (_path)->idx + 1)) +#define trans_for_each_path(_trans, _path) \ + trans_for_each_path_from(_trans, _path, 0) + static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) { unsigned idx = path ? path->sorted_idx + 1 : 0; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 5b6d8184ea45..869f4163a3c6 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans, /* lock */ -void __bch2_btree_node_lock_write(struct btree_trans *trans, - struct btree_bkey_cached_common *b) +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static void lock_graph_pop(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + int ret; + + if (i == g->g) { + ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + ret = 0; + } + + for (i = g->g + 1; i < g->g + g->nr; i++) + wake_up_process(i->trans->locking_wait.task); + return ret; +} + +static noinline int break_cycle(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + i->trans->locking_wait.lock_want == SIX_LOCK_write) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail || + !i->trans->in_traverse_all) + continue; + + return abort_lock(g, i); + } + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->lock_may_not_fail) + continue; + + return abort_lock(g, i); + } + + BUG(); +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + int ret = 0; + + for (i = g->g; i < g->g + g->nr; i++) { + if (i->trans->locking != i->node_want) + while (g->g + g->nr >= i) { + lock_graph_pop(g); + return 0; + } + + if (i->trans == trans) { + ret = break_cycle(g); + if (ret) + goto deadlock; + /* + * If we didn't abort (instead telling another + * transaction to abort), keep checking: + */ + } + } + + if (g->nr == ARRAY_SIZE(g->g)) { + if (orig_trans->lock_may_not_fail) + return 0; + + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + goto deadlock; + } + + closure_get(&trans->ref); + + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; + + return 0; +deadlock: + while (g->nr) + lock_graph_pop(g); + return ret; +} + +#if 0 +static void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_str(out, "Found lock cycle:"); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} +#endif + +static noinline void lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr >= i) + lock_graph_pop(g); + return; + } + BUG(); +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +static int check_for_deadlock(struct btree_trans *trans) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + int ret; + + if (trans->lock_must_abort) + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + + g.nr = 0; + ret = lock_graph_descend(&g, trans); + BUG_ON(ret); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_from(top->trans, path, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (top->path_idx != path->idx) { + top->path_idx = path->idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (unlikely(IS_ERR_OR_NULL(b))) { + lock_graph_remove_non_waiters(&g); + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + ret = lock_graph_descend(&g, trans); + raw_spin_unlock(&b->lock.wait_lock); + + if (ret) + return ret < 0 ? ret : 0; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + lock_graph_pop(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return check_for_deadlock(trans); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; /* * Must drop our read locks before calling six_lock_write() - @@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans, * locked: */ six_lock_readers_add(&b->lock, -readers); - btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail); six_lock_readers_add(&b->lock, readers); + + return ret; } static inline bool path_has_read_locks(struct btree_path *path) diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index aea2ebafffd8..874dd4428b3a 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat void bch2_btree_node_unlock_write(struct btree_trans *, struct btree_path *, struct btree *); +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + /* lock: */ +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail) +{ + int ret; + + trans->lock_may_not_fail = lock_may_not_fail; + trans->locking = b; + trans->lock_must_abort = false; + + ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); + return ret; +} + static inline int __must_check btree_node_lock_nopath(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - six_lock_type(&b->lock, type, NULL, NULL); - return 0; + return __btree_node_lock_nopath(trans, b, type, false); } static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, struct btree_bkey_cached_common *b, enum six_lock_type type) { - int ret = btree_node_lock_nopath(trans, b, type); + int ret = __btree_node_lock_nopath(trans, b, type, true); BUG_ON(ret); } @@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans, enum six_lock_type type, six_lock_should_sleep_fn should_sleep_fn, void *p) { - int ret; - if (six_trylock_type(&b->lock, type)) return 0; @@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans, trans->locking_pos = pos; trans->locking_btree_id = path->btree_id; trans->locking_level = level; - trans->locking_lock_type = type; + trans->lock_may_not_fail = false; trans->locking = b; - ret = six_lock_type(&b->lock, type, should_sleep_fn, p); - trans->locking = NULL; - return ret; + return six_lock_type_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans); } /* @@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans, return ret; } -void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *); +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool); -static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, - struct btree_path *path, - struct btree_bkey_cached_common *b) +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) { + int ret; + EBUG_ON(&path->l[b->level].b->c != b); EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq); EBUG_ON(!btree_node_intent_locked(path, b->level)); @@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, */ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); - if (unlikely(!six_trylock_write(&b->lock))) - __bch2_btree_node_lock_write(trans, b); + ret = likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail); + if (ret) + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); + + return ret; +} + +static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + int ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); } static inline int __must_check @@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, struct btree_bkey_cached_common *b) { - bch2_btree_node_lock_write_nofail(trans, path, b); - return 0; + return __btree_node_lock_write(trans, path, b, false); } /* relock: */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 0a3854b614e0..578cf8fa3d2f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -387,15 +387,19 @@ struct btree_trans_commit_hook { struct btree_trans { struct bch_fs *c; const char *fn; + struct closure ref; struct list_head list; u64 last_begin_time; - struct btree_bkey_cached_common *locking; + unsigned locking_path_idx; struct bpos locking_pos; u8 locking_btree_id; u8 locking_level; - u8 locking_lock_type; - struct task_struct *task; + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + int srcu_idx; u8 fn_idx; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4fe20d36212e..6944dfef5bcb 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, mutex_lock(&c->btree_trans_lock); list_for_each_entry(trans, &c->btree_trans_list, list) { - if (trans->task->pid <= i->iter) + if (trans->locking_wait.task->pid <= i->iter) continue; ret = flush_buf(i); @@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, prt_printf(&i->buf, "backtrace:"); prt_newline(&i->buf); printbuf_indent_add(&i->buf, 2); - prt_backtrace(&i->buf, trans->task); + prt_backtrace(&i->buf, trans->locking_wait.task); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); - i->iter = trans->task->pid; + i->iter = trans->locking_wait.task->pid; } mutex_unlock(&c->btree_trans_lock); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 3dc477eb3600..1ea004f1adbb 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -35,6 +35,7 @@ x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 62de89fcb74b..35c40678f4b5 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1065,6 +1065,12 @@ TRACE_EVENT(trans_restart_would_deadlock, __entry->want_pos_snapshot) ); +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + TRACE_EVENT(trans_restart_would_deadlock_write, TP_PROTO(struct btree_trans *trans), TP_ARGS(trans), |