summaryrefslogtreecommitdiff
path: root/fs/bcachefs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2022-08-22 13:23:47 -0400
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:41 -0400
commit33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d (patch)
tree6fff6e218b381e0fa2cd4580da3a2e919d18ccd8 /fs/bcachefs
parent62448afee714354a26db8a0f3c644f58628f0792 (diff)
downloadlwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.tar.gz
lwn-33bd5d068603f9e81e0b73dbe50e9b88b2e56d0d.zip
bcachefs: Deadlock cycle detector
We've outgrown our own deadlock avoidance strategy. The btree iterator API provides an interface where the user doesn't need to concern themselves with lock ordering - different btree iterators can be traversed in any order. Without special care, this will lead to deadlocks. Our previous strategy was to define a lock ordering internally, and whenever we attempt to take a lock and trylock() fails, we'd check if the current btree transaction is holding any locks that cause a lock ordering violation. If so, we'd issue a transaction restart, and then bch2_trans_begin() would re-traverse all previously used iterators, but in the correct order. That approach had some issues, though. - Sometimes we'd issue transaction restarts unnecessarily, when no deadlock would have actually occured. Lock ordering restarts have become our primary cause of transaction restarts, on some workloads totally 20% of actual transaction commits. - To avoid deadlock or livelock, we'd often have to take intent locks when we only wanted a read lock: with the lock ordering approach, it is actually illegal to hold _any_ read lock while blocking on an intent lock, and this has been causing us unnecessary lock contention. - It was getting fragile - the various lock ordering rules are not trivial, and we'd been seeing occasional livelock issues related to this machinery. So, since bcachefs is already a relational database masquerading as a filesystem, we're stealing the next traditional database technique and switching to a cycle detector for avoiding deadlocks. When we block taking a btree lock, after adding ourself to the waitlist but before sleeping, we do a DFS of btree transactions waiting on other btree transactions, starting with the current transaction and walking our held locks, and transactions blocking on our held locks. If we find a cycle, we emit a transaction restart. Occasionally (e.g. the btree split path) we can not allow the lock() operation to fail, so if necessary we'll tell another transaction that it has to fail. Result: trans_restart_would_deadlock events are reduced by a factor of 10 to 100, and we'll be able to delete a whole bunch of grotty, fragile code. Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/btree_iter.c11
-rw-r--r--fs/bcachefs/btree_iter.h7
-rw-r--r--fs/bcachefs/btree_locking.c246
-rw-r--r--fs/bcachefs/btree_locking.h65
-rw-r--r--fs/bcachefs/btree_types.h10
-rw-r--r--fs/bcachefs/debug.c6
-rw-r--r--fs/bcachefs/errcode.h1
-rw-r--r--fs/bcachefs/trace.h6
9 files changed, 322 insertions, 33 deletions
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0e80fe2568f2..5471b797be93 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -1400,7 +1400,8 @@ struct bch_sb_field_disk_groups {
x(trans_restart_key_cache_upgrade, 70) \
x(trans_traverse_all, 71) \
x(transaction_commit, 72) \
- x(write_super, 73)
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74)
enum bch_persistent_counters {
#define x(t, n, ...) BCH_COUNTER_##t,
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index df9949cef907..5773b00e69ac 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -2848,9 +2848,10 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
trans->fn = fn;
trans->last_begin_time = ktime_get_ns();
trans->fn_idx = bch2_trans_get_fn_idx(trans, c, fn);
- trans->task = current;
+ trans->locking_wait.task = current;
trans->journal_replay_not_finished =
!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
+ closure_init_stack(&trans->ref);
bch2_trans_alloc_paths(trans, c);
@@ -2877,7 +2878,7 @@ void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, const char *
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
- if (trans->task->pid < pos->task->pid) {
+ if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
@@ -2919,6 +2920,8 @@ void bch2_trans_exit(struct btree_trans *trans)
bch2_trans_unlock(trans);
+ closure_sync(&trans->ref);
+
if (s)
s->max_mem = max(s->max_mem, trans->mem_max);
@@ -2997,7 +3000,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
static char lock_types[] = { 'r', 'i', 'w' };
unsigned l;
- prt_printf(out, "%i %s\n", trans->task->pid, trans->fn);
+ prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
trans_for_each_path(trans, path) {
if (!path->nodes_locked)
@@ -3029,7 +3032,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
trans->locking_path_idx,
path->cached ? 'c' : 'b',
trans->locking_level,
- lock_types[trans->locking_lock_type],
+ lock_types[trans->locking_wait.lock_want],
bch2_btree_ids[trans->locking_btree_id]);
bch2_bpos_to_text(out, trans->locking_pos);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 2f47889c688a..04b6773d6e10 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -81,11 +81,14 @@ __trans_next_path(struct btree_trans *trans, unsigned idx)
return &trans->paths[idx];
}
-#define trans_for_each_path(_trans, _path) \
- for (_path = __trans_next_path((_trans), 0); \
+#define trans_for_each_path_from(_trans, _path, _start) \
+ for (_path = __trans_next_path((_trans), _start); \
(_path); \
_path = __trans_next_path((_trans), (_path)->idx + 1))
+#define trans_for_each_path(_trans, _path) \
+ trans_for_each_path_from(_trans, _path, 0)
+
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
unsigned idx = path ? path->sorted_idx + 1 : 0;
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 5b6d8184ea45..869f4163a3c6 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -52,10 +52,248 @@ void bch2_btree_node_unlock_write(struct btree_trans *trans,
/* lock */
-void __bch2_btree_node_lock_write(struct btree_trans *trans,
- struct btree_bkey_cached_common *b)
+/*
+ * @trans wants to lock @b with type @type
+ */
+struct trans_waiting_for_lock {
+ struct btree_trans *trans;
+ struct btree_bkey_cached_common *node_want;
+ enum six_lock_type lock_want;
+
+ /* for iterating over held locks :*/
+ u8 path_idx;
+ u8 level;
+ u64 lock_start_time;
+};
+
+struct lock_graph {
+ struct trans_waiting_for_lock g[8];
+ unsigned nr;
+};
+
+static void lock_graph_pop(struct lock_graph *g)
+{
+ closure_put(&g->g[--g->nr].trans->ref);
+}
+
+static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
+{
+ int ret;
+
+ if (i == g->g) {
+ ret = btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
+ } else {
+ i->trans->lock_must_abort = true;
+ ret = 0;
+ }
+
+ for (i = g->g + 1; i < g->g + g->nr; i++)
+ wake_up_process(i->trans->locking_wait.task);
+ return ret;
+}
+
+static noinline int break_cycle(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->lock_may_not_fail ||
+ i->trans->locking_wait.lock_want == SIX_LOCK_write)
+ continue;
+
+ return abort_lock(g, i);
+ }
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->lock_may_not_fail ||
+ !i->trans->in_traverse_all)
+ continue;
+
+ return abort_lock(g, i);
+ }
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->lock_may_not_fail)
+ continue;
+
+ return abort_lock(g, i);
+ }
+
+ BUG();
+}
+
+static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans)
+{
+ struct btree_trans *orig_trans = g->g->trans;
+ struct trans_waiting_for_lock *i;
+ int ret = 0;
+
+ for (i = g->g; i < g->g + g->nr; i++) {
+ if (i->trans->locking != i->node_want)
+ while (g->g + g->nr >= i) {
+ lock_graph_pop(g);
+ return 0;
+ }
+
+ if (i->trans == trans) {
+ ret = break_cycle(g);
+ if (ret)
+ goto deadlock;
+ /*
+ * If we didn't abort (instead telling another
+ * transaction to abort), keep checking:
+ */
+ }
+ }
+
+ if (g->nr == ARRAY_SIZE(g->g)) {
+ if (orig_trans->lock_may_not_fail)
+ return 0;
+
+ trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_);
+ ret = btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit);
+ goto deadlock;
+ }
+
+ closure_get(&trans->ref);
+
+ g->g[g->nr++] = (struct trans_waiting_for_lock) {
+ .trans = trans,
+ .node_want = trans->locking,
+ .lock_want = trans->locking_wait.lock_want,
+ };
+
+ return 0;
+deadlock:
+ while (g->nr)
+ lock_graph_pop(g);
+ return ret;
+}
+
+#if 0
+static void print_cycle(struct printbuf *out, struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ prt_str(out, "Found lock cycle:");
+ prt_newline(out);
+
+ for (i = g->g; i < g->g + g->nr; i++)
+ bch2_btree_trans_to_text(out, i->trans);
+}
+#endif
+
+static noinline void lock_graph_remove_non_waiters(struct lock_graph *g)
+{
+ struct trans_waiting_for_lock *i;
+
+ for (i = g->g + 1; i < g->g + g->nr; i++)
+ if (i->trans->locking != i->node_want ||
+ i->trans->locking_wait.start_time != i[-1].lock_start_time) {
+ while (g->g + g->nr >= i)
+ lock_graph_pop(g);
+ return;
+ }
+ BUG();
+}
+
+static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2)
+{
+ return t1 + t2 > 1;
+}
+
+static int check_for_deadlock(struct btree_trans *trans)
+{
+ struct lock_graph g;
+ struct trans_waiting_for_lock *top;
+ struct btree_bkey_cached_common *b;
+ struct btree_path *path;
+ int ret;
+
+ if (trans->lock_must_abort)
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
+
+ g.nr = 0;
+ ret = lock_graph_descend(&g, trans);
+ BUG_ON(ret);
+next:
+ if (!g.nr)
+ return 0;
+
+ top = &g.g[g.nr - 1];
+
+ trans_for_each_path_from(top->trans, path, top->path_idx) {
+ if (!path->nodes_locked)
+ continue;
+
+ if (top->path_idx != path->idx) {
+ top->path_idx = path->idx;
+ top->level = 0;
+ top->lock_start_time = 0;
+ }
+
+ for (;
+ top->level < BTREE_MAX_DEPTH;
+ top->level++, top->lock_start_time = 0) {
+ int lock_held = btree_node_locked_type(path, top->level);
+
+ if (lock_held == BTREE_NODE_UNLOCKED)
+ continue;
+
+ b = &READ_ONCE(path->l[top->level].b)->c;
+
+ if (unlikely(IS_ERR_OR_NULL(b))) {
+ lock_graph_remove_non_waiters(&g);
+ goto next;
+ }
+
+ if (list_empty_careful(&b->lock.wait_list))
+ continue;
+
+ raw_spin_lock(&b->lock.wait_lock);
+ list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) {
+ BUG_ON(b != trans->locking);
+
+ if (top->lock_start_time &&
+ time_after_eq64(top->lock_start_time, trans->locking_wait.start_time))
+ continue;
+
+ top->lock_start_time = trans->locking_wait.start_time;
+
+ /* Don't check for self deadlock: */
+ if (trans == top->trans ||
+ !lock_type_conflicts(lock_held, trans->locking_wait.lock_want))
+ continue;
+
+ ret = lock_graph_descend(&g, trans);
+ raw_spin_unlock(&b->lock.wait_lock);
+
+ if (ret)
+ return ret < 0 ? ret : 0;
+ goto next;
+
+ }
+ raw_spin_unlock(&b->lock.wait_lock);
+ }
+ }
+
+ lock_graph_pop(&g);
+ goto next;
+}
+
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
+{
+ struct btree_trans *trans = p;
+
+ return check_for_deadlock(trans);
+}
+
+int __bch2_btree_node_lock_write(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
{
int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read];
+ int ret;
/*
* Must drop our read locks before calling six_lock_write() -
@@ -64,8 +302,10 @@ void __bch2_btree_node_lock_write(struct btree_trans *trans,
* locked:
*/
six_lock_readers_add(&b->lock, -readers);
- btree_node_lock_nopath_nofail(trans, b, SIX_LOCK_write);
+ ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, lock_may_not_fail);
six_lock_readers_add(&b->lock, readers);
+
+ return ret;
}
static inline bool path_has_read_locks(struct btree_path *path)
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index aea2ebafffd8..874dd4428b3a 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -183,22 +183,41 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
void bch2_btree_node_unlock_write(struct btree_trans *,
struct btree_path *, struct btree *);
+int bch2_six_check_for_deadlock(struct six_lock *lock, void *p);
+
/* lock: */
+static inline int __btree_node_lock_nopath(struct btree_trans *trans,
+ struct btree_bkey_cached_common *b,
+ enum six_lock_type type,
+ bool lock_may_not_fail)
+{
+ int ret;
+
+ trans->lock_may_not_fail = lock_may_not_fail;
+ trans->locking = b;
+ trans->lock_must_abort = false;
+
+ ret = six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
+ WRITE_ONCE(trans->locking, NULL);
+ WRITE_ONCE(trans->locking_wait.start_time, 0);
+ return ret;
+}
+
static inline int __must_check
btree_node_lock_nopath(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- six_lock_type(&b->lock, type, NULL, NULL);
- return 0;
+ return __btree_node_lock_nopath(trans, b, type, false);
}
static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b,
enum six_lock_type type)
{
- int ret = btree_node_lock_nopath(trans, b, type);
+ int ret = __btree_node_lock_nopath(trans, b, type, true);
BUG_ON(ret);
}
@@ -210,8 +229,6 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
enum six_lock_type type,
six_lock_should_sleep_fn should_sleep_fn, void *p)
{
- int ret;
-
if (six_trylock_type(&b->lock, type))
return 0;
@@ -219,11 +236,10 @@ static inline int btree_node_lock_type(struct btree_trans *trans,
trans->locking_pos = pos;
trans->locking_btree_id = path->btree_id;
trans->locking_level = level;
- trans->locking_lock_type = type;
+ trans->lock_may_not_fail = false;
trans->locking = b;
- ret = six_lock_type(&b->lock, type, should_sleep_fn, p);
- trans->locking = NULL;
- return ret;
+ return six_lock_type_waiter(&b->lock, type, &trans->locking_wait,
+ bch2_six_check_for_deadlock, trans);
}
/*
@@ -279,12 +295,15 @@ static inline int btree_node_lock(struct btree_trans *trans,
return ret;
}
-void __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *);
+int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_bkey_cached_common *, bool);
-static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
- struct btree_path *path,
- struct btree_bkey_cached_common *b)
+static inline int __btree_node_lock_write(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b,
+ bool lock_may_not_fail)
{
+ int ret;
+
EBUG_ON(&path->l[b->level].b->c != b);
EBUG_ON(path->l[b->level].lock_seq != b->lock.state.seq);
EBUG_ON(!btree_node_intent_locked(path, b->level));
@@ -296,8 +315,21 @@ static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
*/
mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write);
- if (unlikely(!six_trylock_write(&b->lock)))
- __bch2_btree_node_lock_write(trans, b);
+ ret = likely(six_trylock_write(&b->lock))
+ ? 0
+ : __bch2_btree_node_lock_write(trans, b, lock_may_not_fail);
+ if (ret)
+ mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent);
+
+ return ret;
+}
+
+static inline void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree_bkey_cached_common *b)
+{
+ int ret = __btree_node_lock_write(trans, path, b, true);
+ BUG_ON(ret);
}
static inline int __must_check
@@ -305,8 +337,7 @@ bch2_btree_node_lock_write(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
- bch2_btree_node_lock_write_nofail(trans, path, b);
- return 0;
+ return __btree_node_lock_write(trans, path, b, false);
}
/* relock: */
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 0a3854b614e0..578cf8fa3d2f 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -387,15 +387,19 @@ struct btree_trans_commit_hook {
struct btree_trans {
struct bch_fs *c;
const char *fn;
+ struct closure ref;
struct list_head list;
u64 last_begin_time;
- struct btree_bkey_cached_common *locking;
+
unsigned locking_path_idx;
struct bpos locking_pos;
u8 locking_btree_id;
u8 locking_level;
- u8 locking_lock_type;
- struct task_struct *task;
+ u8 lock_may_not_fail;
+ u8 lock_must_abort;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+
int srcu_idx;
u8 fn_idx;
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 4fe20d36212e..6944dfef5bcb 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -534,7 +534,7 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
mutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->task->pid <= i->iter)
+ if (trans->locking_wait.task->pid <= i->iter)
continue;
ret = flush_buf(i);
@@ -546,11 +546,11 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- prt_backtrace(&i->buf, trans->task);
+ prt_backtrace(&i->buf, trans->locking_wait.task);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = trans->task->pid;
+ i->iter = trans->locking_wait.task->pid;
}
mutex_unlock(&c->btree_trans_lock);
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 3dc477eb3600..1ea004f1adbb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -35,6 +35,7 @@
x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \
x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\
+ x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\
x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \
x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 62de89fcb74b..35c40678f4b5 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1065,6 +1065,12 @@ TRACE_EVENT(trans_restart_would_deadlock,
__entry->want_pos_snapshot)
);
+DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
+ TP_PROTO(struct btree_trans *trans,
+ unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip)
+);
+
TRACE_EVENT(trans_restart_would_deadlock_write,
TP_PROTO(struct btree_trans *trans),
TP_ARGS(trans),