diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2022-03-14 21:48:42 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:09:29 -0400 |
commit | 31f63fd1244d9609265eb5cfc522c142b35cdacc (patch) | |
tree | ba7e780ac34fe7ce6f6647b0a47f4da301b26d83 | |
parent | d905f67ec89fda758bcfa70d0b5c3d3006bbdb3e (diff) | |
download | lwn-31f63fd1244d9609265eb5cfc522c142b35cdacc.tar.gz lwn-31f63fd1244d9609265eb5cfc522c142b35cdacc.zip |
bcachefs: Introduce a separate journal watermark for copygc
Since journal reclaim -> btree key cache flushing may require the
allocation of new btree nodes, it has an implicit dependency on copygc
in order to make forward progress - so we should avoid blocking copygc
unless the journal is really close to full.
This introduces watermarks to replace our single MAY_GET_UNRESERVED bit
in the journal, and adds a watermark for copygc and plumbs it through.
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r-- | fs/bcachefs/alloc_background.c | 1 | ||||
-rw-r--r-- | fs/bcachefs/btree_key_cache.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/btree_update.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 9 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_leaf.c | 12 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 51 | ||||
-rw-r--r-- | fs/bcachefs/journal.h | 53 | ||||
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 8 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 41 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 2 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 5 |
11 files changed, 108 insertions, 83 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index a53aeb4ee648..33b2e4d7da3b 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -670,7 +670,6 @@ static int bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, ret = bch2_trans_do(c, NULL, &commit_seq, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| flags, bucket_invalidate_btree(&trans, ca, b, &u)); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 7e41552a57df..f856dee0c3aa 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -421,7 +421,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE| (ck->journal.seq == journal_last_seq(j) - ? BTREE_INSERT_JOURNAL_RESERVED + ? JOURNAL_WATERMARK_reserved : 0)| commit_flags); if (ret) { diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 3cf4cc4f2350..ad13b0739a68 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -16,12 +16,12 @@ bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); enum btree_insert_flags { - __BTREE_INSERT_NOFAIL, + /* First two bits for journal watermark: */ + __BTREE_INSERT_NOFAIL = 2, __BTREE_INSERT_NOCHECK_RW, __BTREE_INSERT_LAZY_RW, __BTREE_INSERT_USE_RESERVE, __BTREE_INSERT_JOURNAL_REPLAY, - __BTREE_INSERT_JOURNAL_RESERVED, __BTREE_INSERT_JOURNAL_RECLAIM, __BTREE_INSERT_NOWAIT, __BTREE_INSERT_GC_LOCK_HELD, @@ -41,9 +41,6 @@ enum btree_insert_flags { /* Insert is for journal replay - don't get journal reservations: */ #define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -/* Indicates that we have pre-reserved space in the journal: */ -#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) - /* Insert is being called from journal reclaim path: */ #define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 1c53f965539d..cd4332f891dc 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -599,7 +599,7 @@ static void btree_update_nodes_written(struct btree_update *as) BTREE_INSERT_NOFAIL| BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED, + JOURNAL_WATERMARK_reserved, btree_update_nodes_written_trans(&trans, as)); bch2_trans_exit(&trans); @@ -964,14 +964,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ? BCH_DISK_RESERVATION_NOFAIL : 0; unsigned nr_nodes[2]; unsigned update_level = level; - int journal_flags = 0; + int journal_flags = flags & JOURNAL_WATERMARK_MASK; int ret = 0; BUG_ON(!path->should_be_locked); - if (flags & BTREE_INSERT_JOURNAL_RESERVED) - journal_flags |= JOURNAL_RES_GET_RESERVED; - closure_init_stack(&cl); retry: nr_nodes[0] = nr_nodes[1] = 0; @@ -1972,7 +1969,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, BTREE_INSERT_NOCHECK_RW| BTREE_INSERT_USE_RESERVE| BTREE_INSERT_JOURNAL_RECLAIM| - BTREE_INSERT_JOURNAL_RESERVED); + JOURNAL_WATERMARK_reserved); if (ret) goto err; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index f534d7e649fd..90e6e5130672 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -296,11 +296,10 @@ static inline int bch2_trans_journal_res_get(struct btree_trans *trans, struct bch_fs *c = trans->c; int ret; - if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - flags |= JOURNAL_RES_GET_RESERVED; - ret = bch2_journal_res_get(&c->journal, &trans->journal_res, - trans->journal_u64s, flags); + trans->journal_u64s, + flags| + (trans->flags & JOURNAL_WATERMARK_MASK)); return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; } @@ -902,8 +901,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, ret = bch2_journal_preres_get(&c->journal, &trans->journal_preres, trans->journal_preres_u64s, JOURNAL_RES_GET_NONBLOCK| - ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED) - ? JOURNAL_RES_GET_RESERVED : 0)); + (trans->flags & JOURNAL_WATERMARK_MASK)); if (unlikely(ret == -EAGAIN)) ret = bch2_trans_journal_preres_get_cold(trans, trans->journal_preres_u64s, trace_ip); @@ -988,7 +986,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, bch2_trans_unlock(trans); if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) && - !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED)) { + !(trans->flags & JOURNAL_WATERMARK_reserved)) { trans->restarted = true; ret = -EAGAIN; break; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 750509661d79..c7f1674ed596 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -19,6 +19,18 @@ #include "super-io.h" #include "trace.h" +#define x(n) #n, +static const char * const bch2_journal_watermarks[] = { + JOURNAL_WATERMARKS() + NULL +}; + +static const char * const bch2_journal_errors[] = { + JOURNAL_ERRORS() + NULL +}; +#undef x + static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -207,19 +219,19 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return cur_entry_blocked; + return JOURNAL_ERR_blocked; if (j->cur_entry_error) return j->cur_entry_error; if (bch2_journal_error(j)) - return cur_entry_insufficient_devices; /* -EROFS */ + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ if (!fifo_free(&j->pin)) - return cur_entry_journal_pin_full; + return JOURNAL_ERR_journal_pin_full; if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return cur_entry_max_in_flight; + return JOURNAL_ERR_max_in_flight; BUG_ON(!j->cur_entry_sectors); @@ -238,7 +250,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= 0) - return cur_entry_journal_full; + return JOURNAL_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -354,13 +366,12 @@ retry: return 0; } - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) { /* * Don't want to close current journal entry, just need to * invoke reclaim: */ - ret = cur_entry_journal_full; + ret = JOURNAL_ERR_journal_full; goto unlock; } @@ -378,10 +389,10 @@ retry: __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); ret = journal_entry_open(j); - if (ret == cur_entry_max_in_flight) + if (ret == JOURNAL_ERR_max_in_flight) trace_journal_entry_full(c); unlock: - if ((ret && ret != cur_entry_insufficient_devices) && + if ((ret && ret != JOURNAL_ERR_insufficient_devices) && !j->res_get_blocked_start) { j->res_get_blocked_start = local_clock() ?: 1; trace_journal_full(c); @@ -393,14 +404,15 @@ unlock: if (!ret) goto retry; - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !can_discard && !nr_unwritten_journal_entries(j) && - (flags & JOURNAL_RES_GET_RESERVED)) { + (flags & JOURNAL_WATERMARK_MASK) == JOURNAL_WATERMARK_reserved) { struct printbuf buf = PRINTBUF; - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full"); + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (ret %s)", + bch2_journal_errors[ret]); bch2_journal_debug_to_text(&buf, j); bch_err(c, "%s", buf.buf); @@ -418,8 +430,8 @@ unlock: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == cur_entry_journal_full || - ret == cur_entry_journal_pin_full) && + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -432,7 +444,7 @@ unlock: } } - return ret == cur_entry_insufficient_devices ? -EROFS : -EAGAIN; + return ret == JOURNAL_ERR_insufficient_devices ? -EROFS : -EAGAIN; } /* @@ -1187,13 +1199,14 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) rcu_read_lock(); s = READ_ONCE(j->reservations); - pr_buf(out, "dirty journal entries:\t%llu\n", fifo_used(&j->pin)); + pr_buf(out, "dirty journal entries:\t%llu/%llu\n",fifo_used(&j->pin), j->pin.size); pr_buf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); pr_buf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); pr_buf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); pr_buf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); pr_buf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); pr_buf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); + pr_buf(out, "watermark:\t\t%s\n", bch2_journal_watermarks[j->watermark]); pr_buf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); pr_buf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); pr_buf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); @@ -1203,7 +1216,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) pr_buf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); pr_buf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - pr_buf(out, "current entry error:\t%u\n", j->cur_entry_error); + pr_buf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); pr_buf(out, "current entry:\t\t"); switch (s.cur_entry_offset) { diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 243349f4ac1c..c287ecf643aa 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -293,9 +293,9 @@ static inline void bch2_journal_res_put(struct journal *j, int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, unsigned); -#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -#define JOURNAL_RES_GET_CHECK (1 << 1) -#define JOURNAL_RES_GET_RESERVED (1 << 2) +/* First two bits for JOURNAL_WATERMARK: */ +#define JOURNAL_RES_GET_NONBLOCK (1 << 2) +#define JOURNAL_RES_GET_CHECK (1 << 3) static inline int journal_res_get_fast(struct journal *j, struct journal_res *res, @@ -316,8 +316,7 @@ static inline int journal_res_get_fast(struct journal *j, EBUG_ON(!journal_state_count(new, new.idx)); - if (!(flags & JOURNAL_RES_GET_RESERVED) && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) + if ((flags & JOURNAL_WATERMARK_MASK) < j->watermark) return 0; new.cur_entry_offset += res->u64s; @@ -370,23 +369,27 @@ out: /* journal_preres: */ -static inline bool journal_check_may_get_unreserved(struct journal *j) +static inline void journal_set_watermark(struct journal *j) { union journal_preres_state s = READ_ONCE(j->prereserved); - bool ret = s.reserved < s.remaining && - fifo_free(&j->pin) > j->pin.size / 4; - - lockdep_assert_held(&j->lock); - - if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - if (ret) { - set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - journal_wake(j); - } else { - clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); - } - } - return ret; + unsigned watermark = JOURNAL_WATERMARK_any; + + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (fifo_free(&j->pin) < j->pin.size / 8) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (s.reserved > s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_copygc); + if (!s.remaining) + watermark = max_t(unsigned, watermark, JOURNAL_WATERMARK_reserved); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); } static inline void bch2_journal_preres_put(struct journal *j, @@ -406,12 +409,8 @@ static inline void bch2_journal_preres_put(struct journal *j, closure_wake_up(&j->preres_wait); } - if (s.reserved <= s.remaining && - !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { - spin_lock(&j->lock); - journal_check_may_get_unreserved(j); - spin_unlock(&j->lock); - } + if (s.reserved <= s.remaining && j->watermark) + journal_set_watermark(j); } int __bch2_journal_preres_get(struct journal *, @@ -432,7 +431,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j, old.v = new.v = v; ret = 0; - if ((flags & JOURNAL_RES_GET_RESERVED) || + if ((flags & JOURNAL_WATERMARK_reserved) || new.reserved + d < new.remaining) { new.reserved += d; ret = 1; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index f55fc0b11977..e99a01e3b5fb 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -195,7 +195,7 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < c->opts.metadata_replicas_required) { - ret = cur_entry_insufficient_devices; + ret = JOURNAL_ERR_insufficient_devices; goto out; } @@ -224,9 +224,9 @@ void bch2_journal_space_available(struct journal *j) bch2_fatal_error(c); spin_lock(&j->lock); - ret = cur_entry_journal_stuck; + ret = JOURNAL_ERR_journal_stuck; } else if (!j->space[journal_space_discarded].next_entry) - ret = cur_entry_journal_full; + ret = JOURNAL_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -245,7 +245,7 @@ out: j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; journal_set_remaining(j, u64s_remaining); - journal_check_may_get_unreserved(j); + journal_set_watermark(j); if (!ret) journal_wake(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 91f829adf862..a41b915b3ac6 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -144,16 +144,45 @@ enum journal_space_from { enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, - JOURNAL_MAY_GET_UNRESERVED, JOURNAL_MAY_SKIP_FLUSH, }; +#define JOURNAL_WATERMARKS() \ + x(any) \ + x(copygc) \ + x(reserved) + +enum journal_watermark { +#define x(n) JOURNAL_WATERMARK_##n, + JOURNAL_WATERMARKS() +#undef x +}; + +#define JOURNAL_WATERMARK_MASK 3 + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ struct { union journal_res_state reservations; + enum journal_watermark watermark; union journal_preres_state prereserved; @@ -173,15 +202,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum { - cur_entry_ok, - cur_entry_blocked, - cur_entry_max_in_flight, - cur_entry_journal_full, - cur_entry_journal_pin_full, - cur_entry_journal_stuck, - cur_entry_insufficient_devices, - } cur_entry_error; + enum journal_errors cur_entry_error; unsigned buf_size_want; /* diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index b43e54133b15..a54a83d3247b 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -91,7 +91,7 @@ static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, data_opts->target = io_opts->background_target; data_opts->nr_replicas = 1; data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED; + JOURNAL_WATERMARK_copygc; data_opts->rewrite_dev = p.ptr.dev; if (p.has_ec) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 887971559214..93882e6a2ae4 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -562,8 +562,9 @@ static int bch2_journal_replay(struct bch_fs *c) ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW| BTREE_INSERT_NOFAIL| - BTREE_INSERT_JOURNAL_RESERVED| - (!k->allocated ? BTREE_INSERT_JOURNAL_REPLAY : 0), + (!k->allocated + ? BTREE_INSERT_JOURNAL_REPLAY|JOURNAL_WATERMARK_reserved + : 0), bch2_journal_replay_key(&trans, k)); if (ret) { bch_err(c, "journal replay: error %d while replaying key at btree %s level %u", |