diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2019-10-05 12:54:53 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2023-10-22 17:08:41 -0400 |
commit | 5d20ba48f00050d8e6498cfbbb93b2914bd97114 (patch) | |
tree | 35f462b72e5f47e70c81996b2f92a3798e0b17c7 /fs/bcachefs | |
parent | 2ca88e5ad9b29624ea1467ef7fcc583c928fd783 (diff) | |
download | lwn-5d20ba48f00050d8e6498cfbbb93b2914bd97114.tar.gz lwn-5d20ba48f00050d8e6498cfbbb93b2914bd97114.zip |
bcachefs: Use cached iterators for alloc btree
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs/bcachefs')
-rw-r--r-- | fs/bcachefs/alloc_background.c | 51 | ||||
-rw-r--r-- | fs/bcachefs/alloc_background.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_leaf.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/buckets.c | 85 | ||||
-rw-r--r-- | fs/bcachefs/journal_reclaim.c | 3 | ||||
-rw-r--r-- | fs/bcachefs/journal_types.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/recovery.c | 117 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 30 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 8 |
9 files changed, 184 insertions, 115 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 44ad9821c807..678218ca0feb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -4,6 +4,7 @@ #include "alloc_foreground.h" #include "btree_cache.h" #include "btree_io.h" +#include "btree_key_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_gc.h" @@ -276,6 +277,13 @@ static int bch2_alloc_write_key(struct btree_trans *trans, struct bkey_i_alloc *a; int ret; retry: + bch2_trans_begin(trans); + + ret = bch2_btree_key_cache_flush(trans, + BTREE_ID_ALLOC, iter->pos); + if (ret) + goto err; + k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -330,7 +338,7 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); - bch2_trans_init(&trans, c, 0, 0); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -364,25 +372,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) return ret < 0 ? ret : 0; } -int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -{ - struct btree_trans trans; - struct btree_iter *iter; - int ret; - - bch2_trans_init(&trans, c, 0, 0); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, k->k.p, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - - ret = bch2_alloc_write_key(&trans, iter, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW| - BTREE_INSERT_JOURNAL_REPLAY); - bch2_trans_exit(&trans); - return ret < 0 ? ret : 0; -} - /* Bucket IO clocks: */ static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) @@ -840,7 +829,6 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, struct bkey_alloc_unpacked u; struct bucket *g; struct bucket_mark m; - struct bkey_s_c k; bool invalidating_cached_data; size_t b; int ret = 0; @@ -892,27 +880,14 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); retry: - k = bch2_btree_iter_peek_slot(iter); - ret = bkey_err(k); + ret = bch2_btree_iter_traverse(iter); if (ret) return ret; percpu_down_read(&c->mark_lock); g = bucket(ca, iter->pos.offset); m = READ_ONCE(g->mark); - - if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags))) { - /* - * During journal replay, and if gc repairs alloc info at - * runtime, the alloc info in the btree might not be up to date - * yet - so, trust the in memory mark: - */ - u = alloc_mem_to_key(g, m); - } else { - u = bch2_alloc_unpack(k); - u.read_time = g->io_time[READ]; - u.write_time = g->io_time[WRITE]; - } + u = alloc_mem_to_key(g, m); percpu_up_read(&c->mark_lock); @@ -1000,7 +975,9 @@ static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS(ca->dev_idx, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); /* Only use nowait if we've already invalidated at least one bucket: */ while (!ret && diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index b53a27450889..f6b9f27f0713 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -54,7 +54,6 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); struct journal_keys; int bch2_alloc_read(struct bch_fs *, struct journal_keys *); -int bch2_alloc_replay_key(struct bch_fs *, struct bkey_i *); static inline void bch2_wake_allocator(struct bch_dev *ca) { diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 2d0f101a6303..30839ccbf517 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -178,6 +178,9 @@ static bool btree_insert_key_leaf(struct btree_trans *trans, int old_live_u64s = b->nr.live_u64s; int live_u64s_added, u64s_added; + EBUG_ON(!iter->level && + !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); + if (unlikely(!bch2_btree_bset_insert_key(iter, b, &iter_l(iter)->iter, insert))) return false; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 36c62888f80a..1683833568a7 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1455,13 +1455,11 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, /* trans_mark: */ -static int trans_get_key(struct btree_trans *trans, - enum btree_id btree_id, struct bpos pos, - struct btree_iter **iter, - struct bkey_s_c *k) +static struct btree_iter *trans_get_update(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + struct bkey_s_c *k) { struct btree_insert_entry *i; - int ret; trans_for_each_update(trans, i) if (i->iter->btree_id == btree_id && @@ -1469,17 +1467,33 @@ static int trans_get_key(struct btree_trans *trans, ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && bkey_cmp(pos, i->k->k.p) < 0 : !bkey_cmp(pos, i->iter->pos))) { - *iter = i->iter; - *k = bkey_i_to_s_c(i->k); - return 1; + *k = bkey_i_to_s_c(i->k); + return i->iter; } + return NULL; +} + +static int trans_get_key(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + struct btree_iter **iter, + struct bkey_s_c *k) +{ + unsigned flags = btree_id != BTREE_ID_ALLOC + ? BTREE_ITER_SLOTS + : BTREE_ITER_CACHED; + int ret; + + *iter = trans_get_update(trans, btree_id, pos, k); + if (*iter) + return 1; + *iter = bch2_trans_get_iter(trans, btree_id, pos, - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + flags|BTREE_ITER_INTENT); if (IS_ERR(*iter)) return PTR_ERR(*iter); - *k = bch2_btree_iter_peek_slot(*iter); + *k = __bch2_btree_iter_peek(*iter, flags); ret = bkey_err(*k); if (ret) bch2_trans_iter_put(trans, *iter); @@ -1492,45 +1506,34 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); struct btree_iter *iter; struct bkey_s_c k_a; struct bkey_alloc_unpacked u; struct bkey_i_alloc *a; struct bucket *g; - struct bucket_mark m; int ret; - ret = trans_get_key(trans, BTREE_ID_ALLOC, - POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)), - &iter, &k_a); - if (ret < 0) - return ret; - - percpu_down_read(&c->mark_lock); - g = bucket(ca, iter->pos.offset); - m = READ_ONCE(g->mark); - - if (unlikely(!test_bit(BCH_FS_ALLOC_WRITTEN, &c->flags) && !ret)) { - /* - * During journal replay, and if gc repairs alloc info at - * runtime, the alloc info in the btree might not be up to date - * yet - so, trust the in memory mark - unless we're already - * updating that key: - */ - u = alloc_mem_to_key(g, m); + iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); + if (iter) { + u = bch2_alloc_unpack(k_a); } else { - u = bch2_alloc_unpack(k_a); - u.read_time = g->io_time[READ]; - u.write_time = g->io_time[WRITE]; - } - - percpu_up_read(&c->mark_lock); + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + ret = bch2_btree_iter_traverse(iter); + if (ret) + goto out; - /* - * Incrementing the bucket gen can be done lazily: - */ - if (gen_after(m.gen, u.gen) && !u.data_type) - u.gen = m.gen; + percpu_down_read(&c->mark_lock); + g = bucket(ca, pos.offset); + u = alloc_mem_to_key(g, READ_ONCE(g->mark)); + percpu_up_read(&c->mark_lock); + } ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, &u.dirty_sectors, &u.cached_sectors); @@ -1543,7 +1546,7 @@ static int bch2_trans_mark_pointer(struct btree_trans *trans, goto out; bkey_alloc_init(&a->k_i); - a->k.p = iter->pos; + a->k.p = pos; bch2_alloc_pack(a, u); bch2_trans_update(trans, iter, &a->k_i, 0); out: diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 62e322e959d3..1162acffdf45 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -416,6 +416,9 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) struct journal_entry_pin_list *pin_list; struct journal_entry_pin *ret = NULL; + if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) + return NULL; + spin_lock(&j->lock); fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 0585e9b6e230..6d0ee8e42da1 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -125,6 +125,7 @@ union journal_preres_state { enum { JOURNAL_REPLAY_DONE, JOURNAL_STARTED, + JOURNAL_RECLAIM_STARTED, JOURNAL_NEED_WRITE, JOURNAL_NOT_EMPTY, JOURNAL_MAY_GET_UNRESERVED, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 26e5767aa5de..41b864dcdc39 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -292,17 +292,6 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) cmp_int(l->journal_offset, r->journal_offset); } -static int journal_sort_seq_cmp(const void *_l, const void *_r) -{ - const struct journal_key *l = _l; - const struct journal_key *r = _r; - - return cmp_int(r->level, l->level) ?: - cmp_int(l->journal_seq, r->journal_seq) ?: - cmp_int(l->btree_id, r->btree_id) ?: - bkey_cmp(l->k->k.p, r->k->k.p); -} - void bch2_journal_keys_free(struct journal_keys *keys) { kvfree(keys->d); @@ -518,11 +507,48 @@ static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, __bch2_journal_replay_key(&trans, id, level, k)); } +static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) +{ + struct btree_iter *iter; + int ret; + + iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, + BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL| + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter) ?: + bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) +{ + return bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_USE_RESERVE| + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_JOURNAL_REPLAY, + __bch2_alloc_replay_key(&trans, k)); +} + +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return cmp_int(r->level, l->level) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->btree_id, r->btree_id) ?: + bkey_cmp(l->k->k.p, r->k->k.p); +} + static int bch2_journal_replay(struct bch_fs *c, struct journal_keys keys) { struct journal *j = &c->journal; struct journal_key *i; + u64 seq; int ret; sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); @@ -530,26 +556,63 @@ static int bch2_journal_replay(struct bch_fs *c, if (keys.nr) replay_now_at(j, keys.journal_seq_base); + seq = j->replay_journal_seq; + + /* + * First replay updates to the alloc btree - these will only update the + * btree key cache: + */ for_each_journal_key(keys, i) { - if (!i->level) - replay_now_at(j, keys.journal_seq_base + i->journal_seq); + cond_resched(); - if (i->level) - ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); - if (i->btree_id == BTREE_ID_ALLOC) + if (!i->level && i->btree_id == BTREE_ID_ALLOC) { + j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ret = bch2_alloc_replay_key(c, i->k); - else if (i->k->k.size) - ret = bch2_extent_replay_key(c, i->btree_id, i->k); - else - ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + if (ret) + goto err; + } + } - if (ret) { - bch_err(c, "journal replay: error %d while replaying key", - ret); - return ret; + /* + * Next replay updates to interior btree nodes: + */ + for_each_journal_key(keys, i) { + cond_resched(); + + if (i->level) { + j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; + ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + if (ret) + goto err; } + } + + /* + * Now that the btree is in a consistent state, we can start journal + * reclaim (which will be flushing entries from the btree key cache back + * to the btree: + */ + set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); + set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); + + j->replay_journal_seq = seq; + /* + * Now replay leaf node updates: + */ + for_each_journal_key(keys, i) { cond_resched(); + + if (i->level || i->btree_id == BTREE_ID_ALLOC) + continue; + + replay_now_at(j, keys.journal_seq_base + i->journal_seq); + + ret = i->k->k.size + ? bch2_extent_replay_key(c, i->btree_id, i->k) + : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); + if (ret) + goto err; } replay_now_at(j, j->replay_journal_seq_end); @@ -558,6 +621,9 @@ static int bch2_journal_replay(struct bch_fs *c, bch2_journal_set_replay_done(j); bch2_journal_flush_all_pins(j); return bch2_journal_error(j); +err: + bch_err(c, "journal replay: error %d while replaying key", ret); + return ret; } static bool journal_empty(struct list_head *journal) @@ -1183,6 +1249,9 @@ int bch2_fs_initialize(struct bch_fs *c) for (i = 0; i < BTREE_ID_NR; i++) bch2_btree_root_alloc(c, i); + set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); + set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); + err = "unable to allocate journal buckets"; for_each_online_member(ca, c, i) { ret = bch2_dev_journal_alloc(ca); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a680bf8d95f1..9bc470e68cc9 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1389,6 +1389,31 @@ int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, /* Device add/removal: */ +int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct btree_trans trans; + size_t i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < ca->mi.nbuckets; i++) { + ret = bch2_btree_key_cache_flush(&trans, + BTREE_ID_ALLOC, POS(ca->dev_idx, i)); + if (ret) + break; + } + bch2_trans_exit(&trans); + + if (ret) + return ret; + + return bch2_btree_delete_range(c, BTREE_ID_ALLOC, + POS(ca->dev_idx, 0), + POS(ca->dev_idx + 1, 0), + NULL); +} + int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_sb_field_members *mi; @@ -1422,10 +1447,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) goto err; } - ret = bch2_btree_delete_range(c, BTREE_ID_ALLOC, - POS(ca->dev_idx, 0), - POS(ca->dev_idx + 1, 0), - NULL); + ret = bch2_dev_remove_alloc(c, ca); if (ret) { bch_err(ca, "Remove failed, error deleting alloc info"); goto err; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1800e0f7f81e..bda9eb1598b8 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -134,7 +134,6 @@ do { \ write_attribute(trigger_journal_flush); write_attribute(trigger_btree_coalesce); write_attribute(trigger_gc); -write_attribute(trigger_alloc_write); write_attribute(prune_cache); rw_attribute(btree_gc_periodic); @@ -498,12 +497,6 @@ STORE(bch2_fs) #endif } - if (attr == &sysfs_trigger_alloc_write) { - bool wrote; - - bch2_alloc_write(c, 0, &wrote); - } - if (attr == &sysfs_prune_cache) { struct shrink_control sc; @@ -587,7 +580,6 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_flush, &sysfs_trigger_btree_coalesce, &sysfs_trigger_gc, - &sysfs_trigger_alloc_write, &sysfs_prune_cache, &sysfs_copy_gc_enabled, |