summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2021-12-25 20:07:00 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2023-10-22 17:09:21 -0400
commit5222a4607cd8b9d8882e81796917c10193d10be0 (patch)
treeb68fb2d883efdb75eaf1376c9f2a0baf30fb30fc /fs
parentf28620c108a9476c7b4b25b8e36b94b6b2b29295 (diff)
downloadlwn-5222a4607cd8b9d8882e81796917c10193d10be0.tar.gz
lwn-5222a4607cd8b9d8882e81796917c10193d10be0.zip
bcachefs: BTREE_ITER_WITH_JOURNAL
This adds a new btree iterator flag, BTREE_ITER_WITH_JOURNAL, that is automatically enabled when initializing a btree iterator before journal replay has completed - it overlays the contents of the journal with the btree. This lets us delete bch2_btree_and_journal_walk() and just use the normal btree iterator interface instead - which also lets us delete a significant amount of duplicated code. Note that BTREE_ITER_WITH_JOURNAL is still unoptimized in this patch - we're redoing the binary search over keys in the journal every time we call bch2_btree_iter_peek(). Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
Diffstat (limited to 'fs')
-rw-r--r--fs/bcachefs/alloc_background.c60
-rw-r--r--fs/bcachefs/bcachefs.h1
-rw-r--r--fs/bcachefs/btree_gc.c185
-rw-r--r--fs/bcachefs/btree_iter.c194
-rw-r--r--fs/bcachefs/btree_types.h10
-rw-r--r--fs/bcachefs/btree_update_interior.c4
-rw-r--r--fs/bcachefs/btree_update_leaf.c2
-rw-r--r--fs/bcachefs/ec.c60
-rw-r--r--fs/bcachefs/recovery.c158
-rw-r--r--fs/bcachefs/recovery.h10
10 files changed, 331 insertions, 353 deletions
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 30bf363d2ff3..cb4b059e796c 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -340,46 +340,46 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c,
#undef x
}
-static int bch2_alloc_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_alloc_read(struct bch_fs *c)
{
- struct bch_fs *c = trans->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct bch_dev *ca;
struct bucket *g;
struct bkey_alloc_unpacked u;
-
- if (!bkey_is_alloc(k.k))
- return 0;
-
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = bucket(ca, k.k->p.offset);
- u = bch2_alloc_unpack(k);
-
- *bucket_gen(ca, k.k->p.offset) = u.gen;
- g->_mark.gen = u.gen;
- g->_mark.data_type = u.data_type;
- g->_mark.dirty_sectors = u.dirty_sectors;
- g->_mark.cached_sectors = u.cached_sectors;
- g->_mark.stripe = u.stripe != 0;
- g->stripe = u.stripe;
- g->stripe_redundancy = u.stripe_redundancy;
- g->io_time[READ] = u.read_time;
- g->io_time[WRITE] = u.write_time;
- g->oldest_gen = u.oldest_gen;
- g->gen_valid = 1;
-
- return 0;
-}
-
-int bch2_alloc_read(struct bch_fs *c)
-{
- struct btree_trans trans;
int ret;
bch2_trans_init(&trans, c, 0, 0);
down_read(&c->gc_lock);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_alloc, bch2_alloc_read_fn);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (!bkey_is_alloc(k.k))
+ continue;
+
+ ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ g = bucket(ca, k.k->p.offset);
+ u = bch2_alloc_unpack(k);
+
+ *bucket_gen(ca, k.k->p.offset) = u.gen;
+ g->_mark.gen = u.gen;
+ g->_mark.data_type = u.data_type;
+ g->_mark.dirty_sectors = u.dirty_sectors;
+ g->_mark.cached_sectors = u.cached_sectors;
+ g->_mark.stripe = u.stripe != 0;
+ g->stripe = u.stripe;
+ g->stripe_redundancy = u.stripe_redundancy;
+ g->io_time[READ] = u.read_time;
+ g->io_time[WRITE] = u.write_time;
+ g->oldest_gen = u.oldest_gen;
+ g->gen_valid = 1;
+ }
+ bch2_trans_iter_exit(&trans, &iter);
+
up_read(&c->gc_lock);
bch2_trans_exit(&trans);
+
if (ret) {
bch_err(c, "error reading alloc info: %i", ret);
return ret;
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 9452b6cf04a5..431cf25b38db 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -860,7 +860,6 @@ mempool_t bio_bounce_pages;
u64 reflink_hint;
reflink_gc_table reflink_gc_table;
size_t reflink_gc_nr;
- size_t reflink_gc_idx;
/* VFS IO PATH - fs-io.c */
struct bio_set writepage_bioset;
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 77c30157792b..d7de00af81c9 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -1342,59 +1342,6 @@ static int bch2_gc_start(struct bch_fs *c,
return 0;
}
-static int bch2_gc_reflink_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
- char buf[200];
- int ret = 0;
-
- if (!refcount)
- return 0;
-
- r = genradix_ptr(&c->reflink_gc_table, c->reflink_gc_idx++);
- if (!r)
- return -ENOMEM;
-
- if (!r ||
- r->offset != k.k->p.offset ||
- r->size != k.k->size) {
- bch_err(c, "unexpected inconsistency walking reflink table at gc finish");
- return -EINVAL;
- }
-
- if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c,
- "reflink key has wrong refcount:\n"
- " %s\n"
- " should be %u",
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- r->refcount)) {
- struct bkey_i *new;
-
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto fsck_err;
- }
-
- bkey_reassemble(new, k);
-
- if (!r->refcount) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- } else {
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
- }
-
- ret = bch2_journal_key_insert(c, BTREE_ID_reflink, 0, new);
- kfree(new);
- }
-fsck_err:
- return ret;
-}
-
static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bool metadata_only)
{
@@ -1411,14 +1358,6 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0);
- if (initial) {
- c->reflink_gc_idx = 0;
-
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_done_initial_fn);
- goto out;
- }
-
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
@@ -1426,7 +1365,7 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
if (!refcount)
continue;
- r = genradix_ptr(&c->reflink_gc_table, idx);
+ r = genradix_ptr(&c->reflink_gc_table, idx++);
if (!r ||
r->offset != k.k->p.offset ||
r->size != k.k->size) {
@@ -1456,7 +1395,9 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
else
*bkey_refcount(new) = cpu_to_le64(r->refcount);
- ret = __bch2_trans_do(&trans, NULL, NULL, 0,
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, new)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
__bch2_btree_insert(&trans, BTREE_ID_reflink, new));
kfree(new);
@@ -1466,104 +1407,74 @@ static int bch2_gc_reflink_done(struct bch_fs *c, bool initial,
}
fsck_err:
bch2_trans_iter_exit(&trans, &iter);
-out:
c->reflink_gc_nr = 0;
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_stripes_done_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
+static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
+ bool metadata_only)
{
- struct bch_fs *c = trans->c;
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
struct gc_stripe *m;
const struct bch_stripe *s;
char buf[200];
unsigned i;
int ret = 0;
- if (k.k->type != KEY_TYPE_stripe)
+ if (metadata_only)
return 0;
- s = bkey_s_c_to_stripe(k).v;
+ bch2_trans_init(&trans, c, 0, 0);
+
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
+ s = bkey_s_c_to_stripe(k).v;
+ m = genradix_ptr(&c->gc_stripes, k.k->p.offset);
- for (i = 0; i < s->nr_blocks; i++)
- if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
- goto inconsistent;
- return 0;
+ for (i = 0; i < s->nr_blocks; i++)
+ if (stripe_blockcount_get(s, i) != (m ? m->block_sectors[i] : 0))
+ goto inconsistent;
+ continue;
inconsistent:
- if (fsck_err_on(true, c,
- "stripe has wrong block sector count %u:\n"
- " %s\n"
- " should be %u", i,
- (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
- m ? m->block_sectors[i] : 0)) {
- struct bkey_i_stripe *new;
+ if (fsck_err_on(true, c,
+ "stripe has wrong block sector count %u:\n"
+ " %s\n"
+ " should be %u", i,
+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf),
+ m ? m->block_sectors[i] : 0)) {
+ struct bkey_i_stripe *new;
- new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
- if (!new) {
- ret = -ENOMEM;
- goto fsck_err;
- }
+ new = kmalloc(bkey_bytes(k.k), GFP_KERNEL);
+ if (!new) {
+ ret = -ENOMEM;
+ break;
+ }
- bkey_reassemble(&new->k_i, k);
+ bkey_reassemble(&new->k_i, k);
- for (i = 0; i < new->v.nr_blocks; i++)
- stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
+ for (i = 0; i < new->v.nr_blocks; i++)
+ stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0);
- ret = bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i);
- kfree(new);
+ ret = initial
+ ? bch2_journal_key_insert(c, BTREE_ID_stripes, 0, &new->k_i)
+ : __bch2_trans_do(&trans, NULL, NULL, 0,
+ __bch2_btree_insert(&trans, BTREE_ID_reflink, &new->k_i));
+ kfree(new);
+ }
}
fsck_err:
- return ret;
-}
-
-static int bch2_gc_stripes_done(struct bch_fs *c, bool initial,
- bool metadata_only)
-{
- struct btree_trans trans;
- int ret = 0;
-
- if (metadata_only)
- return 0;
-
- bch2_trans_init(&trans, c, 0, 0);
-
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_gc_stripes_done_initial_fn);
- } else {
- BUG();
- }
+ bch2_trans_iter_exit(&trans, &iter);
bch2_trans_exit(&trans);
return ret;
}
-static int bch2_gc_reflink_start_initial_fn(struct btree_trans *trans,
- struct bkey_s_c k)
-{
-
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- const __le64 *refcount = bkey_refcount_c(k);
-
- if (!refcount)
- return 0;
-
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r)
- return -ENOMEM;
-
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- return 0;
-}
-
static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bool metadata_only)
{
@@ -1579,12 +1490,6 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
bch2_trans_init(&trans, c, 0, 0);
c->reflink_gc_nr = 0;
- if (initial) {
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_reflink,
- bch2_gc_reflink_start_initial_fn);
- goto out;
- }
-
for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN,
BTREE_ITER_PREFETCH, k, ret) {
const __le64 *refcount = bkey_refcount_c(k);
@@ -1604,7 +1509,7 @@ static int bch2_gc_reflink_start(struct bch_fs *c, bool initial,
r->refcount = 0;
}
bch2_trans_iter_exit(&trans, &iter);
-out:
+
bch2_trans_exit(&trans);
return ret;
}
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 0b5bf75fbf89..01c130a3ce8d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "recovery.h"
#include "replicas.h"
#include "subvolume.h"
#include "trace.h"
@@ -1064,6 +1065,7 @@ static inline bool btree_path_advance_to_pos(struct btree_path *path,
static void btree_path_verify_new_node(struct btree_trans *trans,
struct btree_path *path, struct btree *b)
{
+ struct bch_fs *c = trans->c;
struct btree_path_level *l;
unsigned plevel;
bool parent_locked;
@@ -1072,6 +1074,9 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
return;
+ if (trans->journal_replay_not_finished)
+ return;
+
plevel = b->c.level + 1;
if (!btree_path_node(path, plevel))
return;
@@ -1092,7 +1097,7 @@ static void btree_path_verify_new_node(struct btree_trans *trans,
char buf4[100];
struct bkey uk = bkey_unpack_key(b, k);
- bch2_dump_btree_node(trans->c, l->b);
+ bch2_dump_btree_node(c, l->b);
bch2_bpos_to_text(&PBUF(buf1), path->pos);
bch2_bkey_to_text(&PBUF(buf2), &uk);
bch2_bpos_to_text(&PBUF(buf3), b->data->min_key);
@@ -1283,6 +1288,41 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
return ret;
}
+static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path,
+ struct btree_and_journal_iter *jiter)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ struct bkey_buf tmp;
+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ ? (path->level > 1 ? 0 : 2)
+ : (path->level > 1 ? 1 : 16);
+ bool was_locked = btree_node_locked(path, path->level);
+ int ret = 0;
+
+ bch2_bkey_buf_init(&tmp);
+
+ while (nr && !ret) {
+ if (!bch2_btree_node_relock(trans, path, path->level))
+ break;
+
+ bch2_btree_and_journal_iter_advance(jiter);
+ k = bch2_btree_and_journal_iter_peek(jiter);
+ if (!k.k)
+ break;
+
+ bch2_bkey_buf_reassemble(&tmp, c, k);
+ ret = bch2_btree_node_prefetch(c, trans, path, tmp.k, path->btree_id,
+ path->level - 1);
+ }
+
+ if (!was_locked)
+ btree_node_unlock(path, path->level);
+
+ bch2_bkey_buf_exit(&tmp, c);
+ return ret;
+}
+
static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
struct btree_path *path,
unsigned plevel, struct btree *b)
@@ -1305,6 +1345,30 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans,
btree_node_unlock(path, plevel);
}
+static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
+ struct btree_path *path,
+ unsigned flags,
+ struct bkey_buf *out)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_path_level *l = path_l(path);
+ struct btree_and_journal_iter jiter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+
+ k = bch2_btree_and_journal_iter_peek(&jiter);
+
+ bch2_bkey_buf_reassemble(out, c, k);
+
+ if (flags & BTREE_ITER_PREFETCH)
+ ret = btree_path_prefetch_j(trans, path, &jiter);
+
+ bch2_btree_and_journal_iter_exit(&jiter);
+ return ret;
+}
+
static __always_inline int btree_path_down(struct btree_trans *trans,
struct btree_path *path,
unsigned flags,
@@ -1321,8 +1385,21 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
EBUG_ON(!btree_node_locked(path, path->level));
bch2_bkey_buf_init(&tmp);
- bch2_bkey_buf_unpack(&tmp, c, l->b,
- bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (unlikely(trans->journal_replay_not_finished)) {
+ ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp);
+ if (ret)
+ goto err;
+ } else {
+ bch2_bkey_buf_unpack(&tmp, c, l->b,
+ bch2_btree_node_iter_peek(&l->iter, l->b));
+
+ if (flags & BTREE_ITER_PREFETCH) {
+ ret = btree_path_prefetch(trans, path);
+ if (ret)
+ goto err;
+ }
+ }
b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip);
ret = PTR_ERR_OR_ZERO(b);
@@ -1332,13 +1409,11 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
mark_btree_node_locked(path, level, lock_type);
btree_path_level_init(trans, path, b);
- if (tmp.k->k.type == KEY_TYPE_btree_ptr_v2 &&
+ if (likely(!trans->journal_replay_not_finished &&
+ tmp.k->k.type == KEY_TYPE_btree_ptr_v2) &&
unlikely(b != btree_node_mem_ptr(tmp.k)))
btree_node_mem_ptr_set(trans, path, level + 1, b);
- if (flags & BTREE_ITER_PREFETCH)
- ret = btree_path_prefetch(trans, path);
-
if (btree_node_read_locked(path, level + 1))
btree_node_unlock(path, level + 1);
path->level = level;
@@ -2113,6 +2188,55 @@ struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
return ret;
}
+static struct bkey_i *__btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_path *path)
+{
+ struct journal_keys *keys = &trans->c->journal_keys;
+ size_t idx = bch2_journal_key_search(keys, path->btree_id,
+ path->level, path->pos);
+
+ while (idx < keys->nr && keys->d[idx].overwritten)
+ idx++;
+
+ return (idx < keys->nr &&
+ keys->d[idx].btree_id == path->btree_id &&
+ keys->d[idx].level == path->level)
+ ? keys->d[idx].k
+ : NULL;
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
+ struct btree_iter *iter)
+{
+ struct bkey_i *k = __btree_trans_peek_journal(trans, iter->path);
+
+ if (k && !bpos_cmp(k->k.p, iter->pos)) {
+ iter->k = k->k;
+ return bkey_i_to_s_c(k);
+ } else {
+ return bkey_s_c_null;
+ }
+}
+
+static noinline
+struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct bkey_i *next_journal =
+ __btree_trans_peek_journal(trans, iter->path);
+
+ if (next_journal &&
+ bpos_cmp(next_journal->k.p,
+ k.k ? k.k->p : iter->path->l[0].b->key.k.p) <= 0) {
+ iter->k = next_journal->k;
+ k = bkey_i_to_s_c(next_journal);
+ }
+
+ return k;
+}
+
/**
* bch2_btree_iter_peek: returns first key greater than or equal to iterator's
* current position
@@ -2141,16 +2265,12 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
goto out;
}
- next_update = btree_trans_peek_updates(iter);
k = btree_path_level_peek_all(trans->c, &iter->path->l[0], &iter->k);
- /* * In the btree, deleted keys sort before non deleted: */
- if (k.k && bkey_deleted(k.k) &&
- (!next_update ||
- bpos_cmp(k.k->p, next_update->k.p) <= 0)) {
- search_key = k.k->p;
- continue;
- }
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
+ k = btree_trans_peek_journal(trans, iter, k);
+
+ next_update = btree_trans_peek_updates(iter);
if (next_update &&
bpos_cmp(next_update->k.p,
@@ -2159,6 +2279,20 @@ struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
k = bkey_i_to_s_c(next_update);
}
+ if (k.k && bkey_deleted(k.k)) {
+ /*
+ * If we've got a whiteout, and it's after the search
+ * key, advance the search key to the whiteout instead
+ * of just after the whiteout - it might be a btree
+ * whiteout, with a real key at the same position, since
+ * in the btree deleted keys sort before non deleted.
+ */
+ search_key = bpos_cmp(search_key, k.k->p)
+ ? k.k->p
+ : bpos_successor(k.k->p);
+ continue;
+ }
+
if (likely(k.k)) {
/*
* We can never have a key in a leaf node at POS_MAX, so
@@ -2249,6 +2383,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
EBUG_ON(iter->path->cached || iter->path->level);
EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+
+ if (iter->flags & BTREE_ITER_WITH_JOURNAL)
+ return bkey_s_c_err(-EIO);
+
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
@@ -2395,23 +2533,18 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
struct bkey_i *next_update;
- next_update = btree_trans_peek_updates(iter);
- if (next_update &&
+ if ((next_update = btree_trans_peek_updates(iter)) &&
!bpos_cmp(next_update->k.p, iter->pos)) {
iter->k = next_update->k;
k = bkey_i_to_s_c(next_update);
- } else {
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ goto out;
}
- if (!k.k ||
- ((iter->flags & BTREE_ITER_ALL_SNAPSHOTS)
- ? bpos_cmp(iter->pos, k.k->p)
- : bkey_cmp(iter->pos, k.k->p))) {
- bkey_init(&iter->k);
- iter->k.p = iter->pos;
- k = (struct bkey_s_c) { &iter->k, NULL };
- }
+ if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
+ (k = btree_trans_peek_slot_journal(trans, iter)).k)
+ goto out;
+
+ k = bch2_btree_path_peek_slot(iter->path, &iter->k);
} else {
struct bpos next;
@@ -2455,7 +2588,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
k = (struct bkey_s_c) { &iter->k, NULL };
}
}
-
+out:
iter->path->should_be_locked = true;
bch2_btree_iter_verify_entry_exit(iter);
@@ -2635,6 +2768,9 @@ static void __bch2_trans_iter_init(struct btree_trans *trans,
btree_type_has_snapshots(btree_id))
flags |= BTREE_ITER_FILTER_SNAPSHOTS;
+ if (trans->journal_replay_not_finished)
+ flags |= BTREE_ITER_WITH_JOURNAL;
+
iter->trans = trans;
iter->path = NULL;
iter->btree_id = btree_id;
@@ -2801,6 +2937,8 @@ void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
memset(trans, 0, sizeof(*trans));
trans->c = c;
trans->ip = _RET_IP_;
+ trans->journal_replay_not_finished =
+ !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags);
bch2_trans_alloc_paths(trans, c);
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 2c8b30949e6f..1fd0cebe30ac 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -207,10 +207,11 @@ struct btree_node_iter {
#define BTREE_ITER_CACHED_NOFILL (1 << 8)
#define BTREE_ITER_CACHED_NOCREATE (1 << 9)
#define BTREE_ITER_WITH_UPDATES (1 << 10)
-#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 11)
-#define BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
-#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 13)
-#define BTREE_ITER_NOPRESERVE (1 << 14)
+#define BTREE_ITER_WITH_JOURNAL (1 << 11)
+#define __BTREE_ITER_ALL_SNAPSHOTS (1 << 12)
+#define BTREE_ITER_ALL_SNAPSHOTS (1 << 13)
+#define BTREE_ITER_FILTER_SNAPSHOTS (1 << 14)
+#define BTREE_ITER_NOPRESERVE (1 << 15)
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -381,6 +382,7 @@ struct btree_trans {
bool restarted:1;
bool paths_sorted:1;
bool journal_transaction_names:1;
+ bool journal_replay_not_finished:1;
/*
* For when bch2_trans_update notices we'll be splitting a compressed
* extent:
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 6ef0711431a1..17111c4228bd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -16,6 +16,7 @@
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
+#include "recovery.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"
@@ -1146,6 +1147,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
!btree_ptr_sectors_written(insert));
+ if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);
+
invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
if (invalid) {
diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c
index 8af9ba464b25..e95940ffad6b 100644
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
@@ -711,7 +711,7 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
ret = bch2_trans_commit_write_locked(trans, stopped_at, trace_ip);
- if (!ret && unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
+ if (!ret && unlikely(trans->journal_replay_not_finished))
bch2_drop_overwrites_from_journal(trans);
trans_for_each_update(trans, i)
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index e18d2ecf7f07..86421f65d139 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -1558,50 +1558,48 @@ void bch2_stripes_heap_start(struct bch_fs *c)
bch2_stripes_heap_insert(c, m, iter.pos);
}
-static int bch2_stripes_read_fn(struct btree_trans *trans, struct bkey_s_c k)
+int bch2_stripes_read(struct bch_fs *c)
{
+ struct btree_trans trans;
+ struct btree_iter iter;
+ struct bkey_s_c k;
const struct bch_stripe *s;
- struct bch_fs *c = trans->c;
struct stripe *m;
unsigned i;
- int ret = 0;
+ int ret;
- if (k.k->type != KEY_TYPE_stripe)
- return 0;
+ bch2_trans_init(&trans, c, 0, 0);
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- return ret;
+ for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ret) {
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- s = bkey_s_c_to_stripe(k).v;
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->alive = true;
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ s = bkey_s_c_to_stripe(k).v;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->alive = true;
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- spin_lock(&c->ec_stripes_heap_lock);
- bch2_stripes_heap_update(c, m, k.k->p.offset);
- spin_unlock(&c->ec_stripes_heap_lock);
-
- return ret;
-}
+ for (i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
-int bch2_stripes_read(struct bch_fs *c)
-{
- struct btree_trans trans;
- int ret;
+ spin_lock(&c->ec_stripes_heap_lock);
+ bch2_stripes_heap_update(c, m, k.k->p.offset);
+ spin_unlock(&c->ec_stripes_heap_lock);
+ }
+ bch2_trans_iter_exit(&trans, &iter);
- bch2_trans_init(&trans, c, 0, 0);
- ret = bch2_btree_and_journal_walk(&trans, BTREE_ID_stripes,
- bch2_stripes_read_fn);
bch2_trans_exit(&trans);
+
if (ret)
bch_err(c, "error reading stripes: %i", ret);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 219351654564..57311ad283c7 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -59,23 +59,21 @@ static void zero_out_btree_mem_ptr(struct journal_keys *keys)
static int __journal_key_cmp(enum btree_id l_btree_id,
unsigned l_level,
struct bpos l_pos,
- struct journal_key *r)
+ const struct journal_key *r)
{
return (cmp_int(l_btree_id, r->btree_id) ?:
cmp_int(l_level, r->level) ?:
bpos_cmp(l_pos, r->k->k.p));
}
-static int journal_key_cmp(struct journal_key *l, struct journal_key *r)
+static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r)
{
- return (cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p));
+ return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r);
}
-static size_t journal_key_search(struct journal_keys *journal_keys,
- enum btree_id id, unsigned level,
- struct bpos pos)
+size_t bch2_journal_key_search(struct journal_keys *journal_keys,
+ enum btree_id id, unsigned level,
+ struct bpos pos)
{
size_t l = 0, r = journal_keys->nr, m;
@@ -125,7 +123,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
};
struct journal_keys *keys = &c->journal_keys;
struct journal_iter *iter;
- unsigned idx = journal_key_search(keys, id, level, k->k.p);
+ size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
BUG_ON(test_bit(BCH_FS_RW, &c->flags));
@@ -164,6 +162,11 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
return 0;
}
+/*
+ * Can only be used from the recovery thread while we're still RO - can't be
+ * used once we've got RW, as journal_keys is at that point used by multiple
+ * threads:
+ */
int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id,
unsigned level, struct bkey_i *k)
{
@@ -196,7 +199,7 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos pos)
{
struct journal_keys *keys = &c->journal_keys;
- size_t idx = journal_key_search(keys, btree, level, pos);
+ size_t idx = bch2_journal_key_search(keys, btree, level, pos);
if (idx < keys->nr &&
keys->d[idx].btree_id == btree &&
@@ -207,15 +210,18 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter)
{
- struct journal_key *k = iter->idx - iter->keys->nr
- ? iter->keys->d + iter->idx : NULL;
+ struct journal_key *k = iter->keys->d + iter->idx;
- if (k &&
- k->btree_id == iter->btree_id &&
- k->level == iter->level)
- return k->k;
+ while (k < iter->keys->d + iter->keys->nr &&
+ k->btree_id == iter->btree_id &&
+ k->level == iter->level) {
+ if (!k->overwritten)
+ return k->k;
+
+ iter->idx++;
+ k = iter->keys->d + iter->idx;
+ }
- iter->idx = iter->keys->nr;
return NULL;
}
@@ -238,8 +244,7 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->btree_id = id;
iter->level = level;
iter->keys = &c->journal_keys;
- iter->idx = journal_key_search(&c->journal_keys, id, level, pos);
- list_add(&iter->list, &c->journal_iters);
+ iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -325,106 +330,33 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
bch2_journal_iter_exit(&iter->journal);
}
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
- struct bch_fs *c,
- struct btree *b)
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b,
+ struct btree_node_iter node_iter,
+ struct bpos pos)
{
memset(iter, 0, sizeof(*iter));
iter->b = b;
- bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b);
- bch2_journal_iter_init(c, &iter->journal,
- b->c.btree_id, b->c.level, b->data->min_key);
-}
-
-/* Walk btree, overlaying keys from the journal: */
-
-static void btree_and_journal_iter_prefetch(struct bch_fs *c, struct btree *b,
- struct btree_and_journal_iter iter)
-{
- unsigned i = 0, nr = b->c.level > 1 ? 2 : 16;
- struct bkey_s_c k;
- struct bkey_buf tmp;
-
- BUG_ON(!b->c.level);
-
- bch2_bkey_buf_init(&tmp);
-
- while (i < nr &&
- (k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- bch2_btree_node_prefetch(c, NULL, NULL, tmp.k,
- b->c.btree_id, b->c.level - 1);
-
- bch2_btree_and_journal_iter_advance(&iter);
- i++;
- }
-
- bch2_bkey_buf_exit(&tmp, c);
-}
-
-static int bch2_btree_and_journal_walk_recurse(struct btree_trans *trans, struct btree *b,
- enum btree_id btree_id,
- btree_walk_key_fn key_fn)
-{
- struct bch_fs *c = trans->c;
- struct btree_and_journal_iter iter;
- struct bkey_s_c k;
- struct bkey_buf tmp;
- struct btree *child;
- int ret = 0;
-
- bch2_bkey_buf_init(&tmp);
- bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-
- while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
- if (b->c.level) {
- bch2_bkey_buf_reassemble(&tmp, c, k);
-
- child = bch2_btree_node_get_noiter(c, tmp.k,
- b->c.btree_id, b->c.level - 1,
- false);
-
- ret = PTR_ERR_OR_ZERO(child);
- if (ret)
- break;
-
- btree_and_journal_iter_prefetch(c, b, iter);
-
- ret = bch2_btree_and_journal_walk_recurse(trans, child,
- btree_id, key_fn);
- six_unlock_read(&child->c.lock);
- } else {
- ret = key_fn(trans, k);
- }
-
- if (ret)
- break;
-
- bch2_btree_and_journal_iter_advance(&iter);
- }
-
- bch2_btree_and_journal_iter_exit(&iter);
- bch2_bkey_buf_exit(&tmp, c);
- return ret;
+ iter->node_iter = node_iter;
+ bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
+ INIT_LIST_HEAD(&iter->journal.list);
}
-int bch2_btree_and_journal_walk(struct btree_trans *trans, enum btree_id btree_id,
- btree_walk_key_fn key_fn)
+/*
+ * this version is used by btree_gc before filesystem has gone RW and
+ * multithreaded, so uses the journal_iters list:
+ */
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
+ struct bch_fs *c,
+ struct btree *b)
{
- struct bch_fs *c = trans->c;
- struct btree *b = c->btree_roots[btree_id].b;
- int ret = 0;
-
- if (btree_node_fake(b))
- return 0;
-
- six_lock_read(&b->c.lock, NULL, NULL);
- ret = bch2_btree_and_journal_walk_recurse(trans, b, btree_id, key_fn);
- six_unlock_read(&b->c.lock);
+ struct btree_node_iter node_iter;
- return ret;
+ bch2_btree_node_iter_init_from_start(&node_iter, b);
+ __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
+ list_add(&iter->journal.list, &c->journal_iters);
}
/* sort and dedup all keys in the journal: */
@@ -449,9 +381,7 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
const struct journal_key *l = _l;
const struct journal_key *r = _r;
- return cmp_int(l->btree_id, r->btree_id) ?:
- cmp_int(l->level, r->level) ?:
- bpos_cmp(l->k->k.p, r->k->k.p) ?:
+ return journal_key_cmp(l, r) ?:
cmp_int(l->journal_seq, r->journal_seq) ?:
cmp_int(l->journal_offset, r->journal_offset);
}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index a7a9496afb95..21bdad9db249 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,9 @@ struct btree_and_journal_iter {
} last;
};
+size_t bch2_journal_key_search(struct journal_keys *, enum btree_id,
+ unsigned, struct bpos);
+
int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
unsigned, struct bkey_i *);
int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
@@ -45,14 +48,13 @@ struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *
struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *);
void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
+ struct bch_fs *, struct btree *,
+ struct btree_node_iter, struct bpos);
void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
struct bch_fs *,
struct btree *);
-typedef int (*btree_walk_key_fn)(struct btree_trans *, struct bkey_s_c);
-
-int bch2_btree_and_journal_walk(struct btree_trans *, enum btree_id, btree_walk_key_fn);
-
void bch2_journal_keys_free(struct journal_keys *);
void bch2_journal_entries_free(struct list_head *);