diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-15 09:00:09 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-03-15 09:00:09 -0700 |
commit | 32a50540c3d26341698505998dfca5b0e8fb4fd4 (patch) | |
tree | d50aae41b90ae2d24c7790af7f2e27e7ec05939e | |
parent | e5eb28f6d1afebed4bb7d740a797d0390bd3a357 (diff) | |
parent | be28368b2ccb328b207c9f66c35bb088d91e6a03 (diff) | |
download | lwn-32a50540c3d26341698505998dfca5b0e8fb4fd4.tar.gz lwn-32a50540c3d26341698505998dfca5b0e8fb4fd4.zip |
Merge tag 'bcachefs-2024-03-13' of https://evilpiepirate.org/git/bcachefs
Pull bcachefs updates from Kent Overstreet:
- Subvolume children btree; this is needed for providing a userspace
interface for walking subvolumes, which will come later
- Lots of improvements to directory structure checking
- Improved journal pipelining, significantly improving performance on
high iodepth write workloads
- Discard path improvements: the discard path is more efficient, and no
longer flushes the journal unnecessarily
- Buffered write path can now avoid taking the inode lock
- new mm helper: memalloc_flags_{save|restore}
- mempool now does kvmalloc mempools
* tag 'bcachefs-2024-03-13' of https://evilpiepirate.org/git/bcachefs: (128 commits)
bcachefs: time_stats: shrink time_stat_buffer for better alignment
bcachefs: time_stats: split stats-with-quantiles into a separate structure
bcachefs: mean_and_variance: put struct mean_and_variance_weighted on a diet
bcachefs: time_stats: add larger units
bcachefs: pull out time_stats.[ch]
bcachefs: reconstruct_alloc cleanup
bcachefs: fix bch_folio_sector padding
bcachefs: Fix btree key cache coherency during replay
bcachefs: Always flush write buffer in delete_dead_inodes()
bcachefs: Fix order of gc_done passes
bcachefs: fix deletion of indirect extents in btree_gc
bcachefs: Prefer struct_size over open coded arithmetic
bcachefs: Kill unused flags argument to btree_split()
bcachefs: Check for writing superblocks with nonsense member seq fields
bcachefs: fix bch2_journal_buf_to_text()
lib/generic-radix-tree.c: Make nodes more reasonably sized
bcachefs: copy_(to|from)_user_errcode()
bcachefs: Split out bkey_types.h
bcachefs: fix lost journal buf wakeup due to improved pipelining
bcachefs: intercept mountoption value for bool type
...
95 files changed, 3770 insertions, 2253 deletions
diff --git a/Documentation/filesystems/bcachefs/errorcodes.rst b/Documentation/filesystems/bcachefs/errorcodes.rst new file mode 100644 index 000000000000..2cccaa0ba7cd --- /dev/null +++ b/Documentation/filesystems/bcachefs/errorcodes.rst @@ -0,0 +1,30 @@ +.. SPDX-License-Identifier: GPL-2.0 + +bcachefs private error codes +---------------------------- + +In bcachefs, as a hard rule we do not throw or directly use standard error +codes (-EINVAL, -EBUSY, etc.). Instead, we define private error codes as needed +in fs/bcachefs/errcode.h. + +This gives us much better error messages and makes debugging much easier. Any +direct uses of standard error codes you see in the source code are simply old +code that has yet to be converted - feel free to clean it up! + +Private error codes may subtype another error code, this allows for grouping of +related errors that should be handled similarly (e.g. transaction restart +errors), as well as specifying which standard error code should be returned at +the bcachefs module boundary. + +At the module boundary, we use bch2_err_class() to convert to a standard error +code; this also emits a trace event so that the original error code be +recovered even if it wasn't logged. + +Do not reuse error codes! Generally speaking, a private error code should only +be thrown in one place. That means that when we see it in a log message we can +see, unambiguously, exactly which file and line number it was returned from. + +Try to give error codes names that are as reasonably descriptive of the error +as possible. Frequently, the error will be logged at a place far removed from +where the error was generated; good names for error codes mean much more +descriptive and useful error messages. diff --git a/MAINTAINERS b/MAINTAINERS index f4d7f7cb7577..e58171cb32fd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3555,6 +3555,7 @@ R: Brian Foster <bfoster@redhat.com> L: linux-bcachefs@vger.kernel.org S: Supported C: irc://irc.oftc.net/bcache +T: git https://evilpiepirate.org/git/bcachefs.git F: fs/bcachefs/ BDISP ST MEDIA DRIVER diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 1a05cecda7cc..b02796c8a595 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -82,6 +82,7 @@ bcachefs-y := \ super-io.o \ sysfs.o \ tests.o \ + time_stats.o \ thread_with_file.o \ trace.o \ two_state_shared_lock.o \ @@ -90,3 +91,6 @@ bcachefs-y := \ xattr.o obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o + +# Silence "note: xyz changed in GCC X.X" messages +subdir-ccflags-y += $(call cc-disable-warning, psabi) diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index fd3e175d8342..c47f72f2bd58 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -29,6 +29,8 @@ #include <linux/sched/task.h> #include <linux/sort.h> +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket); + /* Persistent alloc info: */ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { @@ -860,23 +862,28 @@ int bch2_trigger_alloc(struct btree_trans *trans, *bucket_gen(ca, new.k->p.offset) = new_a->gen; bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false); + percpu_up_read(&c->mark_lock); + +#define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) +#define statechange(expr) !eval_state(old_a, expr) && eval_state(new_a, expr) +#define bucket_flushed(a) (!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk) - if (new_a->data_type == BCH_DATA_free && - (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + if (statechange(a->data_type == BCH_DATA_free) && + bucket_flushed(new_a)) closure_wake_up(&c->freelist_wait); - if (new_a->data_type == BCH_DATA_need_discard && - (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) - bch2_do_discards(c); + if (statechange(a->data_type == BCH_DATA_need_discard) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && + bucket_flushed(new_a)) + bch2_discard_one_bucket_fast(c, new.k->p); - if (old_a->data_type != BCH_DATA_cached && - new_a->data_type == BCH_DATA_cached && + if (statechange(a->data_type == BCH_DATA_cached) && + !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) && should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) bch2_do_invalidates(c); - if (new_a->data_type == BCH_DATA_need_gc_gens) + if (statechange(a->data_type == BCH_DATA_need_gc_gens)) bch2_do_gc_gens(c); - percpu_up_read(&c->mark_lock); } if ((flags & BTREE_TRIGGER_GC) && @@ -1045,14 +1052,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != discard_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, need_discard_key_wrong, - "incorrect key in need_discard btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[discard_key_type], - (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != discard_key_type, + c, need_discard_key_wrong, + "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1076,15 +1082,14 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (k.k->type != freespace_key_type && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_key_wrong, - "incorrect key in freespace btree (got %s should be %s)\n" - " %s", - bch2_bkey_types[k.k->type], - bch2_bkey_types[freespace_key_type], - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(k.k->type != freespace_key_type, + c, freespace_key_wrong, + "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1108,14 +1113,13 @@ int bch2_check_alloc_key(struct btree_trans *trans, if (ret) goto err; - if (a->gen != alloc_gen(k, gens_offset) && - (c->opts.reconstruct_alloc || - fsck_err(c, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", - alloc_gen(k, gens_offset), a->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), + c, bucket_gens_key_wrong, + "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_bucket_gens *g = bch2_trans_kmalloc(trans, sizeof(*g)); @@ -1167,14 +1171,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, *end = bkey_min(k.k->p, *end); - if (k.k->type != KEY_TYPE_set && - (c->opts.reconstruct_alloc || - fsck_err(c, freespace_hole_missing, - "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", - freespace_iter->pos.inode, - freespace_iter->pos.offset, - end->offset))) { + if (fsck_err_on(k.k->type != KEY_TYPE_set, + c, freespace_hole_missing, + "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset)) { struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(*update)); @@ -1604,6 +1607,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) return ret; } +static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket) +{ + int ret; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + ret = -EEXIST; + goto out; + } + + ret = darray_push(&c->discard_buckets_in_flight, bucket); +out: + mutex_unlock(&c->discard_buckets_in_flight_lock); + return ret; +} + +static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket) +{ + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) + if (bkey_eq(*i, bucket)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + goto found; + } + BUG(); +found: + mutex_unlock(&c->discard_buckets_in_flight_lock); +} + struct discard_buckets_state { u64 seen; u64 open; @@ -1642,6 +1675,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, struct bch_dev *ca; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; + bool discard_locked = false; int ret = 0; ca = bch_dev_bkey_exists(c, pos.inode); @@ -1709,6 +1743,11 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, goto out; } + if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true))) + goto out; + + discard_locked = true; + if (!bkey_eq(*discard_pos_done, iter.pos) && ca->mi.discard && !c->opts.nochanges) { /* @@ -1740,6 +1779,8 @@ write: count_event(c, bucket_discard); s->discarded++; out: + if (discard_locked) + discard_in_flight_remove(c, iter.pos); s->seen++; bch2_trans_iter_exit(trans, &iter); percpu_ref_put(&ca->io_ref); @@ -1779,6 +1820,93 @@ void bch2_do_discards(struct bch_fs *c) bch2_write_ref_put(c, BCH_WRITE_REF_discard); } +static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket) +{ + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT); + struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter); + int ret = bkey_err(k); + if (ret) + goto err; + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto err; + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void bch2_do_discards_fast_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work); + + while (1) { + bool got_bucket = false; + struct bpos bucket; + struct bch_dev *ca; + + mutex_lock(&c->discard_buckets_in_flight_lock); + darray_for_each(c->discard_buckets_in_flight, i) { + if (i->snapshot) + continue; + + ca = bch_dev_bkey_exists(c, i->inode); + + if (!percpu_ref_tryget(&ca->io_ref)) { + darray_remove_item(&c->discard_buckets_in_flight, i); + continue; + } + + got_bucket = true; + bucket = *i; + i->snapshot = true; + break; + } + mutex_unlock(&c->discard_buckets_in_flight_lock); + + if (!got_bucket) + break; + + if (ca->mi.discard && !c->opts.nochanges) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + + int ret = bch2_trans_do(c, NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, + bch2_clear_bucket_needs_discard(trans, bucket)); + bch_err_fn(c, ret); + + percpu_ref_put(&ca->io_ref); + discard_in_flight_remove(c, bucket); + + if (ret) + break; + } + + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + +static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + if (!percpu_ref_is_dying(&ca->io_ref) && + !discard_in_flight_add(c, bucket) && + bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) && + !queue_work(c->write_ref_wq, &c->discard_fast_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); +} + static int invalidate_one_bucket(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, @@ -2210,9 +2338,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) set_bit(ca->dev_idx, c->rw_devs[i].d); } +void bch2_fs_allocator_background_exit(struct bch_fs *c) +{ + darray_exit(&c->discard_buckets_in_flight); +} + void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); + mutex_init(&c->discard_buckets_in_flight_lock); INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work); INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index e7f7e842ee1b..052b2fac25d6 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); +void bch2_fs_allocator_background_exit(struct bch_fs *); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 633d3223b353..ca58193dd902 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -236,8 +236,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * if (cl) closure_wait(&c->open_buckets_wait, cl); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); return ERR_PTR(-BCH_ERR_open_buckets_empty); } @@ -263,11 +262,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * ca->nr_open_buckets++; bch2_open_bucket_hash_add(c, ob); - track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], - &c->blocked_allocate_open_bucket, false); - - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, false); + track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false); + track_event_change(&c->times[BCH_TIME_blocked_allocate], false); spin_unlock(&c->freelist_lock); return ob; @@ -555,8 +551,7 @@ again: goto again; } - track_event_change(&c->times[BCH_TIME_blocked_allocate], - &c->blocked_allocate, true); + track_event_change(&c->times[BCH_TIME_blocked_allocate], true); ob = ERR_PTR(-BCH_ERR_freelist_empty); goto err; diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index 569b97904da4..8cb35ea572cb 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -131,8 +131,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, printbuf_exit(&buf); if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - bch2_inconsistent_error(c); - return -EIO; + return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; } else { return 0; } @@ -478,8 +477,7 @@ missing: prt_printf(&buf, "\nbp pos "); bch2_bpos_to_text(&buf, bp_iter.pos); - if (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) + if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); goto out; @@ -555,60 +553,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) }; } -static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; - u64 mem_bytes; - si_meminfo(&i); - mem_bytes = i.totalram * i.mem_unit; - return div_u64(mem_bytes >> 1, c->opts.btree_node_size); + + u64 mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, - unsigned btree_leaf_mask, - unsigned btree_interior_mask, + u64 btree_leaf_mask, + u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { - struct btree_iter iter; - struct bkey_s_c k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - enum btree_id btree; + struct bch_fs *c = trans->c; + s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; - for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { - unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + btree_interior_mask |= btree_leaf_mask; + + c->btree_cache.pinned_nodes_leaf_mask = btree_leaf_mask; + c->btree_cache.pinned_nodes_interior_mask = btree_interior_mask; + c->btree_cache.pinned_nodes_start = start; + c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; + + for (enum btree_id btree = start.btree; + btree < BTREE_ID_NR && !ret; + btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1; + struct btree_iter iter; + struct btree *b; if (!((1U << btree) & btree_leaf_mask) && !((1U << btree) & btree_interior_mask)) continue; - bch2_trans_node_iter_init(trans, &iter, btree, - btree == start.btree ? start.pos : POS_MIN, - 0, depth, 0); - /* - * for_each_btree_key_contineu() doesn't check the return value - * from bch2_btree_iter_advance(), which is needed when - * iterating over interior nodes where we'll see keys at - * SPOS_MAX: - */ - do { - k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); - ret = bkey_err(k); - if (!k.k || ret) - break; - - --btree_nodes; - if (!btree_nodes) { - *end = BBPOS(btree, k.k->p); + __for_each_btree_node(trans, iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + mem_may_pin -= btree_buf_bytes(b); + if (mem_may_pin <= 0) { + c->btree_cache.pinned_nodes_end = *end = + BBPOS(btree, b->key.k.p); bch2_trans_iter_exit(trans, &iter); return 0; } - } while (bch2_btree_iter_advance(&iter)); + } bch2_trans_iter_exit(trans, &iter); } - *end = BBPOS_MAX; return ret; } @@ -666,62 +665,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, return 0; } -static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, - struct bpos bucket) -{ - return bch2_dev_exists2(c, bucket.inode) - ? bucket_pos_to_bp(c, bucket, 0) - : bucket; -} - -static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, - struct bpos start, struct bpos *end) -{ - struct btree_iter alloc_iter; - struct btree_iter bp_iter; - struct bkey_s_c alloc_k, bp_k; - size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); - bool alloc_end = false, bp_end = false; - int ret = 0; - - bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, - start, 0, 1, 0); - bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); - while (1) { - alloc_k = !alloc_end - ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) - : bkey_s_c_null; - bp_k = !bp_end - ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) - : bkey_s_c_null; - - ret = bkey_err(alloc_k) ?: bkey_err(bp_k); - if ((!alloc_k.k && !bp_k.k) || ret) { - *end = SPOS_MAX; - break; - } - - --btree_nodes; - if (!btree_nodes) { - *end = alloc_k.k ? alloc_k.k->p : SPOS_MAX; - break; - } - - if (bpos_lt(alloc_iter.pos, SPOS_MAX) && - bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { - if (!bch2_btree_iter_advance(&alloc_iter)) - alloc_end = true; - } else { - if (!bch2_btree_iter_advance(&bp_iter)) - bp_end = true; - } - } - bch2_trans_iter_exit(trans, &bp_iter); - bch2_trans_iter_exit(trans, &alloc_iter); - return ret; -} - int bch2_check_extents_to_backpointers(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); @@ -732,10 +675,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bkey_init(&s.last_flushed.k->k); while (1) { - ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end); + struct bbpos end; + ret = bch2_get_btree_in_memory_pos(trans, + BIT_ULL(BTREE_ID_backpointers), + BIT_ULL(BTREE_ID_backpointers), + BBPOS(BTREE_ID_backpointers, s.bucket_start), &end); if (ret) break; + s.bucket_end = end.pos; + if ( bpos_eq(s.bucket_start, POS_MIN) && !bpos_eq(s.bucket_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", @@ -763,6 +712,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } @@ -868,6 +820,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) } bch2_trans_put(trans); + c->btree_cache.pinned_nodes_leaf_mask = 0; + c->btree_cache.pinned_nodes_interior_mask = 0; + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h index 5198e94cf3b8..f63893344f80 100644 --- a/fs/bcachefs/bbpos_types.h +++ b/fs/bcachefs/bbpos_types.h @@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) } #define BBPOS_MIN BBPOS(0, POS_MIN) -#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, SPOS_MAX) #endif /* _BCACHEFS_BBPOS_TYPES_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 69d0d60d50e3..339dc3e1dcd3 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -212,6 +212,7 @@ #include "recovery_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG @@ -266,6 +267,9 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") __printf(2, 3) +void bch2_print_opts(struct bch_opts *, const char *, ...); + +__printf(2, 3) void __bch2_print(struct bch_fs *c, const char *fmt, ...); #define maybe_dev_to_fs(_c) _Generic((_c), \ @@ -504,6 +508,7 @@ enum gc_phase { GC_PHASE_BTREE_deleted_inodes, GC_PHASE_BTREE_logged_ops, GC_PHASE_BTREE_rebalance_work, + GC_PHASE_BTREE_subvolume_children, GC_PHASE_PENDING_DELETE, }; @@ -593,7 +598,7 @@ struct bch_dev { /* The rest of this all shows up in sysfs */ atomic64_t cur_latency[2]; - struct bch2_time_stats io_latency[2]; + struct bch2_time_stats_quantiles io_latency[2]; #define CONGESTED_MAX 1024 atomic_t congested; @@ -663,6 +668,8 @@ struct journal_seq_blacklist_table { }; struct journal_keys { + /* must match layout in darray_types.h */ + size_t nr, size; struct journal_key { u64 journal_seq; u32 journal_offset; @@ -671,15 +678,13 @@ struct journal_keys { bool allocated; bool overwritten; struct bkey_i *k; - } *d; + } *data; /* * Gap buffer: instead of all the empty space in the array being at the * end of the buffer - from @nr to @size - the empty space is at @gap. * This means that sequential insertions are O(n) instead of O(n^2). */ size_t gap; - size_t nr; - size_t size; atomic_t ref; bool initial_ref_held; }; @@ -703,6 +708,7 @@ struct btree_trans_buf { x(reflink) \ x(fallocate) \ x(discard) \ + x(discard_fast) \ x(invalidate) \ x(delete_dead_snapshots) \ x(snapshot_delete_pagecache) \ @@ -919,8 +925,6 @@ struct bch_fs { /* ALLOCATOR */ spinlock_t freelist_lock; struct closure_waitlist freelist_wait; - u64 blocked_allocate; - u64 blocked_allocate_open_bucket; open_bucket_idx_t open_buckets_freelist; open_bucket_idx_t open_buckets_nr_free; @@ -940,8 +944,11 @@ struct bch_fs { unsigned write_points_nr; struct buckets_waiting_for_journal buckets_waiting_for_journal; - struct work_struct discard_work; struct work_struct invalidate_work; + struct work_struct discard_work; + struct mutex discard_buckets_in_flight_lock; + DARRAY(struct bpos) discard_buckets_in_flight; + struct work_struct discard_fast_work; /* GARBAGE COLLECTION */ struct task_struct *gc_thread; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 0668b682a21c..bff8750ac0d7 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -189,7 +189,11 @@ struct bversion { __u32 hi; __u64 lo; #endif -} __packed __aligned(4); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +__aligned(4) +#endif +; struct bkey { /* Size of combined key and value, in u64s */ @@ -222,7 +226,36 @@ struct bkey { __u8 pad[1]; #endif -} __packed __aligned(8); +} __packed +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +/* + * The big-endian version of bkey can't be compiled by rustc with the "aligned" + * attr since it doesn't allow types to have both "packed" and "aligned" attrs. + * So for Rust compatibility, don't include this. It can be included in the LE + * version because the "packed" attr is redundant in that case. + * + * History: (quoting Kent) + * + * Specifically, when i was designing bkey, I wanted the header to be no + * bigger than necessary so that bkey_packed could use the rest. That means that + * decently offten extent keys will fit into only 8 bytes, instead of spilling over + * to 16. + * + * But packed_bkey treats the part after the header - the packed section - + * as a single multi word, variable length integer. And bkey, the unpacked + * version, is just a special case version of a bkey_packed; all the packed + * bkey code will work on keys in any packed format, the in-memory + * representation of an unpacked key also is just one type of packed key... + * + * So that constrains the key part of a bkig endian bkey to start right + * after the header. + * + * If we ever do a bkey_v2 and need to expand the hedaer by another byte for + * some reason - that will clean up this wart. + */ +__aligned(8) +#endif +; struct bkey_packed { __u64 _data[0]; @@ -840,7 +873,9 @@ struct bch_sb_field_downgrade { x(snapshot_skiplists, BCH_VERSION(1, 1)) \ x(deleted_inodes, BCH_VERSION(1, 2)) \ x(rebalance_work, BCH_VERSION(1, 3)) \ - x(member_seq, BCH_VERSION(1, 4)) + x(member_seq, BCH_VERSION(1, 4)) \ + x(subvolume_fs_parent, BCH_VERSION(1, 5)) \ + x(btree_subvolume_children, BCH_VERSION(1, 6)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1275,7 +1310,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(dev_usage, 8) \ x(log, 9) \ x(overwrite, 10) \ - x(write_buffer_keys, 11) + x(write_buffer_keys, 11) \ + x(datetime, 12) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1376,6 +1412,11 @@ struct jset_entry_log { u8 d[]; } __packed __aligned(8); +struct jset_entry_datetime { + struct jset_entry entry; + __le64 seconds; +} __packed __aligned(8); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1482,7 +1523,9 @@ enum btree_id_flags { BIT_ULL(KEY_TYPE_logged_op_truncate)| \ BIT_ULL(KEY_TYPE_logged_op_finsert)) \ x(rebalance_work, 18, BTREE_ID_SNAPSHOT_FIELD, \ - BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) + BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie)) \ + x(subvolume_children, 19, 0, \ + BIT_ULL(KEY_TYPE_set)) enum btree_id { #define x(name, nr, ...) BTREE_ID_##name = nr, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 831be01809f2..cf23ff47bed8 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -4,7 +4,7 @@ #include <linux/bug.h> #include "bcachefs_format.h" - +#include "bkey_types.h" #include "btree_types.h" #include "util.h" #include "vstructs.h" @@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *, const struct bkey_format *, const struct bkey_packed *); -/* bkey with split value, const */ -struct bkey_s_c { - const struct bkey *k; - const struct bch_val *v; -}; - -/* bkey with split value */ -struct bkey_s { - union { - struct { - struct bkey *k; - struct bch_val *v; - }; - struct bkey_s_c s_c; - }; -}; - -#define bkey_p_next(_k) vstruct_next(_k) - -static inline struct bkey_i *bkey_next(struct bkey_i *k) -{ - return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); -} - -#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) - -static inline size_t bkey_val_bytes(const struct bkey *k) -{ - return bkey_val_u64s(k) * sizeof(u64); -} - -static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -{ - unsigned u64s = BKEY_U64s + val_u64s; - - BUG_ON(u64s > U8_MAX); - k->u64s = u64s; -} - -static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -{ - set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); -} - -#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) - -#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) - -#define bkey_whiteout(_k) \ - ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) - enum bkey_lr_packed { BKEY_PACKED_BOTH, BKEY_PACKED_RIGHT, @@ -362,10 +311,7 @@ static inline struct bpos bkey_start_pos(const struct bkey *k) static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, const struct bkey_packed *k) { - unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; - - EBUG_ON(k->u64s < ret); - return ret; + return bkey_packed(k) ? format->key_u64s : BKEY_U64s; } static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, @@ -553,155 +499,6 @@ static inline void bkey_reassemble(struct bkey_i *dst, memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); } -#define bkey_s_null ((struct bkey_s) { .k = NULL }) -#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) - -#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) - -static inline struct bkey_s bkey_to_s(struct bkey *k) -{ - return (struct bkey_s) { .k = k, .v = NULL }; -} - -static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -{ - return (struct bkey_s_c) { .k = k, .v = NULL }; -} - -static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -{ - return (struct bkey_s) { .k = &k->k, .v = &k->v }; -} - -static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -{ - return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -} - -/* - * For a given type of value (e.g. struct bch_extent), generates the types for - * bkey + bch_extent - inline, split, split const - and also all the conversion - * functions, which also check that the value is of the correct type. - * - * We use anonymous unions for upcasting - e.g. converting from e.g. a - * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion - * functions. - */ -#define x(name, ...) \ -struct bkey_i_##name { \ - union { \ - struct bkey k; \ - struct bkey_i k_i; \ - }; \ - struct bch_##name v; \ -}; \ - \ -struct bkey_s_c_##name { \ - union { \ - struct { \ - const struct bkey *k; \ - const struct bch_##name *v; \ - }; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -struct bkey_s_##name { \ - union { \ - struct { \ - struct bkey *k; \ - struct bch_##name *v; \ - }; \ - struct bkey_s_c_##name c; \ - struct bkey_s s; \ - struct bkey_s_c s_c; \ - }; \ -}; \ - \ -static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline const struct bkey_i_##name * \ -bkey_i_to_##name##_c(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return container_of(&k->k, struct bkey_i_##name, k); \ -} \ - \ -static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = k.k, \ - .v = container_of(k.v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -{ \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -name##_i_to_s_c(const struct bkey_i_##name *k) \ -{ \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = &k->v, \ - }; \ -} \ - \ -static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_s_c_##name \ -bkey_i_to_s_c_##name(const struct bkey_i *k) \ -{ \ - EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ - return (struct bkey_s_c_##name) { \ - .k = &k->k, \ - .v = container_of(&k->v, struct bch_##name, v), \ - }; \ -} \ - \ -static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -{ \ - struct bkey_i_##name *k = \ - container_of(&_k->k, struct bkey_i_##name, k); \ - \ - bkey_init(&k->k); \ - memset(&k->v, 0, sizeof(k->v)); \ - k->k.type = KEY_TYPE_##name; \ - set_bkey_val_bytes(&k->k, sizeof(k->v)); \ - \ - return k; \ -} - -BCH_BKEY_TYPES(); -#undef x - /* byte order helpers */ #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h new file mode 100644 index 000000000000..c9ae9e42b385 --- /dev/null +++ b/fs/bcachefs/bkey_types.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_TYPES_H +#define _BCACHEFS_BKEY_TYPES_H + +#include "bcachefs_format.h" + +/* + * bkey_i - bkey with inline value + * bkey_s - bkey with split value + * bkey_s_c - bkey with split value, const + */ + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +#endif /* _BCACHEFS_BKEY_TYPES_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index d7c81beac14a..562561a9a510 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_io.h" @@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b) clear_btree_node_just_written(b); - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; #ifdef __KERNEL__ kvfree(b->aux_data); @@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) { BUG_ON(b->data || b->aux_data); - b->data = kvpmalloc(btree_buf_bytes(b), gfp); + b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) return -BCH_ERR_ENOMEM_btree_node_mem_alloc; #ifdef __KERNEL__ @@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->aux_data = NULL; #endif if (!b->aux_data) { - kvpfree(b->data, btree_buf_bytes(b)); + kvfree(b->data); b->data = NULL; return -BCH_ERR_ENOMEM_btree_node_mem_alloc; } @@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) int ret = 0; lockdep_assert_held(&bc->lock); + + struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p); + + u64 mask = b->c.level + ? bc->pinned_nodes_interior_mask + : bc->pinned_nodes_leaf_mask; + + if ((mask & BIT_ULL(b->c.btree_id)) && + bbpos_cmp(bc->pinned_nodes_start, pos) < 0 && + bbpos_cmp(bc->pinned_nodes_end, pos) >= 0) + return -BCH_ERR_ENOMEM_btree_node_reclaim; + wait_on_io: if (b->flags & ((1U << BTREE_NODE_dirty)| (1U << BTREE_NODE_read_in_flight)| @@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) if (c->verify_data) list_move(&c->verify_data->list, &bc->live); - kvpfree(c->verify_ondisk, c->opts.btree_node_size); + kvfree(c->verify_ondisk); for (i = 0; i < btree_id_nr_alive(c); i++) { struct btree_root *r = bch2_btree_id_root(c, i); @@ -711,6 +724,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, b = bch2_btree_node_mem_alloc(trans, level != 0); if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + if (!path) + return b; + trans->memory_allocation_failure = true; trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); @@ -760,8 +776,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, } if (!six_relock_type(&b->c.lock, lock_type, seq)) { - if (path) - trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + BUG_ON(!path); + + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); } @@ -901,7 +918,7 @@ retry: if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -992,7 +1009,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path * if (unlikely(btree_node_read_error(b))) { six_unlock_type(&b->c.lock, lock_type); - return ERR_PTR(-EIO); + return ERR_PTR(-BCH_ERR_btree_node_read_error); } EBUG_ON(b->c.btree_id != path->btree_id); @@ -1075,7 +1092,7 @@ lock_node: if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); - b = ERR_PTR(-EIO); + b = ERR_PTR(-BCH_ERR_btree_node_read_error); goto out; } @@ -1096,7 +1113,7 @@ int bch2_btree_node_prefetch(struct btree_trans *trans, struct btree_cache *bc = &c->btree_cache; struct btree *b; - BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(path && !btree_node_locked(path, level + 1)); BUG_ON(level >= BTREE_MAX_DEPTH); b = btree_cache_find(bc, k); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 1102995643b1..584aee7010de 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -389,7 +389,8 @@ again: have_child = dropped_children = false; bch2_bkey_buf_init(&prev_k); bch2_bkey_buf_init(&cur_k); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { BUG_ON(bpos_lt(k.k->p, b->data->min_key)); @@ -406,7 +407,7 @@ again: printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(ret == -EIO, c, + if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c, btree_node_unreadable, "Topology repair: unreadable btree node at btree %s level %u:\n" " %s", @@ -478,7 +479,8 @@ again: goto err; bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { bch2_bkey_buf_reassemble(&cur_k, c, k); @@ -591,16 +593,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr); - if (!g->gen_valid && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_to_missing_alloc_key, - "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(!g->gen_valid, + c, ptr_to_missing_alloc_key, + "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -609,16 +610,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(p.ptr.gen, g->gen) > 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { if (!p.ptr.cached) { g->gen_valid = true; g->gen = p.ptr.gen; @@ -631,28 +631,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id } } - if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && - (c->opts.reconstruct_alloc || - fsck_err(c, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, + c, ptr_gen_newer_than_bucket_gen, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; - if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && - (c->opts.reconstruct_alloc || - fsck_err(c, stale_dirty_ptr, - "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" - "while marking %s", - p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), - bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), - p.ptr.gen, g->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0, + c, stale_dirty_ptr, + "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_type_str(ptr_data_type(k->k, &p.ptr)), + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) do_update = true; if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) @@ -931,7 +929,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b struct printbuf buf = PRINTBUF; int ret = 0; - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); bch2_bkey_buf_init(&prev); bch2_bkey_buf_init(&cur); bkey_init(&prev.k->k); @@ -963,7 +961,8 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b if (b->c.level > target_depth) { bch2_btree_and_journal_iter_exit(&iter); - bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b); + iter.prefetch = true; while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { struct btree *child; @@ -976,7 +975,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b false); ret = PTR_ERR_OR_ZERO(child); - if (ret == -EIO) { + if (bch2_err_matches(ret, EIO)) { bch2_topology_error(c); if (__fsck_err(c, @@ -1190,9 +1189,7 @@ static void bch2_gc_free(struct bch_fs *c) genradix_free(&c->gc_stripes); for_each_member_device(c, ca) { - kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), - sizeof(struct bucket_array) + - ca->mi.nbuckets * sizeof(struct bucket)); + kvfree(rcu_dereference_protected(ca->buckets_gc, 1)); ca->buckets_gc = NULL; free_percpu(ca->usage_gc); @@ -1365,11 +1362,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); - struct bucket gc, *b; + struct bucket old_gc, gc, *b; struct bkey_i_alloc_v4 *a; struct bch_alloc_v4 old_convert, new; const struct bch_alloc_v4 *old; - enum bch_data_type type; int ret; old = bch2_alloc_to_v4(k, &old_convert); @@ -1377,30 +1373,31 @@ static int bch2_alloc_write_key(struct btree_trans *trans, percpu_down_read(&c->mark_lock); b = gc_bucket(ca, iter->pos.offset); + old_gc = *b; + + if ((old->data_type == BCH_DATA_sb || + old->data_type == BCH_DATA_journal) && + !bch2_dev_is_online(ca)) { + b->data_type = old->data_type; + b->dirty_sectors = old->dirty_sectors; + } /* * b->data_type doesn't yet include need_discard & need_gc_gen states - * fix that here: */ - type = __alloc_data_type(b->dirty_sectors, - b->cached_sectors, - b->stripe, - *old, - b->data_type); - if (b->data_type != type) { - struct bch_dev_usage *u; - - preempt_disable(); - u = this_cpu_ptr(ca->usage_gc); - u->d[b->data_type].buckets--; - b->data_type = type; - u->d[b->data_type].buckets++; - preempt_enable(); - } - + b->data_type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); gc = *b; percpu_up_read(&c->mark_lock); + if (gc.data_type != old_gc.data_type || + gc.dirty_sectors != old_gc.dirty_sectors) + bch2_dev_usage_update_m(c, ca, &old_gc, &gc); + if (metadata_only && gc.data_type != BCH_DATA_sb && gc.data_type != BCH_DATA_journal && @@ -1410,8 +1407,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, if (gen_after(old->gen, gc.gen)) return 0; - if (c->opts.reconstruct_alloc || - fsck_err_on(new.data_type != gc.data_type, c, + if (fsck_err_on(new.data_type != gc.data_type, c, alloc_key_data_type_wrong, "bucket %llu:%llu gen %u has wrong data_type" ": got %s, should be %s", @@ -1422,8 +1418,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans, new.data_type = gc.data_type; #define copy_bucket_field(_errtype, _f) \ - if (c->opts.reconstruct_alloc || \ - fsck_err_on(new._f != gc._f, c, _errtype, \ + if (fsck_err_on(new._f != gc._f, c, _errtype, \ "bucket %llu:%llu gen %u data type %s has wrong " #_f \ ": got %u, should be %u", \ iter->pos.inode, iter->pos.offset, \ @@ -1491,7 +1486,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) { for_each_member_device(c, ca) { - struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) + ca->mi.nbuckets * sizeof(struct bucket), GFP_KERNEL|__GFP_ZERO); if (!buckets) { @@ -1585,8 +1580,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, " should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { - struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); - + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -1595,6 +1589,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, new->k.type = KEY_TYPE_deleted; else *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount); + ret = bch2_trans_update(trans, iter, new, 0); } fsck_err: printbuf_exit(&buf); @@ -1817,10 +1812,10 @@ out: if (!ret) { bch2_journal_block(&c->journal); - ret = bch2_gc_stripes_done(c, metadata_only) ?: - bch2_gc_reflink_done(c, metadata_only) ?: - bch2_gc_alloc_done(c, metadata_only) ?: - bch2_gc_done(c, initial, metadata_only); + ret = bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only) ?: + bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only); bch2_journal_unblock(&c->journal); } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index aa9b6cbe3226..624c8287deb4 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size, if (used_mempool) mempool_free(p, &c->btree_bounce_pool); else - vpfree(p, size); + kvfree(p); } static void *btree_bounce_alloc(struct bch_fs *c, size_t size, @@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size, BUG_ON(size > c->opts.btree_node_size); *used_mempool = false; - p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); + p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT); if (!p) { *used_mempool = true; p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); @@ -581,8 +581,7 @@ static int __btree_err(int ret, break; case -BCH_ERR_btree_node_read_err_bad_node: bch2_print_string_as_lines(KERN_ERR, out.buf); - bch2_topology_error(c); - ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO; + ret = bch2_topology_error(c); break; case -BCH_ERR_btree_node_read_err_incompatible: bch2_print_string_as_lines(KERN_ERR, out.buf); @@ -840,6 +839,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b, if (k->format > KEY_FORMAT_CURRENT) return false; + if (k->u64s < bkeyp_key_u64s(&b->format, k)) + return false; + struct printbuf buf = PRINTBUF; struct bkey tmp; struct bkey_s u = __bkey_disassemble(b, k, &tmp); @@ -881,7 +883,13 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b, "invalid bkey format %u", k->format)) goto drop_this_key; - /* XXX: validate k->u64s */ + if (btree_err_on(k->u64s < bkeyp_key_u64s(&b->format, k), + -BCH_ERR_btree_node_read_err_fixable, + c, NULL, b, i, + btree_node_bkey_bad_u64s, + "k->u64s too small (%u < %u)", k->u64s, bkeyp_key_u64s(&b->format, k))) + goto drop_this_key; + if (!write) bch2_bkey_compat(b->c.level, b->c.btree_id, version, BSET_BIG_ENDIAN(i), write, @@ -1737,7 +1745,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, list_move(&b->list, &c->btree_cache.freeable); mutex_unlock(&c->btree_cache.lock); - ret = -EIO; + ret = -BCH_ERR_btree_node_read_error; goto err; } @@ -1841,7 +1849,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_write_all_failed; + ret = -BCH_ERR_btree_node_write_all_failed; goto err; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 3ef338df82f5..51bcdc6c6d1c 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct bkey_s_c k; int ret = 0; - __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + __bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos); k = bch2_btree_and_journal_iter_peek(&jiter); @@ -1146,7 +1146,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, path = &trans->paths[path_idx]; if (unlikely(path->level >= BTREE_MAX_DEPTH)) - goto out; + goto out_uptodate; path->level = btree_path_up_until_good_node(trans, path, 0); @@ -1179,7 +1179,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, goto out; } } - +out_uptodate: path->uptodate = BTREE_ITER_UPTODATE; out: if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) @@ -1520,7 +1520,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans) { unsigned nr = trans->nr_paths * 2; - void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + + void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) + sizeof(struct btree_trans_paths) + nr * sizeof(struct btree_path) + nr * sizeof(btree_path_idx_t) + 8 + @@ -1729,7 +1729,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter) if (ret) return ret; - btree_path_set_should_be_locked(trans->paths + iter->path); + struct btree_path *path = btree_iter_path(trans, iter); + if (btree_path_node(path, path->level)) + btree_path_set_should_be_locked(path); return 0; } @@ -2305,7 +2307,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) btree_iter_path(trans, iter)->level); if (iter->flags & BTREE_ITER_WITH_JOURNAL) - return bkey_s_c_err(-EIO); + return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported); bch2_btree_iter_verify(iter); bch2_btree_iter_verify_entry_exit(iter); @@ -2503,6 +2505,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) k = bch2_btree_iter_peek_upto(&iter2, end); if (k.k && !bkey_err(k)) { + swap(iter->key_cache_path, iter2.key_cache_path); iter->k = iter2.k; k.k = &iter->k; } @@ -2762,6 +2765,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) struct btree_trans *trans = src->trans; *dst = *src; +#ifdef TRACK_PATH_ALLOCATED + dst->ip_allocated = _RET_IP_; +#endif if (src->path) __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT); if (src->update_path) @@ -3085,7 +3091,7 @@ void bch2_trans_put(struct btree_trans *trans) trans->paths = NULL; if (paths_allocated != trans->_paths_allocated) - kfree_rcu_mightsleep(paths_allocated); + kvfree_rcu_mightsleep(paths_allocated); if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) mempool_free(trans->mem, &c->btree_trans_mem_pool); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 719a94a84950..50e04356d72c 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -1,7 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bkey_buf.h" #include "bset.h" +#include "btree_cache.h" #include "btree_journal_iter.h" #include "journal_io.h" @@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) { - return keys->d + idx_to_pos(keys, idx); + return keys->data + idx_to_pos(keys, idx); } static size_t __bch2_journal_key_search(struct journal_keys *keys, @@ -180,10 +182,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, BUG_ON(test_bit(BCH_FS_rw, &c->flags)); if (idx < keys->size && - journal_key_cmp(&n, &keys->d[idx]) == 0) { - if (keys->d[idx].allocated) - kfree(keys->d[idx].k); - keys->d[idx] = n; + journal_key_cmp(&n, &keys->data[idx]) == 0) { + if (keys->data[idx].allocated) + kfree(keys->data[idx].k); + keys->data[idx] = n; return 0; } @@ -196,17 +198,17 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); - if (!new_keys.d) { + new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); return -BCH_ERR_ENOMEM_journal_key_insert; } /* Since @keys was full, there was no gap: */ - memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); - kvfree(keys->d); - keys->d = new_keys.d; + memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr); + kvfree(keys->data); + keys->data = new_keys.data; keys->nr = new_keys.nr; keys->size = new_keys.size; @@ -216,11 +218,10 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, journal_iters_move_gap(c, keys->gap, idx); - move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); - keys->gap = idx; + move_gap(keys, idx); keys->nr++; - keys->d[keys->gap++] = n; + keys->data[keys->gap++] = n; journal_iters_fix(c); @@ -267,10 +268,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, size_t idx = bch2_journal_key_search(keys, btree, level, pos); if (idx < keys->size && - keys->d[idx].btree_id == btree && - keys->d[idx].level == level && - bpos_eq(keys->d[idx].k->k.p, pos)) - keys->d[idx].overwritten = true; + keys->data[idx].btree_id == btree && + keys->data[idx].level == level && + bpos_eq(keys->data[idx].k->k.p, pos)) + keys->data[idx].overwritten = true; } static void bch2_journal_iter_advance(struct journal_iter *iter) @@ -284,16 +285,16 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct journal_key *k = iter->keys->d + iter->idx; + struct journal_key *k = iter->keys->data + iter->idx; - while (k < iter->keys->d + iter->keys->size && + while (k < iter->keys->data + iter->keys->size && k->btree_id == iter->btree_id && k->level == iter->level) { if (!k->overwritten) return bkey_i_to_s_c(k->k); bch2_journal_iter_advance(iter); - k = iter->keys->d + iter->idx; + k = iter->keys->data + iter->idx; } return bkey_s_c_null; @@ -334,9 +335,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) iter->pos = bpos_successor(iter->pos); } +static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter) +{ + struct btree_and_journal_iter iter = *_iter; + struct bch_fs *c = iter.trans->c; + unsigned level = iter.journal.level; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_started, &c->flags) + ? (level > 1 ? 0 : 2) + : (level > 1 ? 1 : 16); + + iter.prefetch = false; + bch2_bkey_buf_init(&tmp); + + while (nr--) { + bch2_btree_and_journal_iter_advance(&iter); + struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1); + } + + bch2_bkey_buf_exit(&tmp, c); +} + struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) { struct bkey_s_c btree_k, journal_k, ret; + + if (iter->prefetch && iter->journal.level) + btree_and_journal_iter_prefetch(iter); again: if (iter->at_end) return bkey_s_c_null; @@ -376,17 +406,18 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) bch2_journal_iter_exit(&iter->journal); } -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b, struct btree_node_iter node_iter, struct bpos pos) { memset(iter, 0, sizeof(*iter)); + iter->trans = trans; iter->b = b; iter->node_iter = node_iter; - bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos); INIT_LIST_HEAD(&iter->journal.list); iter->pos = b->data->min_key; iter->at_end = false; @@ -396,15 +427,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter * this version is used by btree_gc before filesystem has gone RW and * multithreaded, so uses the journal_iters list: */ -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, - struct bch_fs *c, +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans, + struct btree_and_journal_iter *iter, struct btree *b) { struct btree_node_iter node_iter; bch2_btree_node_iter_init_from_start(&node_iter, b); - __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); - list_add(&iter->journal.list, &c->journal_iters); + __bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &trans->c->journal_iters); } /* sort and dedup all keys in the journal: */ @@ -415,9 +446,7 @@ void bch2_journal_entries_free(struct bch_fs *c) struct genradix_iter iter; genradix_for_each(&c->journal_entries, iter, i) - if (*i) - kvpfree(*i, offsetof(struct journal_replay, j) + - vstruct_bytes(&(*i)->j)); + kvfree(*i); genradix_free(&c->journal_entries); } @@ -437,22 +466,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r) void bch2_journal_keys_put(struct bch_fs *c) { struct journal_keys *keys = &c->journal_keys; - struct journal_key *i; BUG_ON(atomic_read(&keys->ref) <= 0); if (!atomic_dec_and_test(&keys->ref)) return; - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); - for (i = keys->d; i < keys->d + keys->nr; i++) + darray_for_each(*keys, i) if (i->allocated) kfree(i->k); - kvfree(keys->d); - keys->d = NULL; + kvfree(keys->data); + keys->data = NULL; keys->nr = keys->gap = keys->size = 0; bch2_journal_entries_free(c); @@ -460,83 +487,38 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - struct journal_key *src, *dst; + sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); - sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + struct journal_key *dst = keys->data; - src = dst = keys->d; - while (src < keys->d + keys->nr) { - while (src + 1 < keys->d + keys->nr && - !journal_key_cmp(src, src + 1)) - src++; + darray_for_each(*keys, src) { + if (src + 1 < &darray_top(*keys) && + !journal_key_cmp(src, src + 1)) + continue; - *dst++ = *src++; + *dst++ = *src; } - keys->nr = dst - keys->d; + keys->nr = dst - keys->data; } int bch2_journal_keys_sort(struct bch_fs *c) { struct genradix_iter iter; struct journal_replay *i, **_i; - struct jset_entry *entry; - struct bkey_i *k; struct journal_keys *keys = &c->journal_keys; - size_t nr_keys = 0, nr_read = 0; - - genradix_for_each(&c->journal_entries, iter, _i) { - i = *_i; - - if (!i || i->ignore) - continue; - - for_each_jset_key(k, entry, &i->j) - nr_keys++; - } - - if (!nr_keys) - return 0; - - keys->size = roundup_pow_of_two(nr_keys); - - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - if (!keys->d) { - bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", - nr_keys); - - do { - keys->size >>= 1; - keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); - } while (!keys->d && keys->size > nr_keys / 8); - - if (!keys->d) { - bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", - keys->size); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } + size_t nr_read = 0; genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; cond_resched(); for_each_jset_key(k, entry, &i->j) { - if (keys->nr == keys->size) { - __journal_keys_sort(keys); - - if (keys->nr > keys->size * 7 / 8) { - bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", - keys->nr, keys->size, nr_read, nr_keys); - return -BCH_ERR_ENOMEM_journal_keys_sort; - } - } - - keys->d[keys->nr++] = (struct journal_key) { + struct journal_key n = (struct journal_key) { .btree_id = entry->btree_id, .level = entry->level, .k = k, @@ -544,6 +526,18 @@ int bch2_journal_keys_sort(struct bch_fs *c) .journal_offset = k->_data - i->j._data, }; + if (darray_push(keys, n)) { + __journal_keys_sort(keys); + + if (keys->nr * 8 > keys->size * 7) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", + keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + + BUG_ON(darray_push(keys, n)); + } + nr_read++; } } @@ -551,6 +545,6 @@ int bch2_journal_keys_sort(struct bch_fs *c) __journal_keys_sort(keys); keys->gap = keys->nr; - bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr); return 0; } diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h index 8ca4c100b2e3..c9d19da3ea04 100644 --- a/fs/bcachefs/btree_journal_iter.h +++ b/fs/bcachefs/btree_journal_iter.h @@ -15,6 +15,7 @@ struct journal_iter { */ struct btree_and_journal_iter { + struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; @@ -22,6 +23,7 @@ struct btree_and_journal_iter { struct journal_iter journal; struct bpos pos; bool at_end; + bool prefetch; }; struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, @@ -29,6 +31,9 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); +int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, + struct btree_and_journal_iter *); + int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, @@ -42,12 +47,11 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); -void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, struct btree *, +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); -void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, - struct bch_fs *, - struct btree *); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, + struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 74e52fd28abe..8a71d43444b9 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -380,9 +380,11 @@ static int btree_key_cache_fill(struct btree_trans *trans, struct bkey_i *new_k = NULL; int ret; - k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, - BTREE_ITER_KEY_CACHE_FILL| - BTREE_ITER_CACHED_NOFILL); + bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + iter.flags &= ~BTREE_ITER_WITH_JOURNAL; + k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 684397442338..b9b151e693ed 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -747,7 +747,8 @@ void bch2_trans_downgrade(struct btree_trans *trans) return; trans_for_each_path(trans, path, i) - bch2_btree_path_downgrade(trans, path); + if (path->ref) + bch2_btree_path_downgrade(trans, path); } int bch2_trans_relock(struct btree_trans *trans) diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4a5a64499eb7..9404d96c38f3 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/rhashtable.h> +#include "bbpos_types.h" #include "btree_key_cache_types.h" #include "buckets_types.h" #include "darray.h" @@ -173,6 +174,11 @@ struct btree_cache { */ struct task_struct *alloc_lock; struct closure_waitlist alloc_wait; + + struct bbpos pinned_nodes_start; + struct bbpos pinned_nodes_end; + u64 pinned_nodes_leaf_mask; + u64 pinned_nodes_interior_mask; }; struct btree_node_iter { @@ -654,6 +660,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type); BIT_ULL(BKEY_TYPE_inodes)| \ BIT_ULL(BKEY_TYPE_stripes)| \ BIT_ULL(BKEY_TYPE_reflink)| \ + BIT_ULL(BKEY_TYPE_subvolumes)| \ BIT_ULL(BKEY_TYPE_btree)) #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \ @@ -727,7 +734,7 @@ struct btree_root { __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); u8 level; u8 alive; - s8 error; + s16 error; }; enum btree_gc_coalesce_fail_reason { diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index c3ff365acce9..a4b40c1656a5 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -452,7 +452,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, * the key cache - but the key has to exist in the btree for that to * work: */ - if (path->cached && bkey_deleted(&i->old_k)) + if (path->cached && !i->old_btree_u64s) return flush_new_cached_update(trans, i, flags, ip); return 0; @@ -788,6 +788,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct bpos pos, bool set) { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k)); + int ret = PTR_ERR_OR_ZERO(k); + if (ret) + return ret; + + bkey_init(&k->k); + k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + k->k.p = pos; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT); + + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, + struct bpos pos, bool set) +{ struct bkey_i k; bkey_init(&k.k); diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index b9382b7b288b..cc7c53e83f89 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id, struct bpos, struct bpos, unsigned, u64 *); int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool); +int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool); static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, enum btree_id btree, struct bpos pos) { - return bch2_btree_bit_mod(trans, btree, pos, false); + return bch2_btree_bit_mod_buffered(trans, btree, pos, false); } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 4530b14ff2c3..642213ef9f79 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -25,8 +25,7 @@ #include <linux/random.h> static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, - btree_path_idx_t, struct btree *, - struct keylist *, unsigned); + btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans, @@ -1208,10 +1207,6 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) mutex_unlock(&c->btree_cache.lock); mutex_lock(&c->btree_root_lock); - BUG_ON(btree_node_root(c, b) && - (b->c.level < btree_node_root(c, b)->c.level || - !btree_node_dying(btree_node_root(c, b)))); - bch2_btree_id_root(c, b->c.btree_id)->b = b; mutex_unlock(&c->btree_root_lock); @@ -1477,7 +1472,7 @@ static void btree_split_insert_keys(struct btree_update *as, static int btree_split(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree *parent = btree_node_parent(trans->paths + path, b); @@ -1578,7 +1573,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, if (parent) { /* Split a non root node */ - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err; } else if (n3) { @@ -1673,7 +1668,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as, * @path_idx: path that points to current node * @b: node to insert keys into * @keys: list of keys to insert - * @flags: transaction commit flags * * Returns: 0 on success, typically transaction restart error on failure * @@ -1683,7 +1677,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as, */ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, btree_path_idx_t path_idx, struct btree *b, - struct keylist *keys, unsigned flags) + struct keylist *keys) { struct bch_fs *c = as->c; struct btree_path *path = trans->paths + path_idx; @@ -1739,7 +1733,7 @@ split: return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); } - return btree_split(as, trans, path_idx, b, keys, flags); + return btree_split(as, trans, path_idx, b, keys); } int bch2_btree_split_leaf(struct btree_trans *trans, @@ -1747,7 +1741,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans, unsigned flags) { /* btree_split & merge may both cause paths array to be reallocated */ - struct btree *b = path_l(trans->paths + path)->b; struct btree_update *as; unsigned l; @@ -1759,7 +1752,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, if (IS_ERR(as)) return PTR_ERR(as); - ret = btree_split(as, trans, path, b, NULL, flags); + ret = btree_split(as, trans, path, b, NULL); if (ret) { bch2_btree_update_free(as, trans); return ret; @@ -1775,6 +1768,60 @@ int bch2_btree_split_leaf(struct btree_trans *trans, return ret; } +static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans, + btree_path_idx_t path_idx) +{ + struct bch_fs *c = as->c; + struct btree_path *path = trans->paths + path_idx; + struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b; + + BUG_ON(!btree_node_locked(path, b->c.level)); + + n = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + path->locks_want++; + BUG_ON(btree_node_locked(path, n->c.level)); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED); + bch2_btree_path_level_init(trans, path, n); + + n->sib_u64s[0] = U16_MAX; + n->sib_u64s[1] = U16_MAX; + + bch2_keylist_add(&as->parent_keys, &b->key); + btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys); + + bch2_btree_set_root(as, trans, path, n); + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + bch2_trans_node_add(trans, path, n); + six_unlock_intent(&n->c.lock); + + mutex_lock(&c->btree_cache.lock); + list_add_tail(&b->list, &c->btree_cache.live); + mutex_unlock(&c->btree_cache.lock); + + bch2_trans_verify_locks(trans); +} + +int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b; + struct btree_update *as = + bch2_btree_update_start(trans, trans->paths + path, + b->c.level, true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + __btree_increase_depth(as, trans, path); + bch2_btree_update_done(as, trans); + return 0; +} + int __bch2_foreground_maybe_merge(struct btree_trans *trans, btree_path_idx_t path, unsigned level, @@ -1845,8 +1892,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, __func__, buf1.buf, buf2.buf); printbuf_exit(&buf1); printbuf_exit(&buf2); - bch2_topology_error(c); - ret = -EIO; + ret = bch2_topology_error(c); goto err; } @@ -1916,7 +1962,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, bch2_trans_verify_paths(trans); - ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys); if (ret) goto err_free_update; @@ -1987,8 +2033,7 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, if (parent) { bch2_keylist_add(&as->parent_keys, &n->key); - ret = bch2_btree_insert_node(as, trans, iter->path, - parent, &as->parent_keys, flags); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys); if (ret) goto err; } else { @@ -2485,7 +2530,7 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) int bch2_fs_btree_interior_update_init(struct bch_fs *c) { c->btree_interior_update_worker = - alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index c593c925d1e3..3439b03719c7 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -119,6 +119,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned); +int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned); + int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t, unsigned, unsigned, enum btree_node_sibling); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index ac7844861966..b77e7b382b66 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -574,8 +574,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf) { struct journal_keys_to_wb dst; - struct jset_entry *entry; - struct bkey_i *k; int ret = 0; bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq)); @@ -590,7 +588,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu entry->type = BCH_JSET_ENTRY_btree_keys; } + spin_lock(&c->journal.lock); buf->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); out: bch2_journal_keys_to_write_buffer_end(c, &dst); return ret; diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 54f7826ac498..c2f46b267b3a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1053,7 +1053,8 @@ int bch2_trigger_extent(struct btree_trans *trans, (int) bch2_bkey_needs_rebalance(c, old); if (mod) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + new.k->p, mod > 0); if (ret) return ret; } @@ -1335,7 +1336,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu) struct bucket_gens *buckets = container_of(rcu, struct bucket_gens, rcu); - kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); + kvfree(buckets); } int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) @@ -1345,16 +1346,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) bool resize = ca->bucket_gens != NULL; int ret; - if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, - GFP_KERNEL|__GFP_ZERO))) { + if (!(bucket_gens = kvmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { ret = -BCH_ERR_ENOMEM_bucket_gens; goto err; } if ((c->opts.buckets_nouse && - !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * - sizeof(unsigned long), - GFP_KERNEL|__GFP_ZERO)))) { + !(buckets_nouse = kvmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { ret = -BCH_ERR_ENOMEM_buckets_nouse; goto err; } @@ -1397,8 +1398,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ret = 0; err: - kvpfree(buckets_nouse, - BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + kvfree(buckets_nouse); if (bucket_gens) call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); @@ -1407,27 +1407,21 @@ err: void bch2_dev_buckets_free(struct bch_dev *ca) { - unsigned i; - - kvpfree(ca->buckets_nouse, - BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); - kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), - sizeof(struct bucket_gens) + ca->mi.nbuckets); + kvfree(ca->buckets_nouse); + kvfree(rcu_dereference_protected(ca->bucket_gens, 1)); - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) free_percpu(ca->usage[i]); kfree(ca->usage_base); } int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - unsigned i; - ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); if (!ca->usage_base) return -BCH_ERR_ENOMEM_usage_init; - for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) { ca->usage[i] = alloc_percpu(struct bch_dev_usage); if (!ca->usage[i]) return -BCH_ERR_ENOMEM_usage_init; diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 226b39c17667..38defa19d52d 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -22,12 +22,6 @@ #include <linux/slab.h> #include <linux/uaccess.h> -__must_check -static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) -{ - return copy_to_user(to, from, n) ? -EFAULT : 0; -} - /* returns with ref on ca->ref */ static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, unsigned flags) @@ -155,19 +149,35 @@ static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr) kfree(thr); } -static int bch2_fsck_offline_thread_fn(void *arg) +static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts); - thr->thr.thr.ret = PTR_ERR_OR_ZERO(c); - if (!thr->thr.thr.ret) - bch2_fs_stop(c); + if (IS_ERR(c)) + return PTR_ERR(c); - thread_with_stdio_done(&thr->thr); - return 0; + int ret = 0; + if (test_bit(BCH_FS_errors_fixed, &c->flags)) + ret |= 1; + if (test_bit(BCH_FS_error, &c->flags)) + ret |= 4; + + bch2_fs_stop(c); + + if (ret & 1) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name); + if (ret & 4) + bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name); + + return ret; } +static const struct thread_with_stdio_ops bch2_offline_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_offline_thread_fn, +}; + static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; @@ -220,9 +230,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio); - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_offline_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops); err: if (ret < 0) { if (thr) @@ -763,9 +771,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, return ret; } -static int bch2_fsck_online_thread_fn(void *arg) +static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) { - struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr); + struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr); struct bch_fs *c = thr->c; c->stdio_filter = current; @@ -793,13 +801,16 @@ static int bch2_fsck_online_thread_fn(void *arg) c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - thread_with_stdio_done(&thr->thr); - up(&c->online_fsck_mutex); bch2_ro_ref_put(c); - return 0; + return ret; } +static const struct thread_with_stdio_ops bch2_online_fsck_ops = { + .exit = bch2_fsck_thread_exit, + .fn = bch2_fsck_online_thread_fn, +}; + static long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) { @@ -840,9 +851,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c, goto err; } - ret = bch2_run_thread_with_stdio(&thr->thr, - bch2_fsck_thread_exit, - bch2_fsck_online_thread_fn); + ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops); err: if (ret < 0) { bch_err_fn(c, ret); diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 3c761ad6b1c8..4701457f6381 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -558,7 +558,7 @@ got_key: return 0; } -#include "../crypto.h" +#include "crypto.h" #endif int bch2_request_key(struct bch_sb *sb, struct bch_key *key) diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 33df8cf86bd8..1410365a8891 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) return 0; if (!mempool_initialized(&c->compression_bounce[READ]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_read_init; if (!mempool_initialized(&c->compression_bounce[WRITE]) && - mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], - 1, c->opts.encoded_extent_max)) + mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) return -BCH_ERR_ENOMEM_compression_bounce_write_init; for (i = compression_types; @@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_initialized(&c->compress_workspace[i->type])) continue; - if (mempool_init_kvpmalloc_pool( + if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) return -BCH_ERR_ENOMEM_compression_workspace_init; } if (!mempool_initialized(&c->decompress_workspace) && - mempool_init_kvpmalloc_pool(&c->decompress_workspace, - 1, decompress_workspace_size)) + mempool_init_kvmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) return -BCH_ERR_ENOMEM_decompression_workspace_init; return 0; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 7bdba8507fc9..b1f147e6be4d 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b) mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { - c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } @@ -199,7 +199,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, return; } - n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL); + n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; @@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, out: if (bio) bio_put(bio); - kvpfree(n_ondisk, btree_buf_bytes(b)); + kvfree(n_ondisk); percpu_ref_put(&ca->io_ref); } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 4ae1e9f002a0..d37bd07afbfe 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -144,19 +144,21 @@ fsck_err: return ret; } -void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, - struct bkey_s_c k) +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> %llu type %s", - d_name.len, - d_name.name, - d.v->d_type != DT_SUBVOL - ? le64_to_cpu(d.v->d_inum) - : le32_to_cpu(d.v->d_child_subvol), - bch2_d_type_str(d.v->d_type)); + prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + + if (d.v->d_type != DT_SUBVOL) + prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + else + prt_printf(out, "%u -> %u", + le32_to_cpu(d.v->d_parent_subvol), + le32_to_cpu(d.v->d_child_subvol)); + + prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, @@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } int bch2_dirent_create_snapshot(struct btree_trans *trans, - u64 dir, u32 snapshot, + u32 dir_subvol, u64 dir, u32 snapshot, const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, bch_str_hash_flags_t str_hash_flags) { - subvol_inum zero_inum = { 0 }; + subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir }; struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.inode = dir; dirent->k.p.snapshot = snapshot; - ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info, - zero_inum, snapshot, - &dirent->k_i, str_hash_flags, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, + dir_inum, snapshot, + &dirent->k_i, str_hash_flags, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); *dir_offset = dirent->k.p.offset; return ret; @@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans, struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); - unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + unsigned src_update_flags = 0; + bool delete_src, delete_dst; int ret = 0; - if (src_dir.subvol != dst_dir.subvol) - return -EXDEV; - memset(src_inum, 0, sizeof(*src_inum)); memset(dst_inum, 0, sizeof(*dst_inum)); @@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - src_type = bkey_s_c_to_dirent(old_src).v->d_type; - - if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) - return -EOPNOTSUPP; - - /* Lookup dst: */ if (mode == BCH_RENAME) { /* @@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans, bkey_s_c_to_dirent(old_dst), dst_inum); if (ret) goto out; - - dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; - - if (dst_type == DT_SUBVOL) - return -EOPNOTSUPP; } if (mode != BCH_RENAME_EXCHANGE) @@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans, } } + if (new_dst->v.d_type == DT_SUBVOL) + new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol); + + if ((mode == BCH_RENAME_EXCHANGE) && + new_src->v.d_type == DT_SUBVOL) + new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; out_set_src: - /* - * If we're deleting a subvolume, we need to really delete the dirent, - * not just emit a whiteout in the current snapshot: + * If we're deleting a subvolume we need to really delete the dirent, + * not just emit a whiteout in the current snapshot - there can only be + * single dirent that points to a given subvolume. + * + * IOW, we don't maintain multiple versions in different snapshots of + * dirents that point to subvolumes - dirents that point to subvolumes + * are only visible in one particular subvolume so it's not necessary, + * and it would be particularly confusing for fsck to have to deal with. */ - if (src_type == DT_SUBVOL) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter); + delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL && + new_src->k.p.snapshot != old_src.k->p.snapshot; + + delete_dst = old_dst.k && + bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL && + new_dst->k.p.snapshot != old_dst.k->p.snapshot; + + if (!delete_src || !bkey_deleted(&new_src->k)) { + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); if (ret) goto out; + } - new_src->k.p = src_iter.pos; - src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + if (delete_src) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; } - ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); - if (ret) - goto out; + if (delete_dst) { + bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto out; + } if (mode == BCH_RENAME_EXCHANGE) *src_offset = new_src->k.p.offset; @@ -456,41 +472,29 @@ out: return ret; } -int __bch2_dirent_lookup_trans(struct btree_trans *trans, - struct btree_iter *iter, - subvol_inum dir, - const struct bch_hash_info *hash_info, - const struct qstr *name, subvol_inum *inum, - unsigned flags) +int bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) { - struct bkey_s_c k; - struct bkey_s_c_dirent d; - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); - if (ret) - return ret; - - ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); + int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); if (ret) return ret; - k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) goto err; - d = bkey_s_c_to_dirent(k); - - ret = bch2_dirent_read_target(trans, dir, d, inum); + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum); if (ret > 0) ret = -ENOENT; err: if (ret) bch2_trans_iter_exit(trans, iter); - return ret; } @@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, struct btree_iter iter = { NULL }; int ret = lockrestart_do(trans, - __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); + bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); bch2_trans_iter_exit(trans, &iter); bch2_trans_put(trans); return ret; } -int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) +int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot) SPOS(dir, 0, snapshot), POS(dir, U64_MAX), 0, k, ret) if (k.k->type == KEY_TYPE_dirent) { - ret = -ENOTEMPTY; + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) + continue; + ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; break; } bch2_trans_iter_exit(trans, &iter); @@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) u32 snapshot; return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?: - bch2_empty_dir_snapshot(trans, dir.inum, snapshot); + bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot); } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 21ffeb78f02e..bee55cca2aa0 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len) int bch2_dirent_read_target(struct btree_trans *, subvol_inum, struct bkey_s_c_dirent, subvol_inum *); -int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32, +int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, bch_str_hash_flags_t); @@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, +int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *, unsigned); u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, const struct bch_hash_info *, const struct qstr *, subvol_inum *); -int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32); +int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d503af270024..b98e2c2b8bf0 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -504,7 +504,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) unsigned i; for (i = 0; i < s->v.nr_blocks; i++) { - kvpfree(buf->data[i], buf->size << 9); + kvfree(buf->data[i]); buf->data[i] = NULL; } } @@ -531,7 +531,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, memset(buf->valid, 0xFF, sizeof(buf->valid)); for (i = 0; i < v->nr_blocks; i++) { - buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL); if (!buf->data[i]) goto err; } diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index d260ff9bbfeb..43557bebd0f8 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -2,6 +2,7 @@ #include "bcachefs.h" #include "errcode.h" +#include "trace.h" #include <linux/errname.h> @@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class) return err == class; } -int __bch2_err_class(int err) +int __bch2_err_class(int bch_err) { - err = -err; - BUG_ON((unsigned) err >= BCH_ERR_MAX); + int std_err = -bch_err; + BUG_ON((unsigned) std_err >= BCH_ERR_MAX); - while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) - err = bch2_errcode_parents[err - BCH_ERR_START]; + while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START]) + std_err = bch2_errcode_parents[std_err - BCH_ERR_START]; + + trace_error_downcast(bch_err, std_err, _RET_IP_); - return -err; + return -std_err; } const char *bch2_blk_status_to_str(blk_status_t status) diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 8c40c2067a04..af25d8ec60f2 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,10 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, mount_option) \ + x(BCH_ERR_mount_option, option_name) \ + x(BCH_ERR_mount_option, option_value) \ + x(BCH_ERR_mount_option, option_not_bool) \ x(ENOMEM, ENOMEM_stripe_buf) \ x(ENOMEM, ENOMEM_replicas_table) \ x(ENOMEM, ENOMEM_cpu_replicas) \ @@ -78,6 +82,7 @@ x(ENOMEM, ENOMEM_fs_name_alloc) \ x(ENOMEM, ENOMEM_fs_other_alloc) \ x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOMEM, ENOMEM_disk_accounting) \ x(ENOSPC, ENOSPC_disk_reservation) \ x(ENOSPC, ENOSPC_bucket_alloc) \ x(ENOSPC, ENOSPC_disk_label_add) \ @@ -109,6 +114,8 @@ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ + x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ + x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(0, open_buckets_empty) \ x(0, freelist_empty) \ x(BCH_ERR_freelist_empty, no_buckets_found) \ @@ -176,6 +183,9 @@ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ x(EINVAL, opt_parse_error) \ + x(EINVAL, remove_with_metadata_missing_unimplemented)\ + x(EINVAL, remove_would_lose_data) \ + x(EINVAL, btree_iter_with_journal_not_supported) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -225,7 +235,10 @@ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ x(EIO, btree_node_read_err) \ x(EIO, sb_not_downgraded) \ - x(EIO, btree_write_all_failed) \ + x(EIO, btree_node_write_all_failed) \ + x(EIO, btree_node_read_error) \ + x(EIO, btree_node_read_validate_error) \ + x(EIO, btree_need_topology_repair) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ @@ -238,7 +251,8 @@ x(BCH_ERR_nopromote, nopromote_congested) \ x(BCH_ERR_nopromote, nopromote_in_flight) \ x(BCH_ERR_nopromote, nopromote_no_writes) \ - x(BCH_ERR_nopromote, nopromote_enomem) + x(BCH_ERR_nopromote, nopromote_enomem) \ + x(0, need_inode_lock) enum bch_errcode { BCH_ERR_START = 2048, diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index d32c8bebe46c..043431206799 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "error.h" +#include "recovery.h" #include "super.h" #include "thread_with_file.h" @@ -25,11 +26,16 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -void bch2_topology_error(struct bch_fs *c) +int bch2_topology_error(struct bch_fs *c) { set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_fsck_running, &c->flags)) + if (!test_bit(BCH_FS_fsck_running, &c->flags)) { bch2_inconsistent_error(c); + return -BCH_ERR_btree_need_topology_repair; + } else { + return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + -BCH_ERR_btree_node_read_validate_error; + } } void bch2_fatal_error(struct bch_fs *c) diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index fec17d1353d1..94491190e09e 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -30,7 +30,7 @@ struct work_struct; bool bch2_inconsistent_error(struct bch_fs *); -void bch2_topology_error(struct bch_fs *); +int bch2_topology_error(struct bch_fs *); #define bch2_fs_inconsistent(c, ...) \ ({ \ diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 6bf839d69e84..6219f2c08e4c 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -43,6 +43,11 @@ enum bkey_invalid_flags; #define extent_entry_next(_entry) \ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) +#define extent_entry_next_safe(_entry, _end) \ + (likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX) \ + ? extent_entry_next(_entry) \ + : _end) + static inline unsigned __extent_entry_type(const union bch_extent_entry *e) { @@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) #define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ for ((_entry) = (_start); \ (_entry) < (_end); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define __bkey_ptr_next(_ptr, _end) \ ({ \ @@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) (_ptr).has_ec = false; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ - switch (extent_entry_type(_entry)) { \ + switch (__extent_entry_type(_entry)) { \ case BCH_EXTENT_ENTRY_ptr: \ (_ptr).ptr = _entry->ptr; \ goto out; \ @@ -344,7 +349,7 @@ out: \ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ (_entry) = _start; \ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ - (_entry) = extent_entry_next(_entry)) + (_entry) = extent_entry_next_safe(_entry, _end)) #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h index 66b945be10c2..d8153fe27037 100644 --- a/fs/bcachefs/fifo.h +++ b/fs/bcachefs/fifo.h @@ -24,12 +24,12 @@ struct { \ (fifo)->mask = (fifo)->size \ ? roundup_pow_of_two((fifo)->size) - 1 \ : 0; \ - (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ + (fifo)->data = kvmalloc(fifo_buf_size(fifo), (_gfp)); \ }) #define free_fifo(fifo) \ do { \ - kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + kvfree((fifo)->data); \ (fifo)->data = NULL; \ } while (0) diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 1c1ea0f0c692..624e6f963240 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans, u32 new_subvol, dir_snapshot; ret = bch2_subvolume_create(trans, new_inode->bi_inum, + dir.subvol, snapshot_src.subvol, &new_subvol, &snapshot, (flags & BCH_CREATE_SNAPSHOT_RO) != 0); @@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - bool deleting_snapshot) + bool deleting_subvol) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, - name, &inum, BTREE_ITER_INTENT); + ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot && !inode_u->bi_subvol) { + if (deleting_subvol && !inode_u->bi_subvol) { ret = -BCH_ERR_ENOENT_not_subvol; goto err; } - if (deleting_snapshot || inode_u->bi_subvol) { + if (inode_u->bi_subvol) { + /* Recursive subvolume destroy not allowed (yet?) */ + ret = bch2_subvol_has_children(trans, inode_u->bi_subvol); + if (ret) + goto err; + } + + if (deleting_subvol || inode_u->bi_subvol) { ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; @@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, return ret; } +static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent) +{ + struct btree_iter iter; + struct bkey_i_subvolume *s = + bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_CACHED, subvolume); + int ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.fs_path_parent = cpu_to_le32(new_parent); + bch2_trans_iter_exit(trans, &iter); + return 0; +} + int bch2_rename_trans(struct btree_trans *trans, subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, @@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } + if (src_inode_u->bi_subvol && + dst_dir.subvol != src_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + dst_inode_u->bi_subvol && + src_dir.subvol != dst_inode_u->bi_parent_subvol) { + ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol); + if (ret) + goto err; + } + + /* Can't move across subvolumes, unless it's a subvolume root: */ + if (src_dir.subvol != dst_dir.subvol && + (!src_inode_u->bi_subvol || + (dst_inum.inum && !dst_inode_u->bi_subvol))) { + ret = -EXDEV; + goto err; + } + + if (src_inode_u->bi_parent_subvol) + src_inode_u->bi_parent_subvol = dst_dir.subvol; + + if ((mode == BCH_RENAME_EXCHANGE) && + dst_inode_u->bi_parent_subvol) + dst_inode_u->bi_parent_subvol = src_dir.subvol; + src_inode_u->bi_dir = dst_dir_u->bi_inum; src_inode_u->bi_dir_offset = dst_offset; @@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inum)) { - ret = -ENOTEMPTY; - goto err; + if (S_ISDIR(dst_inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, dst_inum); + if (ret) + goto err; } } diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index 27710cdd5710..39292e7ef342 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi) static int __bch2_buffered_write(struct bch_inode_info *inode, struct address_space *mapping, struct iov_iter *iter, - loff_t pos, unsigned len) + loff_t pos, unsigned len, + bool inode_locked) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; @@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, BUG_ON(!fs.nr); + /* + * If we're not using the inode lock, we need to lock all the folios for + * atomiticity of writes vs. other writes: + */ + if (!inode_locked && folio_end_pos(darray_last(fs)) < end) { + ret = -BCH_ERR_need_inode_lock; + goto out; + } + f = darray_first(fs); if (pos != folio_pos(f) && !folio_test_uptodate(f)) { ret = bch2_read_single_folio(f, mapping); @@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode, end = pos + copied; spin_lock(&inode->v.i_lock); - if (end > inode->v.i_size) + if (end > inode->v.i_size) { + BUG_ON(!inode_locked); i_size_write(&inode->v, end); + } spin_unlock(&inode->v.i_lock); f_pos = pos; @@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct bch_inode_info *inode = file_bch_inode(file); - loff_t pos = iocb->ki_pos; - ssize_t written = 0; - int ret = 0; + loff_t pos; + bool inode_locked = false; + ssize_t written = 0, written2 = 0, ret = 0; + + /* + * We don't take the inode lock unless i_size will be changing. Folio + * locks provide exclusion with other writes, and the pagecache add lock + * provides exclusion with truncate and hole punching. + * + * There is one nasty corner case where atomicity would be broken + * without great care: when copying data from userspace to the page + * cache, we do that with faults disable - a page fault would recurse + * back into the filesystem, taking filesystem locks again, and + * deadlock; so it's done with faults disabled, and we fault in the user + * buffer when we aren't holding locks. + * + * If we do part of the write, but we then race and in the userspace + * buffer have been evicted and are no longer resident, then we have to + * drop our folio locks to re-fault them in, breaking write atomicity. + * + * To fix this, we restart the write from the start, if we weren't + * holding the inode lock. + * + * There is another wrinkle after that; if we restart the write from the + * start, and then get an unrecoverable error, we _cannot_ claim to + * userspace that we did not write data we actually did - so we must + * track (written2) the most we ever wrote. + */ + + if ((iocb->ki_flags & IOCB_APPEND) || + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) { + inode_lock(&inode->v); + inode_locked = true; + } + + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0); + if (ret) { + if (!inode_locked) { + inode_lock(&inode->v); + inode_locked = true; + ret = file_remove_privs_flags(file, 0); + } + if (ret) + goto unlock; + } + + ret = file_update_time(file); + if (ret) + goto unlock; + + pos = iocb->ki_pos; bch2_pagecache_add_get(inode); + if (!inode_locked && + (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) + goto get_inode_lock; + do { unsigned offset = pos & (PAGE_SIZE - 1); unsigned bytes = iov_iter_count(iter); @@ -1004,12 +1072,17 @@ again: } } + if (unlikely(bytes != iov_iter_count(iter) && !inode_locked)) + goto get_inode_lock; + if (unlikely(fatal_signal_pending(current))) { ret = -EINTR; break; } - ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked); + if (ret == -BCH_ERR_need_inode_lock) + goto get_inode_lock; if (unlikely(ret < 0)) break; @@ -1030,50 +1103,46 @@ again: } pos += ret; written += ret; + written2 = max(written, written2); + + if (ret != bytes && !inode_locked) + goto get_inode_lock; ret = 0; balance_dirty_pages_ratelimited(mapping); - } while (iov_iter_count(iter)); + if (0) { +get_inode_lock: + bch2_pagecache_add_put(inode); + inode_lock(&inode->v); + inode_locked = true; + bch2_pagecache_add_get(inode); + + iov_iter_revert(iter, written); + pos -= written; + written = 0; + ret = 0; + } + } while (iov_iter_count(iter)); bch2_pagecache_add_put(inode); +unlock: + if (inode_locked) + inode_unlock(&inode->v); + + iocb->ki_pos += written; - return written ? written : ret; + ret = max(written, written2) ?: ret; + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; } -ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter) { - struct file *file = iocb->ki_filp; - struct bch_inode_info *inode = file_bch_inode(file); - ssize_t ret; - - if (iocb->ki_flags & IOCB_DIRECT) { - ret = bch2_direct_write(iocb, from); - goto out; - } - - inode_lock(&inode->v); - - ret = generic_write_checks(iocb, from); - if (ret <= 0) - goto unlock; - - ret = file_remove_privs(file); - if (ret) - goto unlock; - - ret = file_update_time(file); - if (ret) - goto unlock; - - ret = bch2_buffered_write(iocb, from); - if (likely(ret > 0)) - iocb->ki_pos += ret; -unlock: - inode_unlock(&inode->v); + ssize_t ret = iocb->ki_flags & IOCB_DIRECT + ? bch2_direct_write(iocb, iter) + : bch2_buffered_write(iocb, iter); - if (ret > 0) - ret = generic_write_sync(iocb, ret); -out: return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h index 8cbaba6565b4..828c3d7c8f19 100644 --- a/fs/bcachefs/fs-io-pagecache.h +++ b/fs/bcachefs/fs-io-pagecache.h @@ -51,13 +51,10 @@ enum bch_folio_sector_state { struct bch_folio_sector { /* Uncompressed, fully allocated replicas (or on disk reservation): */ - unsigned nr_replicas:4; - + u8 nr_replicas:4, /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ - unsigned replicas_reserved:4; - - /* i_sectors: */ - enum bch_folio_sector_state state:8; + replicas_reserved:4; + u8 state; }; struct bch_folio { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 77ae65542db9..3f073845bbd7 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -176,45 +176,88 @@ static unsigned bch2_inode_hash(subvol_inum inum) return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode) { - struct bch_inode_unpacked inode_u; - struct bch_inode_info *inode; - struct btree_trans *trans; - struct bch_subvolume subvol; - int ret; + subvol_inum inum = inode_inum(inode); + struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); - inode = to_bch_ei(iget5_locked(c->vfs_sb, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - if (unlikely(!inode)) - return ERR_PTR(-ENOMEM); - if (!(inode->v.i_state & I_NEW)) - return &inode->v; + if (unlikely(old != inode)) { + discard_new_inode(&inode->v); + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } - trans = bch2_trans_get(c); - ret = lockrestart_do(trans, - bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: - bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); + return inode; +} - if (!ret) - bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); - bch2_trans_put(trans); +#define memalloc_flags_do(_flags, _do) \ +({ \ + unsigned _saved_flags = memalloc_flags_save(_flags); \ + typeof(_do) _ret = _do; \ + memalloc_noreclaim_restore(_saved_flags); \ + _ret; \ +}) - if (ret) { - iget_failed(&inode->v); - return ERR_PTR(bch2_err_class(ret)); +/* + * Allocate a new inode, dropping/retaking btree locks if necessary: + */ +static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_info *inode = + memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN, + to_bch_ei(new_inode(c->vfs_sb))); + + if (unlikely(!inode)) { + int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM); + if (ret && inode) + discard_new_inode(&inode->v); + if (ret) + return ERR_PTR(ret); } - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); + return inode; +} - unlock_new_inode(&inode->v); +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + return &inode->v; - return &inode->v; + struct btree_trans *trans = bch2_trans_get(c); + + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + int ret = lockrestart_do(trans, + bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (!ret) { + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); + } + bch2_trans_put(trans); + + return ret ? ERR_PTR(ret) : &inode->v; } struct bch_inode_info * @@ -226,7 +269,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans *trans; struct bch_inode_unpacked dir_u; - struct bch_inode_info *inode, *old; + struct bch_inode_info *inode; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; @@ -293,7 +336,6 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_iget5_set(&inode->v, &inum); bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -304,36 +346,7 @@ err_before_quota: * bch2_trans_exit() and dropping locks, else we could race with another * thread pulling the inode in and modifying it: */ - - inode->v.i_state |= I_CREATING; - - old = to_bch_ei(inode_insert5(&inode->v, - bch2_inode_hash(inum), - bch2_iget5_test, - bch2_iget5_set, - &inum)); - BUG_ON(!old); - - if (unlikely(old != inode)) { - /* - * We raced, another process pulled the new inode into cache - * before us: - */ - make_bad_inode(&inode->v); - iput(&inode->v); - - inode = old; - } else { - mutex_lock(&c->vfs_inodes_lock); - list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); - mutex_unlock(&c->vfs_inodes_lock); - /* - * we really don't want insert_inode_locked2() to be setting - * I_NEW... - */ - unlock_new_inode(&inode->v); - } - + inode = bch2_inode_insert(c, inode); bch2_trans_put(trans); err: posix_acl_release(default_acl); @@ -352,23 +365,78 @@ err_trans: /* methods */ +static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_hash_info *dir_hash_info, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dirent_iter = {}; + subvol_inum inum = {}; + + int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, + dir_hash_info, dir, name, 0); + if (ret) + return ERR_PTR(ret); + + struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + if (ret > 0) + ret = -ENOENT; + if (ret) + goto err; + + struct bch_inode_info *inode = + to_bch_ei(ilookup5_nowait(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + &inum)); + if (inode) + goto out; + + struct bch_subvolume subvol; + struct bch_inode_unpacked inode_u; + ret = bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans)); + if (bch2_err_matches(ret, ENOENT)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch_err(c, "%s points to missing inode", buf.buf); + printbuf_exit(&buf); + } + if (ret) + goto err; + + bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol); + inode = bch2_inode_insert(c, inode); +out: + bch2_trans_iter_exit(trans, &dirent_iter); + return inode; +err: + inode = ERR_PTR(ret); + goto out; +} + static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, unsigned int flags) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); - struct inode *vinode = NULL; - subvol_inum inum = { .subvol = 1 }; - int ret; - ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, - &dentry->d_name, &inum); - - if (!ret) - vinode = bch2_vfs_inode_get(c, inum); + struct bch_inode_info *inode; + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir), + &hash, &dentry->d_name))); + if (IS_ERR(inode)) + inode = NULL; - return d_splice_alias(vinode, dentry); + return d_splice_alias(&inode->v, dentry); } static int bch2_mknod(struct mnt_idmap *idmap, @@ -1372,6 +1440,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *bi, struct bch_subvolume *subvol) { + bch2_iget5_set(&inode->v, &inum); bch2_inode_update_after_write(trans, inode, bi, ~0); if (BCH_SUBVOLUME_SNAP(subvol)) @@ -1572,7 +1641,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) * number: */ u64 avail_inodes = ((usage.capacity - usage.used) << 3); - u64 fsid; buf->f_type = BCACHEFS_STATFS_MAGIC; buf->f_bsize = sb->s_blocksize; @@ -1583,10 +1651,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = usage.nr_inodes + avail_inodes; buf->f_ffree = avail_inodes; - fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ - le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); - buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; - buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_fsid = uuid_to_fsid(c->sb.user_uuid.b); buf->f_namelen = BCH_NAME_MAX; return 0; @@ -1805,8 +1870,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type, opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ret = bch2_parse_mount_opts(NULL, &opts, data); - if (ret) + if (ret) { + ret = bch2_err_class(ret); return ERR_PTR(ret); + } if (!dev_name || strlen(dev_name) == 0) return ERR_PTR(-EINVAL); @@ -1882,6 +1949,7 @@ got_sb: sb->s_time_gran = c->sb.nsec_per_time_unit; sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + sb->s_uuid = c->sb.user_uuid; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 6a760777bafb..f48033be3f6b 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -100,8 +100,8 @@ err: } static int lookup_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode, - u32 *snapshot) + struct bch_inode_unpacked *inode, + u32 *snapshot) { struct btree_iter iter; struct bkey_s_c k; @@ -142,34 +142,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - struct bkey_inode_buf *inode_p = - bch2_trans_kmalloc(trans, sizeof(*inode_p)); - - if (IS_ERR(inode_p)) - return PTR_ERR(inode_p); - - bch2_inode_pack(inode_p, inode); - inode_p->inode.k.p.snapshot = snapshot; - - return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, - &inode_p->inode.k_i, - BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); -} - -static int fsck_write_inode(struct btree_trans *trans, - struct bch_inode_unpacked *inode, - u32 snapshot) -{ - int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - __write_inode(trans, inode, snapshot)); - bch_err_fn(trans->c, ret); - return ret; -} - static int __remove_dirent(struct btree_trans *trans, struct bpos pos) { struct bch_fs *c = trans->c; @@ -280,7 +252,7 @@ create_lostfound: goto err; ret = bch2_dirent_create_snapshot(trans, - root_inode.bi_inum, snapshot, &root_hash_info, + 0, root_inode.bi_inum, snapshot, &root_hash_info, mode_to_type(lostfound->bi_mode), &lostfound_str, lostfound->bi_inum, @@ -303,30 +275,47 @@ static int reattach_inode(struct btree_trans *trans, char name_buf[20]; struct qstr name; u64 dir_offset = 0; + u32 dirent_snapshot = inode_snapshot; int ret; - ret = lookup_lostfound(trans, inode_snapshot, &lostfound); + if (inode->bi_subvol) { + inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL; + + u64 root_inum; + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &dirent_snapshot, &root_inum); + if (ret) + return ret; + + snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol); + } else { + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + } + + ret = lookup_lostfound(trans, dirent_snapshot, &lostfound); if (ret) return ret; if (S_ISDIR(inode->bi_mode)) { lostfound.bi_nlink++; - ret = __write_inode(trans, &lostfound, U32_MAX); + ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX); if (ret) return ret; } dir_hash = bch2_hash_info_init(trans->c, &lostfound); - snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); name = (struct qstr) QSTR(name_buf); ret = bch2_dirent_create_snapshot(trans, - lostfound.bi_inum, inode_snapshot, + inode->bi_parent_subvol, lostfound.bi_inum, + dirent_snapshot, &dir_hash, inode_d_type(inode), - &name, inode->bi_inum, &dir_offset, + &name, + inode->bi_subvol ?: inode->bi_inum, + &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) return ret; @@ -334,7 +323,7 @@ static int reattach_inode(struct btree_trans *trans, inode->bi_dir = lostfound.bi_inum; inode->bi_dir_offset = dir_offset; - return __write_inode(trans, inode, inode_snapshot); + return __bch2_fsck_write_inode(trans, inode, inode_snapshot); } static int remove_backpointer(struct btree_trans *trans, @@ -353,6 +342,27 @@ static int remove_backpointer(struct btree_trans *trans, return ret; } +static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_fs *c = trans->c; + + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + ret = remove_backpointer(trans, &inode); + bch_err_msg(c, ret, "removing dirent"); + if (ret) + return ret; + + ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot)); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + return ret; +} + struct snapshots_seen_entry { u32 id; u32 equiv; @@ -592,13 +602,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, } static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, - u32 snapshot, bool is_whiteout) +lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) { - struct inode_walker_entry *i; - - snapshot = bch2_snapshot_equiv(c, snapshot); + bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + struct inode_walker_entry *i; __darray_for_each(w->inodes, i) if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) goto found; @@ -609,20 +618,24 @@ found: if (snapshot != i->snapshot && !is_whiteout) { struct inode_walker_entry new = *i; - size_t pos; - int ret; new.snapshot = snapshot; new.count = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", - w->last_pos.inode, snapshot, i->snapshot); + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, k); + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + "unexpected because we should always update the inode when we update a key in that inode\n" + "%s", + w->last_pos.inode, snapshot, i->snapshot, buf.buf); + printbuf_exit(&buf); while (i > w->inodes.data && i[-1].snapshot > snapshot) --i; - pos = i - w->inodes.data; - ret = darray_insert_item(&w->inodes, pos, new); + size_t pos = i - w->inodes.data; + int ret = darray_insert_item(&w->inodes, pos, new); if (ret) return ERR_PTR(ret); @@ -633,21 +646,21 @@ found: } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, - struct inode_walker *w, struct bpos pos, - bool is_whiteout) + struct inode_walker *w, + struct bkey_s_c k) { - if (w->last_pos.inode != pos.inode) { - int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (w->last_pos.inode != k.k->p.inode) { + int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode); if (ret) return ERR_PTR(ret); - } else if (bkey_cmp(w->last_pos, pos)) { + } else if (bkey_cmp(w->last_pos, k.k->p)) { darray_for_each(w->inodes, i) i->seen_this_pos = false; } - w->last_pos = pos; + w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout); + return lookup_inode_for_snapshot(trans->c, w, k); } static int __get_visible_inodes(struct btree_trans *trans, @@ -722,7 +735,7 @@ static int hash_redo_key(struct btree_trans *trans, delete->k.p = k_iter->pos; return bch2_btree_iter_traverse(k_iter) ?: bch2_trans_update(trans, k_iter, delete, 0) ?: - bch2_hash_set_snapshot(trans, desc, hash_info, + bch2_hash_set_in_snapshot(trans, desc, hash_info, (subvol_inum) { 0, k.k->p.inode }, k.k->p.snapshot, tmp, BCH_HASH_SET_MUST_CREATE, @@ -795,16 +808,93 @@ fsck_err: goto out; } +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + if (inode->bi_subvol) { + u64 inum; + int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum); + if (ret) + return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) }); + } + + return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot)); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p) { struct btree_iter iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0); - int ret = bkey_err(k); - if (ret) + int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set; + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k, + struct bch_inode_unpacked *inode, + u32 inode_snapshot, bool *write_inode) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + struct btree_iter dirent_iter = {}; + struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot); + int ret = bkey_err(d); + if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - bch2_trans_iter_exit(trans, &iter); - return k.k->type == KEY_TYPE_set; + if (fsck_err_on(ret, + c, inode_points_to_missing_dirent, + "inode points to missing dirent\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) || + fsck_err_on(!ret && !dirent_points_to_inode(d, inode), + c, inode_points_to_wrong_dirent, + "inode points to dirent that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, inode_k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + /* + * We just clear the backpointer fields for now. If we find a + * dirent that points to this inode in check_dirents(), we'll + * update it then; then when we get to check_path() if the + * backpointer is still 0 we'll reattach it. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + inode->bi_flags &= ~BCH_INODE_backptr_untrusted; + *write_inode = true; + } + + ret = 0; +fsck_err: + bch2_trans_iter_exit(trans, &dirent_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; } static int check_inode(struct btree_trans *trans, @@ -861,7 +951,8 @@ static int check_inode(struct btree_trans *trans, u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked; - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); + bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -876,7 +967,7 @@ static int check_inode(struct btree_trans *trans, if (ret < 0) return ret; - fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list, + fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list, "inode %llu:%u unlinked, but not on deleted list", u.bi_inum, k.k->p.snapshot); ret = 0; @@ -950,8 +1041,49 @@ static int check_inode(struct btree_trans *trans, do_update = true; } + if (u.bi_dir || u.bi_dir_offset) { + ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update); + if (ret) + goto err; + } + + if (fsck_err_on(u.bi_parent_subvol && + (u.bi_subvol == 0 || + u.bi_subvol == BCACHEFS_ROOT_SUBVOL), + c, inode_bi_parent_nonzero, + "inode %llu:%u has subvol %u but nonzero parent subvol %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) { + u.bi_parent_subvol = 0; + do_update = true; + } + + if (u.bi_subvol) { + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, + c, inode_bi_subvol_missing, + "inode %llu:%u bi_subvol points to missing subvolume %u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol) || + fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum || + !bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot), + k.k->p.snapshot), + c, inode_bi_subvol_wrong, + "inode %llu:%u points to subvol %u, but subvol points to %llu:%u", + u.bi_inum, k.k->p.snapshot, u.bi_subvol, + le64_to_cpu(s.inode), + le32_to_cpu(s.snapshot))) { + u.bi_subvol = 0; + u.bi_parent_subvol = 0; + do_update = true; + } + } + if (do_update) { - ret = __write_inode(trans, &u, iter->pos.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot); bch_err_msg(c, ret, "in fsck updating inode"); if (ret) return ret; @@ -982,28 +1114,6 @@ int bch2_check_inodes(struct bch_fs *c) return ret; } -static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos) -{ - return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); -} - -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - -static bool dirent_points_to_inode(struct bkey_s_c_dirent d, - struct bch_inode_unpacked *inode) -{ - return d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol - : le64_to_cpu(d.v->d_inum) == inode->bi_inum; -} - static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) { struct bch_fs *c = trans->c; @@ -1032,7 +1142,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) w->last_pos.inode, i->snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1312,7 +1422,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) goto err; @@ -1481,7 +1591,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) "directory %llu:%u with wrong i_nlink: got %u, should be %llu", w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; - ret = fsck_write_inode(trans, &i->inode, i->snapshot); + ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot); if (ret) break; } @@ -1491,86 +1601,106 @@ fsck_err: return ret ?: trans_was_restarted(trans, restart_count); } -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target, - u32 target_snapshot) +static int check_dirent_inode_dirent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) { struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; int ret = 0; + if (inode_points_to_dirent(target, d)) + return 0; + if (!target->bi_dir && !target->bi_dir_offset) { target->bi_dir = d.k->p.inode; target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; + return __bch2_fsck_write_inode(trans, target, target_snapshot); } - if (!inode_points_to_dirent(target, d)) { - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; + struct btree_iter bp_iter = { NULL }; + struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, + SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot)); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; - bool backpointer_exists = !ret; - ret = 0; + bool backpointer_exists = !ret; + ret = 0; + + if (fsck_err_on(!backpointer_exists, + c, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + goto out; + } - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + if (backpointer_exists) + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (fsck_err_on(backpointer_exists && + (S_ISDIR(target->bi_mode) || + target->bi_subvol), + c, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target_snapshot, buf.buf)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } - if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists, - c, inode_dir_multiple_links, - "directory %llu:%u with multiple links\n%s", - target->bi_inum, target_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(backpointer_exists && !target->bi_nlink, + c, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - c, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + struct printbuf buf = PRINTBUF; + int ret = 0; - if (fsck_err_on(!backpointer_exists, - c, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - - ret = __write_inode(trans, target, target_snapshot); - if (ret) - goto err; - } - } + ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot); + if (ret) + goto err; if (fsck_err_on(d.v->d_type != inode_d_type(target), c, dirent_d_type_wrong, @@ -1586,6 +1716,12 @@ static int check_dirent_target(struct btree_trans *trans, bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } ret = bch2_trans_update(trans, iter, &n->k_i, 0); if (ret) @@ -1593,33 +1729,134 @@ static int check_dirent_target(struct btree_trans *trans, d = dirent_i_to_s_c(n); } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} - if (fsck_err_on(d.v->d_type == DT_SUBVOL && - target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol), - c, dirent_d_parent_subvol_wrong, - "dirent has wrong d_parent_subvol field: got %u, should be %u", - le32_to_cpu(d.v->d_parent_subvol), - target->bi_parent_subvol)) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); +/* find a subvolume that's a descendent of @snapshot: */ +static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) { + bch2_trans_iter_exit(trans, &iter); + *subvolid = k.k->p.offset; + goto found; + } + } + if (!ret) + ret = -ENOENT; +found: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c_dirent d) +{ + struct bch_fs *c = trans->c; + struct btree_iter subvol_iter = {}; + struct bch_inode_unpacked subvol_root; + u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol); + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 parent_snapshot; + u64 parent_inum; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol, + "dirent parent_subvol points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) || + fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot), + c, dirent_not_visible_in_parent_subvol, + "dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s", + parent_snapshot, + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + u32 new_parent_subvol; + ret = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol); if (ret) goto err; - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); + ret = PTR_ERR_OR_ZERO(new_dirent); + if (ret) + goto err; - ret = bch2_trans_update(trans, iter, &n->k_i, 0); + new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol); + } + + struct bkey_s_c_subvolume s = + bch2_bkey_get_iter_typed(trans, &subvol_iter, + BTREE_ID_subvolumes, POS(0, target_subvol), + 0, subvolume); + ret = bkey_err(s.s_c); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret) { + if (fsck_err(c, dirent_to_missing_subvol, + "dirent points to missing subvolume\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) + return __remove_dirent(trans, d.k->p); + ret = 0; + goto out; + } + + if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, + c, subvol_fs_path_parent_wrong, + "subvol with wrong fs_path_parent, should be be %u\n%s", + parent_subvol, + (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); if (ret) goto err; - d = dirent_i_to_s_c(n); + n->v.fs_path_parent = cpu_to_le32(parent_subvol); } + + u64 target_inum = le64_to_cpu(s.v->inode); + u32 target_snapshot = le32_to_cpu(s.v->snapshot); + + ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(parent_subvol != subvol_root.bi_parent_subvol, + c, inode_bi_parent_wrong, + "subvol root %llu has wrong bi_parent_subvol: got %u, should be %u", + target_inum, + subvol_root.bi_parent_subvol, parent_subvol)) { + subvol_root.bi_parent_subvol = parent_subvol; + ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot); + if (ret) + return ret; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + return ret; out: err: fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &subvol_iter); printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -1661,7 +1898,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(!btree_iter_path(trans, iter)->should_be_locked); - i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, dir, k); ret = PTR_ERR_OR_ZERO(i); if (ret < 0) goto err; @@ -1707,50 +1944,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL) { - struct bch_inode_unpacked subvol_root; - u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); - u32 target_snapshot; - u64 target_inum; - - ret = subvol_lookup(trans, target_subvol, - &target_snapshot, &target_inum); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, dirent_to_missing_subvol, - "dirent points to missing subvolume %u", - le32_to_cpu(d.v->d_child_subvol))) { - ret = __remove_dirent(trans, d.k->p); - goto err; - } - - ret = lookup_inode(trans, target_inum, - &subvol_root, &target_snapshot); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - if (fsck_err_on(ret, c, subvol_to_missing_root, - "subvolume %u points to missing subvolume root %llu", - target_subvol, - target_inum)) { - bch_err(c, "repair not implemented yet"); - ret = -EINVAL; - goto err; - } - - if (fsck_err_on(subvol_root.bi_subvol != target_subvol, - c, subvol_root_wrong_bi_subvol, - "subvol root %llu has wrong bi_subvol field: got %u, should be %u", - target_inum, - subvol_root.bi_subvol, target_subvol)) { - subvol_root.bi_subvol = target_subvol; - ret = __write_inode(trans, &subvol_root, target_snapshot); - if (ret) - goto err; - } - - ret = check_dirent_target(trans, iter, d, &subvol_root, - target_snapshot); + ret = check_dirent_to_subvol(trans, iter, d); if (ret) goto err; } else { @@ -1776,12 +1970,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; } - } - - if (d.v->d_type == DT_DIR) - for_each_visible_inode(c, s, dir, equiv.snapshot, i) - i->count++; + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + } out: err: fsck_err: @@ -1832,7 +2025,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout); + i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -1919,7 +2112,7 @@ static int check_root_trans(struct btree_trans *trans) 0, NULL); root_inode.bi_inum = inum; - ret = __write_inode(trans, &root_inode, snapshot); + ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot); bch_err_msg(c, ret, "writing root inode"); } err: @@ -1936,6 +2129,107 @@ int bch2_check_root(struct bch_fs *c) return ret; } +typedef DARRAY(u32) darray_u32; + +static bool darray_u32_has(darray_u32 *d, u32 v) +{ + darray_for_each(*d, i) + if (*i == v) + return true; + return false; +} + +/* + * We've checked that inode backpointers point to valid dirents; here, it's + * sufficient to check that the subvolume root has a dirent: + */ +static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s) +{ + struct bch_inode_unpacked inode; + int ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) }, + &inode); + if (ret) + return ret; + + return inode.bi_dir != 0; +} + +static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter parent_iter = {}; + darray_u32 subvol_path = {}; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) { + ret = darray_push(&subvol_path, k.k->p.offset); + if (ret) + goto err; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + ret = subvol_has_dirent(trans, s); + if (ret < 0) + break; + + if (fsck_err_on(!ret, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + + u32 parent = le32_to_cpu(s.v->fs_path_parent); + + if (darray_u32_has(&subvol_path, parent)) { + if (fsck_err(c, subvol_loop, "subvolume loop")) + ret = reattach_subvol(trans, s); + break; + } + + bch2_trans_iter_exit(trans, &parent_iter); + bch2_trans_iter_init(trans, &parent_iter, + BTREE_ID_subvolumes, POS(0, parent), 0); + k = bch2_btree_iter_peek_slot(&parent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, + c, subvol_unreachable, + "unreachable subvolume %s", + (bch2_bkey_val_to_text(&buf, c, s.s_c), + buf.buf))) { + ret = reattach_subvol(trans, s); + break; + } + } +fsck_err: +err: + printbuf_exit(&buf); + darray_exit(&subvol_path); + bch2_trans_iter_exit(trans, &parent_iter); + return ret; +} + +int bch2_check_subvolume_structure(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_path(trans, &iter, k))); + bch_err_fn(c, ret); + return ret; +} + struct pathbuf_entry { u64 inum; u32 snapshot; @@ -1952,89 +2246,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, pathbuf *p, - u64 inum, u32 snapshot) -{ - int ret = darray_push(p, ((struct pathbuf_entry) { - .inum = inum, - .snapshot = snapshot, - })); - - if (ret) - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - p->size); - return ret; -} - /* - * Check that a given inode is reachable from the root: + * Check that a given inode is reachable from its subvolume root - we already + * verified subvolume connectivity: * * XXX: we should also be verifying that inodes are in the right subvolumes */ -static int check_path(struct btree_trans *trans, - pathbuf *p, - struct bch_inode_unpacked *inode, - u32 snapshot) +static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k) { struct bch_fs *c = trans->c; + struct btree_iter inode_iter = {}; + struct bch_inode_unpacked inode; + struct printbuf buf = PRINTBUF; + u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot); int ret = 0; - snapshot = bch2_snapshot_equiv(c, snapshot); p->nr = 0; - while (!(inode->bi_inum == BCACHEFS_ROOT_INO && - inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + BUG_ON(bch2_inode_unpack(inode_k, &inode)); + + while (!inode.bi_subvol) { struct btree_iter dirent_iter; struct bkey_s_c_dirent d; u32 parent_snapshot = snapshot; - if (inode->bi_subvol) { - u64 inum; - - ret = subvol_lookup(trans, inode->bi_parent_subvol, - &parent_snapshot, &inum); - if (ret) - break; - } - - d = dirent_get_by_pos(trans, &dirent_iter, - SPOS(inode->bi_dir, inode->bi_dir_offset, - parent_snapshot)); + d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot); ret = bkey_err(d.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) break; - if (!ret && !dirent_points_to_inode(d, inode)) { + if (!ret && !dirent_points_to_inode(d, &inode)) { bch2_trans_iter_exit(trans, &dirent_iter); ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; } if (bch2_err_matches(ret, ENOENT)) { - if (fsck_err(c, inode_unreachable, - "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", - inode->bi_inum, snapshot, - bch2_d_type_str(inode_d_type(inode)), - inode->bi_nlink, - inode->bi_dir, - inode->bi_dir_offset)) - ret = reattach_inode(trans, inode, snapshot); - break; + ret = 0; + if (fsck_err(c, inode_unreachable, + "unreachable inode\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, inode_k), + buf.buf))) + ret = reattach_inode(trans, &inode, snapshot); + goto out; } bch2_trans_iter_exit(trans, &dirent_iter); - if (!S_ISDIR(inode->bi_mode)) + if (!S_ISDIR(inode.bi_mode)) break; - ret = path_down(c, p, inode->bi_inum, snapshot); - if (ret) { - bch_err(c, "memory allocation failure"); + ret = darray_push(p, ((struct pathbuf_entry) { + .inum = inode.bi_inum, + .snapshot = snapshot, + })); + if (ret) return ret; - } snapshot = parent_snapshot; - ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + bch2_trans_iter_exit(trans, &inode_iter); + inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, + SPOS(0, inode.bi_dir, snapshot), 0); + ret = bkey_err(inode_k) ?: + !bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode + : bch2_inode_unpack(inode_k, &inode); if (ret) { /* Should have been caught in dirents pass */ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2042,30 +2318,32 @@ static int check_path(struct btree_trans *trans, break; } - if (path_is_dup(p, inode->bi_inum, snapshot)) { + snapshot = inode_k.k->p.snapshot; + + if (path_is_dup(p, inode.bi_inum, snapshot)) { /* XXX print path */ bch_err(c, "directory structure loop"); darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode->bi_inum, snapshot); - - if (!fsck_err(c, dir_loop, "directory structure loop")) - return 0; + pr_err("%llu:%u", inode.bi_inum, snapshot); - ret = remove_backpointer(trans, inode); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + if (fsck_err(c, dir_loop, "directory structure loop")) { + ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); - if (ret) - break; + if (ret) + break; - ret = reattach_inode(trans, inode, snapshot); - if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) - bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum); + ret = reattach_inode(trans, &inode, snapshot); + bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum); + } break; } } +out: fsck_err: + bch2_trans_iter_exit(trans, &inode_iter); + printbuf_exit(&buf); bch_err_fn(c, ret); return ret; } @@ -2077,7 +2355,6 @@ fsck_err: */ int bch2_check_directory_structure(struct bch_fs *c) { - struct bch_inode_unpacked u; pathbuf path = { 0, }; int ret; @@ -2090,12 +2367,10 @@ int bch2_check_directory_structure(struct bch_fs *c) if (!bkey_is_inode(k.k)) continue; - BUG_ON(bch2_inode_unpack(k, &u)); - - if (u.bi_flags & BCH_INODE_unlinked) + if (bch2_inode_flags(k) & BCH_INODE_unlinked) continue; - check_path(trans, &path, &u, iter.pos.snapshot); + check_path(trans, &path, k); }))); darray_exit(&path); @@ -2291,7 +2566,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], bch2_inode_nlink_get(&u), link->count)) { bch2_inode_nlink_set(&u, link->count); - ret = __write_inode(trans, &u, k.k->p.snapshot); + ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot); } fsck_err: return ret; diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index da991e8cf27e..a4ef94271784 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *); int bch2_check_dirents(struct bch_fs *); int bch2_check_xattrs(struct bch_fs *); int bch2_check_root(struct bch_fs *); +int bch2_check_subvolume_structure(struct bch_fs *); int bch2_check_directory_structure(struct bch_fs *); int bch2_check_nlinks(struct bch_fs *); int bch2_fix_reflink_p(struct bch_fs *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 086f0090b03a..2b5e06770ab3 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k, return bch2_inode_unpack_slowpath(k, unpacked); } -static int bch2_inode_peek_nowarn(struct btree_trans *trans, +int bch2_inode_peek_nowarn(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, subvol_inum inum, unsigned flags) @@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans, return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags); } +int __bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +int bch2_fsck_write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + __bch2_fsck_write_inode(trans, inode, snapshot)); + bch_err_fn(trans->c, ret); + return ret; +} + struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) { struct bch_inode_unpacked u; @@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans, bool old_deleted = bkey_is_deleted_inode(old); bool new_deleted = bkey_is_deleted_inode(new.s_c); if (old_deleted != new_deleted) { - int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted); + int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, + new.k->p, new_deleted); if (ret) return ret; } @@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; if (S_ISDIR(inode.bi_mode)) { - ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot); - if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir, + ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); + if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + c, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; @@ -1141,7 +1171,7 @@ fsck_err: bch2_trans_iter_exit(trans, &inode_iter); return ret; delete: - ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false); + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false); goto out; } @@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c) bool need_another_pass; int ret; again: + /* + * if we ran check_inodes() unlinked inodes will have already been + * cleaned up but the write buffer will be out of sync; therefore we + * alway need a write buffer flush + */ + ret = bch2_btree_write_buffer_flush_sync(trans); + if (ret) + goto err; + need_another_pass = false; /* @@ -1183,12 +1222,8 @@ again: ret; })); - if (!ret && need_another_pass) { - ret = bch2_btree_write_buffer_flush_sync(trans); - if (ret) - goto err; + if (!ret && need_another_pass) goto again; - } err: bch2_trans_put(trans); return ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index b63f312581cf..056298050550 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); +int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, subvol_inum, unsigned); @@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans, return bch2_inode_write_flags(trans, iter, inode, 0); } +int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); +int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32); + void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); void bch2_inode_init_late(struct bch_inode_unpacked *, u64, @@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode) return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); } +static inline u32 bch2_inode_flags(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags); + case KEY_TYPE_inode_v2: + return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags); + case KEY_TYPE_inode_v3: + return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags); + default: + return 0; + } +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 3c574d8873a1..8a556e6d1ab6 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); - op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL); + op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err; diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 2c098ac017b3..f137252bccc5 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) bch2_congested_acct(ca, io_latency, now, rw); - __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); + __bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now); } #endif @@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1067,7 +1068,8 @@ do_write: *_dst = dst; return more; csum_err: - bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); ret = -EIO; err: if (to_wbio(dst)->bounce) @@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) bch_err_inum_offset_ratelimited(c, insert->k.p.inode, insert->k.p.offset << 9, - "write error while doing btree update: %s", + "%s write error while doing btree update: %s", + op->flags & BCH_WRITE_MOVE ? "move" : "user", bch2_err_str(ret)); } @@ -1449,7 +1452,9 @@ err: bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "%s(): error: %s", __func__, bch2_err_str(ret)); + "%s(): %s error: %s", __func__, + op->flags & BCH_WRITE_MOVE ? "move" : "user", + bch2_err_str(ret)); op->error = ret; break; } @@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write) bch_err_inum_offset_ratelimited(c, op->pos.inode, op->pos.offset << 9, - "misaligned write"); + "%s write error: misaligned write", + op->flags & BCH_WRITE_MOVE ? "move" : "user"); op->error = -EIO; goto err; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index bc890776eb57..f314b2e78ec3 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = { NULL }; +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq) { union journal_res_state s = READ_ONCE(j->reservations); unsigned i = seq & JOURNAL_BUF_MASK; struct journal_buf *buf = j->buf + i; - prt_printf(out, "seq:"); + prt_str(out, "seq:"); prt_tab(out); prt_printf(out, "%llu", seq); prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:"); + prt_str(out, "refcount:"); prt_tab(out); prt_printf(out, "%u", journal_state_count(s, i)); prt_newline(out); - prt_printf(out, "size:"); + prt_str(out, "size:"); prt_tab(out); prt_human_readable_u64(out, vstruct_bytes(buf->data)); prt_newline(out); - prt_printf(out, "expires"); + prt_str(out, "expires:"); prt_tab(out); prt_printf(out, "%li jiffies", buf->expires - jiffies); prt_newline(out); + prt_str(out, "flags:"); + prt_tab(out); + if (buf->noflush) + prt_str(out, "noflush "); + if (buf->must_flush) + prt_str(out, "must_flush "); + if (buf->separate_flush) + prt_str(out, "separate_flush "); + if (buf->need_flush_to_write_buffer) + prt_str(out, "need_flush_to_write_buffer "); + if (buf->write_started) + prt_str(out, "write_started "); + if (buf->write_allocated) + prt_str(out, "write allocated "); + if (buf->write_done) + prt_str(out, "write done"); + prt_newline(out); + printbuf_indent_sub(out, 2); } @@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq <= journal_cur_seq(j); seq++) bch2_journal_buf_to_text(out, j, seq); -} - -static inline bool journal_seq_unwritten(struct journal *j, u64 seq) -{ - return seq > j->seq_ondisk; -} - -static bool __journal_entry_is_open(union journal_res_state state) -{ - return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -} - -static inline unsigned nr_unwritten_journal_entries(struct journal *j) -{ - return atomic64_read(&j->seq) - j->seq_ondisk; -} - -static bool journal_entry_is_open(struct journal *j) -{ - return __journal_entry_is_open(j->reservations); + prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); } static inline struct journal_buf * @@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } +void bch2_journal_do_writes(struct journal *j) +{ + for (u64 seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unsigned idx = seq & JOURNAL_BUF_MASK; + struct journal_buf *w = j->buf + idx; + + if (w->write_started && !w->write_allocated) + break; + if (w->write_started) + continue; + + if (!journal_state_count(j->reservations, idx)) { + w->write_started = true; + closure_call(&w->io, bch2_journal_write, j->wq, NULL); + } + + break; + } +} + /* * Final processing when the last reference of a journal buffer has been * dropped. Drop the pin list reference acquired at journal entry open and write * the buffer, if requested. */ -void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write) +void bch2_journal_buf_put_final(struct journal *j, u64 seq) { - struct bch_fs *c = container_of(j, struct bch_fs, journal); - lockdep_assert_held(&j->lock); if (__bch2_journal_pin_put(j, seq)) bch2_journal_reclaim_fast(j); - if (write) - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + bch2_journal_do_writes(j); } /* @@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j) BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); bkey_extent_init(&buf->key); - buf->noflush = false; - buf->must_flush = false; - buf->separate_flush = false; - buf->flush_time = 0; + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; buf->need_flush_to_write_buffer = true; + buf->write_started = false; + buf->write_allocated = false; + buf->write_done = false; memset(buf->data, 0, sizeof(*buf->data)); buf->data->seq = cpu_to_le64(journal_cur_seq(j)); @@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j) } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - mod_delayed_work(c->io_complete_wq, - &j->write_work, - msecs_to_jiffies(c->opts.journal_flush_delay)); + if (nr_unwritten_journal_entries(j) == 1) + mod_delayed_work(j->wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); journal_wake(j); if (j->early_journal_entries.nr) @@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j) static void journal_write_work(struct work_struct *work) { struct journal *j = container_of(work, struct journal, write_work.work); - struct bch_fs *c = container_of(j, struct bch_fs, journal); - long delta; spin_lock(&j->lock); - if (!__journal_entry_is_open(j->reservations)) - goto unlock; - - delta = journal_cur_buf(j)->expires - jiffies; + if (__journal_entry_is_open(j->reservations)) { + long delta = journal_cur_buf(j)->expires - jiffies; - if (delta > 0) - mod_delayed_work(c->io_complete_wq, &j->write_work, delta); - else - __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); -unlock: + if (delta > 0) + mod_delayed_work(j->wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true); + } spin_unlock(&j->lock); } @@ -473,33 +511,32 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + ret = JOURNAL_ERR_journal_full; + can_discard = j->can_discard; + goto out; + } - spin_lock(&j->lock); + if (j->blocked) + return -BCH_ERR_journal_res_get_blocked; - /* check once more in case somebody else shut things down... */ - if (bch2_journal_error(j)) { - spin_unlock(&j->lock); + if (bch2_journal_error(j)) return -BCH_ERR_erofs_journal_err; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { + ret = JOURNAL_ERR_max_in_flight; + goto out; } + spin_lock(&j->lock); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() * unnecessarily */ if (journal_res_get_fast(j, res, flags)) { - spin_unlock(&j->lock); - return 0; - } - - if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - /* - * Don't want to close current journal entry, just need to - * invoke reclaim: - */ - ret = JOURNAL_ERR_journal_full; + ret = 0; goto unlock; } @@ -515,30 +552,30 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j); - - if (ret == JOURNAL_ERR_max_in_flight) { - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, true); - if (trace_journal_entry_full_enabled()) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - bch2_journal_bufs_to_text(&buf, j); - trace_journal_entry_full(c, buf.buf); - printbuf_exit(&buf); - } - count_event(c, journal_entry_full); - } + ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); - - if (!ret) +out: + if (ret == JOURNAL_ERR_retry) goto retry; + if (!ret) + return 0; + if (journal_error_check_stuck(j, ret, flags)) ret = -BCH_ERR_journal_res_get_blocked; + if (ret == JOURNAL_ERR_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + /* * Journal is full - can't rely on reclaim from work item due to * freezing: @@ -674,7 +711,7 @@ recheck_need_open: return ret; seq = res.seq; - buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf = journal_seq_to_buf(j, seq); buf->must_flush = true; if (!buf->flush_time) { @@ -692,8 +729,8 @@ recheck_need_open: } /* - * if write was kicked off without a flush, flush the next sequence - * number instead + * if write was kicked off without a flush, or if we promised it + * wouldn't be a flush, flush the next sequence number instead */ buf = journal_seq_to_buf(j, seq); if (buf->noflush) { @@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq) unwritten_seq++) { struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); - /* journal write is already in flight, and was a flush write: */ - if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + /* journal flush already in flight, or flush requseted */ + if (buf->must_flush) goto out; buf->noflush = true; @@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) struct journal_replay *i, **_i; struct genradix_iter iter; bool had_entries = false; - unsigned ptr; u64 last_seq = cur_seq, nr, seq; genradix_for_each_reverse(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; last_seq = le64_to_cpu(i->j.last_seq); @@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); @@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) p = journal_seq_pin(j, seq); p->devs.nr = 0; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) + bch2_dev_list_add_dev(&p->devs, ptr->dev); had_entries = true; } @@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) void bch2_dev_journal_exit(struct bch_dev *ca) { - kfree(ca->journal.bio); - kfree(ca->journal.buckets); - kfree(ca->journal.bucket_seq); + struct journal_device *ja = &ca->journal; + + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + kfree(ja->bio[i]); + ja->bio[i] = NULL; + } - ca->journal.bio = NULL; - ca->journal.buckets = NULL; - ca->journal.bucket_seq = NULL; + kfree(ja->buckets); + kfree(ja->bucket_seq); + ja->buckets = NULL; + ja->bucket_seq = NULL; } int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) @@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) bch2_sb_field_get(sb, journal); struct bch_sb_field_journal_v2 *journal_buckets_v2 = bch2_sb_field_get(sb, journal_v2); - unsigned i, nr_bvecs; ja->nr = 0; if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - for (i = 0; i < nr; i++) + for (unsigned i = 0; i < nr; i++) ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); } else if (journal_buckets) { ja->nr = bch2_nr_journal_buckets(journal_buckets); @@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (!ja->bucket_seq) return -BCH_ERR_ENOMEM_dev_journal_init; - nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); - ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); - if (!ca->journal.bio) - return -BCH_ERR_ENOMEM_dev_journal_init; + for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { + ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + nr_bvecs), GFP_KERNEL); + if (!ja->bio[i]) + return -BCH_ERR_ENOMEM_dev_journal_init; - bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + ja->bio[i]->ca = ca; + ja->bio[i]->buf_idx = i; + bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0); + } ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) @@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); - unsigned j, dst = 0; + unsigned dst = 0; - for (i = 0; i < nr; i++) - for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + for (unsigned i = 0; i < nr; i++) + for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) ja->buckets[dst++] = le64_to_cpu(journal_buckets_v2->d[i].start) + j; } else if (journal_buckets) { - for (i = 0; i < ja->nr; i++) + for (unsigned i = 0; i < ja->nr; i++) ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); } @@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) void bch2_fs_journal_exit(struct journal *j) { - unsigned i; + if (j->wq) + destroy_workqueue(j->wq); darray_exit(&j->early_journal_entries); - for (i = 0; i < ARRAY_SIZE(j->buf); i++) - kvpfree(j->buf[i].data, j->buf[i].buf_size); + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) + kvfree(j->buf[i].data); free_fifo(&j->pin); } int bch2_fs_journal_init(struct journal *j) { static struct lock_class_key res_key; - unsigned i; mutex_init(&j->buf_lock); spin_lock_init(&j->lock); @@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j) if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) return -BCH_ERR_ENOMEM_journal_pin_fifo; - for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); if (!j->buf[i].data) return -BCH_ERR_ENOMEM_journal_buf; + j->buf[i].idx = i; } j->pin.front = j->pin.back = 1; + + j->wq = alloc_workqueue("bcachefs_journal", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); + if (!j->wq) + return -BCH_ERR_ENOMEM_fs_other_alloc; return 0; } @@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "blocked:\t\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); prt_printf(out, "current entry:\t\t"); @@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 { struct journal_entry_pin_list *pin_list; struct journal_entry_pin *pin; - unsigned i; spin_lock(&j->lock); *seq = max(*seq, j->pin.front); @@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 prt_newline(out); printbuf_indent_add(out, 2); - for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++) list_for_each_entry(pin, &pin_list->list[i], list) { prt_printf(out, "\t%px %ps", pin, pin->flush); prt_newline(out); diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 4544ce24bb8a..7c7528f839c5 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u } bool bch2_journal_entry_close(struct journal *); -void bch2_journal_buf_put_final(struct journal *, u64, bool); +void bch2_journal_do_writes(struct journal *); +void bch2_journal_buf_put_final(struct journal *, u64); static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) { @@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); } static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) @@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq s = journal_state_buf_put(j, idx); if (!journal_state_count(s, idx)) { spin_lock(&j->lock); - bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx); + bch2_journal_buf_put_final(j, seq); spin_unlock(&j->lock); } } diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 47805193f18c..d76c3c0c203f 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -17,6 +17,37 @@ #include "sb-clean.h" #include "trace.h" +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + darray_for_each(j->ptrs, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev); + u64 offset; + + div64_u64_rem(i->sector, ca->mi.bucket_size, &offset); + + if (i != j->ptrs.data) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + i->dev, i->bucket, i->bucket_offset, i->sector); + } +} + +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + + bch2_journal_ptrs_to_text(out, c, j); + + for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); + break; + } +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c, BUG_ON(*p != i); *p = NULL; - kvpfree(i, offsetof(struct journal_replay, j) + - vstruct_bytes(&i->j)); + kvfree(i); } -static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted) { - i->ignore = true; + if (blacklisted) + i->ignore_blacklisted = true; + else + i->ignore_not_dirty = true; if (!c->opts.read_entire_journal) __journal_replay_free(c, i); @@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, { struct genradix_iter iter; struct journal_replay **_i, *i, *dup; - struct journal_ptr *ptr; size_t bytes = vstruct_bytes(j); u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + struct printbuf buf = PRINTBUF; int ret = JOURNAL_ENTRY_ADD_OK; /* Is this entry older than the range we need? */ @@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, jlist->last_seq)) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (le64_to_cpu(i->j.seq) >= last_seq) break; - journal_replay_free(c, i); + + journal_replay_free(c, i, false); } } @@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, */ dup = *_i; if (dup) { - if (bytes == vstruct_bytes(&dup->j) && - !memcmp(j, &dup->j, bytes)) { - i = dup; - goto found; - } + bool identical = bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes); + bool not_identical = !identical && + entry_ptr.csum_good && + dup->csum_good; + + bool same_device = false; + darray_for_each(dup->ptrs, ptr) + if (ptr->dev == ca->dev_idx) + same_device = true; + + ret = darray_push(&dup->ptrs, entry_ptr); + if (ret) + goto out; - if (!entry_ptr.csum_good) { - i = dup; - goto found; - } + bch2_journal_replay_to_text(&buf, c, dup); - if (!dup->csum_good) + fsck_err_on(same_device, + c, journal_entry_dup_same_device, + "duplicate journal entry on same device\n %s", + buf.buf); + + fsck_err_on(not_identical, + c, journal_entry_replicas_data_mismatch, + "found duplicate but non identical journal entries\n %s", + buf.buf); + + if (entry_ptr.csum_good && !identical) goto replace; - fsck_err(c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries (seq %llu)", - le64_to_cpu(j->seq)); - i = dup; - goto found; + goto out; } replace: - i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) return -BCH_ERR_ENOMEM_journal_entry_add; - i->nr_ptrs = 0; - i->csum_good = entry_ptr.csum_good; - i->ignore = false; + darray_init(&i->ptrs); + i->csum_good = entry_ptr.csum_good; + i->ignore_blacklisted = false; + i->ignore_not_dirty = false; unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); - i->ptrs[i->nr_ptrs++] = entry_ptr; if (dup) { - if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; - } - /* The first ptr should represent the jset we kept: */ - memcpy(i->ptrs + i->nr_ptrs, - dup->ptrs, - sizeof(dup->ptrs[0]) * dup->nr_ptrs); - i->nr_ptrs += dup->nr_ptrs; + darray_for_each(dup->ptrs, ptr) + darray_push(&i->ptrs, *ptr); __journal_replay_free(c, dup); + } else { + darray_push(&i->ptrs, entry_ptr); } *_i = i; - return 0; -found: - for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { - if (ptr->dev == ca->dev_idx) { - bch_err(c, "duplicate journal entry %llu on same device", - le64_to_cpu(i->j.seq)); - goto out; - } - } - - if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { - bch_err(c, "found too many copies of journal entry %llu", - le64_to_cpu(i->j.seq)); - goto out; - } - - i->ptrs[i->nr_ptrs++] = entry_ptr; out: fsck_err: + printbuf_exit(&buf); return ret; } @@ -374,7 +398,6 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c, static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, struct jset_entry *entry) { - struct bkey_i *k; bool first = true; jset_entry_for_each_key(entry, k) { @@ -741,6 +764,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_datetime_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + enum bkey_invalid_flags flags) +{ + unsigned bytes = vstruct_bytes(entry); + unsigned expected = 16; + int ret = 0; + + if (journal_entry_err_on(vstruct_bytes(entry) < expected, + c, version, jset, entry, + journal_entry_dev_usage_bad_size, + "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } +fsck_err: + return ret; +} + +static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_datetime *datetime = + container_of(entry, struct jset_entry_datetime, entry); + + bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, unsigned, int, @@ -913,11 +967,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b, return -BCH_ERR_ENOMEM_journal_read_buf_realloc; new_size = roundup_pow_of_two(new_size); - n = kvpmalloc(new_size, GFP_KERNEL); + n = kvmalloc(new_size, GFP_KERNEL); if (!n) return -BCH_ERR_ENOMEM_journal_read_buf_realloc; - kvpfree(b->data, b->size); + kvfree(b->data); b->data = n; b->size = new_size; return 0; @@ -1102,16 +1156,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!r) continue; - for (i = 0; i < r->nr_ptrs; i++) { - if (r->ptrs[i].dev == ca->dev_idx) { - unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + darray_for_each(r->ptrs, i) + if (i->dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, i->sector) + vstruct_sectors(&r->j, c->block_bits); - ja->cur_idx = r->ptrs[i].bucket; + ja->cur_idx = i->bucket; ja->sectors_free = ca->mi.bucket_size - wrote; goto found; } - } } found: mutex_unlock(&jlist->lock); @@ -1144,7 +1197,7 @@ found: ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); - kvpfree(buf.data, buf.size); + kvfree(buf.data); percpu_ref_put(&ca->io_ref); closure_return(cl); return; @@ -1155,27 +1208,6 @@ err: goto out; } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) -{ - unsigned i; - - for (i = 0; i < j->nr_ptrs; i++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); - u64 offset; - - div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); - - if (i) - prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - j->ptrs[i].dev, - j->ptrs[i].bucket, - j->ptrs[i].bucket_offset, - j->ptrs[i].sector); - } -} - int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, @@ -1228,20 +1260,20 @@ int bch2_journal_read(struct bch_fs *c, i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; if (!*start_seq) *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; if (JSET_NO_FLUSH(&i->j)) { - i->ignore = true; + i->ignore_blacklisted = true; continue; } if (!last_write_torn && !i->csum_good) { last_write_torn = true; - i->ignore = true; + i->ignore_blacklisted = true; continue; } @@ -1280,12 +1312,12 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; seq = le64_to_cpu(i->j.seq); if (seq < *last_seq) { - journal_replay_free(c, i); + journal_replay_free(c, i, false); continue; } @@ -1293,7 +1325,7 @@ int bch2_journal_read(struct bch_fs *c, fsck_err_on(!JSET_NO_FLUSH(&i->j), c, jset_seq_blacklisted, "found blacklisted journal entry %llu", seq); - i->ignore = true; + i->ignore_blacklisted = true; } } @@ -1302,7 +1334,7 @@ int bch2_journal_read(struct bch_fs *c, genradix_for_each(&c->journal_entries, radix_iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; BUG_ON(seq > le64_to_cpu(i->j.seq)); @@ -1353,32 +1385,31 @@ int bch2_journal_read(struct bch_fs *c, .e.data_type = BCH_DATA_journal, .e.nr_required = 1, }; - unsigned ptr; i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) { - struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + darray_for_each(i->ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); - if (!i->ptrs[ptr].csum_good) - bch_err_dev_offset(ca, i->ptrs[ptr].sector, + if (!ptr->csum_good) + bch_err_dev_offset(ca, ptr->sector, "invalid journal checksum, seq %llu%s", le64_to_cpu(i->j.seq), i->csum_good ? " (had good copy on another device)" : ""); } ret = jset_validate(c, - bch_dev_bkey_exists(c, i->ptrs[0].dev), + bch_dev_bkey_exists(c, i->ptrs.data[0].dev), &i->j, - i->ptrs[0].sector, + i->ptrs.data[0].sector, READ); if (ret) goto err; - for (ptr = 0; ptr < i->nr_ptrs; ptr++) - replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + darray_for_each(i->ptrs, ptr) + replicas.e.devs[replicas.e.nr_devs++] = ptr->dev; bch2_replicas_entry_sort(&replicas.e); @@ -1547,7 +1578,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size)) return; - new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN); if (!new_buf) return; @@ -1558,7 +1589,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) swap(buf->buf_size, new_size); spin_unlock(&j->lock); - kvpfree(new_buf, new_size); + kvfree(new_buf); } static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) @@ -1568,12 +1599,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) static CLOSURE_CALLBACK(journal_write_done) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; union journal_res_state old, new; - u64 v, seq; + u64 v, seq = le64_to_cpu(w->data->seq); int err = 0; bch2_time_stats_update(!JSET_NO_FLUSH(w->data) @@ -1593,63 +1624,68 @@ static CLOSURE_CALLBACK(journal_write_done) if (err) bch2_fatal_error(c); - spin_lock(&j->lock); - seq = le64_to_cpu(w->data->seq); + closure_debug_destroy(cl); + spin_lock(&j->lock); if (seq >= j->pin.front) journal_seq_pin(j, seq)->devs = w->devs_written; + if (err && (!j->err_seq || seq < j->err_seq)) + j->err_seq = seq; + w->write_done = true; - if (!err) { - if (!JSET_NO_FLUSH(w->data)) { + bool completed = false; + + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + w = j->buf + (seq & JOURNAL_BUF_MASK); + if (!w->write_done) + break; + + if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; bch2_do_discards(c); closure_wake_up(&c->freelist_wait); - bch2_reset_alloc_cursors(c); } - } else if (!j->err_seq || seq < j->err_seq) - j->err_seq = seq; - j->seq_ondisk = seq; + j->seq_ondisk = seq; - /* - * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard - * more buckets: - * - * Must come before signaling write completion, for - * bch2_fs_journal_stop(): - */ - if (j->watermark != BCH_WATERMARK_stripe) - journal_reclaim_kick(&c->journal); + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); - /* also must come before signalling write completion: */ - closure_debug_destroy(cl); + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - v = atomic64_read(&j->reservations.counter); - do { - old.v = new.v = v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v); - new.unwritten_idx++; - } while ((v = atomic64_cmpxchg(&j->reservations.counter, - old.v, new.v)) != old.v); + closure_wake_up(&w->wait); + completed = true; + } - bch2_journal_reclaim_fast(j); - bch2_journal_space_available(j); + if (completed) { + bch2_journal_reclaim_fast(j); + bch2_journal_space_available(j); - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], - &j->max_in_flight_start, false); + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false); - closure_wake_up(&w->wait); - journal_wake(j); + journal_wake(j); + } - if (!journal_state_count(new, new.unwritten_idx) && - journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { - spin_unlock(&j->lock); - closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); - } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1659,46 +1695,46 @@ static CLOSURE_CALLBACK(journal_write_done) * previous entries still in flight - the current journal entry * might want to be written now: */ - - spin_unlock(&j->lock); - mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); - } else { - spin_unlock(&j->lock); + mod_delayed_work(j->wq, &j->write_work, max(0L, delta)); } + + spin_unlock(&j->lock); } static void journal_write_endio(struct bio *bio) { - struct bch_dev *ca = bio->bi_private; + struct journal_bio *jbio = container_of(bio, struct journal_bio, bio); + struct bch_dev *ca = jbio->ca; struct journal *j = &ca->fs->journal; - struct journal_buf *w = journal_last_unwritten_buf(j); - unsigned long flags; + struct journal_buf *w = j->buf + jbio->buf_idx; if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), bch2_blk_status_to_str(bio->bi_status)) || bch2_meta_write_fault("journal")) { + unsigned long flags; + spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } - closure_put(&j->io); + closure_put(&w->io); percpu_ref_put(&ca->io_ref); } static CLOSURE_CALLBACK(do_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_dev *ca; - struct journal_buf *w = journal_last_unwritten_buf(j); - struct bio *bio; unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - ca = bch_dev_bkey_exists(c, ptr->dev); + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct journal_device *ja = &ca->journal; + if (!percpu_ref_tryget(&ca->io_ref)) { /* XXX: fix this */ bch_err(c, "missing device for journal write\n"); @@ -1708,7 +1744,7 @@ static CLOSURE_CALLBACK(do_journal_write) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); - bio = ca->journal.bio; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1727,11 +1763,10 @@ static CLOSURE_CALLBACK(do_journal_write) trace_and_count(c, journal_write, bio); closure_bio_submit(bio, cl); - ca->journal.bucket_seq[ca->journal.cur_idx] = - le64_to_cpu(w->data->seq); + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); } - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) @@ -1782,7 +1817,6 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (!wb.wb) bch2_journal_keys_to_write_buffer_start(c, &wb, seq); - struct bkey_i *k; jset_entry_for_each_key(i, k) { ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k); if (ret) { @@ -1798,12 +1832,20 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) if (wb.wb) bch2_journal_keys_to_write_buffer_end(c, &wb); + + spin_lock(&c->journal.lock); w->need_flush_to_write_buffer = false; + spin_unlock(&c->journal.lock); start = end = vstruct_last(jset); end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have); + struct jset_entry_datetime *d = + container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry); + d->entry.type = BCH_JSET_ENTRY_datetime; + d->seconds = cpu_to_le64(ktime_get_real_seconds()); + bch2_journal_super_entries_add_common(c, &end, seq); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1893,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * j->nr_noflush_writes++; } else { + w->must_flush = true; j->last_flush_write = jiffies; j->nr_flush_writes++; clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); @@ -1903,20 +1946,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * CLOSURE_CALLBACK(bch2_journal_write) { - closure_type(j, struct journal, io); + closure_type(w, struct journal_buf, io); + struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct journal_buf *w = journal_last_unwritten_buf(j); struct bch_replicas_padded replicas; - struct bio *bio; struct printbuf journal_debug_buf = PRINTBUF; unsigned nr_rw_members = 0; int ret; + for_each_rw_member(c, ca) + nr_rw_members++; + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + BUG_ON(!w->write_started); + BUG_ON(w->write_allocated); + BUG_ON(w->write_done); j->write_start_time = local_clock(); spin_lock(&j->lock); + if (nr_rw_members > 1) + w->separate_flush = true; + ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); if (ret) @@ -1956,12 +2007,14 @@ CLOSURE_CALLBACK(bch2_journal_write) * bch2_journal_space_available(): */ w->sectors = 0; + w->write_allocated = true; /* * journal entry has been compacted and allocated, recalculate space * available: */ bch2_journal_space_available(j); + bch2_journal_do_writes(j); spin_unlock(&j->lock); w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); @@ -1969,12 +2022,6 @@ CLOSURE_CALLBACK(bch2_journal_write) if (c->opts.nochanges) goto no_io; - for_each_rw_member(c, ca) - nr_rw_members++; - - if (nr_rw_members > 1) - w->separate_flush = true; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -1985,25 +2032,29 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (!JSET_NO_FLUSH(w->data)) + closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq)); + if (!JSET_NO_FLUSH(w->data) && w->separate_flush) { for_each_rw_member(c, ca) { percpu_ref_get(&ca->io_ref); - bio = ca->journal.bio; + struct journal_device *ja = &ca->journal; + struct bio *bio = &ja->bio[w->idx]->bio; bio_reset(bio, ca->disk_sb.bdev, - REQ_OP_WRITE|REQ_PREFLUSH); + REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH); bio->bi_end_io = journal_write_endio; bio->bi_private = ca; closure_bio_submit(bio, cl); } } - continue_at(cl, do_journal_write, c->io_complete_wq); + continue_at(cl, do_journal_write, j->wq); return; no_io: - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); return; err: bch2_fatal_error(c); - continue_at(cl, journal_write_done, c->io_complete_wq); + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index c035e7c108e1..4f1e763ab506 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -2,26 +2,35 @@ #ifndef _BCACHEFS_JOURNAL_IO_H #define _BCACHEFS_JOURNAL_IO_H +#include "darray.h" + +struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; +}; + /* * Only used for holding the journal entries we read in btree_journal_read() * during cache_registration */ struct journal_replay { - struct journal_ptr { - bool csum_good; - u8 dev; - u32 bucket; - u32 bucket_offset; - u64 sector; - } ptrs[BCH_REPLICAS_MAX]; - unsigned nr_ptrs; + DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs; bool csum_good; - bool ignore; + bool ignore_blacklisted; + bool ignore_not_dirty; /* must be last: */ struct jset j; }; +static inline bool journal_replay_ignore(struct journal_replay *i) +{ + return !i || i->ignore_blacklisted || i->ignore_not_dirty; +} + static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, struct jset_entry *entry, unsigned type) { @@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, } #define for_each_jset_entry_type(entry, jset, type) \ - for (entry = (jset)->start; \ + for (struct jset_entry *entry = (jset)->start; \ (entry = __jset_entry_type_next(jset, entry, type)); \ entry = vstruct_next(entry)) #define jset_entry_for_each_key(_e, _k) \ - for (_k = (_e)->start; \ + for (struct bkey_i *_k = (_e)->start; \ _k < vstruct_last(_e); \ _k = bkey_next(_k)) @@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); CLOSURE_CALLBACK(bch2_journal_write); +static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + #endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index c33dca641575..ab811c0dad26 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -62,12 +62,9 @@ void bch2_journal_set_watermark(struct journal *j) ? BCH_WATERMARK_reclaim : BCH_WATERMARK_stripe; - if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], - &j->low_on_space_start, low_on_space) || - track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], - &j->low_on_pin_start, low_on_pin) || - track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], - &j->write_buffer_full_start, low_on_wb)) + if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) || + track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) || + track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb)) trace_and_count(c, journal_full, c); swap(watermark, j->watermark); @@ -394,8 +391,6 @@ void bch2_journal_pin_copy(struct journal *j, struct journal_entry_pin *src, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); u64 seq = READ_ONCE(src->seq); @@ -411,44 +406,44 @@ void bch2_journal_pin_copy(struct journal *j, return; } - reclaim = __journal_pin_drop(j, dst); + bool reclaim = __journal_pin_drop(j, dst); bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + spin_unlock(&j->lock); } void bch2_journal_pin_set(struct journal *j, u64 seq, struct journal_entry_pin *pin, journal_pin_flush_fn flush_fn) { - bool reclaim; - spin_lock(&j->lock); BUG_ON(seq < journal_last_seq(j)); - reclaim = __journal_pin_drop(j, pin); + bool reclaim = __journal_pin_drop(j, pin); bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); - spin_unlock(&j->lock); - /* * If the journal is currently full, we might want to call flush_fn * immediately: */ - journal_wake(j); + if (seq == journal_last_seq(j)) + journal_wake(j); + + spin_unlock(&j->lock); } /** diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 0200e299cfbb..b5303874fc35 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr) return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); } -static struct bch_sb_field_journal_seq_blacklist * -blacklist_entry_try_merge(struct bch_fs *c, - struct bch_sb_field_journal_seq_blacklist *bl, - unsigned i) -{ - unsigned nr = blacklist_nr_entries(bl); - - if (le64_to_cpu(bl->start[i].end) >= - le64_to_cpu(bl->start[i + 1].start)) { - bl->start[i].end = bl->start[i + 1].end; - --nr; - memmove(&bl->start[i], - &bl->start[i + 1], - sizeof(bl->start[0]) * (nr - i)); - - bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, - sb_blacklist_u64s(nr)); - BUG_ON(!bl); - } - - return bl; -} - -static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, - u64 start, u64 end) -{ - return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); -} - int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) { struct bch_sb_field_journal_seq_blacklist *bl; - unsigned i, nr; + unsigned i = 0, nr; int ret = 0; mutex_lock(&c->sb_lock); bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist); nr = blacklist_nr_entries(bl); - for (i = 0; i < nr; i++) { + while (i < nr) { struct journal_seq_blacklist_entry *e = bl->start + i; - if (bl_entry_contig_or_overlaps(e, start, end)) { - e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); - e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); - - if (i + 1 < nr) - bl = blacklist_entry_try_merge(c, - bl, i); - if (i) - bl = blacklist_entry_try_merge(c, - bl, i - 1); - goto out_write_sb; + if (end < le64_to_cpu(e->start)) + break; + + if (start > le64_to_cpu(e->end)) { + i++; + continue; } + + /* + * Entry is contiguous or overlapping with new entry: merge it + * with new entry, and delete: + */ + + start = min(start, le64_to_cpu(e->start)); + end = max(end, le64_to_cpu(e->end)); + array_remove_item(bl->start, nr, i); } bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, @@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) goto out; } - bl->start[nr].start = cpu_to_le64(start); - bl->start[nr].end = cpu_to_le64(end); -out_write_sb: + array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) { + .start = cpu_to_le64(start), + .end = cpu_to_le64(end), + })); c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ret = bch2_write_super(c); @@ -165,8 +141,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) if (!bl) return 0; - t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, - GFP_KERNEL); + t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) return -BCH_ERR_ENOMEM_blacklist_table_init; diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 38817c7a0851..8c053cb64ca5 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -18,6 +18,7 @@ * the journal that are being staged or in flight. */ struct journal_buf { + struct closure io; struct jset *data; __BKEY_PADDED(key, BCH_REPLICAS_MAX); @@ -33,10 +34,14 @@ struct journal_buf { unsigned disk_sectors; /* maximum size entry could have been, if buf_size was bigger */ unsigned u64s_reserved; - bool noflush; /* write has already been kicked off, and was noflush */ - bool must_flush; /* something wants a flush */ - bool separate_flush; - bool need_flush_to_write_buffer; + bool noflush:1; /* write has already been kicked off, and was noflush */ + bool must_flush:1; /* something wants a flush */ + bool separate_flush:1; + bool need_flush_to_write_buffer:1; + bool write_started:1; + bool write_allocated:1; + bool write_done:1; + u8 idx; }; /* @@ -134,6 +139,7 @@ enum journal_flags { /* Reasons we may fail to get a journal reservation: */ #define JOURNAL_ERRORS() \ x(ok) \ + x(retry) \ x(blocked) \ x(max_in_flight) \ x(journal_full) \ @@ -149,6 +155,13 @@ enum journal_errors { typedef DARRAY(u64) darray_u64; +struct journal_bio { + struct bch_dev *ca; + unsigned buf_idx; + + struct bio bio; +}; + /* Embedded in struct bch_fs */ struct journal { /* Fastpath stuff up front: */ @@ -203,8 +216,8 @@ struct journal { wait_queue_head_t wait; struct closure_waitlist async_wait; - struct closure io; struct delayed_work write_work; + struct workqueue_struct *wq; /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; @@ -274,11 +287,6 @@ struct journal { u64 nr_noflush_writes; u64 entry_bytes_written; - u64 low_on_space_start; - u64 low_on_pin_start; - u64 max_in_flight_start; - u64 write_buffer_full_start; - struct bch2_time_stats *flush_write_time; struct bch2_time_stats *noflush_write_time; struct bch2_time_stats *flush_seq_time; @@ -313,7 +321,7 @@ struct journal_device { u64 *buckets; /* Bio for journal reads/writes to this device */ - struct bio *bio; + struct journal_bio *bio[JOURNAL_BUF_NR]; /* for bch_journal_read_device */ struct closure read; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 7a4ca5a28b3e..26569043e368 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time, bool set) { return time - ? bch2_btree_bit_mod(trans, BTREE_ID_lru, - lru_pos(lru_id, dev_bucket, time), set) + ? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, + lru_pos(lru_id, dev_bucket, time), set) : 0; } @@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans, goto out; } - if (c->opts.reconstruct_alloc || - fsck_err(c, lru_entry_bad, + if (fsck_err(c, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" " %s\n" " for %s", diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c index bf0ef668fd38..0ea9f30803a2 100644 --- a/fs/bcachefs/mean_and_variance.c +++ b/fs/bcachefs/mean_and_variance.c @@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() * @s: mean and variance number of samples and their sums * @x: new value to include in the &mean_and_variance_weighted + * @initted: caller must track whether this is the first use or not + * @weight: ewma weight * * see linked pdf: function derived from equations 140-143 where alpha = 2^w. * values are stored bitshifted for performance and added precision. */ -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 x, bool initted, u8 weight) { // previous weighted variance. - u8 w = s->weight; + u8 w = weight; u64 var_w0 = s->variance; // new value weighted. s64 x_w = x << w; @@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 // new mean weighted. s64 u_w1 = s->mean + diff; - if (!s->init) { + if (!initted) { s->mean = x_w; s->variance = 0; } else { s->mean = u_w1; s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; } - s->init = true; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); /** * mean_and_variance_weighted_get_mean() - get mean from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight) { - return fast_divpow2(s.mean, s.weight); + return fast_divpow2(s.mean, weight); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); /** * mean_and_variance_weighted_get_variance() -- get variance from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight) { // always positive don't need fast divpow2 - return s.variance >> s.weight; + return s.variance >> weight; } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); /** * mean_and_variance_weighted_get_stddev() - get standard deviation from @s * @s: mean and variance number of samples and their sums + * @weight: ewma weight */ -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight) { - return int_sqrt64(mean_and_variance_weighted_get_variance(s)); + return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight)); } EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h index 64df11ab422b..4fcf062dd22c 100644 --- a/fs/bcachefs/mean_and_variance.h +++ b/fs/bcachefs/mean_and_variance.h @@ -154,8 +154,6 @@ struct mean_and_variance { /* expontentially weighted variant */ struct mean_and_variance_weighted { - bool init; - u8 weight; /* base 2 logarithim */ s64 mean; u64 variance; }; @@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s); u64 mean_and_variance_get_variance(struct mean_and_variance s1); u32 mean_and_variance_get_stddev(struct mean_and_variance s); -void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, + s64 v, bool initted, u8 weight); -s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); -u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); -u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s, + u8 weight); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s, + u8 weight); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s, + u8 weight); #endif // MEAN_AND_VAIRANCE_H_ diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c index 019583c3ca0e..db63b3f3b338 100644 --- a/fs/bcachefs/mean_and_variance_test.c +++ b/fs/bcachefs/mean_and_variance_test.c @@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test) static void mean_and_variance_weighted_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 2 }; + struct mean_and_variance_weighted s = { }; - mean_and_variance_weighted_update(&s, 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, 10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, 20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, 20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, 30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, 30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); - s = (struct mean_and_variance_weighted) { .weight = 2 }; + s = (struct mean_and_variance_weighted) { }; - mean_and_variance_weighted_update(&s, -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + mean_and_variance_weighted_update(&s, -10, false, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0); - mean_and_variance_weighted_update(&s, -20); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + mean_and_variance_weighted_update(&s, -20, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18); - mean_and_variance_weighted_update(&s, -30); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + mean_and_variance_weighted_update(&s, -30, true, 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72); } static void mean_and_variance_weighted_advanced_test(struct kunit *test) { - struct mean_and_variance_weighted s = { .weight = 8 }; + struct mean_and_variance_weighted s = { }; + bool initted = false; s64 i; - for (i = 10; i <= 100; i += 10) - mean_and_variance_weighted_update(&s, i); + for (i = 10; i <= 100; i += 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); - s = (struct mean_and_variance_weighted) { .weight = 8 }; + s = (struct mean_and_variance_weighted) { }; + initted = false; - for (i = -10; i >= -100; i -= 10) - mean_and_variance_weighted_update(&s, i); + for (i = -10; i >= -100; i -= 10) { + mean_and_variance_weighted_update(&s, i, initted, 8); + initted = true; + } - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107); } static void do_mean_and_variance_test(struct kunit *test, @@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test, s64 *weighted_stddev) { struct mean_and_variance mv = {}; - struct mean_and_variance_weighted vw = { .weight = weight }; + struct mean_and_variance_weighted vw = { }; for (unsigned i = 0; i < initial_n; i++) { mean_and_variance_update(&mv, initial_value); - mean_and_variance_weighted_update(&vw, initial_value); + mean_and_variance_weighted_update(&vw, initial_value, false, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0); } for (unsigned i = 0; i < n; i++) { mean_and_variance_update(&mv, data[i]); - mean_and_variance_weighted_update(&vw, data[i]); + mean_and_variance_weighted_update(&vw, data[i], true, weight); KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); - KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]); } KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 5623cee3ef86..69098eeb5d48 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -EINVAL; + return -BCH_ERR_remove_would_lose_data; return 0; } @@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -EINVAL; + return -BCH_ERR_remove_with_metadata_missing_unimplemented; trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); @@ -132,10 +132,8 @@ retry: ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true); - if (ret) { - bch_err(c, "Cannot drop device without losing data"); + if (ret) break; - } ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index b1ed0b9a20d3..08ea0cfc4aef 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -314,7 +314,7 @@ int bch2_opt_parse(struct bch_fs *c, if (ret < 0 || (*res != 0 && *res != 1)) { if (err) prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; + return ret < 0 ? ret : -BCH_ERR_option_not_bool; } break; case BCH_OPT_UINT: @@ -456,7 +456,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, copied_opts = kstrdup(options, GFP_KERNEL); if (!copied_opts) - return -1; + return -ENOMEM; copied_opts_start = copied_opts; while ((opt = strsep(&copied_opts, ",")) != NULL) { @@ -501,11 +501,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, bad_opt: pr_err("Bad mount option %s", name); - ret = -1; + ret = -BCH_ERR_option_name; goto out; bad_val: pr_err("Invalid mount option %s", err.buf); - ret = -1; + ret = -BCH_ERR_option_value; goto out; out: kfree(copied_opts_start); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9a4b7faa3765..136083c11f3a 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -290,6 +290,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Allow mounting in when data will be missing") \ + x(no_splitbrain_check, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't kick drives out when splitbrain detected")\ x(discard, u8, \ OPT_FS|OPT_MOUNT|OPT_DEVICE, \ OPT_BOOL(), \ @@ -332,6 +337,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Run fsck on mount") \ + x(fsck_memory_usage_percent, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(20, 70), \ + BCH2_NO_SB_OPT, 50, \ + NULL, "Maximum percentage of system ram fsck is allowed to pin")\ x(fix_errors, u8, \ OPT_FS|OPT_MOUNT, \ OPT_FN(bch2_opt_fix_errors), \ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 22d1017aa49b..56336f3dd1d0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) u64 now = atomic64_read(&c->io_clock[WRITE].now); prt_str(out, "io wait duration: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); prt_str(out, "io wait remaining: "); - bch2_prt_human_readable_s64(out, r->wait_iotime_end - now); + bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); prt_str(out, "duration waited: "); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 21e13bb4335b..2af219aedfdb 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -52,14 +52,47 @@ static bool btree_id_is_alloc(enum btree_id id) } /* for -o reconstruct_alloc: */ -static void drop_alloc_keys(struct journal_keys *keys) +static void do_reconstruct_alloc(struct bch_fs *c) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required); + __set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required); + + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent); + __set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent); + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + + struct journal_keys *keys = &c->journal_keys; size_t src, dst; for (src = 0, dst = 0; src < keys->nr; src++) - if (!btree_id_is_alloc(keys->d[src].btree_id)) - keys->d[dst++] = keys->d[src]; - + if (!btree_id_is_alloc(keys->data[src].btree_id)) + keys->data[dst++] = keys->data[src]; keys->nr = dst; } @@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys) */ static void zero_out_btree_mem_ptr(struct journal_keys *keys) { - struct journal_key *i; - - for (i = keys->d; i < keys->d + keys->nr; i++) + darray_for_each(*keys, i) if (i->k->k.type == KEY_TYPE_btree_ptr_v2) bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; } @@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans, if (ret) goto out; + struct btree_path *path = btree_iter_path(trans, &iter); + if (unlikely(!btree_path_node(path, k->level))) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, 0, iter_flags); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_btree_increase_depth(trans, iter.path, 0) ?: + -BCH_ERR_transaction_restart_nested; + goto out; + } + /* Must be checked with btree locked: */ if (k->overwritten) goto out; @@ -166,11 +208,9 @@ static int bch2_journal_replay(struct bch_fs *c) * efficient - better locality of btree access - but some might fail if * that would cause a journal deadlock. */ - for (size_t i = 0; i < keys->nr; i++) { + darray_for_each(*keys, k) { cond_resched(); - struct journal_key *k = keys->d + i; - /* Skip fastpath if we're low on space in the journal */ ret = c->journal.watermark ? -1 : commit_do(trans, NULL, NULL, @@ -264,7 +304,7 @@ static int journal_replay_entry_early(struct bch_fs *c, bkey_copy(&r->key, (struct bkey_i *) entry->start); r->error = 0; } else { - r->error = -EIO; + r->error = -BCH_ERR_btree_node_read_error; } r->alive = true; break; @@ -359,7 +399,7 @@ static int journal_replay_early(struct bch_fs *c, genradix_for_each(&c->journal_entries, iter, _i) { i = *_i; - if (!i || i->ignore) + if (journal_replay_ignore(i)) continue; vstruct_for_each(&i->j, entry) { @@ -388,11 +428,8 @@ static int read_btree_roots(struct bch_fs *c) if (!r->alive) continue; - if (btree_id_is_alloc(i) && - c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc) continue; - } if (r->error) { __fsck_err(c, @@ -524,8 +561,7 @@ static int bch2_set_may_go_rw(struct bch_fs *c) * setting journal_key->overwritten: it will be accessed by multiple * threads */ - move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); - keys->gap = keys->nr; + move_gap(keys, keys->nr); set_bit(BCH_FS_may_go_rw, &c->flags); @@ -862,7 +898,7 @@ int bch2_fs_recovery(struct bch_fs *c) goto out; genradix_for_each_reverse(&c->journal_entries, iter, i) - if (*i && !(*i)->ignore) { + if (!journal_replay_ignore(*i)) { last_journal_entry = &(*i)->j; break; } @@ -887,7 +923,8 @@ int bch2_fs_recovery(struct bch_fs *c) genradix_for_each_reverse(&c->journal_entries, iter, i) if (*i) { last_journal_entry = &(*i)->j; - (*i)->ignore = false; + (*i)->ignore_blacklisted = false; + (*i)->ignore_not_dirty= false; /* * This was probably a NO_FLUSH entry, * so last_seq was garbage - but we know @@ -923,10 +960,8 @@ use_clean: c->journal_replay_seq_start = last_seq; c->journal_replay_seq_end = blacklist_seq - 1; - if (c->opts.reconstruct_alloc) { - c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); - drop_alloc_keys(&c->journal_keys); - } + if (c->opts.reconstruct_alloc) + do_reconstruct_alloc(c); zero_out_btree_mem_ptr(&c->journal_keys); @@ -950,7 +985,7 @@ use_clean: bch2_journal_seq_blacklist_add(c, blacklist_seq, journal_seq); if (ret) { - bch_err(c, "error creating new journal seq blacklist entry"); + bch_err_msg(c, ret, "error creating new journal seq blacklist entry"); goto err; } } @@ -961,9 +996,6 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) - bch2_journal_log_msg(c, "dropping alloc info"); - /* * Skip past versions that might have possibly been used (as nonces), * but hadn't had their pointers written: diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h index fa0c8efd2a1b..1361e34d4e64 100644 --- a/fs/bcachefs/recovery_types.h +++ b/fs/bcachefs/recovery_types.h @@ -34,6 +34,7 @@ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ x(fs_upgrade_for_subvolumes, 22, 0) \ x(resume_logged_ops, 23, PASS_ALWAYS) \ @@ -43,6 +44,7 @@ x(check_dirents, 27, PASS_FSCK) \ x(check_xattrs, 28, PASS_FSCK) \ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ x(check_nlinks, 31, PASS_FSCK) \ x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \ diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c index b6bf0ebe7e84..5980ba2563fe 100644 --- a/fs/bcachefs/sb-clean.c +++ b/fs/bcachefs/sb-clean.c @@ -171,22 +171,6 @@ fsck_err: return ERR_PTR(ret); } -static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) -{ - struct jset_entry *entry = *end; - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - - memset(entry, 0, u64s * sizeof(u64)); - /* - * The u64s field counts from the start of data, ignoring the shared - * fields. - */ - entry->u64s = cpu_to_le16(u64s - 1); - - *end = vstruct_next(*end); - return entry; -} - void bch2_journal_super_entries_add_common(struct bch_fs *c, struct jset_entry **end, u64 journal_seq) diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 441dcb1bf160..e4396cb0bacb 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -45,7 +45,13 @@ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \ x(rebalance_work, \ - BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) + BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance)) \ + x(subvolume_fs_parent, \ + BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ + BCH_FSCK_ERR_subvol_fs_path_parent_wrong) \ + x(btree_subvolume_children, \ + BIT_ULL(BCH_RECOVERY_PASS_check_subvols), \ + BCH_FSCK_ERR_subvol_children_not_set) #define DOWNGRADE_TABLE() @@ -253,7 +259,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi if (e < BCH_SB_ERR_MAX) __set_bit(e, c->sb.errors_silent); if (e < sizeof(ext->errors_silent) * 8) - ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64)); + __set_bit_le64(e, ext->errors_silent); } } } diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h index c08aacdfd073..5178bf579f7c 100644 --- a/fs/bcachefs/sb-errors_types.h +++ b/fs/bcachefs/sb-errors_types.h @@ -231,7 +231,7 @@ x(dirent_name_dot_or_dotdot, 223) \ x(dirent_name_has_slash, 224) \ x(dirent_d_type_wrong, 225) \ - x(dirent_d_parent_subvol_wrong, 226) \ + x(inode_bi_parent_wrong, 226) \ x(dirent_in_missing_dir_inode, 227) \ x(dirent_in_non_dir_inode, 228) \ x(dirent_to_missing_inode, 229) \ @@ -250,7 +250,22 @@ x(hash_table_key_duplicate, 242) \ x(hash_table_key_wrong_offset, 243) \ x(unlinked_inode_not_on_deleted_list, 244) \ - x(reflink_p_front_pad_bad, 245) + x(reflink_p_front_pad_bad, 245) \ + x(journal_entry_dup_same_device, 246) \ + x(inode_bi_subvol_missing, 247) \ + x(inode_bi_subvol_wrong, 248) \ + x(inode_points_to_missing_dirent, 249) \ + x(inode_points_to_wrong_dirent, 250) \ + x(inode_bi_parent_nonzero, 251) \ + x(dirent_to_missing_parent_subvol, 252) \ + x(dirent_not_visible_in_parent_subvol, 253) \ + x(subvol_fs_path_parent_wrong, 254) \ + x(subvol_root_fs_path_parent_nonzero, 255) \ + x(subvol_children_not_set, 256) \ + x(subvol_children_bad, 257) \ + x(subvol_loop, 258) \ + x(subvol_unreachable, 259) \ + x(btree_node_bkey_bad_u64s, 260) enum bch_sb_error_id { #define x(t, n) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index fcaa5a888744..3976f80721bf 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, } static __always_inline -int bch2_hash_set_snapshot(struct btree_trans *trans, +int bch2_hash_set_in_snapshot(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, subvol_inum inum, u32 snapshot, @@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans, struct bkey_i *insert, bch_str_hash_flags_t str_hash_flags) { - u32 snapshot; - int ret; - - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - return ret; - insert->k.p.inode = inum.inum; - return bch2_hash_set_snapshot(trans, desc, info, inum, - snapshot, insert, str_hash_flags, 0); + u32 snapshot; + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + bch2_hash_set_in_snapshot(trans, desc, info, inum, + snapshot, insert, str_hash_flags, 0); } static __always_inline diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 7c67c28d3ef8..ce7aed121942 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -13,13 +13,26 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static struct bpos subvolume_children_pos(struct bkey_s_c k) +{ + if (k.k->type != KEY_TYPE_subvolume) + return POS_MIN; + + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + if (!s.v->fs_path_parent) + return POS_MIN; + return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset); +} + static int check_subvol(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_s_c_subvolume subvol; + struct btree_iter subvol_children_iter = {}; struct bch_snapshot snapshot; + struct printbuf buf = PRINTBUF; unsigned snapid; int ret = 0; @@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans, return ret ?: -BCH_ERR_transaction_restart_nested; } + if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL && + subvol.v->fs_path_parent, + c, subvol_root_fs_path_parent_nonzero, + "root subvolume has nonzero fs_path_parent\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = 0; + } + + if (subvol.v->fs_path_parent) { + struct bpos pos = subvolume_children_pos(k); + + struct bkey_s_c subvol_children_k = + bch2_bkey_get_iter(trans, &subvol_children_iter, + BTREE_ID_subvolume_children, pos, 0); + ret = bkey_err(subvol_children_k); + if (ret) + goto err; + + if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set, + c, subvol_children_not_set, + "subvolume not set in subvolume_children btree at %llu:%llu\n%s", + pos.inode, pos.offset, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true); + if (ret) + goto err; + } + } + + struct bch_inode_unpacked inode; + struct btree_iter inode_iter = {}; + ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode, + (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) }, + 0); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret, c, subvol_to_missing_root, + "subvolume %llu points to missing subvolume root %llu:%u", + k.k->p.offset, le64_to_cpu(subvol.v->inode), + le32_to_cpu(subvol.v->snapshot))) { + ret = bch2_subvolume_delete(trans, iter->pos.offset); + bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset, + c, subvol_root_wrong_bi_subvol, + "subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu", + inode.bi_inum, inode_iter.k.p.snapshot, + inode.bi_subvol, subvol.k->p.offset)) { + inode.bi_subvol = subvol.k->p.offset; + ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot)); + if (ret) + goto err; + } + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); u32 snapshot_tree; @@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans, SET_BCH_SUBVOLUME_SNAP(&s->v, true); } } - +err: fsck_err: + bch2_trans_iter_exit(trans, &subvol_children_iter); + printbuf_exit(&buf); return ret; } @@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c) return ret; } +static int check_subvol_child(struct btree_trans *trans, + struct btree_iter *child_iter, + struct bkey_s_c child_k) +{ + struct bch_fs *c = trans->c; + struct bch_subvolume s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset), + 0, subvolume, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (fsck_err_on(ret || + le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode, + c, subvol_children_bad, + "incorrect entry in subvolume_children btree %llu:%llu", + child_k.k->p.inode, child_k.k->p.offset)) { + ret = bch2_btree_delete_at(trans, child_iter, 0); + if (ret) + goto err; + } +err: +fsck_err: + return ret; +} + +int bch2_check_subvol_children(struct bch_fs *c) +{ + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_subvol_child(trans, &iter, k))); + bch_err_fn(c, ret); + return 0; +} + /* Subvolumes: */ int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k, @@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, le64_to_cpu(s.v->inode), le32_to_cpu(s.v->snapshot)); - if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) - prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) { + prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); + prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); + } +} + +static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) +{ + return !bpos_eq(pos, POS_MIN) + ? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set) + : 0; +} + +int bch2_subvolume_trigger(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s new, + unsigned flags) +{ + if (flags & BTREE_TRIGGER_TRANSACTIONAL) { + struct bpos children_pos_old = subvolume_children_pos(old); + struct bpos children_pos_new = subvolume_children_pos(new.s_c); + + if (!bpos_eq(children_pos_old, children_pos_new)) { + int ret = subvolume_children_mod(trans, children_pos_old, false) ?: + subvolume_children_mod(trans, children_pos_new, true); + if (ret) + return ret; + } + } + + return 0; +} + +int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) +{ + struct btree_iter iter; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); + struct bkey_s_c k = bch2_btree_iter_peek(&iter); + bch2_trans_iter_exit(trans, &iter); + + return bkey_err(k) ?: k.k && k.k->p.inode == subvol + ? -BCH_ERR_ENOTEMPTY_subvol_not_empty + : 0; } static __always_inline int @@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (k.k->type != KEY_TYPE_subvolume) return 0; - if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && - le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent) return 0; s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); @@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans, if (ret) return ret; - s->v.parent = cpu_to_le32(new_parent); + s->v.creation_parent = cpu_to_le32(new_parent); return 0; } @@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_subvolume_reparent(trans, &iter, k, - subvolid_to_delete, le32_to_cpu(s.parent))); + subvolid_to_delete, le32_to_cpu(s.creation_parent))); } /* @@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) } int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 parent_subvolid, u32 src_subvolid, u32 *new_subvolid, u32 *new_snapshotid, @@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, if (ret) goto err; - new_subvol->v.flags = 0; - new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); - new_subvol->v.inode = cpu_to_le64(inode); - new_subvol->v.parent = cpu_to_le32(src_subvolid); - new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); - new_subvol->v.otime.hi = 0; + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.creation_parent = cpu_to_le32(src_subvolid); + new_subvol->v.fs_path_parent = cpu_to_le32(parent_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index a6f56f66e27c..903c05162c06 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -8,17 +8,22 @@ enum bkey_invalid_flags; int bch2_check_subvols(struct bch_fs *); +int bch2_check_subvol_children(struct bch_fs *); int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c, enum bkey_invalid_flags, struct printbuf *); void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s, unsigned); #define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ .key_invalid = bch2_subvolume_invalid, \ .val_to_text = bch2_subvolume_to_text, \ + .trigger = bch2_subvolume_trigger, \ .min_val_size = 16, \ }) +int bch2_subvol_has_children(struct btree_trans *, u32); int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); @@ -30,8 +35,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *); void bch2_delete_dead_snapshots_async(struct bch_fs *); int bch2_subvolume_unlink(struct btree_trans *, u32); -int bch2_subvolume_create(struct btree_trans *, u64, u32, - u32 *, u32 *, bool); +int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_fs_subvolumes_init(struct bch_fs *); diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h index af79134b07d6..e029df7ba89f 100644 --- a/fs/bcachefs/subvolume_format.h +++ b/fs/bcachefs/subvolume_format.h @@ -19,8 +19,8 @@ struct bch_subvolume { * This is _not_ necessarily the subvolume of the directory containing * this subvolume: */ - __le32 parent; - __le32 pad; + __le32 creation_parent; + __le32 fs_path_parent; bch_le128 otime; }; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index bd64eb68e84a..bceac29f3d86 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, return ret; } + if (rw == WRITE && + bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) { + prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu", + le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq), + le64_to_cpu(sb->seq)); + return -BCH_ERR_invalid_sb_members_missing; + } + return 0; } @@ -717,6 +725,7 @@ retry: if (IS_ERR(sb->s_bdev_file)) { ret = PTR_ERR(sb->s_bdev_file); + prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret)); goto err; } sb->bdev = file_bdev(sb->s_bdev_file); @@ -743,9 +752,9 @@ retry: prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n", path, err.buf); if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg) - printk(KERN_INFO "%s", err2.buf); + bch2_print_opts(opts, KERN_INFO "%s", err2.buf); else - printk(KERN_ERR "%s", err2.buf); + bch2_print_opts(opts, KERN_ERR "%s", err2.buf); printbuf_exit(&err2); printbuf_reset(&err); @@ -803,21 +812,20 @@ got_super: goto err; } - ret = 0; sb->have_layout = true; ret = bch2_sb_validate(sb, &err, READ); if (ret) { - printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", + path, err.buf); goto err_no_print; } out: printbuf_exit(&err); return ret; err: - printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n", - path, err.buf); + bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n", + path, err.buf); err_no_print: bch2_free_super(sb); goto out; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6b23e11825e6..233f864ed8b0 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -56,6 +56,7 @@ #include "super.h" #include "super-io.h" #include "sysfs.h" +#include "thread_with_file.h" #include "trace.h" #include <linux/backing-dev.h> @@ -86,6 +87,23 @@ const char * const bch2_fs_flag_strs[] = { NULL }; +void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...) +{ + struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio; + + va_list args; + va_start(args, fmt); + if (likely(!stdio)) { + vprintk(fmt, args); + } else { + if (fmt[0] == KERN_SOH[0]) + fmt += 2; + + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); + } + va_end(args); +} + void __bch2_print(struct bch_fs *c, const char *fmt, ...) { struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -95,16 +113,10 @@ void __bch2_print(struct bch_fs *c, const char *fmt, ...) if (likely(!stdio)) { vprintk(fmt, args); } else { - unsigned long flags; - if (fmt[0] == KERN_SOH[0]) fmt += 2; - spin_lock_irqsave(&stdio->output_lock, flags); - prt_vprintf(&stdio->output_buf, fmt, args); - spin_unlock_irqrestore(&stdio->output_lock, flags); - - wake_up(&stdio->output_wait); + bch2_stdio_redirect_vprintf(stdio, true, fmt, args); } va_end(args); } @@ -576,7 +588,7 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_update_wq); bch2_free_super(&c->disk_sb); - kvpfree(c, sizeof(*c)); + kvfree(c); module_put(THIS_MODULE); } @@ -715,7 +727,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) unsigned i, iter_size; int ret = 0; - c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); if (!c) { c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); goto out; @@ -818,13 +830,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; pr_uuid(&name, c->sb.user_uuid.b); - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; if (ret) goto err; + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -862,13 +874,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || !(c->io_complete_wq = alloc_workqueue("bcachefs_io", - WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) || + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0)) || #ifndef BCH_WRITE_REF_DEBUG @@ -882,8 +894,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) BIOSET_NEED_BVECS) || !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || !(c->online_reserved = alloc_percpu(u64)) || - mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, - c->opts.btree_node_size) || + mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, + c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, sizeof(u64), GFP_KERNEL))) { @@ -1061,7 +1073,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) } static int bch2_dev_in_fs(struct bch_sb_handle *fs, - struct bch_sb_handle *sb) + struct bch_sb_handle *sb, + struct bch_opts *opts) { if (fs == sb) return 0; @@ -1102,11 +1115,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));; prt_newline(&buf); - prt_printf(&buf, "Not using older sb"); + if (!opts->no_splitbrain_check) + prt_printf(&buf, "Not using older sb"); pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx); @@ -1124,17 +1140,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, prt_newline(&buf); prt_bdevname(&buf, fs->bdev); - prt_str(&buf, "believes seq of "); + prt_str(&buf, " believes seq of "); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " to be %llu, but ", seq_from_fs); prt_bdevname(&buf, sb->bdev); prt_printf(&buf, " has %llu\n", seq_from_member); - prt_str(&buf, "Not using "); - prt_bdevname(&buf, sb->bdev); + + if (!opts->no_splitbrain_check) { + prt_str(&buf, "Not using "); + prt_bdevname(&buf, sb->bdev); + } pr_err("%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_device_splitbrain; + + if (!opts->no_splitbrain_check) + return -BCH_ERR_device_splitbrain; } return 0; @@ -1168,8 +1189,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_dev_buckets_free(ca); free_page((unsigned long) ca->sb_read_scratch); - bch2_time_stats_exit(&ca->io_latency[WRITE]); - bch2_time_stats_exit(&ca->io_latency[READ]); + bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); percpu_ref_exit(&ca->io_ref); percpu_ref_exit(&ca->ref); @@ -1260,8 +1281,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, INIT_WORK(&ca->io_error_work, bch2_io_error_work); - bch2_time_stats_init(&ca->io_latency[READ]); - bch2_time_stats_init(&ca->io_latency[WRITE]); + bch2_time_stats_quantiles_init(&ca->io_latency[READ]); + bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]); ca->mi = bch2_mi_to_cpu(member); @@ -1597,27 +1618,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "dropping data"); + bch_err_msg(ca, ret, "bch2_dev_data_drop()"); if (ret) goto err; ret = bch2_dev_remove_alloc(c, ca); - bch_err_msg(ca, ret, "deleting alloc info"); + bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) goto err; ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); - bch_err_msg(ca, ret, "flushing journal"); + bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()"); if (ret) goto err; ret = bch2_journal_flush(&c->journal); - bch_err(ca, "journal error"); + bch_err_msg(ca, ret, "bch2_journal_flush()"); if (ret) goto err; ret = bch2_replicas_gc2(c); - bch_err_msg(ca, ret, "in replicas_gc2()"); + bch_err_msg(ca, ret, "bch2_replicas_gc2()"); if (ret) goto err; @@ -1835,7 +1856,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path) dev_idx = sb.sb->dev_idx; - ret = bch2_dev_in_fs(&c->disk_sb, &sb); + ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts); bch_err_msg(c, ret, "bringing %s online", path); if (ret) goto err; @@ -2023,7 +2044,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb); + ret = bch2_dev_in_fs(best, sb, &opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index cee80c47feea..c86a93a8d8fc 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -930,10 +930,10 @@ SHOW(bch2_dev) sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); if (attr == &sysfs_io_latency_stats_read) - bch2_time_stats_to_text(out, &ca->io_latency[READ]); + bch2_time_stats_to_text(out, &ca->io_latency[READ].stats); if (attr == &sysfs_io_latency_stats_write) - bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats); sysfs_printf(congested, "%u%%", clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index 9220d7de10db..940db15d6a93 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -2,7 +2,6 @@ #ifndef NO_BCACHEFS_FS #include "bcachefs.h" -#include "printbuf.h" #include "thread_with_file.h" #include <linux/anon_inodes.h> @@ -10,6 +9,7 @@ #include <linux/kthread.h> #include <linux/pagemap.h> #include <linux/poll.h> +#include <linux/sched/sysctl.h> void bch2_thread_with_file_exit(struct thread_with_file *thr) { @@ -65,68 +65,82 @@ err: return ret; } -static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr) +/* stdio_redirect */ + +static bool stdio_redirect_has_input(struct stdio_redirect *stdio) { - return thr->stdio.output_buf.pos || - thr->output2.nr || - thr->thr.done; + return stdio->input.buf.nr || stdio->done; } -static ssize_t thread_with_stdio_read(struct file *file, char __user *buf, - size_t len, loff_t *ppos) +static bool stdio_redirect_has_output(struct stdio_redirect *stdio) { - struct thread_with_stdio *thr = - container_of(file->private_data, struct thread_with_stdio, thr); - size_t copied = 0, b; - int ret = 0; + return stdio->output.buf.nr || stdio->done; +} - if ((file->f_flags & O_NONBLOCK) && - !thread_with_stdio_has_output(thr)) - return -EAGAIN; +#define STDIO_REDIRECT_BUFSIZE 4096 - ret = wait_event_interruptible(thr->stdio.output_wait, - thread_with_stdio_has_output(thr)); - if (ret) - return ret; +static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio) +{ + return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - if (thr->thr.done) - return 0; +static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio) +{ + return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done; +} - while (len) { - ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos); - if (ret) - break; +static void stdio_buf_init(struct stdio_buf *buf) +{ + spin_lock_init(&buf->lock); + init_waitqueue_head(&buf->wait); + darray_init(&buf->buf); +} - spin_lock_irq(&thr->stdio.output_lock); - b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos); +/* thread_with_stdio */ - memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b); - memmove(thr->stdio.output_buf.buf, - thr->stdio.output_buf.buf + b, - thr->stdio.output_buf.pos - b); +static void thread_with_stdio_done(struct thread_with_stdio *thr) +{ + thr->thr.done = true; + thr->stdio.done = true; + wake_up(&thr->stdio.input.wait); + wake_up(&thr->stdio.output.wait); +} - thr->output2.nr += b; - thr->stdio.output_buf.pos -= b; - spin_unlock_irq(&thr->stdio.output_lock); +static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf, + size_t len, loff_t *ppos) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + struct stdio_buf *buf = &thr->stdio.output; + size_t copied = 0, b; + int ret = 0; - b = min(len, thr->output2.nr); - if (!b) - break; + if (!(file->f_flags & O_NONBLOCK)) { + ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio)); + if (ret) + return ret; + } else if (!stdio_redirect_has_output(&thr->stdio)) + return -EAGAIN; - b -= copy_to_user(buf, thr->output2.data, b); - if (!b) { + while (len && buf->buf.nr) { + if (fault_in_writeable(ubuf, len) == len) { ret = -EFAULT; break; } - copied += b; - buf += b; - len -= b; - - memmove(thr->output2.data, - thr->output2.data + b, - thr->output2.nr - b); - thr->output2.nr -= b; + spin_lock_irq(&buf->lock); + b = min_t(size_t, len, buf->buf.nr); + + if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) { + ubuf += b; + len -= b; + copied += b; + buf->buf.nr -= b; + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + } + spin_unlock_irq(&buf->lock); } return copied ?: ret; @@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file) struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); + thread_with_stdio_done(thr); bch2_thread_with_file_exit(&thr->thr); - printbuf_exit(&thr->stdio.input_buf); - printbuf_exit(&thr->stdio.output_buf); - darray_exit(&thr->output2); - thr->exit(thr); + darray_exit(&thr->stdio.input.buf); + darray_exit(&thr->stdio.output.buf); + thr->ops->exit(thr); return 0; } -#define WRITE_BUFFER 4096 - -static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr) -{ - return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done; -} - static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf, size_t len, loff_t *ppos) { struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - struct printbuf *buf = &thr->stdio.input_buf; + struct stdio_buf *buf = &thr->stdio.input; size_t copied = 0; ssize_t ret = 0; @@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu break; } - spin_lock(&thr->stdio.input_lock); - if (buf->pos < WRITE_BUFFER) - bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos)); - b = min(len, printbuf_remaining_size(buf)); - - if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) { - ubuf += b; - len -= b; - copied += b; - buf->pos += b; + spin_lock(&buf->lock); + if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE) + darray_make_room_gfp(&buf->buf, + min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT); + b = min(len, darray_room(buf->buf)); + + if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) { + buf->buf.nr += b; + ubuf += b; + len -= b; + copied += b; } - spin_unlock(&thr->stdio.input_lock); + spin_unlock(&buf->lock); if (b) { - wake_up(&thr->stdio.input_wait); + wake_up(&buf->wait); } else { if ((file->f_flags & O_NONBLOCK)) { ret = -EAGAIN; break; } - ret = wait_event_interruptible(thr->stdio.input_wait, - thread_with_stdio_has_input_space(thr)); + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_input_space(&thr->stdio)); if (ret) break; } @@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru struct thread_with_stdio *thr = container_of(file->private_data, struct thread_with_stdio, thr); - poll_wait(file, &thr->stdio.output_wait, wait); - poll_wait(file, &thr->stdio.input_wait, wait); + poll_wait(file, &thr->stdio.output.wait, wait); + poll_wait(file, &thr->stdio.input.wait, wait); __poll_t mask = 0; - if (thread_with_stdio_has_output(thr)) + if (stdio_redirect_has_output(&thr->stdio)) mask |= EPOLLIN; - if (thread_with_stdio_has_input_space(thr)) + if (stdio_redirect_has_input_space(&thr->stdio)) mask |= EPOLLOUT; if (thr->thr.done) mask |= EPOLLHUP|EPOLLERR; return mask; } +static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + poll_wait(file, &thr->stdio.output.wait, wait); + + __poll_t mask = 0; + + if (stdio_redirect_has_output(&thr->stdio)) + mask |= EPOLLIN; + if (thr->thr.done) + mask |= EPOLLHUP|EPOLLERR; + return mask; +} + +static int thread_with_stdio_flush(struct file *file, fl_owner_t id) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + return thr->thr.ret; +} + +static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p) +{ + struct thread_with_stdio *thr = + container_of(file->private_data, struct thread_with_stdio, thr); + + if (thr->ops->unlocked_ioctl) + return thr->ops->unlocked_ioctl(thr, cmd, p); + return -ENOTTY; +} + static const struct file_operations thread_with_stdio_fops = { - .release = thread_with_stdio_release, + .llseek = no_llseek, .read = thread_with_stdio_read, .write = thread_with_stdio_write, .poll = thread_with_stdio_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, +}; + +static const struct file_operations thread_with_stdout_fops = { .llseek = no_llseek, + .read = thread_with_stdio_read, + .poll = thread_with_stdout_poll, + .flush = thread_with_stdio_flush, + .release = thread_with_stdio_release, + .unlocked_ioctl = thread_with_stdio_ioctl, }; +static int thread_with_stdio_fn(void *arg) +{ + struct thread_with_stdio *thr = arg; + + thr->thr.ret = thr->ops->fn(thr); + + thread_with_stdio_done(thr); + return 0; +} + int bch2_run_thread_with_stdio(struct thread_with_stdio *thr, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)) + const struct thread_with_stdio_ops *ops) { - thr->stdio.input_buf = PRINTBUF; - thr->stdio.input_buf.atomic++; - spin_lock_init(&thr->stdio.input_lock); - init_waitqueue_head(&thr->stdio.input_wait); + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; - thr->stdio.output_buf = PRINTBUF; - thr->stdio.output_buf.atomic++; - spin_lock_init(&thr->stdio.output_lock); - init_waitqueue_head(&thr->stdio.output_wait); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn); +} - darray_init(&thr->output2); - thr->exit = exit; +int bch2_run_thread_with_stdout(struct thread_with_stdio *thr, + const struct thread_with_stdio_ops *ops) +{ + stdio_buf_init(&thr->stdio.input); + stdio_buf_init(&thr->stdio.output); + thr->ops = ops; - return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn); + return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn); } +EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout); -int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); + struct stdio_buf *buf = &stdio->input; + + /* + * we're waiting on user input (or for the file descriptor to be + * closed), don't want a hung task warning: + */ + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); if (stdio->done) return -1; - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); + spin_lock(&buf->lock); + int ret = min(len, buf->buf.nr); + buf->buf.nr -= ret; + memcpy(ubuf, buf->buf.data, ret); + memmove(buf->buf.data, + buf->buf.data + ret, + buf->buf.nr); + spin_unlock(&buf->lock); - wake_up(&stdio->input_wait); + wake_up(&buf->wait); return ret; } -int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len) +int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len) { - wait_event(stdio->input_wait, - stdio->input_buf.pos || stdio->done); - - if (stdio->done) - return -1; + struct stdio_buf *buf = &stdio->input; + size_t copied = 0; + ssize_t ret = 0; +again: + do { + wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio), + sysctl_hung_task_timeout_secs * HZ / 2); + } while (!stdio_redirect_has_input(stdio)); + + if (stdio->done) { + ret = -1; + goto out; + } - spin_lock(&stdio->input_lock); - int ret = min(len, stdio->input_buf.pos); - char *n = memchr(stdio->input_buf.buf, '\n', ret); + spin_lock(&buf->lock); + size_t b = min(len, buf->buf.nr); + char *n = memchr(buf->buf.data, '\n', b); if (n) - ret = min(ret, n + 1 - stdio->input_buf.buf); - stdio->input_buf.pos -= ret; - memcpy(buf, stdio->input_buf.buf, ret); - memmove(stdio->input_buf.buf, - stdio->input_buf.buf + ret, - stdio->input_buf.pos); - spin_unlock(&stdio->input_lock); - - wake_up(&stdio->input_wait); + b = min_t(size_t, b, n + 1 - buf->buf.data); + buf->buf.nr -= b; + memcpy(ubuf, buf->buf.data, b); + memmove(buf->buf.data, + buf->buf.data + b, + buf->buf.nr); + ubuf += b; + len -= b; + copied += b; + spin_unlock(&buf->lock); + + wake_up(&buf->wait); + + if (!n && len) + goto again; +out: + return copied ?: ret; +} + +__printf(3, 0) +static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args) +{ + ssize_t ret; + + do { + va_list args2; + size_t len; + + va_copy(args2, args); + len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2); + va_end(args2); + + if (len + 1 <= darray_room(*out)) { + out->nr += len; + return len; + } + + ret = darray_make_room_gfp(out, len + 1, gfp); + } while (ret == 0); + + return ret; +} + +ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, va_list args) +{ + struct stdio_buf *buf = &stdio->output; + unsigned long flags; + ssize_t ret; + +again: + spin_lock_irqsave(&buf->lock, flags); + ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); + spin_unlock_irqrestore(&buf->lock, flags); + + if (ret < 0) { + if (nonblocking) + return -EAGAIN; + + ret = wait_event_interruptible(buf->wait, + stdio_redirect_has_output_space(stdio)); + if (ret) + return ret; + goto again; + } + + wake_up(&buf->wait); + return ret; +} + +ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking, + const char *fmt, ...) +{ + va_list args; + ssize_t ret; + + va_start(args, fmt); + ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args); + va_end(args); + return ret; } diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h index 05879c5048c8..af54ea8f5b0f 100644 --- a/fs/bcachefs/thread_with_file.h +++ b/fs/bcachefs/thread_with_file.h @@ -4,6 +4,38 @@ #include "thread_with_file_types.h" +/* + * Thread with file: Run a kthread and connect it to a file descriptor, so that + * it can be interacted with via fd read/write methods and closing the file + * descriptor stops the kthread. + * + * We have two different APIs: + * + * thread_with_file, the low level version. + * You get to define the full file_operations, including your release function, + * which means that you must call bch2_thread_with_file_exit() from your + * .release method + * + * thread_with_stdio, the higher level version + * This implements full piping of input and output, including .poll. + * + * Notes on behaviour: + * - kthread shutdown behaves like writing or reading from a pipe that has been + * closed + * - Input and output buffers are 4096 bytes, although buffers may in some + * situations slightly exceed that limit so as to avoid chopping off a + * message in the middle in nonblocking mode. + * - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations - + * should be fine but might change in future revisions. + * - Output buffer may grow past 4096 bytes to deal with messages that are + * bigger than 4096 bytes + * - Writing may be done blocking or nonblocking; in nonblocking mode, we only + * drop entire messages. + * + * To write, use stdio_redirect_printf() + * To read, use stdio_redirect_read() or stdio_redirect_readline() + */ + struct task_struct; struct thread_with_file { @@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *, const struct file_operations *, int (*fn)(void *)); +struct thread_with_stdio; + +struct thread_with_stdio_ops { + void (*exit)(struct thread_with_stdio *); + int (*fn)(struct thread_with_stdio *); + long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long); +}; + struct thread_with_stdio { struct thread_with_file thr; struct stdio_redirect stdio; - DARRAY(char) output2; - void (*exit)(struct thread_with_stdio *); + const struct thread_with_stdio_ops *ops; }; -static inline void thread_with_stdio_done(struct thread_with_stdio *thr) -{ - thr->thr.done = true; - thr->stdio.done = true; - wake_up(&thr->stdio.input_wait); - wake_up(&thr->stdio.output_wait); -} - int bch2_run_thread_with_stdio(struct thread_with_stdio *, - void (*exit)(struct thread_with_stdio *), - int (*fn)(void *)); + const struct thread_with_stdio_ops *); +int bch2_run_thread_with_stdout(struct thread_with_stdio *, + const struct thread_with_stdio_ops *); int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t); int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t); +__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list); +__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...); + #endif /* _BCACHEFS_THREAD_WITH_FILE_H */ diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h index 90b5e645e98c..e0daf4eec341 100644 --- a/fs/bcachefs/thread_with_file_types.h +++ b/fs/bcachefs/thread_with_file_types.h @@ -2,14 +2,21 @@ #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H +#include "darray.h" + +struct stdio_buf { + spinlock_t lock; + wait_queue_head_t wait; + darray_char buf; +}; + struct stdio_redirect { - spinlock_t output_lock; - wait_queue_head_t output_wait; - struct printbuf output_buf; + struct stdio_buf input; + struct stdio_buf output; spinlock_t input_lock; wait_queue_head_t input_wait; - struct printbuf input_buf; + darray_char input_buf; bool done; }; diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c new file mode 100644 index 000000000000..4508e9dcbee2 --- /dev/null +++ b/fs/bcachefs/time_stats.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/module.h> +#include <linux/percpu.h> +#include <linux/preempt.h> +#include <linux/time.h> +#include <linux/spinlock.h> + +#include "eytzinger.h" +#include "time_stats.h" + +static const struct time_unit time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "d", (u64) NSEC_PER_SEC * 3600 * 24}, + { "w", (u64) NSEC_PER_SEC * 3600 * 24 * 7}, + { "y", (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */ + { "eon", U64_MAX }, +}; + +const struct time_unit *bch2_pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +static void quantiles_update(struct quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + bool initted = stats->last_event != 0; + + if (time_after64(end, start)) { + struct quantiles *quantiles = time_stats_to_quantiles(stats); + + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, + duration, initted, TIME_STATS_MV_WEIGHT); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + stats->total_duration += duration; + + if (quantiles) + quantiles_update(quantiles, duration); + } + + if (stats->last_event && time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, + freq, initted, TIME_STATS_MV_WEIGHT); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + } + + stats->last_event = end; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + for (struct time_stat_buffer_entry *i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + time_stats_update_one(stats, i->start, i->end); + b->nr = 0; +} + +static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats, + struct time_stat_buffer *b) +{ + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + __bch2_time_stats_clear_buffer(stats, b); + spin_unlock_irqrestore(&stats->lock, flags); +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h new file mode 100644 index 000000000000..5df61403744b --- /dev/null +++ b/fs/bcachefs/time_stats.h @@ -0,0 +1,159 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * bch2_time_stats - collect statistics on events that have a duration, with nicely + * formatted textual output on demand + * + * - percpu buffering of event collection: cheap enough to shotgun + * everywhere without worrying about overhead + * + * tracks: + * - number of events + * - maximum event duration ever seen + * - sum of all event durations + * - average event duration, standard and weighted + * - standard deviation of event durations, standard and weighted + * and analagous statistics for the frequency of events + * + * We provide both mean and weighted mean (exponentially weighted), and standard + * deviation and weighted standard deviation, to give an efficient-to-compute + * view of current behaviour versus. average behaviour - "did this event source + * just become wonky, or is this typical?". + * + * Particularly useful for tracking down latency issues. + */ +#ifndef _BCACHEFS_TIME_STATS_H +#define _BCACHEFS_TIME_STATS_H + +#include <linux/sched/clock.h> +#include <linux/spinlock_types.h> +#include <linux/string.h> + +#include "mean_and_variance.h" + +struct time_unit { + const char *name; + u64 nsecs; +}; + +/* + * given a nanosecond value, pick the preferred time units for printing: + */ +const struct time_unit *bch2_pick_time_units(u64 ns); + +/* + * quantiles - do not use: + * + * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't + * use in new code. + */ + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct quantiles { + struct quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct time_stat_buffer { + unsigned nr; + struct time_stat_buffer_entry { + u64 start; + u64 end; + } entries[31]; +}; + +struct bch2_time_stats { + spinlock_t lock; + bool have_quantiles; + /* all fields are in nanoseconds */ + u64 min_duration; + u64 max_duration; + u64 total_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + u64 last_event_start; + + struct mean_and_variance duration_stats; + struct mean_and_variance freq_stats; + +/* default weight for weighted mean and variance calculations */ +#define TIME_STATS_MV_WEIGHT 8 + + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance_weighted freq_stats_weighted; + struct time_stat_buffer __percpu *buffer; +}; + +struct bch2_time_stats_quantiles { + struct bch2_time_stats stats; + struct quantiles quantiles; +}; + +static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats) +{ + return stats->have_quantiles + ? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles + : NULL; +} + +void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *); +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); + +/** + * time_stats_update - collect a new event being tracked + * + * @stats - bch2_time_stats to update + * @start - start time of event, recorded with local_clock() + * + * The end duration of the event will be the current time + */ +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +/** + * track_event_change - track state change events + * + * @stats - bch2_time_stats to update + * @v - new state, true or false + * + * Use this when tracking time stats for state changes, i.e. resource X becoming + * blocked/unblocked. + */ +static inline bool track_event_change(struct bch2_time_stats *stats, bool v) +{ + if (v != !!stats->last_event_start) { + if (!v) { + bch2_time_stats_update(stats, stats->last_event_start); + stats->last_event_start = 0; + } else { + stats->last_event_start = local_clock() ?: 1; + return true; + } + } + + return false; +} + +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_exit(&statq->stats); +} +static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq) +{ + bch2_time_stats_init(&statq->stats); + statq->stats.have_quantiles = true; + memset(&statq->quantiles, 0, sizeof(statq->quantiles)); +} + +#endif /* _BCACHEFS_TIME_STATS_H */ diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 293b90d704fb..6aa81d1e6d36 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +); + #endif /* _TRACE_BCACHEFS_H */ /* This part must be outside protection */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index 3a32faa86b5c..216fadf16928 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec) } #endif -static const struct time_unit { - const char *name; - u64 nsecs; -} time_units[] = { - { "ns", 1 }, - { "us", NSEC_PER_USEC }, - { "ms", NSEC_PER_MSEC }, - { "s", NSEC_PER_SEC }, - { "m", (u64) NSEC_PER_SEC * 60}, - { "h", (u64) NSEC_PER_SEC * 3600}, - { "eon", U64_MAX }, -}; - -static const struct time_unit *pick_time_units(u64 ns) -{ - const struct time_unit *u; - - for (u = time_units; - u + 1 < time_units + ARRAY_SIZE(time_units) && - ns >= u[1].nsecs << 1; - u++) - ; - - return u; -} - void bch2_pr_time_units(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); } -/* time stats: */ - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) -{ - unsigned i = 0; - - while (i < ARRAY_SIZE(q->entries)) { - struct bch2_quantile_entry *e = q->entries + i; - - if (unlikely(!e->step)) { - e->m = v; - e->step = max_t(unsigned, v / 2, 1024); - } else if (e->m > v) { - e->m = e->m >= e->step - ? e->m - e->step - : 0; - } else if (e->m < v) { - e->m = e->m + e->step > e->m - ? e->m + e->step - : U32_MAX; - } - - if ((e->m > v ? e->m - v : v - e->m) < e->step) - e->step = max_t(unsigned, e->step / 2, 1); - - if (v >= e->m) - break; - - i = eytzinger0_child(i, v > e->m); - } -} - -static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, - u64 start, u64 end) -{ - u64 duration, freq; - - if (time_after64(end, start)) { - duration = end - start; - mean_and_variance_update(&stats->duration_stats, duration); - mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); - stats->max_duration = max(stats->max_duration, duration); - stats->min_duration = min(stats->min_duration, duration); - stats->total_duration += duration; - bch2_quantiles_update(&stats->quantiles, duration); - } - - if (stats->last_event && time_after64(end, stats->last_event)) { - freq = end - stats->last_event; - mean_and_variance_update(&stats->freq_stats, freq); - mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); - stats->max_freq = max(stats->max_freq, freq); - stats->min_freq = min(stats->min_freq, freq); - } - - stats->last_event = end; -} - -static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - for (struct bch2_time_stat_buffer_entry *i = b->entries; - i < b->entries + ARRAY_SIZE(b->entries); - i++) - bch2_time_stats_update_one(stats, i->start, i->end); - b->nr = 0; -} - -static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, - struct bch2_time_stat_buffer *b) -{ - unsigned long flags; - - spin_lock_irqsave(&stats->lock, flags); - __bch2_time_stats_clear_buffer(stats, b); - spin_unlock_irqrestore(&stats->lock, flags); -} - -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) -{ - unsigned long flags; - - WARN_ONCE(!stats->duration_stats_weighted.weight || - !stats->freq_stats_weighted.weight, - "uninitialized time_stats"); - - if (!stats->buffer) { - spin_lock_irqsave(&stats->lock, flags); - bch2_time_stats_update_one(stats, start, end); - - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && - stats->duration_stats.n > 1024) - stats->buffer = - alloc_percpu_gfp(struct bch2_time_stat_buffer, - GFP_ATOMIC); - spin_unlock_irqrestore(&stats->lock, flags); - } else { - struct bch2_time_stat_buffer *b; - - preempt_disable(); - b = this_cpu_ptr(stats->buffer); - - BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); - b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { - .start = start, - .end = end - }; - - if (unlikely(b->nr == ARRAY_SIZE(b->entries))) - bch2_time_stats_clear_buffer(stats, b); - preempt_enable(); - } -} - static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) { - const struct time_unit *u = pick_time_units(ns); + const struct time_unit *u = bch2_pick_time_units(ns); prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); prt_tab_rjust(out); @@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) { - const struct time_unit *u; + struct quantiles *quantiles = time_stats_to_quantiles(stats); s64 f_mean = 0, d_mean = 0; - u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; - int i; + u64 f_stddev = 0, d_stddev = 0; if (stats->buffer) { int cpu; @@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, d_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, d_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); @@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats prt_tab(out); bch2_pr_time_units_aligned(out, f_mean); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); prt_newline(out); prt_printf(out, "stddev:"); prt_tab(out); bch2_pr_time_units_aligned(out, f_stddev); prt_tab(out); - bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT)); printbuf_indent_sub(out, 2); prt_newline(out); printbuf_tabstops_reset(out); - i = eytzinger0_first(NR_QUANTILES); - u = pick_time_units(stats->quantiles.entries[i].m); - - prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; - - q = max(stats->quantiles.entries[i].m, last_q); - prt_printf(out, "%llu ", - div_u64(q, u->nsecs)); - if (is_last) - prt_newline(out); - last_q = q; + if (quantiles) { + int i = eytzinger0_first(NR_QUANTILES); + const struct time_unit *u = + bch2_pick_time_units(quantiles->entries[i].m); + u64 last_q = 0; + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + u64 q = max(quantiles->entries[i].m, last_q); + prt_printf(out, "%llu ", div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } } } -#else -void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {} -#endif - -void bch2_time_stats_exit(struct bch2_time_stats *stats) -{ - free_percpu(stats->buffer); -} - -void bch2_time_stats_init(struct bch2_time_stats *stats) -{ - memset(stats, 0, sizeof(*stats)); - stats->duration_stats_weighted.weight = 8; - stats->freq_stats_weighted.weight = 8; - stats->min_duration = U64_MAX; - stats->min_freq = U64_MAX; - spin_lock_init(&stats->lock); -} /* ratelimit: */ @@ -1007,28 +850,6 @@ void sort_cmp_size(void *base, size_t num, size_t size, } } -static void mempool_free_vp(void *element, void *pool_data) -{ - size_t size = (size_t) pool_data; - - vpfree(element, size); -} - -static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - - return vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return size < PAGE_SIZE - ? mempool_init_kmalloc_pool(pool, min_nr, size) - : mempool_init(pool, min_nr, mempool_alloc_vp, - mempool_free_vp, (void *) size); -} - #if 0 void eytzinger1_test(void) { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index b414736d59a5..7ffbddb80400 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -21,6 +21,7 @@ #include "mean_and_variance.h" #include "darray.h" +#include "time_stats.h" struct closure; @@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void vpfree(void *p, size_t size) -{ - if (is_vmalloc_addr(p)) - vfree(p); - else - free_pages((unsigned long) p, get_order(size)); -} - -static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -{ - return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, - get_order(size)) ?: - __vmalloc(size, gfp_mask); -} - -static inline void kvpfree(void *p, size_t size) -{ - if (size < PAGE_SIZE) - kfree(p); - else - vpfree(p, size); -} - -static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -{ - return size < PAGE_SIZE - ? kmalloc(size, gfp_mask) - : vpmalloc(size, gfp_mask); -} - -int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); - #define HEAP(type) \ struct { \ size_t size, used; \ @@ -97,13 +66,13 @@ struct { \ ({ \ (heap)->used = 0; \ (heap)->size = (_size); \ - (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\ (gfp)); \ }) #define free_heap(heap) \ do { \ - kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + kvfree((heap)->data); \ (heap)->data = NULL; \ } while (0) @@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev) #endif } -#define NR_QUANTILES 15 -#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) - -struct bch2_quantiles { - struct bch2_quantile_entry { - u64 m; - u64 step; - } entries[NR_QUANTILES]; -}; - -struct bch2_time_stat_buffer { - unsigned nr; - struct bch2_time_stat_buffer_entry { - u64 start; - u64 end; - } entries[32]; -}; - -struct bch2_time_stats { - spinlock_t lock; - /* all fields are in nanoseconds */ - u64 min_duration; - u64 max_duration; - u64 total_duration; - u64 max_freq; - u64 min_freq; - u64 last_event; - struct bch2_quantiles quantiles; - - struct mean_and_variance duration_stats; - struct mean_and_variance_weighted duration_stats_weighted; - struct mean_and_variance freq_stats; - struct mean_and_variance_weighted freq_stats_weighted; - struct bch2_time_stat_buffer __percpu *buffer; -}; - -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); - -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) -{ - __bch2_time_stats_update(stats, start, local_clock()); -} - -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - if (v != !!*start) { - if (!v) { - bch2_time_stats_update(stats, *start); - *start = 0; - } else { - *start = local_clock() ?: 1; - return true; - } - } - - return false; -} -#else -static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} -static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {} -static inline bool track_event_change(struct bch2_time_stats *stats, - u64 *start, bool v) -{ - bool ret = v && !*start; - *start = v; - return ret; -} -#endif - void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); -void bch2_time_stats_exit(struct bch2_time_stats *); -void bch2_time_stats_init(struct bch2_time_stats *); - #define ewma_add(ewma, val, weight) \ ({ \ typeof(ewma) _ewma = (ewma); \ @@ -788,8 +681,12 @@ static inline void __move_gap(void *array, size_t element_size, } /* Move the gap in a gap buffer: */ -#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ - __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) +#define move_gap(_d, _new_gap) \ +do { \ + __move_gap((_d)->data, sizeof((_d)->data[0]), \ + (_d)->nr, (_d)->size, (_d)->gap, _new_gap); \ + (_d)->gap = _new_gap; \ +} while (0) #define bubble_sort(_base, _nr, _cmp) \ do { \ @@ -876,4 +773,25 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) void bch2_darray_str_exit(darray_str *); int bch2_split_devs(const char *, darray_str *); +#ifdef __KERNEL__ + +__must_check +static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n) +{ + return copy_to_user(to, from, n) ? -EFAULT : 0; +} + +__must_check +static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n) +{ + return copy_from_user(to, from, n) ? -EFAULT : 0; +} + +#endif + +static inline void __set_bit_le64(size_t bit, __le64 *addr) +{ + addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64)); +} + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 9c0d2316031b..754f17bba68e 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, kfree(buf); if (ret < 0) - return ret; + goto err_class_exit; ret = bch2_opt_check_may_set(c, opt_id, v); if (ret < 0) - return ret; + goto err_class_exit; s.v = v + 1; s.defined = true; @@ -595,6 +595,7 @@ err: (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression)))) bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum); +err_class_exit: return bch2_err_class(ret); } diff --git a/fs/inode.c b/fs/inode.c index d290f007b3d1..3a41f83a4ba5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2033,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -static int __file_remove_privs(struct file *file, unsigned int flags) +int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2058,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } +EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2070,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags) */ int file_remove_privs(struct file *file) { - return __file_remove_privs(file, 0); + return file_remove_privs_flags(file, 0); } EXPORT_SYMBOL(file_remove_privs); @@ -2163,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags) * Clear the security bits if the process is not being run by root. * This keeps people from modifying setuid and setgid binaries. */ - ret = __file_remove_privs(file, flags); + ret = file_remove_privs_flags(file, flags); if (ret) return ret; diff --git a/include/linux/fs.h b/include/linux/fs.h index d5d5a4ee24f0..00fc429b0af0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3074,6 +3074,7 @@ extern struct inode *new_inode_pseudo(struct super_block *sb); extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); +extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 847413164738..f3512fddf3d7 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -5,7 +5,7 @@ * DOC: Generic radix trees/sparse arrays * * Very simple and minimalistic, supporting arbitrary size entries up to - * PAGE_SIZE. + * GENRADIX_NODE_SIZE. * * A genradix is defined with the type it will store, like so: * @@ -45,12 +45,15 @@ struct genradix_root; +#define GENRADIX_NODE_SHIFT 9 +#define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) + struct __genradix { struct genradix_root *root; }; /* - * NOTE: currently, sizeof(_type) must not be larger than PAGE_SIZE: + * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: */ #define __GENRADIX_INITIALIZER \ @@ -101,14 +104,14 @@ void __genradix_free(struct __genradix *); static inline size_t __idx_to_offset(size_t idx, size_t obj_size) { if (__builtin_constant_p(obj_size)) - BUILD_BUG_ON(obj_size > PAGE_SIZE); + BUILD_BUG_ON(obj_size > GENRADIX_NODE_SIZE); else - BUG_ON(obj_size > PAGE_SIZE); + BUG_ON(obj_size > GENRADIX_NODE_SIZE); if (!is_power_of_2(obj_size)) { - size_t objs_per_page = PAGE_SIZE / obj_size; + size_t objs_per_page = GENRADIX_NODE_SIZE / obj_size; - return (idx / objs_per_page) * PAGE_SIZE + + return (idx / objs_per_page) * GENRADIX_NODE_SIZE + (idx % objs_per_page) * obj_size; } else { return idx * obj_size; @@ -118,9 +121,9 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) #define __genradix_objs_per_page(_radix) \ - (PAGE_SIZE / sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE / sizeof((_radix)->type[0])) #define __genradix_page_remainder(_radix) \ - (PAGE_SIZE % sizeof((_radix)->type[0])) + (GENRADIX_NODE_SIZE % sizeof((_radix)->type[0])) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -217,8 +220,8 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, iter->offset += obj_size; if (!is_power_of_2(obj_size) && - (iter->offset & (PAGE_SIZE - 1)) + obj_size > PAGE_SIZE) - iter->offset = round_up(iter->offset, PAGE_SIZE); + (iter->offset & (GENRADIX_NODE_SIZE - 1)) + obj_size > GENRADIX_NODE_SIZE) + iter->offset = round_up(iter->offset, GENRADIX_NODE_SIZE); iter->pos++; } @@ -235,8 +238,8 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, return; } - if ((iter->offset & (PAGE_SIZE - 1)) == 0) - iter->offset -= PAGE_SIZE % obj_size; + if ((iter->offset & (GENRADIX_NODE_SIZE - 1)) == 0) + iter->offset -= GENRADIX_NODE_SIZE % obj_size; iter->offset -= obj_size; iter->pos--; @@ -263,7 +266,7 @@ static inline void __genradix_iter_rewind(struct genradix_iter *iter, genradix_for_each_from(_radix, _iter, _p, 0) #define genradix_last_pos(_radix) \ - (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + (SIZE_MAX / GENRADIX_NODE_SIZE * __genradix_objs_per_page(_radix) - 1) /** * genradix_for_each_reverse - iterate over entry in a genradix, reverse order diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 7be1e32e6d42..16c5cc807ff6 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -95,6 +95,19 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) (void *) size); } +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data); +void mempool_kvfree(void *element, void *pool_data); + +static inline int mempool_init_kvmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return mempool_init(pool, min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + +static inline mempool_t *mempool_create_kvmalloc_pool(int min_nr, size_t size) +{ + return mempool_create(min_nr, mempool_kvmalloc, mempool_kvfree, (void *) size); +} + /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data diff --git a/include/linux/sched.h b/include/linux/sched.h index 94f0e618865b..3c2abbc587b4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1639,8 +1639,8 @@ extern struct pid *cad_pid; * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ -#define PF__HOLE__00800000 0x00800000 -#define PF__HOLE__01000000 0x01000000 +#define PF_MEMALLOC_NORECLAIM 0x00800000 /* All allocation requests will clear __GFP_DIRECT_RECLAIM */ +#define PF_MEMALLOC_NOWARN 0x01000000 /* All allocation requests will inherit __GFP_NOWARN */ #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 7a4066d22883..b6543f9d78d6 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -236,16 +236,25 @@ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); - if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS | PF_MEMALLOC_PIN))) { + if (unlikely(pflags & (PF_MEMALLOC_NOIO | + PF_MEMALLOC_NOFS | + PF_MEMALLOC_NORECLAIM | + PF_MEMALLOC_NOWARN | + PF_MEMALLOC_PIN))) { /* - * NOIO implies both NOIO and NOFS and it is a weaker context - * so always make sure it makes precedence + * Stronger flags before weaker flags: + * NORECLAIM implies NOIO, which in turn implies NOFS */ - if (pflags & PF_MEMALLOC_NOIO) + if (pflags & PF_MEMALLOC_NORECLAIM) + flags &= ~__GFP_DIRECT_RECLAIM; + else if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; + if (pflags & PF_MEMALLOC_NOWARN) + flags |= __GFP_NOWARN; + if (pflags & PF_MEMALLOC_PIN) flags &= ~__GFP_MOVABLE; } @@ -307,6 +316,24 @@ static inline void might_alloc(gfp_t gfp_mask) } /** + * memalloc_flags_save - Add a PF_* flag to current->flags, save old value + * + * This allows PF_* flags to be conveniently added, irrespective of current + * value, and then the old version restored with memalloc_flags_restore(). + */ +static inline unsigned memalloc_flags_save(unsigned flags) +{ + unsigned oldflags = ~current->flags & flags; + current->flags |= flags; + return oldflags; +} + +static inline void memalloc_flags_restore(unsigned flags) +{ + current->flags &= ~flags; +} + +/** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. @@ -320,9 +347,7 @@ static inline void might_alloc(gfp_t gfp_mask) */ static inline unsigned int memalloc_noio_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOIO; - current->flags |= PF_MEMALLOC_NOIO; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOIO); } /** @@ -335,7 +360,7 @@ static inline unsigned int memalloc_noio_save(void) */ static inline void memalloc_noio_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; + memalloc_flags_restore(flags); } /** @@ -352,9 +377,7 @@ static inline void memalloc_noio_restore(unsigned int flags) */ static inline unsigned int memalloc_nofs_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_NOFS; - current->flags |= PF_MEMALLOC_NOFS; - return flags; + return memalloc_flags_save(PF_MEMALLOC_NOFS); } /** @@ -367,7 +390,7 @@ static inline unsigned int memalloc_nofs_save(void) */ static inline void memalloc_nofs_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; + memalloc_flags_restore(flags); } /** @@ -395,9 +418,7 @@ static inline void memalloc_nofs_restore(unsigned int flags) */ static inline unsigned int memalloc_noreclaim_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC; - current->flags |= PF_MEMALLOC; - return flags; + return memalloc_flags_save(PF_MEMALLOC); } /** @@ -410,7 +431,7 @@ static inline unsigned int memalloc_noreclaim_save(void) */ static inline void memalloc_noreclaim_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC) | flags; + memalloc_flags_restore(flags); } /** @@ -425,10 +446,7 @@ static inline void memalloc_noreclaim_restore(unsigned int flags) */ static inline unsigned int memalloc_pin_save(void) { - unsigned int flags = current->flags & PF_MEMALLOC_PIN; - - current->flags |= PF_MEMALLOC_PIN; - return flags; + return memalloc_flags_save(PF_MEMALLOC_PIN); } /** @@ -441,7 +459,7 @@ static inline unsigned int memalloc_pin_save(void) */ static inline void memalloc_pin_restore(unsigned int flags) { - current->flags = (current->flags & ~PF_MEMALLOC_PIN) | flags; + memalloc_flags_restore(flags); } #ifdef CONFIG_MEMCG diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 9a24574988d2..b2fc2727d654 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ static int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * Zero means infinite timeout - no checking done: */ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; +EXPORT_SYMBOL_GPL(sysctl_hung_task_timeout_secs); /* * Zero (default value) means use sysctl_hung_task_timeout_secs: diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index 41f1bcdc4488..aaefb9b678c8 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -5,7 +5,7 @@ #include <linux/gfp.h> #include <linux/kmemleak.h> -#define GENRADIX_ARY (PAGE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) #define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) struct genradix_node { @@ -14,13 +14,13 @@ struct genradix_node { struct genradix_node *children[GENRADIX_ARY]; /* Leaf: */ - u8 data[PAGE_SIZE]; + u8 data[GENRADIX_NODE_SIZE]; }; }; static inline int genradix_depth_shift(unsigned depth) { - return PAGE_SHIFT + GENRADIX_ARY_SHIFT * depth; + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; } /* @@ -33,7 +33,7 @@ static inline size_t genradix_depth_size(unsigned depth) /* depth that's needed for a genradix that can address up to ULONG_MAX: */ #define GENRADIX_MAX_DEPTH \ - DIV_ROUND_UP(BITS_PER_LONG - PAGE_SHIFT, GENRADIX_ARY_SHIFT) + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) #define GENRADIX_DEPTH_MASK \ ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) @@ -79,23 +79,12 @@ EXPORT_SYMBOL(__genradix_ptr); static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) { - struct genradix_node *node; - - node = (struct genradix_node *)__get_free_page(gfp_mask|__GFP_ZERO); - - /* - * We're using pages (not slab allocations) directly for kernel data - * structures, so we need to explicitly inform kmemleak of them in order - * to avoid false positive memory leak reports. - */ - kmemleak_alloc(node, PAGE_SIZE, 1, gfp_mask); - return node; + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); } static inline void genradix_free_node(struct genradix_node *node) { - kmemleak_free(node); - free_page((unsigned long)node); + kfree(node); } /* @@ -200,7 +189,7 @@ restart: i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; @@ -209,7 +198,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); @@ -235,7 +224,7 @@ restart: if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; @@ -251,7 +240,7 @@ restart: size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); - iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; @@ -267,7 +256,7 @@ restart: n = n->children[i]; } - return &n->data[iter->offset & (PAGE_SIZE - 1)]; + return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); @@ -289,7 +278,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size, { size_t offset; - for (offset = 0; offset < size; offset += PAGE_SIZE) + for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) return -ENOMEM; diff --git a/mm/mempool.c b/mm/mempool.c index dbbf0e9fb424..076c736f5f1f 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -590,6 +590,19 @@ void mempool_kfree(void *element, void *pool_data) } EXPORT_SYMBOL(mempool_kfree); +void *mempool_kvmalloc(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t)pool_data; + return kvmalloc(size, gfp_mask); +} +EXPORT_SYMBOL(mempool_kvmalloc); + +void mempool_kvfree(void *element, void *pool_data) +{ + kvfree(element); +} +EXPORT_SYMBOL(mempool_kvfree); + /* * A simple mempool-backed page allocator that allocates pages * of the order specified by pool_data. |